diff options
author | Dan McDonald <danmcd@mnx.io> | 2022-09-22 16:09:11 -0400 |
---|---|---|
committer | Dan McDonald <danmcd@mnx.io> | 2022-09-22 16:09:11 -0400 |
commit | f80fffffb15bd016da0e6de56b81eadf9c20f583 (patch) | |
tree | 5069bd6eb91f21c2bb62f6acfda9bee339e0d855 | |
parent | d2273fdd36ffaa49593d7aead4b3a85112721aa2 (diff) | |
parent | 61b899723556289ed14dbcc0792ee6ed33c3edf9 (diff) | |
download | illumos-joyent-f80fffffb15bd016da0e6de56b81eadf9c20f583.tar.gz |
[illumos-gate merge]
commit 61b899723556289ed14dbcc0792ee6ed33c3edf9
14997 unnecessary jmp in desctbls_asm.s
commit 578d9a563f6dc0f4f8a56447931b36db474172da
14485 bhyve needs better cpuid control
21 files changed, 1702 insertions, 644 deletions
diff --git a/usr/src/pkg/manifests/system-bhyve-tests.p5m b/usr/src/pkg/manifests/system-bhyve-tests.p5m index aa9522d2d6..842a268852 100644 --- a/usr/src/pkg/manifests/system-bhyve-tests.p5m +++ b/usr/src/pkg/manifests/system-bhyve-tests.p5m @@ -31,6 +31,7 @@ dir path=opt/bhyve-tests/runfiles file path=opt/bhyve-tests/runfiles/default.run mode=0444 dir path=opt/bhyve-tests/tests dir path=opt/bhyve-tests/tests/inst_emul +file path=opt/bhyve-tests/tests/inst_emul/cpuid mode=0555 file path=opt/bhyve-tests/tests/inst_emul/rdmsr mode=0555 file path=opt/bhyve-tests/tests/inst_emul/triple_fault mode=0555 file path=opt/bhyve-tests/tests/inst_emul/wrmsr mode=0555 @@ -54,6 +55,7 @@ file path=opt/bhyve-tests/tests/viona/interface_version mode=0555 dir path=opt/bhyve-tests/tests/vmm file path=opt/bhyve-tests/tests/vmm/auto_destruct mode=0555 file path=opt/bhyve-tests/tests/vmm/check_iommu mode=0555 +file path=opt/bhyve-tests/tests/vmm/cpuid_ioctl mode=0555 file path=opt/bhyve-tests/tests/vmm/drv_hold mode=0555 file path=opt/bhyve-tests/tests/vmm/fpu_getset mode=0555 file path=opt/bhyve-tests/tests/vmm/interface_version mode=0555 diff --git a/usr/src/test/bhyve-tests/runfiles/default.run b/usr/src/test/bhyve-tests/runfiles/default.run index 30262dee86..f2d975b52c 100644 --- a/usr/src/test/bhyve-tests/runfiles/default.run +++ b/usr/src/test/bhyve-tests/runfiles/default.run @@ -27,6 +27,7 @@ user = root pre = vmm_drv_test_init tests = [ 'auto_destruct', + 'cpuid_ioctl', 'drv_hold', 'fpu_getset', 'interface_version', @@ -51,6 +52,7 @@ tests = [ [/opt/bhyve-tests/tests/inst_emul] user = root tests = [ + 'cpuid', 'rdmsr', 'wrmsr', 'triple_fault' diff --git a/usr/src/test/bhyve-tests/tests/Makefile.in_guest b/usr/src/test/bhyve-tests/tests/Makefile.in_guest index 7ce2b0d531..2c33f506de 100644 --- a/usr/src/test/bhyve-tests/tests/Makefile.in_guest +++ b/usr/src/test/bhyve-tests/tests/Makefile.in_guest @@ -18,6 +18,10 @@ PAYLOAD_CLEANFILES = payload_start.o \ $(PAYLOADS:%=pobj_%.s) $(PAYLOADS:%=payload_%.o) := AS_CPPFLAGS += -I../common +# Without a real runtime in the payload, the stack protector must be disabled +$(PAYLOADS:%=payload_%.o) := STACKPROTECT = none +# Like our own kernel, prevent the compiler from using the FPU via SIMD +$(PAYLOADS:%=payload_%.o) := CFLAGS64 += $(STAND_FLAGS_64) payload_%: payload_start.o payload_%.o $(LD) -dn -e _start -M ../common/Mapfile.payload -o $@ $^ diff --git a/usr/src/test/bhyve-tests/tests/kdev/payload_utils.h b/usr/src/test/bhyve-tests/tests/common/payload_utils.h index bbb168698a..602a86e6a4 100644 --- a/usr/src/test/bhyve-tests/tests/kdev/payload_utils.h +++ b/usr/src/test/bhyve-tests/tests/common/payload_utils.h @@ -27,5 +27,6 @@ uint16_t inw(uint16_t); uint32_t inl(uint16_t); uint64_t rdmsr(uint32_t); void wrmsr(uint32_t, uint64_t); +void cpuid(uint32_t, uint32_t, uint32_t *); #endif /* _PAYLOAD_UTILS_H_ */ diff --git a/usr/src/test/bhyve-tests/tests/kdev/payload_utils.s b/usr/src/test/bhyve-tests/tests/common/payload_utils.s index e1114f5cb1..2094f36e3b 100644 --- a/usr/src/test/bhyve-tests/tests/kdev/payload_utils.s +++ b/usr/src/test/bhyve-tests/tests/common/payload_utils.s @@ -15,6 +15,7 @@ #include <sys/asm_linkage.h> +/* void outb(uint16_t port, uint8_t val) */ ENTRY(outb) movw %di, %dx movb %sil, %al @@ -22,6 +23,7 @@ ENTRY(outb) ret SET_SIZE(outb) +/* void outw(uint16_t port, uint16_t val) */ ENTRY(outw) movw %di, %dx movw %si, %ax @@ -29,6 +31,7 @@ ENTRY(outw) ret SET_SIZE(outb) +/* void outl(uint16_t port, uint32_t val) */ ENTRY(outl) movw %di, %dx movl %esi, %eax @@ -36,24 +39,28 @@ ENTRY(outl) ret SET_SIZE(outl) +/* uint8_t inb(uint16_t port) */ ENTRY(inb) movw %di, %dx inb (%dx) ret SET_SIZE(inb) +/* uint16_t inw(uint16_t port) */ ENTRY(inw) movw %di, %dx inw (%dx) ret SET_SIZE(inw) +/* uint32_t inl(uint16_t port) */ ENTRY(inl) movw %di, %dx inl (%dx) ret SET_SIZE(inl) +/* uint64_t wrmsr(uint32_t msr) */ ENTRY(rdmsr) movl %edi, %ecx rdmsr @@ -62,6 +69,7 @@ ENTRY(rdmsr) ret SET_SIZE(rdmsr) +/* void wrmsr(uint32_t msr, uint64_t val) */ ENTRY(wrmsr) movq %rsi, %rdx shrq $32, %rdx @@ -70,3 +78,18 @@ ENTRY(wrmsr) wrmsr ret SET_SIZE(wrmsr) + +/* void cpuid(uint32_t in_eax, uint32_t in_ecx, uint32_t *out_regs) */ +ENTRY(cpuid) + pushq %rbx + movl %edi, %eax + movl %esi, %ecx + movq %rdx, %r8 + cpuid + movl %eax, (%r8) + movl %ebx, 4(%r8) + movl %ecx, 8(%r8) + movl %edx, 12(%r8) + popq %rbx + ret +SET_SIZE(cpuid) diff --git a/usr/src/test/bhyve-tests/tests/inst_emul/Makefile b/usr/src/test/bhyve-tests/tests/inst_emul/Makefile index c9a675a035..7a7844d65c 100644 --- a/usr/src/test/bhyve-tests/tests/inst_emul/Makefile +++ b/usr/src/test/bhyve-tests/tests/inst_emul/Makefile @@ -17,14 +17,18 @@ include $(SRC)/test/Makefile.com PROG = rdmsr \ wrmsr \ - triple_fault + triple_fault \ + cpuid + +# C-based payloads need additional utils object +CPAYLOADS = cpuid PAYLOADS = $(PROG) include ../Makefile.in_guest COMMON_OBJS = in_guest.o -CLEANFILES = $(COMMON_OBJS) $(PAYLOAD_CLEANFILES) +CLEANFILES = $(COMMON_OBJS) $(PAYLOAD_CLEANFILES) payload_utils.o CLOBBERFILES = $(PROG) ROOTOPTPKG = $(ROOT)/opt/bhyve-tests @@ -44,7 +48,6 @@ CPPFLAGS = -I$(COMPAT)/bhyve -I$(CONTRIB)/bhyve \ ASFLAGS += -P -D__STDC__ -D_ASM -CFLAGS = -m64 $(PROG) := LDLIBS += -lvmmapi all: $(PROG) @@ -71,3 +74,5 @@ $(TESTDIR)/%: % %: %.o $(LINK.c) -o $@ $^ $(LDLIBS) $(POST_PROCESS) + +$(CPAYLOADS:%=payload_%): payload_utils.o diff --git a/usr/src/test/bhyve-tests/tests/inst_emul/cpuid.c b/usr/src/test/bhyve-tests/tests/inst_emul/cpuid.c new file mode 100644 index 0000000000..72ffe966ba --- /dev/null +++ b/usr/src/test/bhyve-tests/tests/inst_emul/cpuid.c @@ -0,0 +1,176 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2022 Oxide Computer Company + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <strings.h> +#include <libgen.h> +#include <assert.h> + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/vmm.h> +#include <sys/vmm_dev.h> +#include <vmmapi.h> + +#include "in_guest.h" + +static const struct vcpu_cpuid_entry test_entries[] = { + { + .vce_function = 0, + .vce_eax = 5, + .vce_ebx = 0x74737552, + .vce_edx = 0x4f206465, + .vce_ecx = 0x65646978, + }, + /* basic "std" leaf */ + { + .vce_function = 1, + .vce_eax = 0x100, + }, + + /* skip 2 for a hole */ + + /* leaf with index matching */ + { + .vce_function = 3, + .vce_index = 0, + .vce_flags = VCE_FLAG_MATCH_INDEX, + .vce_eax = 0x300, + }, + { + .vce_function = 3, + .vce_index = 1, + .vce_flags = VCE_FLAG_MATCH_INDEX, + .vce_eax = 0x301, + }, + + /* leaf with index matching and a hole */ + { + .vce_function = 4, + .vce_index = 0, + .vce_flags = VCE_FLAG_MATCH_INDEX, + .vce_eax = 0x400, + }, + { + .vce_function = 4, + .vce_index = 2, + .vce_flags = VCE_FLAG_MATCH_INDEX, + .vce_eax = 0x402, + }, + + /* terminal "std" leaf */ + { + .vce_function = 5, + .vce_eax = 5, + .vce_ebx = 5, + .vce_edx = 5, + .vce_ecx = 5, + }, + + /* base "extended" leaf */ + { + .vce_function = 0x80000000, + .vce_eax = 0x80000001, + }, + /* index-match "extended" leaves */ + { + .vce_function = 0x80000001, + .vce_index = 0x0, + .vce_flags = VCE_FLAG_MATCH_INDEX, + .vce_eax = 0x8000, + }, + { + .vce_function = 0x80000001, + .vce_index = 0x1, + .vce_flags = VCE_FLAG_MATCH_INDEX, + .vce_eax = 0x8001, + }, +}; + +int +main(int argc, char *argv[]) +{ + const char *test_suite_name = basename(argv[0]); + struct vmctx *ctx = NULL; + int err; + + ctx = test_initialize(test_suite_name); + + err = test_setup_vcpu(ctx, 0, MEM_LOC_PAYLOAD, MEM_LOC_STACK); + if (err != 0) { + test_fail_errno(err, "Could not initialize vcpu0"); + } + + + /* Start with test data using Intel-style fallback */ + int vmfd = vm_get_device_fd(ctx); + + struct vm_vcpu_cpuid_config cfg = { + .vvcc_vcpuid = 0, + .vvcc_flags = VCC_FLAG_INTEL_FALLBACK, + .vvcc_nent = ARRAY_SIZE(test_entries), + /* We trust the ioctl not to alter this const value */ + .vvcc_entries = (struct vcpu_cpuid_entry *)test_entries, + }; + err = ioctl(vmfd, VM_SET_CPUID, &cfg); + if (err != 0) { + test_fail_errno(err, "ioctl(VM_SET_CPUID) failed"); + } + + struct vm_entry ventry = { 0 }; + struct vm_exit vexit = { 0 }; + + do { + const enum vm_exit_kind kind = + test_run_vcpu(ctx, 0, &ventry, &vexit); + switch (kind) { + case VEK_REENTR: + break; + case VEK_TEST_PASS: + test_pass(); + break; + case VEK_TEST_FAIL: + test_fail_msg("failed result %rip: %x", vexit.rip); + break; + case VEK_UNHANDLED: { + uint32_t val; + if (vexit_match_inout(&vexit, false, IOP_TEST_VALUE, 4, + &val)) { + /* + * The payload has requested switch to AMD-style + * fallback to run the second half of the test. + */ + cfg.vvcc_flags = 0; + err = ioctl(vmfd, VM_SET_CPUID, &cfg); + if (err != 0) { + test_fail_errno(err, + "ioctl(VM_SET_CPUID) failed"); + } + ventry_fulfill_inout(&vexit, &ventry, 0); + } else { + test_fail_vmexit(&vexit); + } + break; + } + + default: + test_fail_vmexit(&vexit); + break; + } + } while (true); +} diff --git a/usr/src/test/bhyve-tests/tests/inst_emul/payload_cpuid.c b/usr/src/test/bhyve-tests/tests/inst_emul/payload_cpuid.c new file mode 100644 index 0000000000..8c55cdbcaf --- /dev/null +++ b/usr/src/test/bhyve-tests/tests/inst_emul/payload_cpuid.c @@ -0,0 +1,169 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2022 Oxide Computer Company + */ + +#include "payload_common.h" +#include "payload_utils.h" + +int +leaf_cmp(const uint32_t *a, const uint32_t *b) +{ + return (a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3]); +} + +const uint32_t expected_base[] = { 5, 0x74737552, 0x65646978, 0x4f206465 }; + +struct test_case { + uint32_t func; + uint32_t idx; + uint32_t val_eax; + int fallback; +}; + +const struct test_case cases[] = { + /* basic leaf match */ + { + .func = 1, + .val_eax = 0x100, + }, + /* index matching */ + { + .func = 3, + .idx = 0, + .val_eax = 0x300, + }, + { + .func = 3, + .idx = 1, + .val_eax = 0x301, + }, + /* leaf match with hole */ + { + .func = 4, + .idx = 0, + .val_eax = 0x400, + }, + { + .func = 4, + .idx = 2, + .val_eax = 0x402, + }, + /* last std leaf */ + { + .func = 5, + .val_eax = 0x5, + }, + + /* invalid leaf */ + { + .func = 2, + .val_eax = 0, + }, + /* invalid index */ + { + .func = 3, + .idx = 2, + .val_eax = 0, + }, + { + .func = 4, + .idx = 1, + .val_eax = 0x0, + }, + { + .func = 4, + .idx = 0xffff, + .val_eax = 0x0, + }, + + /* basic extd leaf match */ + { + .func = 0x80000000, + .val_eax = 0x80000001, + }, + /* basic extd index match */ + { + .func = 0x80000001, + .idx = 0, + .val_eax = 0x8000, + }, + { + .func = 0x80000001, + .idx = 1, + .val_eax = 0x8001, + }, + /* zeroed for invalid index */ + { + .func = 0x80000001, + .idx = 5, + .val_eax = 0, + }, + + /* fallback beyond std leaf */ + { + .func = 6, + .fallback = 1, + }, + /* fallback beyond extd leaf */ + { + .func = 0x80000002, + .fallback = 1, + }, +}; +#define NCASES (sizeof (cases) / sizeof (cases[0])) + +void +do_test(int intel_fallback) +{ + uint32_t regs[4]; + uint32_t expected_fallback[4] = { 0 }; + + cpuid(0, 0, regs); + if (!leaf_cmp(regs, expected_base)) { + outb(IOP_TEST_RESULT, TEST_RESULT_FAIL); + } + + if (intel_fallback) { + cpuid(regs[0], 0, expected_fallback); + } + + for (uint_t i = 0; i < NCASES; i++) { + cpuid(cases[i].func, cases[i].idx, regs); + if (cases[i].fallback != 0) { + if (!leaf_cmp(regs, expected_fallback)) { + outb(IOP_TEST_RESULT, TEST_RESULT_FAIL); + } + } else { + if (regs[0] != cases[i].val_eax) { + outb(IOP_TEST_RESULT, TEST_RESULT_FAIL); + } + } + } +} + +void +start(void) +{ + /* Check results expecting Intel-style fallback */ + do_test(1); + + /* Notify userspace component to change fallback style */ + outl(IOP_TEST_VALUE, 0); + + /* Check results expecting AMD-style fallback */ + do_test(0); + + /* If all is well by this point, indicate success */ + outb(IOP_TEST_RESULT, TEST_RESULT_PASS); +} diff --git a/usr/src/test/bhyve-tests/tests/vmm/Makefile b/usr/src/test/bhyve-tests/tests/vmm/Makefile index 51f55d0147..cd0aa81268 100644 --- a/usr/src/test/bhyve-tests/tests/vmm/Makefile +++ b/usr/src/test/bhyve-tests/tests/vmm/Makefile @@ -25,7 +25,8 @@ PROG = mem_partial \ interface_version \ check_iommu \ auto_destruct \ - drv_hold + drv_hold \ + cpuid_ioctl SCRIPT = vmm_drv_test_fini vmm_drv_test_init diff --git a/usr/src/test/bhyve-tests/tests/vmm/cpuid_ioctl.c b/usr/src/test/bhyve-tests/tests/vmm/cpuid_ioctl.c new file mode 100644 index 0000000000..21da309372 --- /dev/null +++ b/usr/src/test/bhyve-tests/tests/vmm/cpuid_ioctl.c @@ -0,0 +1,169 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2022 Oxide Computer Company + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <fcntl.h> +#include <libgen.h> +#include <err.h> +#include <errno.h> +#include <strings.h> + +#include <sys/vmm.h> +#include <sys/vmm_dev.h> +#include <vmmapi.h> + +#include "common.h" + +int +main(int argc, char *argv[]) +{ + const char *suite_name = basename(argv[0]); + struct vmctx *ctx; + + ctx = create_test_vm(suite_name); + if (ctx == NULL) { + perror("could open test VM"); + return (EXIT_FAILURE); + } + int vmfd = vm_get_device_fd(ctx); + + struct vm_vcpu_cpuid_config cfg = { 0 }; + struct vcpu_cpuid_entry *entries = NULL; + + if (ioctl(vmfd, VM_GET_CPUID, &cfg) != 0) { + err(EXIT_FAILURE, "ioctl(VM_GET_CPUID) failed"); + } + if (cfg.vvcc_flags != VCC_FLAG_LEGACY_HANDLING) { + errx(EXIT_FAILURE, + "cpuid handling did not default to legacy-style"); + } + + cfg.vvcc_flags = ~VCC_FLAG_LEGACY_HANDLING; + if (ioctl(vmfd, VM_SET_CPUID, &cfg) == 0) { + errx(EXIT_FAILURE, + "ioctl(VM_SET_CPUID) did not reject invalid flags"); + } + + entries = calloc(VMM_MAX_CPUID_ENTRIES + 1, + sizeof (struct vcpu_cpuid_entry)); + if (entries == NULL) { + errx(EXIT_FAILURE, "could not allocate cpuid entries"); + } + + cfg.vvcc_flags = VCC_FLAG_LEGACY_HANDLING; + cfg.vvcc_nent = 1; + cfg.vvcc_entries = entries; + if (ioctl(vmfd, VM_SET_CPUID, &cfg) == 0) { + errx(EXIT_FAILURE, + "ioctl(VM_SET_CPUID) did not reject entries when " + "legacy-style handling was requested"); + } + + cfg.vvcc_flags = 0; + cfg.vvcc_nent = VMM_MAX_CPUID_ENTRIES + 1; + if (ioctl(vmfd, VM_SET_CPUID, &cfg) == 0) { + errx(EXIT_FAILURE, + "ioctl(VM_SET_CPUID) did not reject excessive entry count"); + } + + cfg.vvcc_nent = 1; + entries[0].vce_flags = ~0; + if (ioctl(vmfd, VM_SET_CPUID, &cfg) == 0) { + errx(EXIT_FAILURE, + "ioctl(VM_SET_CPUID) did not invalid entry flags"); + } + entries[0].vce_flags = 0; + + /* Actually set some entries to use for GET_CPUID testing */ + const uint_t valid_entries = (VMM_MAX_CPUID_ENTRIES / 2); + for (uint_t i = 0; i < valid_entries; i++) { + entries[i].vce_function = i; + } + cfg.vvcc_nent = valid_entries; + if (ioctl(vmfd, VM_SET_CPUID, &cfg) != 0) { + err(EXIT_FAILURE, + "ioctl(VM_SET_CPUID) unable to set valid entries"); + } + + /* Try with no entries buffer */ + bzero(&cfg, sizeof (cfg)); + if (ioctl(vmfd, VM_GET_CPUID, &cfg) == 0 || errno != E2BIG) { + errx(EXIT_FAILURE, + "ioctl(VM_GET_CPUID) did not fail absent buffer"); + } + if (cfg.vvcc_nent != valid_entries) { + errx(EXIT_FAILURE, + "ioctl(VM_GET_CPUID) did not emit entry count " + "(expected %u, got %u)", valid_entries, cfg.vvcc_nent); + } + + /* Try with too-small entries buffer */ + cfg.vvcc_nent = 1; + cfg.vvcc_entries = entries; + bzero(entries, valid_entries * sizeof (struct vcpu_cpuid_entry)); + if (ioctl(vmfd, VM_GET_CPUID, &cfg) == 0 || errno != E2BIG) { + errx(EXIT_FAILURE, + "ioctl(VM_GET_CPUID) did not fail too-small buffer"); + } + if (cfg.vvcc_nent != valid_entries) { + errx(EXIT_FAILURE, + "ioctl(VM_GET_CPUID) did not emit entry count " + "(expected %u, got %u)", valid_entries, cfg.vvcc_nent); + } + + /* Try with adequate entries buffer */ + cfg.vvcc_nent = valid_entries; + if (ioctl(vmfd, VM_GET_CPUID, &cfg) != 0) { + err(EXIT_FAILURE, "ioctl(VM_GET_CPUID) failed"); + } + if (cfg.vvcc_nent != valid_entries) { + errx(EXIT_FAILURE, + "ioctl(VM_GET_CPUID) did not emit entry count " + "(expected %u, got %u)", valid_entries, cfg.vvcc_nent); + } + for (uint_t i = 0; i < valid_entries; i++) { + if (entries[i].vce_function != i) { + errx(EXIT_FAILURE, "unexpected entry contents"); + } + } + + /* + * The legacy handling is simply using the host values with certain + * modifications (masking, etc) applied. The base leaf should be + * exactly the same as we read from the host. + * + * Since a bhyve compat header has an inline-asm cpuid wrapper, use that + * for now for querying the host + */ + struct vm_legacy_cpuid legacy = { 0 }; + if (ioctl(vmfd, VM_LEGACY_CPUID, &legacy) != 0) { + err(EXIT_FAILURE, "ioctl(VM_CPUID_LEGACY) failed"); + } + + uint32_t basic_cpuid[4]; + cpuid_count(0, 0, basic_cpuid); + if (basic_cpuid[0] != legacy.vlc_eax || + basic_cpuid[1] != legacy.vlc_ebx || + basic_cpuid[2] != legacy.vlc_ecx || + basic_cpuid[3] != legacy.vlc_edx) { + errx(EXIT_FAILURE, "legacy cpuid mismatch"); + } + + vm_destroy(ctx); + (void) printf("%s\tPASS\n", suite_name); + return (EXIT_SUCCESS); +} diff --git a/usr/src/uts/intel/io/vmm/Makefile.vmm b/usr/src/uts/intel/io/vmm/Makefile.vmm index d7c26183fc..b6049af51e 100644 --- a/usr/src/uts/intel/io/vmm/Makefile.vmm +++ b/usr/src/uts/intel/io/vmm/Makefile.vmm @@ -53,6 +53,7 @@ VMM_OBJS = \ vmm_lapic.o \ vmm_stat.o \ vmm_util.o \ + vmm_cpuid.o \ x86.o \ iommu.o \ vatpic.o \ diff --git a/usr/src/uts/intel/io/vmm/amd/svm.c b/usr/src/uts/intel/io/vmm/amd/svm.c index a00b7d4c1d..5c2943f4f9 100644 --- a/usr/src/uts/intel/io/vmm/amd/svm.c +++ b/usr/src/uts/intel/io/vmm/amd/svm.c @@ -1468,8 +1468,9 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) break; case VMCB_EXIT_CPUID: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); - handled = x86_emulate_cpuid(svm_sc->vm, vcpu, &state->rax, + vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax, &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx); + handled = 1; break; case VMCB_EXIT_HLT: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); diff --git a/usr/src/uts/intel/io/vmm/intel/vmx.c b/usr/src/uts/intel/io/vmm/intel/vmx.c index 5d9d920ca6..3f67f0138f 100644 --- a/usr/src/uts/intel/io/vmm/intel/vmx.c +++ b/usr/src/uts/intel/io/vmm/intel/vmx.c @@ -870,17 +870,6 @@ vmx_vminit(struct vm *vm) return (vmx); } -static int -vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) -{ - int handled; - - handled = x86_emulate_cpuid(vm, vcpu, (uint64_t *)&vmxctx->guest_rax, - (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx, - (uint64_t *)&vmxctx->guest_rdx); - return (handled); -} - static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); @@ -2357,7 +2346,12 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) case EXIT_REASON_CPUID: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); - handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); + vcpu_emulate_cpuid(vmx->vm, vcpu, + (uint64_t *)&vmxctx->guest_rax, + (uint64_t *)&vmxctx->guest_rbx, + (uint64_t *)&vmxctx->guest_rcx, + (uint64_t *)&vmxctx->guest_rdx); + handled = HANDLED; break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h b/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h index 1dba79a7bf..404942f438 100644 --- a/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h +++ b/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h @@ -395,8 +395,23 @@ enum vm_cpuid_capability { VCC_LAST }; -int x86_emulate_cpuid(struct vm *, int, uint64_t *, uint64_t *, uint64_t *, +/* Possible flags and entry count limit definited in sys/vmm.h */ +typedef struct vcpu_cpuid_config { + uint32_t vcc_flags; + uint32_t vcc_nent; + struct vcpu_cpuid_entry *vcc_entries; +} vcpu_cpuid_config_t; + +vcpu_cpuid_config_t *vm_cpuid_config(struct vm *, int); +int vm_get_cpuid(struct vm *, int, vcpu_cpuid_config_t *); +int vm_set_cpuid(struct vm *, int, const vcpu_cpuid_config_t *); +void vcpu_emulate_cpuid(struct vm *, int, uint64_t *, uint64_t *, uint64_t *, uint64_t *); +void legacy_emulate_cpuid(struct vm *, int, uint32_t *, uint32_t *, uint32_t *, + uint32_t *); +void vcpu_cpuid_init(vcpu_cpuid_config_t *); +void vcpu_cpuid_cleanup(vcpu_cpuid_config_t *); + bool vm_cpuid_capability(struct vm *, int, enum vm_cpuid_capability); bool validate_guest_xcr0(uint64_t, uint64_t); diff --git a/usr/src/uts/intel/io/vmm/vmm.c b/usr/src/uts/intel/io/vmm/vmm.c index e85d84d0b6..e28c235b4c 100644 --- a/usr/src/uts/intel/io/vmm/vmm.c +++ b/usr/src/uts/intel/io/vmm/vmm.c @@ -154,6 +154,7 @@ struct vcpu { vm_client_t *vmclient; /* (a) VM-system client */ uint64_t tsc_offset; /* (x) offset from host TSC */ struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ + vcpu_cpuid_config_t cpuid_cfg; /* (x) cpuid configuration */ enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ hrtime_t ustate_when; /* (i) time of last ustate change */ @@ -332,6 +333,8 @@ vcpu_cleanup(struct vm *vm, int i, bool destroy) if (destroy) { vmm_stat_free(vcpu->stats); + vcpu_cpuid_cleanup(&vcpu->cpuid_cfg); + hma_fpu_free(vcpu->guestfpu); vcpu->guestfpu = NULL; @@ -365,6 +368,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create) vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); vcpu->stats = vmm_stat_alloc(); vcpu->vie_ctx = vie_alloc(); + vcpu_cpuid_init(&vcpu->cpuid_cfg); vcpu->ustate = VU_INIT; vcpu->ustate_when = gethrtime(); @@ -3043,6 +3047,15 @@ vm_set_capability(struct vm *vm, int vcpu, int type, int val) return (VMSETCAP(vm->cookie, vcpu, type, val)); } +vcpu_cpuid_config_t * +vm_cpuid_config(struct vm *vm, int vcpuid) +{ + ASSERT3S(vcpuid, >=, 0); + ASSERT3S(vcpuid, <, VM_MAXCPU); + + return (&vm->vcpu[vcpuid].cpuid_cfg); +} + struct vlapic * vm_lapic(struct vm *vm, int cpu) { diff --git a/usr/src/uts/intel/io/vmm/vmm_cpuid.c b/usr/src/uts/intel/io/vmm/vmm_cpuid.c new file mode 100644 index 0000000000..30ef28ec85 --- /dev/null +++ b/usr/src/uts/intel/io/vmm/vmm_cpuid.c @@ -0,0 +1,925 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + * Copyright 2022 Oxide Computer Company + */ + +#include <sys/types.h> +#include <sys/stdbool.h> +#include <sys/errno.h> + +#include <machine/md_var.h> +#include <machine/specialreg.h> + +#include <machine/vmm.h> +#include <sys/vmm_kernel.h> + +#include "vmm_host.h" +#include "vmm_util.h" + +/* + * CPUID Emulation + * + * All CPUID instruction exits are handled by the in-kernel emulation. + * + * ---------------- + * Legacy Emulation + * ---------------- + * + * Originally, the kernel vmm portion of bhyve relied on fixed logic to filter + * and/or generate CPUID results based on what was reported by the host CPU, as + * well as attributes of the VM (such as CPU topology, and enabled features). + * This is largely adequate to expose CPU capabilities to the guest in manner + * which allows it to operate properly. + * + * ------------------------------ + * Userspace-Controlled Emulation + * ------------------------------ + * + * In certain situations, more control over the CPUID emulation results present + * to the guest is desired. Live migration between physical hosts is one such + * example, where the underlying CPUs, or at least their microcode, may differ + * between the source and destination. In such cases, where changes to the + * CPUID results cannot be tolerated, the userspace portion of the VMM can be in + * complete control over the leaves which are presented to the guest. It may + * still consult the "legacy" CPUID data for guidance about which CPU features + * are safe to expose (due to hypervisor limitations, etc). This leaf + * information is configured on a per-vCPU basis. + * + * The emulation entries provided by userspace are expected to be in sorted + * order, running from lowest function and index to highest. + * + * For example: + * (func: 00h idx: 00h) -> + * (flags: 0, eax: highest std leaf, ebx-edx: vendor id) + * (func: 0Dh idx: 00h) -> + * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XCR0/XSAVE info) + * (func: 0Dh idx: 01h) -> + * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XSAVE/XSAVEOPT details) + * ... + * (func: 0Dh idx: 07H) -> + * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: AVX-512 details) + * (func: 8000000h idx: 0h) -> + * (flags: 0, eax: highest extd leaf ...) + * ... + */ + + +#define CPUID_TYPE_MASK 0xf0000000 +#define CPUID_TYPE_STD 0x00000000 +#define CPUID_TYPE_EXTD 0x80000000 + +static const struct vcpu_cpuid_entry cpuid_empty_entry = { 0 }; + +/* + * Given the CPUID configuration for a vCPU, locate the entry which matches the + * provided function/index tuple. The entries list is walked in order, and the + * first valid match based on the function/index and flags will be emitted. + * + * If no match is found, but Intel-style fallback is configured, then the + * highest standard leaf encountered will be emitted. + */ +static const struct vcpu_cpuid_entry * +cpuid_find_entry(const vcpu_cpuid_config_t *cfg, uint32_t func, uint32_t idx) +{ + const struct vcpu_cpuid_entry *last_std = NULL; + const bool intel_fallback = + (cfg->vcc_flags & VCC_FLAG_INTEL_FALLBACK) != 0; + bool matched_leaf = false; + + ASSERT0(cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING); + + for (uint_t i = 0; i < cfg->vcc_nent; i++) { + const struct vcpu_cpuid_entry *ent = &cfg->vcc_entries[i]; + const bool ent_is_std = + (ent->vce_function & CPUID_TYPE_MASK) == CPUID_TYPE_STD; + const bool ent_must_match_idx = + (ent->vce_flags & VCE_FLAG_MATCH_INDEX) != 0; + + if (ent_is_std) { + /* + * Keep track of the last "standard" leaf for + * Intel-style fallback behavior. + * + * This does currently not account for the sub-leaf + * index matching behavior for fallback described in the + * SDM. It is not clear if any consumers rely on such + * matching when encountering fallback. + */ + last_std = ent; + } + if (ent->vce_function == func) { + if (ent->vce_index == idx || !ent_must_match_idx) { + return (ent); + } + /* + * Make note of when the top-level leaf matches, even + * when the index does not. + */ + matched_leaf = true; + } else if (ent->vce_function > func) { + if ((ent->vce_function & CPUID_TYPE_MASK) == + (func & CPUID_TYPE_MASK)) { + /* + * We are beyond a valid leaf to match, but have + * not exceeded the maximum leaf for this "type" + * (standard, extended, hvm, etc), so return an + * empty entry. + */ + return (&cpuid_empty_entry); + } else { + /* + * Otherwise, we can stop now, having gone + * beyond the last entry which could match the + * target function in a sorted list. + */ + break; + } + } + } + + if (matched_leaf || !intel_fallback) { + return (&cpuid_empty_entry); + } else { + return (last_std); + } +} + +void +vcpu_emulate_cpuid(struct vm *vm, int vcpuid, uint64_t *rax, uint64_t *rbx, + uint64_t *rcx, uint64_t *rdx) +{ + const vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid); + + ASSERT3P(rax, !=, NULL); + ASSERT3P(rbx, !=, NULL); + ASSERT3P(rcx, !=, NULL); + ASSERT3P(rdx, !=, NULL); + + /* Fall back to legacy handling if specified */ + if ((cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { + uint32_t regs[4] = { *rax, 0, *rcx, 0 }; + + legacy_emulate_cpuid(vm, vcpuid, ®s[0], ®s[1], ®s[2], + ®s[3]); + /* CPUID clears the upper 32-bits of the long-mode registers. */ + *rax = regs[0]; + *rbx = regs[1]; + *rcx = regs[2]; + *rdx = regs[3]; + return; + } + + const struct vcpu_cpuid_entry *ent = cpuid_find_entry(cfg, *rax, *rcx); + ASSERT(ent != NULL); + /* CPUID clears the upper 32-bits of the long-mode registers. */ + *rax = ent->vce_eax; + *rbx = ent->vce_ebx; + *rcx = ent->vce_ecx; + *rdx = ent->vce_edx; +} + +/* + * Get the current CPUID emulation configuration for this vCPU. + * + * Only the existing flags will be emitted if the vCPU is configured for legacy + * operation via the VCC_FLAG_LEGACY_HANDLING flag. If in userspace-controlled + * mode, then we will attempt to copy the existing entries into vcc_entries, + * its side specified by vcc_nent. + * + * Regardless of whether vcc_entries is adequately sized (or even present), + * vcc_nent will be set to the number of existing entries. + */ +int +vm_get_cpuid(struct vm *vm, int vcpuid, vcpu_cpuid_config_t *res) +{ + if (vcpuid < 0 || vcpuid > VM_MAXCPU) { + return (EINVAL); + } + + const vcpu_cpuid_config_t *src = vm_cpuid_config(vm, vcpuid); + if (src->vcc_nent > res->vcc_nent) { + res->vcc_nent = src->vcc_nent; + return (E2BIG); + } else if (src->vcc_nent != 0) { + bcopy(src->vcc_entries, res->vcc_entries, + src->vcc_nent * sizeof (struct vcpu_cpuid_entry)); + } + res->vcc_flags = src->vcc_flags; + res->vcc_nent = src->vcc_nent; + return (0); +} + +/* + * Set the CPUID emulation configuration for this vCPU. + * + * If VCC_FLAG_LEGACY_HANDLING is set in vcc_flags, then vcc_nent is expected to + * be set to 0, as configuring a list of entries would be useless when using the + * legacy handling. + * + * Any existing entries which are configured are freed, and the newly provided + * ones will be copied into their place. + */ +int +vm_set_cpuid(struct vm *vm, int vcpuid, const vcpu_cpuid_config_t *src) +{ + if (vcpuid < 0 || vcpuid > VM_MAXCPU) { + return (EINVAL); + } + if (src->vcc_nent > VMM_MAX_CPUID_ENTRIES) { + return (EINVAL); + } + if ((src->vcc_flags & ~VCC_FLAGS_VALID) != 0) { + return (EINVAL); + } + if ((src->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0 && + src->vcc_nent != 0) { + /* No entries should be provided if using legacy handling */ + return (EINVAL); + } + for (uint_t i = 0; i < src->vcc_nent; i++) { + /* Ensure all entries carry valid flags */ + if ((src->vcc_entries[i].vce_flags & ~VCE_FLAGS_VALID) != 0) { + return (EINVAL); + } + } + + vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid); + + /* Free any existing entries first */ + vcpu_cpuid_cleanup(cfg); + + /* Copy supplied entries into freshly allocated space */ + if (src->vcc_nent != 0) { + const size_t entries_sz = + src->vcc_nent * sizeof (struct vcpu_cpuid_entry); + + cfg->vcc_nent = src->vcc_nent; + cfg->vcc_entries = kmem_alloc(entries_sz, KM_SLEEP); + bcopy(src->vcc_entries, cfg->vcc_entries, entries_sz); + } + cfg->vcc_flags = src->vcc_flags; + + return (0); +} + +void +vcpu_cpuid_init(vcpu_cpuid_config_t *cfg) +{ + /* Default to legacy-style handling */ + cfg->vcc_flags = VCC_FLAG_LEGACY_HANDLING; + cfg->vcc_nent = 0; + cfg->vcc_entries = NULL; +} + +void +vcpu_cpuid_cleanup(vcpu_cpuid_config_t *cfg) +{ + if (cfg->vcc_nent != 0) { + ASSERT3P(cfg->vcc_entries, !=, NULL); + + kmem_free(cfg->vcc_entries, + cfg->vcc_nent * sizeof (struct vcpu_cpuid_entry)); + + cfg->vcc_nent = 0; + cfg->vcc_entries = NULL; + } +} + +static const char bhyve_id[12] = "bhyve bhyve "; + +/* + * Force exposition of the invariant TSC capability, regardless of whether the + * host CPU reports having it. + */ +static int vmm_force_invariant_tsc = 0; + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_0000_000F (0xF) +#define CPUID_0000_0010 (0x10) +#define CPUID_0000_0015 (0x15) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) +#define CPUID_8000_001D (0x8000001D) +#define CPUID_8000_001E (0x8000001E) + +#define CPUID_VM_HIGH 0x40000000 + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_SHIFT 24 + + +/* + * Round up to the next power of two, if necessary, and then take log2. + * Returns -1 if argument is zero. + */ +static __inline int +log2(uint_t x) +{ + return (fls(x << (1 - powerof2(x))) - 1); +} + +/* + * The "legacy" bhyve cpuid emulation, which largly applies statically defined + * masks to the data provided by the host CPU. + */ +void +legacy_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + const struct xsave_limits *limits; + uint64_t cr4; + int error, enable_invpcid, level, width = 0, x2apic_id = 0; + unsigned int func, regs[4], logical_cpus = 0, param; + enum x2apic_state x2apic_state; + uint16_t cores, maxcpus, sockets, threads; + + /* + * The function of CPUID is controlled through the provided value of + * %eax (and secondarily %ecx, for certain leaf data). + */ + func = (uint32_t)*eax; + param = (uint32_t)*ecx; + + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && func >= 0x80000000) { + if (func > cpu_exthigh) + func = cpu_exthigh; + } else if (func >= 0x40000000) { + if (func > CPUID_VM_HIGH) + func = CPUID_VM_HIGH; + } else if (func > cpu_high) { + func = cpu_high; + } + + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { + /* + * Pass these through to the guest + */ + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_8000_0000: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + cpuid_count(func, param, regs); + break; + case CPUID_8000_0008: + cpuid_count(func, param, regs); + if (vmm_is_svm()) { + /* + * As on Intel (0000_0007:0, EDX), mask out + * unsupported or unsafe AMD extended features + * (8000_0008 EBX). + */ + regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | + AMDFEID_XSAVEERPTR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + /* + * Here, width is ApicIdCoreIdSize, present on + * at least Family 15h and newer. It + * represents the "number of bits in the + * initial apicid that indicate thread id + * within a package." + * + * Our topo_probe_amd() uses it for + * pkg_id_shift and other OSes may rely on it. + */ + width = MIN(0xF, log2(threads * cores)); + if (width < 0x4) + width = 0; + logical_cpus = MIN(0xFF, threads * cores - 1); + regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | + logical_cpus; + } + break; + + case CPUID_8000_0001: + cpuid_count(func, param, regs); + + /* + * Hide SVM from guest. + */ + regs[2] &= ~AMDID2_SVM; + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~AMDID2_PCXC; + regs[2] &= ~AMDID2_PNXC; + regs[2] &= ~AMDID2_PTSCEL2I; + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~AMDID2_IBS; + + /* NodeID MSR not available */ + regs[2] &= ~AMDID2_NODE_ID; + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~AMDID2_OSVW; + + /* Hide mwaitx/monitorx capability from the guest */ + regs[2] &= ~AMDID2_MWAITX; + +#ifndef __FreeBSD__ + /* + * Detection routines for TCE and FFXSR are missing + * from our vm_cpuid_capability() detection logic + * today. Mask them out until that is remedied. + * They do not appear to be in common usage, so their + * absence should not cause undue trouble. + */ + regs[2] &= ~AMDID2_TCE; + regs[3] &= ~AMDID_FFXSR; +#endif + + /* + * Hide rdtscp/ia32_tsc_aux until we know how + * to deal with them. + */ + regs[3] &= ~AMDID_RDTSCP; + break; + + case CPUID_8000_0007: + cpuid_count(func, param, regs); + /* + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + + /* + * If the host system possesses an invariant TSC, then + * it is safe to expose to the guest. + * + * If there is measured skew between host TSCs, it will + * be properly offset so guests do not observe any + * change between CPU migrations. + */ + regs[3] &= AMDPM_TSC_INVARIANT; + + /* + * Since illumos avoids deep C-states on CPUs which do + * not support an invariant TSC, it may be safe (and + * desired) to unconditionally expose that capability to + * the guest. + */ + if (vmm_force_invariant_tsc != 0) { + regs[3] |= AMDPM_TSC_INVARIANT; + } + break; + + case CPUID_8000_001D: + /* AMD Cache topology, like 0000_0004 for Intel. */ + if (!vmm_is_svm()) + goto default_leaf; + + /* + * Similar to Intel, generate a ficticious cache + * topology for the guest with L3 shared by the + * package, and L1 and L2 local to a core. + */ + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + switch (param) { + case 0: + logical_cpus = threads; + level = 1; + func = 1; /* data cache */ + break; + case 1: + logical_cpus = threads; + level = 2; + func = 3; /* unified cache */ + break; + case 2: + logical_cpus = threads * cores; + level = 3; + func = 3; /* unified cache */ + break; + default: + logical_cpus = 0; + level = 0; + func = 0; + break; + } + + logical_cpus = MIN(0xfff, logical_cpus - 1); + regs[0] = (logical_cpus << 14) | (1 << 8) | + (level << 5) | func; + regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_8000_001E: + /* + * AMD Family 16h+ and Hygon Family 18h additional + * identifiers. + */ + if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) + goto default_leaf; + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] = vcpu_id; + threads = MIN(0xFF, threads - 1); + regs[1] = (threads << 8) | + (vcpu_id >> log2(threads + 1)); + /* + * XXX Bhyve topology cannot yet represent >1 node per + * processor. + */ + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_0001: + do_cpuid(1, regs); + + error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + VERIFY0(error); + + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_LOCAL_APIC_ID); + regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX, SpeedStep, TME or SMX capability. + * Advertise x2APIC capability and Hypervisor guest. + */ + regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] &= ~(CPUID2_SMX); + + regs[2] |= CPUID2_HV; + + if (x2apic_state != X2APIC_DISABLED) + regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; + + /* + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. + */ + if (!(regs[2] & CPUID2_OSXSAVE)) + regs[2] &= ~CPUID2_XSAVE; + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~CPUID2_OSXSAVE; + if (regs[2] & CPUID2_XSAVE) { + error = vm_get_register(vm, vcpu_id, + VM_REG_GUEST_CR4, &cr4); + VERIFY0(error); + if (cr4 & CR4_XSAVE) + regs[2] |= CPUID2_OSXSAVE; + } + + /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~CPUID2_MON; + + /* + * Hide the performance and debug features. + */ + regs[2] &= ~CPUID2_PDCM; + + /* + * No TSC deadline support in the APIC yet + */ + regs[2] &= ~CPUID2_TSCDLT; + + /* + * Hide thermal monitoring + */ + regs[3] &= ~(CPUID_ACPI | CPUID_TM); + + /* + * Hide the debug store capability. + */ + regs[3] &= ~CPUID_DS; + + /* + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. + */ + regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + logical_cpus = threads * cores; + regs[1] &= ~CPUID_HTT_CORES; + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= CPUID_HTT; + break; + + case CPUID_0000_0004: + cpuid_count(func, param, regs); + + if (regs[0] || regs[1] || regs[2] || regs[3]) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] &= 0x3ff; + regs[0] |= (cores - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores; + regs[0] |= (logical_cpus - 1) << 14; + } + break; + + case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (param == 0) { + cpuid_count(func, param, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= (CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | + CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_RDSEED | + CPUID_STDEXT_SMAP | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); + regs[2] = 0; + regs[3] &= CPUID_STDEXT3_MD_CLEAR; + + /* Advertise INVPCID if it is enabled. */ + error = vm_get_capability(vm, vcpu_id, + VM_CAP_ENABLE_INVPCID, &enable_invpcid); + if (error == 0 && enable_invpcid) + regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000A: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000B: + /* + * Intel processor topology enumeration + */ + if (vmm_is_intel()) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + if (param == 0) { + logical_cpus = threads; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (param == 1) { + logical_cpus = threads * cores; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (param >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (level << 8) | (param & 0xff); + regs[3] = x2apic_id; + } else { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(func, param, regs); + switch (param) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << param))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } + break; + + case CPUID_0000_000F: + case CPUID_0000_0010: + /* + * Do not report any Resource Director Technology + * capabilities. Exposing control of cache or memory + * controller resource partitioning to the guest is not + * at all sensible. + * + * This is already hidden at a high level by masking of + * leaf 0x7. Even still, a guest may look here for + * detailed capability information. + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_0015: + /* + * Don't report CPU TSC/Crystal ratio and clock + * values since guests may use these to derive the + * local APIC frequency.. + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case 0x40000000: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id + 4, ®s[2], 4); + bcopy(bhyve_id + 8, ®s[3], 4); + break; + + default: +default_leaf: + /* + * The leaf value has already been clamped so + * simply pass this through. + */ + cpuid_count(func, param, regs); + break; + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; +} diff --git a/usr/src/uts/intel/io/vmm/vmm_sol_dev.c b/usr/src/uts/intel/io/vmm/vmm_sol_dev.c index e1882d7a73..72e8e6e94a 100644 --- a/usr/src/uts/intel/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/intel/io/vmm/vmm_sol_dev.c @@ -448,6 +448,9 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, case VM_SET_RUN_STATE: case VM_GET_FPU: case VM_SET_FPU: + case VM_GET_CPUID: + case VM_SET_CPUID: + case VM_LEGACY_CPUID: /* * Copy in the ID of the vCPU chosen for this operation. * Since a nefarious caller could update their struct between @@ -1197,6 +1200,117 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, kmem_free(kbuf, req.len); break; } + case VM_GET_CPUID: { + struct vm_vcpu_cpuid_config cfg; + struct vcpu_cpuid_entry *entries = NULL; + + if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { + error = EFAULT; + break; + } + if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { + error = EINVAL; + break; + } + + const size_t entries_size = + cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); + if (entries_size != 0) { + entries = kmem_zalloc(entries_size, KM_SLEEP); + } + + vcpu_cpuid_config_t vm_cfg = { + .vcc_nent = cfg.vvcc_nent, + .vcc_entries = entries, + }; + error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); + + /* + * Only attempt to copy out the resultant entries if we were + * able to query them from the instance. The flags and number + * of entries are emitted regardless. + */ + cfg.vvcc_flags = vm_cfg.vcc_flags; + cfg.vvcc_nent = vm_cfg.vcc_nent; + if (entries != NULL) { + if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, + entries_size, md) != 0) { + error = EFAULT; + } + + kmem_free(entries, entries_size); + } + + if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { + error = EFAULT; + } + break; + } + case VM_SET_CPUID: { + struct vm_vcpu_cpuid_config cfg; + struct vcpu_cpuid_entry *entries = NULL; + size_t entries_size = 0; + + if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { + error = EFAULT; + break; + } + if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { + error = EFBIG; + break; + } + if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { + /* + * If we are being instructed to use "legacy" handling, + * then no entries should be provided, since the static + * in-kernel masking will be used. + */ + if (cfg.vvcc_nent != 0) { + error = EINVAL; + break; + } + } else if (cfg.vvcc_nent != 0) { + entries_size = + cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); + entries = kmem_alloc(entries_size, KM_SLEEP); + + if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, + md) != 0) { + error = EFAULT; + kmem_free(entries, entries_size); + break; + } + } + + vcpu_cpuid_config_t vm_cfg = { + .vcc_flags = cfg.vvcc_flags, + .vcc_nent = cfg.vvcc_nent, + .vcc_entries = entries, + }; + error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); + + if (entries != NULL) { + kmem_free(entries, entries_size); + } + break; + } + case VM_LEGACY_CPUID: { + struct vm_legacy_cpuid vlc; + if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { + error = EFAULT; + break; + } + vlc.vlc_vcpuid = vcpu; + + legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, + &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); + + if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { + error = EFAULT; + break; + } + break; + } case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { diff --git a/usr/src/uts/intel/io/vmm/x86.c b/usr/src/uts/intel/io/vmm/x86.c index e593e0c04e..c4a4d43028 100644 --- a/usr/src/uts/intel/io/vmm/x86.c +++ b/usr/src/uts/intel/io/vmm/x86.c @@ -63,628 +63,6 @@ __FBSDID("$FreeBSD$"); #include "vmm_host.h" #include "vmm_util.h" -SYSCTL_DECL(_hw_vmm); - -#define CPUID_VM_HIGH 0x40000000 - -static const char bhyve_id[12] = "bhyve bhyve "; - -/* Number of times an unknown cpuid leaf was accessed */ -static uint64_t bhyve_xcpuids; - -static int cpuid_leaf_b = 1; - -/* - * Force exposition of the invariant TSC capability, regardless of whether the - * host CPU reports having it. - */ -static int vmm_force_invariant_tsc = 0; - -#define CPUID_0000_0000 (0x0) -#define CPUID_0000_0001 (0x1) -#define CPUID_0000_0002 (0x2) -#define CPUID_0000_0003 (0x3) -#define CPUID_0000_0004 (0x4) -#define CPUID_0000_0006 (0x6) -#define CPUID_0000_0007 (0x7) -#define CPUID_0000_000A (0xA) -#define CPUID_0000_000B (0xB) -#define CPUID_0000_000D (0xD) -#define CPUID_0000_000F (0xF) -#define CPUID_0000_0010 (0x10) -#define CPUID_0000_0015 (0x15) -#define CPUID_8000_0000 (0x80000000) -#define CPUID_8000_0001 (0x80000001) -#define CPUID_8000_0002 (0x80000002) -#define CPUID_8000_0003 (0x80000003) -#define CPUID_8000_0004 (0x80000004) -#define CPUID_8000_0006 (0x80000006) -#define CPUID_8000_0007 (0x80000007) -#define CPUID_8000_0008 (0x80000008) -#define CPUID_8000_001D (0x8000001D) -#define CPUID_8000_001E (0x8000001E) - -/* - * CPUID instruction Fn0000_0001: - */ -#define CPUID_0000_0001_APICID_MASK (0xff<<24) -#define CPUID_0000_0001_APICID_SHIFT 24 - -/* - * CPUID instruction Fn0000_0001 ECX - */ -#define CPUID_0000_0001_FEAT0_VMX (1<<5) - - -/* - * Round up to the next power of two, if necessary, and then take log2. - * Returns -1 if argument is zero. - */ -static __inline int -log2(uint_t x) -{ - - return (fls(x << (1 - powerof2(x))) - 1); -} - -int -x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx, - uint64_t *rcx, uint64_t *rdx) -{ - const struct xsave_limits *limits; - uint64_t cr4; - int error, enable_invpcid, level, width = 0, x2apic_id = 0; - unsigned int func, regs[4], logical_cpus = 0, param; - enum x2apic_state x2apic_state; - uint16_t cores, maxcpus, sockets, threads; - - /* - * The function of CPUID is controlled through the provided value of - * %eax (and secondarily %ecx, for certain leaf data). - */ - func = (uint32_t)*rax; - param = (uint32_t)*rcx; - - /* - * Requests for invalid CPUID levels should map to the highest - * available level instead. - */ - if (cpu_exthigh != 0 && func >= 0x80000000) { - if (func > cpu_exthigh) - func = cpu_exthigh; - } else if (func >= 0x40000000) { - if (func > CPUID_VM_HIGH) - func = CPUID_VM_HIGH; - } else if (func > cpu_high) { - func = cpu_high; - } - - /* - * In general the approach used for CPU topology is to - * advertise a flat topology where all CPUs are packages with - * no multi-core or SMT. - */ - switch (func) { - /* - * Pass these through to the guest - */ - case CPUID_0000_0000: - case CPUID_0000_0002: - case CPUID_0000_0003: - case CPUID_8000_0000: - case CPUID_8000_0002: - case CPUID_8000_0003: - case CPUID_8000_0004: - case CPUID_8000_0006: - cpuid_count(func, param, regs); - break; - case CPUID_8000_0008: - cpuid_count(func, param, regs); - if (vmm_is_svm()) { - /* - * As on Intel (0000_0007:0, EDX), mask out - * unsupported or unsafe AMD extended features - * (8000_0008 EBX). - */ - regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | - AMDFEID_XSAVEERPTR); - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - /* - * Here, width is ApicIdCoreIdSize, present on - * at least Family 15h and newer. It - * represents the "number of bits in the - * initial apicid that indicate thread id - * within a package." - * - * Our topo_probe_amd() uses it for - * pkg_id_shift and other OSes may rely on it. - */ - width = MIN(0xF, log2(threads * cores)); - if (width < 0x4) - width = 0; - logical_cpus = MIN(0xFF, threads * cores - 1); - regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | - logical_cpus; - } - break; - - case CPUID_8000_0001: - cpuid_count(func, param, regs); - - /* - * Hide SVM from guest. - */ - regs[2] &= ~AMDID2_SVM; - - /* - * Don't advertise extended performance counter MSRs - * to the guest. - */ - regs[2] &= ~AMDID2_PCXC; - regs[2] &= ~AMDID2_PNXC; - regs[2] &= ~AMDID2_PTSCEL2I; - - /* - * Don't advertise Instruction Based Sampling feature. - */ - regs[2] &= ~AMDID2_IBS; - - /* NodeID MSR not available */ - regs[2] &= ~AMDID2_NODE_ID; - - /* Don't advertise the OS visible workaround feature */ - regs[2] &= ~AMDID2_OSVW; - - /* Hide mwaitx/monitorx capability from the guest */ - regs[2] &= ~AMDID2_MWAITX; - -#ifndef __FreeBSD__ - /* - * Detection routines for TCE and FFXSR are missing - * from our vm_cpuid_capability() detection logic - * today. Mask them out until that is remedied. - * They do not appear to be in common usage, so their - * absence should not cause undue trouble. - */ - regs[2] &= ~AMDID2_TCE; - regs[3] &= ~AMDID_FFXSR; -#endif - - /* - * Hide rdtscp/ia32_tsc_aux until we know how - * to deal with them. - */ - regs[3] &= ~AMDID_RDTSCP; - break; - - case CPUID_8000_0007: - cpuid_count(func, param, regs); - /* - * AMD uses this leaf to advertise the processor's - * power monitoring and RAS capabilities. These - * features are hardware-specific and exposing - * them to a guest doesn't make a lot of sense. - * - * Intel uses this leaf only to advertise the - * "Invariant TSC" feature with all other bits - * being reserved (set to zero). - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - - /* - * If the host system possesses an invariant TSC, then - * it is safe to expose to the guest. - * - * If there is measured skew between host TSCs, it will - * be properly offset so guests do not observe any - * change between CPU migrations. - */ - regs[3] &= AMDPM_TSC_INVARIANT; - - /* - * Since illumos avoids deep C-states on CPUs which do - * not support an invariant TSC, it may be safe (and - * desired) to unconditionally expose that capability to - * the guest. - */ - if (vmm_force_invariant_tsc != 0) { - regs[3] |= AMDPM_TSC_INVARIANT; - } - break; - - case CPUID_8000_001D: - /* AMD Cache topology, like 0000_0004 for Intel. */ - if (!vmm_is_svm()) - goto default_leaf; - - /* - * Similar to Intel, generate a ficticious cache - * topology for the guest with L3 shared by the - * package, and L1 and L2 local to a core. - */ - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - switch (param) { - case 0: - logical_cpus = threads; - level = 1; - func = 1; /* data cache */ - break; - case 1: - logical_cpus = threads; - level = 2; - func = 3; /* unified cache */ - break; - case 2: - logical_cpus = threads * cores; - level = 3; - func = 3; /* unified cache */ - break; - default: - logical_cpus = 0; - level = 0; - func = 0; - break; - } - - logical_cpus = MIN(0xfff, logical_cpus - 1); - regs[0] = (logical_cpus << 14) | (1 << 8) | - (level << 5) | func; - regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_8000_001E: - /* - * AMD Family 16h+ and Hygon Family 18h additional - * identifiers. - */ - if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) - goto default_leaf; - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - regs[0] = vcpu_id; - threads = MIN(0xFF, threads - 1); - regs[1] = (threads << 8) | - (vcpu_id >> log2(threads + 1)); - /* - * XXX Bhyve topology cannot yet represent >1 node per - * processor. - */ - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_0001: - do_cpuid(1, regs); - - error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); - if (error) { - panic("x86_emulate_cpuid: error %d " - "fetching x2apic state", error); - } - - /* - * Override the APIC ID only in ebx - */ - regs[1] &= ~(CPUID_LOCAL_APIC_ID); - regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); - - /* - * Don't expose VMX, SpeedStep, TME or SMX capability. - * Advertise x2APIC capability and Hypervisor guest. - */ - regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); - regs[2] &= ~(CPUID2_SMX); - - regs[2] |= CPUID2_HV; - - if (x2apic_state != X2APIC_DISABLED) - regs[2] |= CPUID2_X2APIC; - else - regs[2] &= ~CPUID2_X2APIC; - - /* - * Only advertise CPUID2_XSAVE in the guest if - * the host is using XSAVE. - */ - if (!(regs[2] & CPUID2_OSXSAVE)) - regs[2] &= ~CPUID2_XSAVE; - - /* - * If CPUID2_XSAVE is being advertised and the - * guest has set CR4_XSAVE, set - * CPUID2_OSXSAVE. - */ - regs[2] &= ~CPUID2_OSXSAVE; - if (regs[2] & CPUID2_XSAVE) { - error = vm_get_register(vm, vcpu_id, - VM_REG_GUEST_CR4, &cr4); - if (error) - panic("x86_emulate_cpuid: error %d " - "fetching %%cr4", error); - if (cr4 & CR4_XSAVE) - regs[2] |= CPUID2_OSXSAVE; - } - - /* - * Hide monitor/mwait until we know how to deal with - * these instructions. - */ - regs[2] &= ~CPUID2_MON; - - /* - * Hide the performance and debug features. - */ - regs[2] &= ~CPUID2_PDCM; - - /* - * No TSC deadline support in the APIC yet - */ - regs[2] &= ~CPUID2_TSCDLT; - - /* - * Hide thermal monitoring - */ - regs[3] &= ~(CPUID_ACPI | CPUID_TM); - - /* - * Hide the debug store capability. - */ - regs[3] &= ~CPUID_DS; - - /* - * Advertise the Machine Check and MTRR capability. - * - * Some guest OSes (e.g. Windows) will not boot if - * these features are absent. - */ - regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - logical_cpus = threads * cores; - regs[1] &= ~CPUID_HTT_CORES; - regs[1] |= (logical_cpus & 0xff) << 16; - regs[3] |= CPUID_HTT; - break; - - case CPUID_0000_0004: - cpuid_count(func, param, regs); - - if (regs[0] || regs[1] || regs[2] || regs[3]) { - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - regs[0] &= 0x3ff; - regs[0] |= (cores - 1) << 26; - /* - * Cache topology: - * - L1 and L2 are shared only by the logical - * processors in a single core. - * - L3 and above are shared by all logical - * processors in the package. - */ - logical_cpus = threads; - level = (regs[0] >> 5) & 0x7; - if (level >= 3) - logical_cpus *= cores; - regs[0] |= (logical_cpus - 1) << 14; - } - break; - - case CPUID_0000_0007: - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - - /* leaf 0 */ - if (param == 0) { - cpuid_count(func, param, regs); - - /* Only leaf 0 is supported */ - regs[0] = 0; - - /* - * Expose known-safe features. - */ - regs[1] &= (CPUID_STDEXT_FSGSBASE | - CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | - CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | - CPUID_STDEXT_BMI2 | - CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | - CPUID_STDEXT_AVX512F | - CPUID_STDEXT_RDSEED | - CPUID_STDEXT_SMAP | - CPUID_STDEXT_AVX512PF | - CPUID_STDEXT_AVX512ER | - CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); - regs[2] = 0; - regs[3] &= CPUID_STDEXT3_MD_CLEAR; - - /* Advertise INVPCID if it is enabled. */ - error = vm_get_capability(vm, vcpu_id, - VM_CAP_ENABLE_INVPCID, &enable_invpcid); - if (error == 0 && enable_invpcid) - regs[1] |= CPUID_STDEXT_INVPCID; - } - break; - - case CPUID_0000_0006: - regs[0] = CPUTPM1_ARAT; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_000A: - /* - * Handle the access, but report 0 for - * all options - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_000B: - /* - * Intel processor topology enumeration - */ - if (vmm_is_intel()) { - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - if (param == 0) { - logical_cpus = threads; - width = log2(logical_cpus); - level = CPUID_TYPE_SMT; - x2apic_id = vcpu_id; - } - - if (param == 1) { - logical_cpus = threads * cores; - width = log2(logical_cpus); - level = CPUID_TYPE_CORE; - x2apic_id = vcpu_id; - } - - if (!cpuid_leaf_b || param >= 2) { - width = 0; - logical_cpus = 0; - level = 0; - x2apic_id = 0; - } - - regs[0] = width & 0x1f; - regs[1] = logical_cpus & 0xffff; - regs[2] = (level << 8) | (param & 0xff); - regs[3] = x2apic_id; - } else { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - } - break; - - case CPUID_0000_000D: - limits = vmm_get_xsave_limits(); - if (!limits->xsave_enabled) { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - } - - cpuid_count(func, param, regs); - switch (param) { - case 0: - /* - * Only permit the guest to use bits - * that are active in the host in - * %xcr0. Also, claim that the - * maximum save area size is - * equivalent to the host's current - * save area size. Since this runs - * "inside" of vmrun(), it runs with - * the guest's xcr0, so the current - * save area size is correct as-is. - */ - regs[0] &= limits->xcr0_allowed; - regs[2] = limits->xsave_max_size; - regs[3] &= (limits->xcr0_allowed >> 32); - break; - case 1: - /* Only permit XSAVEOPT. */ - regs[0] &= CPUID_EXTSTATE_XSAVEOPT; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - default: - /* - * If the leaf is for a permitted feature, - * pass through as-is, otherwise return - * all zeroes. - */ - if (!(limits->xcr0_allowed & (1ul << param))) { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - } - break; - } - break; - - case CPUID_0000_000F: - case CPUID_0000_0010: - /* - * Do not report any Resource Director Technology - * capabilities. Exposing control of cache or memory - * controller resource partitioning to the guest is not - * at all sensible. - * - * This is already hidden at a high level by masking of - * leaf 0x7. Even still, a guest may look here for - * detailed capability information. - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_0015: - /* - * Don't report CPU TSC/Crystal ratio and clock - * values since guests may use these to derive the - * local APIC frequency.. - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case 0x40000000: - regs[0] = CPUID_VM_HIGH; - bcopy(bhyve_id, ®s[1], 4); - bcopy(bhyve_id + 4, ®s[2], 4); - bcopy(bhyve_id + 8, ®s[3], 4); - break; - - default: -default_leaf: - /* - * The leaf value has already been clamped so - * simply pass this through, keeping count of - * how many unhandled leaf values have been seen. - */ - atomic_add_long(&bhyve_xcpuids, 1); - cpuid_count(func, param, regs); - break; - } - - /* - * CPUID clears the upper 32-bits of the long-mode registers. - */ - *rax = regs[0]; - *rbx = regs[1]; - *rcx = regs[2]; - *rdx = regs[3]; - - return (1); -} - /* * Return 'true' if the capability 'cap' is enabled in this virtual cpu * and 'false' otherwise. diff --git a/usr/src/uts/intel/ml/desctbls_asm.s b/usr/src/uts/intel/ml/desctbls_asm.s index 4528bc07ad..230d6c4d34 100644 --- a/usr/src/uts/intel/ml/desctbls_asm.s +++ b/usr/src/uts/intel/ml/desctbls_asm.s @@ -25,6 +25,7 @@ /* * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ #include <sys/asm_linkage.h> @@ -60,9 +61,6 @@ pushq %rbp movq %rsp, %rbp lgdt (%rdi) - jmp 1f - nop -1: leave ret SET_SIZE(wr_gdtr) diff --git a/usr/src/uts/intel/sys/vmm.h b/usr/src/uts/intel/sys/vmm.h index 50d76ab17c..45e0fe8f34 100644 --- a/usr/src/uts/intel/sys/vmm.h +++ b/usr/src/uts/intel/sys/vmm.h @@ -406,4 +406,51 @@ enum vm_create_flags { VCF_RESERVOIR_MEM = (1 << 0), }; +/* + * Describes an entry for `cpuid` emulation. + * Used internally by bhyve (kernel) in addition to exposed ioctl(2) interface. + */ +struct vcpu_cpuid_entry { + uint32_t vce_function; + uint32_t vce_index; + uint32_t vce_flags; + uint32_t vce_eax; + uint32_t vce_ebx; + uint32_t vce_ecx; + uint32_t vce_edx; + uint32_t _pad; +}; + +/* + * Defined flags for vcpu_cpuid_entry`vce_flags are below. + */ + +/* Use index (ecx) input value when matching entry */ +#define VCE_FLAG_MATCH_INDEX (1 << 0) + +/* All valid flacts for vcpu_cpuid_entry`vce_flags */ +#define VCE_FLAGS_VALID VCE_FLAG_MATCH_INDEX + +/* + * Defined flags for vcpu_cpuid configuration are below. + * These are used by both the ioctl(2) interface via vm_vcpu_cpuid_config and + * internally in the kernel vmm. + */ + +/* Use legacy hard-coded cpuid masking tables applied to the host CPU */ +#define VCC_FLAG_LEGACY_HANDLING (1 << 0) +/* + * Emulate Intel-style fallback behavior (emit highest "standard" entry) if the + * queried function/index do not match. If not set, emulate AMD-style, where + * all zeroes are returned in such cases. + */ +#define VCC_FLAG_INTEL_FALLBACK (1 << 1) + +/* All valid flacts for vm_vcpu_cpuid_config`vvcc_flags */ +#define VCC_FLAGS_VALID \ + (VCC_FLAG_LEGACY_HANDLING | VCC_FLAG_INTEL_FALLBACK) + +/* Maximum vcpu_cpuid_entry records per vCPU */ +#define VMM_MAX_CPUID_ENTRIES 256 + #endif /* _VMM_H_ */ diff --git a/usr/src/uts/intel/sys/vmm_dev.h b/usr/src/uts/intel/sys/vmm_dev.h index b8c87217b4..80b8c2d7ba 100644 --- a/usr/src/uts/intel/sys/vmm_dev.h +++ b/usr/src/uts/intel/sys/vmm_dev.h @@ -370,6 +370,23 @@ struct vm_data_xfer { void *vdx_data; }; +struct vm_vcpu_cpuid_config { + int vvcc_vcpuid; + uint32_t vvcc_flags; + uint32_t vvcc_nent; + uint32_t _pad; + void *vvcc_entries; +}; + +/* Query the computed legacy cpuid value for a vcpuid with VM_LEGACY_CPUID */ +struct vm_legacy_cpuid { + int vlc_vcpuid; + uint32_t vlc_eax; + uint32_t vlc_ebx; + uint32_t vlc_ecx; + uint32_t vlc_edx; +}; + /* * VMM Interface Version * @@ -385,7 +402,7 @@ struct vm_data_xfer { * best-effort activity. Nothing is to be inferred about the magnitude of a * change when the version is modified. It follows no rules like semver. */ -#define VMM_CURRENT_INTERFACE_VERSION 4 +#define VMM_CURRENT_INTERFACE_VERSION 5 #define VMMCTL_IOC_BASE (('V' << 16) | ('M' << 8)) @@ -431,6 +448,9 @@ struct vm_data_xfer { #define VM_SET_RUN_STATE (VMM_CPU_IOC_BASE | 0x18) #define VM_GET_FPU (VMM_CPU_IOC_BASE | 0x19) #define VM_SET_FPU (VMM_CPU_IOC_BASE | 0x1a) +#define VM_GET_CPUID (VMM_CPU_IOC_BASE | 0x1b) +#define VM_SET_CPUID (VMM_CPU_IOC_BASE | 0x1c) +#define VM_LEGACY_CPUID (VMM_CPU_IOC_BASE | 0x1d) /* Operations requiring write-locking the VM */ #define VM_REINIT (VMM_LOCK_IOC_BASE | 0x01) |