diff options
Diffstat (limited to 'usr/src/uts/intel/io/vmm/x86.c')
-rw-r--r-- | usr/src/uts/intel/io/vmm/x86.c | 622 |
1 files changed, 0 insertions, 622 deletions
diff --git a/usr/src/uts/intel/io/vmm/x86.c b/usr/src/uts/intel/io/vmm/x86.c index e593e0c04e..c4a4d43028 100644 --- a/usr/src/uts/intel/io/vmm/x86.c +++ b/usr/src/uts/intel/io/vmm/x86.c @@ -63,628 +63,6 @@ __FBSDID("$FreeBSD$"); #include "vmm_host.h" #include "vmm_util.h" -SYSCTL_DECL(_hw_vmm); - -#define CPUID_VM_HIGH 0x40000000 - -static const char bhyve_id[12] = "bhyve bhyve "; - -/* Number of times an unknown cpuid leaf was accessed */ -static uint64_t bhyve_xcpuids; - -static int cpuid_leaf_b = 1; - -/* - * Force exposition of the invariant TSC capability, regardless of whether the - * host CPU reports having it. - */ -static int vmm_force_invariant_tsc = 0; - -#define CPUID_0000_0000 (0x0) -#define CPUID_0000_0001 (0x1) -#define CPUID_0000_0002 (0x2) -#define CPUID_0000_0003 (0x3) -#define CPUID_0000_0004 (0x4) -#define CPUID_0000_0006 (0x6) -#define CPUID_0000_0007 (0x7) -#define CPUID_0000_000A (0xA) -#define CPUID_0000_000B (0xB) -#define CPUID_0000_000D (0xD) -#define CPUID_0000_000F (0xF) -#define CPUID_0000_0010 (0x10) -#define CPUID_0000_0015 (0x15) -#define CPUID_8000_0000 (0x80000000) -#define CPUID_8000_0001 (0x80000001) -#define CPUID_8000_0002 (0x80000002) -#define CPUID_8000_0003 (0x80000003) -#define CPUID_8000_0004 (0x80000004) -#define CPUID_8000_0006 (0x80000006) -#define CPUID_8000_0007 (0x80000007) -#define CPUID_8000_0008 (0x80000008) -#define CPUID_8000_001D (0x8000001D) -#define CPUID_8000_001E (0x8000001E) - -/* - * CPUID instruction Fn0000_0001: - */ -#define CPUID_0000_0001_APICID_MASK (0xff<<24) -#define CPUID_0000_0001_APICID_SHIFT 24 - -/* - * CPUID instruction Fn0000_0001 ECX - */ -#define CPUID_0000_0001_FEAT0_VMX (1<<5) - - -/* - * Round up to the next power of two, if necessary, and then take log2. - * Returns -1 if argument is zero. - */ -static __inline int -log2(uint_t x) -{ - - return (fls(x << (1 - powerof2(x))) - 1); -} - -int -x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx, - uint64_t *rcx, uint64_t *rdx) -{ - const struct xsave_limits *limits; - uint64_t cr4; - int error, enable_invpcid, level, width = 0, x2apic_id = 0; - unsigned int func, regs[4], logical_cpus = 0, param; - enum x2apic_state x2apic_state; - uint16_t cores, maxcpus, sockets, threads; - - /* - * The function of CPUID is controlled through the provided value of - * %eax (and secondarily %ecx, for certain leaf data). - */ - func = (uint32_t)*rax; - param = (uint32_t)*rcx; - - /* - * Requests for invalid CPUID levels should map to the highest - * available level instead. - */ - if (cpu_exthigh != 0 && func >= 0x80000000) { - if (func > cpu_exthigh) - func = cpu_exthigh; - } else if (func >= 0x40000000) { - if (func > CPUID_VM_HIGH) - func = CPUID_VM_HIGH; - } else if (func > cpu_high) { - func = cpu_high; - } - - /* - * In general the approach used for CPU topology is to - * advertise a flat topology where all CPUs are packages with - * no multi-core or SMT. - */ - switch (func) { - /* - * Pass these through to the guest - */ - case CPUID_0000_0000: - case CPUID_0000_0002: - case CPUID_0000_0003: - case CPUID_8000_0000: - case CPUID_8000_0002: - case CPUID_8000_0003: - case CPUID_8000_0004: - case CPUID_8000_0006: - cpuid_count(func, param, regs); - break; - case CPUID_8000_0008: - cpuid_count(func, param, regs); - if (vmm_is_svm()) { - /* - * As on Intel (0000_0007:0, EDX), mask out - * unsupported or unsafe AMD extended features - * (8000_0008 EBX). - */ - regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | - AMDFEID_XSAVEERPTR); - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - /* - * Here, width is ApicIdCoreIdSize, present on - * at least Family 15h and newer. It - * represents the "number of bits in the - * initial apicid that indicate thread id - * within a package." - * - * Our topo_probe_amd() uses it for - * pkg_id_shift and other OSes may rely on it. - */ - width = MIN(0xF, log2(threads * cores)); - if (width < 0x4) - width = 0; - logical_cpus = MIN(0xFF, threads * cores - 1); - regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | - logical_cpus; - } - break; - - case CPUID_8000_0001: - cpuid_count(func, param, regs); - - /* - * Hide SVM from guest. - */ - regs[2] &= ~AMDID2_SVM; - - /* - * Don't advertise extended performance counter MSRs - * to the guest. - */ - regs[2] &= ~AMDID2_PCXC; - regs[2] &= ~AMDID2_PNXC; - regs[2] &= ~AMDID2_PTSCEL2I; - - /* - * Don't advertise Instruction Based Sampling feature. - */ - regs[2] &= ~AMDID2_IBS; - - /* NodeID MSR not available */ - regs[2] &= ~AMDID2_NODE_ID; - - /* Don't advertise the OS visible workaround feature */ - regs[2] &= ~AMDID2_OSVW; - - /* Hide mwaitx/monitorx capability from the guest */ - regs[2] &= ~AMDID2_MWAITX; - -#ifndef __FreeBSD__ - /* - * Detection routines for TCE and FFXSR are missing - * from our vm_cpuid_capability() detection logic - * today. Mask them out until that is remedied. - * They do not appear to be in common usage, so their - * absence should not cause undue trouble. - */ - regs[2] &= ~AMDID2_TCE; - regs[3] &= ~AMDID_FFXSR; -#endif - - /* - * Hide rdtscp/ia32_tsc_aux until we know how - * to deal with them. - */ - regs[3] &= ~AMDID_RDTSCP; - break; - - case CPUID_8000_0007: - cpuid_count(func, param, regs); - /* - * AMD uses this leaf to advertise the processor's - * power monitoring and RAS capabilities. These - * features are hardware-specific and exposing - * them to a guest doesn't make a lot of sense. - * - * Intel uses this leaf only to advertise the - * "Invariant TSC" feature with all other bits - * being reserved (set to zero). - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - - /* - * If the host system possesses an invariant TSC, then - * it is safe to expose to the guest. - * - * If there is measured skew between host TSCs, it will - * be properly offset so guests do not observe any - * change between CPU migrations. - */ - regs[3] &= AMDPM_TSC_INVARIANT; - - /* - * Since illumos avoids deep C-states on CPUs which do - * not support an invariant TSC, it may be safe (and - * desired) to unconditionally expose that capability to - * the guest. - */ - if (vmm_force_invariant_tsc != 0) { - regs[3] |= AMDPM_TSC_INVARIANT; - } - break; - - case CPUID_8000_001D: - /* AMD Cache topology, like 0000_0004 for Intel. */ - if (!vmm_is_svm()) - goto default_leaf; - - /* - * Similar to Intel, generate a ficticious cache - * topology for the guest with L3 shared by the - * package, and L1 and L2 local to a core. - */ - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - switch (param) { - case 0: - logical_cpus = threads; - level = 1; - func = 1; /* data cache */ - break; - case 1: - logical_cpus = threads; - level = 2; - func = 3; /* unified cache */ - break; - case 2: - logical_cpus = threads * cores; - level = 3; - func = 3; /* unified cache */ - break; - default: - logical_cpus = 0; - level = 0; - func = 0; - break; - } - - logical_cpus = MIN(0xfff, logical_cpus - 1); - regs[0] = (logical_cpus << 14) | (1 << 8) | - (level << 5) | func; - regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_8000_001E: - /* - * AMD Family 16h+ and Hygon Family 18h additional - * identifiers. - */ - if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) - goto default_leaf; - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - regs[0] = vcpu_id; - threads = MIN(0xFF, threads - 1); - regs[1] = (threads << 8) | - (vcpu_id >> log2(threads + 1)); - /* - * XXX Bhyve topology cannot yet represent >1 node per - * processor. - */ - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_0001: - do_cpuid(1, regs); - - error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); - if (error) { - panic("x86_emulate_cpuid: error %d " - "fetching x2apic state", error); - } - - /* - * Override the APIC ID only in ebx - */ - regs[1] &= ~(CPUID_LOCAL_APIC_ID); - regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); - - /* - * Don't expose VMX, SpeedStep, TME or SMX capability. - * Advertise x2APIC capability and Hypervisor guest. - */ - regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); - regs[2] &= ~(CPUID2_SMX); - - regs[2] |= CPUID2_HV; - - if (x2apic_state != X2APIC_DISABLED) - regs[2] |= CPUID2_X2APIC; - else - regs[2] &= ~CPUID2_X2APIC; - - /* - * Only advertise CPUID2_XSAVE in the guest if - * the host is using XSAVE. - */ - if (!(regs[2] & CPUID2_OSXSAVE)) - regs[2] &= ~CPUID2_XSAVE; - - /* - * If CPUID2_XSAVE is being advertised and the - * guest has set CR4_XSAVE, set - * CPUID2_OSXSAVE. - */ - regs[2] &= ~CPUID2_OSXSAVE; - if (regs[2] & CPUID2_XSAVE) { - error = vm_get_register(vm, vcpu_id, - VM_REG_GUEST_CR4, &cr4); - if (error) - panic("x86_emulate_cpuid: error %d " - "fetching %%cr4", error); - if (cr4 & CR4_XSAVE) - regs[2] |= CPUID2_OSXSAVE; - } - - /* - * Hide monitor/mwait until we know how to deal with - * these instructions. - */ - regs[2] &= ~CPUID2_MON; - - /* - * Hide the performance and debug features. - */ - regs[2] &= ~CPUID2_PDCM; - - /* - * No TSC deadline support in the APIC yet - */ - regs[2] &= ~CPUID2_TSCDLT; - - /* - * Hide thermal monitoring - */ - regs[3] &= ~(CPUID_ACPI | CPUID_TM); - - /* - * Hide the debug store capability. - */ - regs[3] &= ~CPUID_DS; - - /* - * Advertise the Machine Check and MTRR capability. - * - * Some guest OSes (e.g. Windows) will not boot if - * these features are absent. - */ - regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - logical_cpus = threads * cores; - regs[1] &= ~CPUID_HTT_CORES; - regs[1] |= (logical_cpus & 0xff) << 16; - regs[3] |= CPUID_HTT; - break; - - case CPUID_0000_0004: - cpuid_count(func, param, regs); - - if (regs[0] || regs[1] || regs[2] || regs[3]) { - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - regs[0] &= 0x3ff; - regs[0] |= (cores - 1) << 26; - /* - * Cache topology: - * - L1 and L2 are shared only by the logical - * processors in a single core. - * - L3 and above are shared by all logical - * processors in the package. - */ - logical_cpus = threads; - level = (regs[0] >> 5) & 0x7; - if (level >= 3) - logical_cpus *= cores; - regs[0] |= (logical_cpus - 1) << 14; - } - break; - - case CPUID_0000_0007: - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - - /* leaf 0 */ - if (param == 0) { - cpuid_count(func, param, regs); - - /* Only leaf 0 is supported */ - regs[0] = 0; - - /* - * Expose known-safe features. - */ - regs[1] &= (CPUID_STDEXT_FSGSBASE | - CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | - CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | - CPUID_STDEXT_BMI2 | - CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | - CPUID_STDEXT_AVX512F | - CPUID_STDEXT_RDSEED | - CPUID_STDEXT_SMAP | - CPUID_STDEXT_AVX512PF | - CPUID_STDEXT_AVX512ER | - CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); - regs[2] = 0; - regs[3] &= CPUID_STDEXT3_MD_CLEAR; - - /* Advertise INVPCID if it is enabled. */ - error = vm_get_capability(vm, vcpu_id, - VM_CAP_ENABLE_INVPCID, &enable_invpcid); - if (error == 0 && enable_invpcid) - regs[1] |= CPUID_STDEXT_INVPCID; - } - break; - - case CPUID_0000_0006: - regs[0] = CPUTPM1_ARAT; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_000A: - /* - * Handle the access, but report 0 for - * all options - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_000B: - /* - * Intel processor topology enumeration - */ - if (vmm_is_intel()) { - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - if (param == 0) { - logical_cpus = threads; - width = log2(logical_cpus); - level = CPUID_TYPE_SMT; - x2apic_id = vcpu_id; - } - - if (param == 1) { - logical_cpus = threads * cores; - width = log2(logical_cpus); - level = CPUID_TYPE_CORE; - x2apic_id = vcpu_id; - } - - if (!cpuid_leaf_b || param >= 2) { - width = 0; - logical_cpus = 0; - level = 0; - x2apic_id = 0; - } - - regs[0] = width & 0x1f; - regs[1] = logical_cpus & 0xffff; - regs[2] = (level << 8) | (param & 0xff); - regs[3] = x2apic_id; - } else { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - } - break; - - case CPUID_0000_000D: - limits = vmm_get_xsave_limits(); - if (!limits->xsave_enabled) { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - } - - cpuid_count(func, param, regs); - switch (param) { - case 0: - /* - * Only permit the guest to use bits - * that are active in the host in - * %xcr0. Also, claim that the - * maximum save area size is - * equivalent to the host's current - * save area size. Since this runs - * "inside" of vmrun(), it runs with - * the guest's xcr0, so the current - * save area size is correct as-is. - */ - regs[0] &= limits->xcr0_allowed; - regs[2] = limits->xsave_max_size; - regs[3] &= (limits->xcr0_allowed >> 32); - break; - case 1: - /* Only permit XSAVEOPT. */ - regs[0] &= CPUID_EXTSTATE_XSAVEOPT; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - default: - /* - * If the leaf is for a permitted feature, - * pass through as-is, otherwise return - * all zeroes. - */ - if (!(limits->xcr0_allowed & (1ul << param))) { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - } - break; - } - break; - - case CPUID_0000_000F: - case CPUID_0000_0010: - /* - * Do not report any Resource Director Technology - * capabilities. Exposing control of cache or memory - * controller resource partitioning to the guest is not - * at all sensible. - * - * This is already hidden at a high level by masking of - * leaf 0x7. Even still, a guest may look here for - * detailed capability information. - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_0015: - /* - * Don't report CPU TSC/Crystal ratio and clock - * values since guests may use these to derive the - * local APIC frequency.. - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case 0x40000000: - regs[0] = CPUID_VM_HIGH; - bcopy(bhyve_id, ®s[1], 4); - bcopy(bhyve_id + 4, ®s[2], 4); - bcopy(bhyve_id + 8, ®s[3], 4); - break; - - default: -default_leaf: - /* - * The leaf value has already been clamped so - * simply pass this through, keeping count of - * how many unhandled leaf values have been seen. - */ - atomic_add_long(&bhyve_xcpuids, 1); - cpuid_count(func, param, regs); - break; - } - - /* - * CPUID clears the upper 32-bits of the long-mode registers. - */ - *rax = regs[0]; - *rbx = regs[1]; - *rcx = regs[2]; - *rdx = regs[3]; - - return (1); -} - /* * Return 'true' if the capability 'cap' is enabled in this virtual cpu * and 'false' otherwise. |