diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2018-12-14 23:15:07 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@pfmooney.com> | 2019-03-15 15:01:23 +0000 |
commit | 2453029c010976e95241a5f5244e86d44dc6194c (patch) | |
tree | ee57e43a87769a4489b772635f14c4e153205ac5 /usr | |
parent | 7afbf32617941421fcf18c3c37cbce5f7d30a8fc (diff) | |
download | illumos-joyent-2453029c010976e95241a5f5244e86d44dc6194c.tar.gz |
OS-7170 bhyve should support AMD
Reviewed by: John Levon <john.levon@joyent.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Diffstat (limited to 'usr')
-rw-r--r-- | usr/src/uts/i86pc/Makefile.files | 2 | ||||
-rw-r--r-- | usr/src/uts/i86pc/Makefile.rules | 3 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/amd/svm.c | 122 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/amd/svm_msr.c | 26 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/amd/svm_softc.h | 15 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/amd/svm_support.s | 188 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/intel/vmx.c | 6 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/intel/vmx.h | 3 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/intel/vmx_support.s | 40 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/vm/vm_glue.h | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c | 15 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c | 297 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c | 16 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/vmm_support.s | 54 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/vmm/x86.c | 12 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/hma.c | 273 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/hma.h | 9 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/vmm.h | 4 | ||||
-rw-r--r-- | usr/src/uts/intel/sys/controlregs.h | 12 | ||||
-rw-r--r-- | usr/src/uts/intel/sys/x86_archext.h | 12 |
20 files changed, 942 insertions, 168 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 8eadfcb56f..3979ddaef7 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -279,6 +279,8 @@ VMM_OBJS += vmm.o \ vmm_sol_vm.o \ vmm_sol_glue.o \ vmm_sol_ept.o \ + vmm_sol_rvi.o \ + vmm_support.o \ vmm_zsd.o VIONA_OBJS += viona.o diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index 3ea69978ce..0e3ea556ea 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -232,6 +232,9 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/io/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/%.s + $(COMPILE.s) -o $@ $< + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/intel/%.s $(COMPILE.s) -o $@ $< diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c index e921383d22..ca9ed9e4e1 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #ifndef __FreeBSD__ #include <sys/x86_archext.h> +#include <sys/trap.h> #endif #include <vm/vm.h> @@ -123,6 +124,7 @@ static int disable_npf_assist; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, &disable_npf_assist, 0, NULL); +#ifdef __FreeBSD__ /* Maximum ASIDs supported by the processor */ static uint32_t nasid; SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, @@ -135,6 +137,7 @@ static struct asid asid[MAXCPU]; * SVM host state saved area of size 4KB for each core. */ static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); +#endif /* __FreeBSD__ */ static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); @@ -156,6 +159,7 @@ decode_assist(void) return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); } +#ifdef __FreeBSD__ static void svm_disable(void *arg __unused) { @@ -298,6 +302,31 @@ svm_restore(void) svm_enable(NULL); } +#else /* __FreeBSD__ */ +static int +svm_cleanup(void) +{ + /* This is taken care of by the hma registration */ + return (0); +} + +static int +svm_init(int ipinum) +{ + vmcb_clean &= VMCB_CACHE_DEFAULT; + + svm_msr_init(); + svm_npt_init(ipinum); + + return (0); +} + +static void +svm_restore(void) +{ + /* No-op on illumos */ +} +#endif /* __FreeBSD__ */ /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 @@ -1309,7 +1338,11 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) struct svm_regctx *ctx; uint64_t code, info1, info2, val; uint32_t eax, ecx, edx; +#ifdef __FreeBSD__ int error, errcode_valid, handled, idtvec, reflect; +#else + int error, errcode_valid = 0, handled, idtvec, reflect; +#endif bool retu; ctx = svm_get_guest_regctx(svm_sc, vcpu); @@ -1380,8 +1413,11 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) */ reflect = 0; VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); - /* XXXJOY: we will need equivalent of vmx_call_trap */ +#ifdef __FreeBSD__ __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif break; case IDT_PF: error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, @@ -1772,11 +1808,14 @@ restore_host_tss(void) tss_sd->sd_type = SDT_SYSTSS; ltr(GSEL(GPROC0_SEL, SEL_KPL)); #else - /* XXXJOY: Add logic to restore TSS for us */ - panic("SVM Restore system TSS"); + system_desc_t *tss = (system_desc_t *)&CPU->cpu_gdt[GDT_KTSS]; + + tss->ssd_type = SDT_SYSTSS; + wr_tsr(KTSS_SEL); #endif } +#ifdef __FreeBSD__ static void check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) { @@ -1879,6 +1918,27 @@ check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) KASSERT(ctrl->asid == vcpustate->asid.num, ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); } +#else /* __FreeBSD__ */ +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + long eptgen; + uint8_t flush; + + eptgen = pmap->pm_eptgen; + flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), + vcpustate->eptgen == eptgen); + + if (flush != VMCB_TLB_FLUSH_NOTHING) { + ctrl->asid = vcpustate->hma_asid.hsa_asid; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + } + ctrl->tlb_ctrl = flush; + vcpustate->eptgen = eptgen; +} +#endif /* __FreeBSD__ */ static __inline void disable_gintr(void) @@ -1983,7 +2043,11 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, /* * Force new ASID allocation by invalidating the generation. */ +#ifdef __FreeBSD__ vcpustate->asid.gen = 0; +#else + vcpustate->hma_asid.hsa_gen = 0; +#endif /* * Invalidate the VMCB state cache by marking all fields dirty. @@ -2006,10 +2070,25 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, svm_msr_guest_enter(svm_sc, vcpu); +#ifndef __FreeBSD__ + VERIFY(!vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_TRUE; +#endif + /* Update Guest RIP */ state->rip = rip; do { +#ifndef __FreeBSD__ + /* + * Interrupt injection may involve mutex contention which, on + * illumos bhyve, are blocking/non-spin. Doing so with global + * interrupts disabled is a recipe for deadlock, so it is + * performed here. + */ + svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif + /* * Disable global interrupts to guarantee atomicity during * loading of guest state. This includes not only the state @@ -2059,7 +2138,9 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, */ ldt_sel = sldt(); +#ifdef __FreeBSD__ svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif /* Activate the nested pmap on 'curcpu' */ CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); @@ -2108,6 +2189,11 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, svm_msr_guest_exit(svm_sc, vcpu); +#ifndef __FreeBSD__ + VERIFY(vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_FALSE; +#endif + return (0); } @@ -2309,6 +2395,28 @@ svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) free(vlapic, M_SVM_VLAPIC); } +#ifndef __FreeBSD__ +static void +svm_savectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_exit(sc, vcpu); + } +} + +static void +svm_restorectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_enter(sc, vcpu); + } +} +#endif /* __FreeBSD__ */ + struct vmm_ops vmm_ops_amd = { svm_init, svm_cleanup, @@ -2328,11 +2436,7 @@ struct vmm_ops vmm_ops_amd = { svm_vlapic_cleanup, #ifndef __FreeBSD__ - /* - * When SVM support is wired up and tested, it is likely to require - * savectx/restorectx functions similar to VMX. - */ - NULL, - NULL, + svm_savectx, + svm_restorectx, #endif }; diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c index 67c43100f1..0c1ce0e4e0 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c @@ -54,6 +54,7 @@ enum { HOST_MSR_NUM /* must be the last enumeration */ }; +#ifdef __FreeBSD__ static uint64_t host_msrs[HOST_MSR_NUM]; void @@ -68,6 +69,19 @@ svm_msr_init(void) host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); } +#else + +CTASSERT(HOST_MSR_NUM == SVM_HOST_MSR_NUM); + +void +svm_msr_init(void) +{ + /* + * These MSRs do vary between CPUs on illumos, so saving system-wide + * values for them serves no purpose. + */ +} +#endif /* __FreeBSD__ */ void svm_msr_guest_init(struct svm_softc *sc, int vcpu) @@ -89,11 +103,23 @@ svm_msr_guest_enter(struct svm_softc *sc, int vcpu) /* * Save host MSRs (if any) and restore guest MSRs (if any). */ +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ } void svm_msr_guest_exit(struct svm_softc *sc, int vcpu) { +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; +#endif /* * Save guest MSRs (if any) and restore host MSRs. */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h index 8735353bb4..b5ac1903e7 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h @@ -34,10 +34,17 @@ #define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) #define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) +#ifdef __FreeBSD__ struct asid { uint64_t gen; /* range is [1, ~0UL] */ uint32_t num; /* range is [1, nasid - 1] */ }; +#else +#include <sys/hma.h> + +/* This must match HOST_MSR_NUM in svm_msr.c (where it is CTASSERTed) */ +#define SVM_HOST_MSR_NUM 4 +#endif /* __FreeBSD__ */ /* * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space @@ -51,7 +58,12 @@ struct svm_vcpu { int lastcpu; /* host cpu that the vcpu last ran on */ uint32_t dirty; /* state cache bits that must be cleared */ long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ +#ifdef __FreeBSD__ struct asid asid; +#else + hma_svm_asid_t hma_asid; + boolean_t loaded; +#endif } __aligned(PAGE_SIZE); /* @@ -64,6 +76,9 @@ struct svm_softc { uint8_t *iopm_bitmap; /* shared by all vcpus */ uint8_t *msr_bitmap; /* shared by all vcpus */ struct vm *vm; +#ifndef __FreeBSD__ + uint64_t host_msrs[VM_MAXCPU][SVM_HOST_MSR_NUM]; +#endif }; CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0); diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s index 4258c95d70..fad994b09c 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s @@ -25,7 +25,12 @@ * * $FreeBSD$ */ -#include <machine/asmacros.h> + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> #include "svm_assym.h" @@ -34,115 +39,126 @@ #if defined(lint) struct svm_regctx; -struct pcpu; +struct cpu; /*ARGSUSED*/ void -svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu) +svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *cpu) {} #else /* lint */ -/* - * Be friendly to DTrace FBT's prologue/epilogue pattern matching. - * - * They are also responsible for saving/restoring the host %rbp across VMRUN. - */ -#define VENTER push %rbp ; mov %rsp,%rbp -#define VLEAVE pop %rbp - #define VMLOAD .byte 0x0f, 0x01, 0xda #define VMRUN .byte 0x0f, 0x01, 0xd8 #define VMSAVE .byte 0x0f, 0x01, 0xdb + +/* + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. + */ +#define SVM_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; + +/* Stack layout (offset from %rsp) for svm_launch */ +#define SVMSTK_R15 0x00 /* callee saved %r15 */ +#define SVMSTK_R14 0x08 /* callee saved %r14 */ +#define SVMSTK_R13 0x10 /* callee saved %r13 */ +#define SVMSTK_R12 0x18 /* callee saved %r12 */ +#define SVMSTK_RBX 0x20 /* callee saved %rbx */ +#define SVMSTK_RDX 0x28 /* save-args %rdx (struct cpu *) */ +#define SVMSTK_RSI 0x30 /* save-args %rsi (struct svm_regctx *) */ +#define SVMSTK_RDI 0x38 /* save-args %rdi (uint64_t vmcb_pa) */ +#define SVMSTK_FP 0x40 /* frame pointer %rbp */ +#define SVMSTKSIZE SVMSTK_FP + /* * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu) * %rdi: physical address of VMCB * %rsi: pointer to guest context * %rdx: pointer to the pcpu data */ -ENTRY(svm_launch) - VENTER - - /* save pointer to the pcpu data */ - push %rdx - - /* - * Host register state saved across a VMRUN. - * - * All "callee saved registers" except: - * %rsp: because it is preserved by the processor across VMRUN. - * %rbp: because it is saved/restored by the function prologue/epilogue. - */ - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - /* Save the physical address of the VMCB in %rax */ - movq %rdi, %rax - - push %rsi /* push guest context pointer on the stack */ - - /* - * Restore guest state. - */ - movq SCTX_R8(%rsi), %r8 - movq SCTX_R9(%rsi), %r9 - movq SCTX_R10(%rsi), %r10 - movq SCTX_R11(%rsi), %r11 - movq SCTX_R12(%rsi), %r12 - movq SCTX_R13(%rsi), %r13 - movq SCTX_R14(%rsi), %r14 - movq SCTX_R15(%rsi), %r15 - movq SCTX_RBP(%rsi), %rbp - movq SCTX_RBX(%rsi), %rbx - movq SCTX_RCX(%rsi), %rcx - movq SCTX_RDX(%rsi), %rdx - movq SCTX_RDI(%rsi), %rdi - movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ +ENTRY_NP(svm_launch) + pushq %rbp + movq %rsp, %rbp + subq $SVMSTKSIZE, %rsp + movq %r15, SVMSTK_R15(%rsp) + movq %r14, SVMSTK_R14(%rsp) + movq %r13, SVMSTK_R13(%rsp) + movq %r12, SVMSTK_R12(%rsp) + movq %rbx, SVMSTK_RBX(%rsp) + movq %rdx, SVMSTK_RDX(%rsp) + movq %rsi, SVMSTK_RSI(%rsp) + movq %rdi, SVMSTK_RDI(%rsp) + + /* VMLOAD and VMRUN expect the VMCB physaddr in %rax */ + movq %rdi, %rax + + /* Restore guest state. */ + movq SCTX_R8(%rsi), %r8 + movq SCTX_R9(%rsi), %r9 + movq SCTX_R10(%rsi), %r10 + movq SCTX_R11(%rsi), %r11 + movq SCTX_R12(%rsi), %r12 + movq SCTX_R13(%rsi), %r13 + movq SCTX_R14(%rsi), %r14 + movq SCTX_R15(%rsi), %r15 + movq SCTX_RBP(%rsi), %rbp + movq SCTX_RBX(%rsi), %rbx + movq SCTX_RCX(%rsi), %rcx + movq SCTX_RDX(%rsi), %rdx + movq SCTX_RDI(%rsi), %rdi + movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ VMLOAD VMRUN VMSAVE - pop %rax /* pop guest context pointer from the stack */ - - /* - * Save guest state. - */ - movq %r8, SCTX_R8(%rax) - movq %r9, SCTX_R9(%rax) - movq %r10, SCTX_R10(%rax) - movq %r11, SCTX_R11(%rax) - movq %r12, SCTX_R12(%rax) - movq %r13, SCTX_R13(%rax) - movq %r14, SCTX_R14(%rax) - movq %r15, SCTX_R15(%rax) - movq %rbp, SCTX_RBP(%rax) - movq %rbx, SCTX_RBX(%rax) - movq %rcx, SCTX_RCX(%rax) - movq %rdx, SCTX_RDX(%rax) - movq %rdi, SCTX_RDI(%rax) - movq %rsi, SCTX_RSI(%rax) - - /* Restore host state */ - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - - /* Restore %GS.base to point to the host's pcpu data */ - pop %rdx - mov %edx, %eax - shr $32, %rdx - mov $MSR_GSBASE, %ecx + /* Grab the svm_regctx pointer */ + movq SVMSTK_RSI(%rsp), %rax + + /* Save guest state. */ + movq %r8, SCTX_R8(%rax) + movq %r9, SCTX_R9(%rax) + movq %r10, SCTX_R10(%rax) + movq %r11, SCTX_R11(%rax) + movq %r12, SCTX_R12(%rax) + movq %r13, SCTX_R13(%rax) + movq %r14, SCTX_R14(%rax) + movq %r15, SCTX_R15(%rax) + movq %rbp, SCTX_RBP(%rax) + movq %rbx, SCTX_RBX(%rax) + movq %rcx, SCTX_RCX(%rax) + movq %rdx, SCTX_RDX(%rax) + movq %rdi, SCTX_RDI(%rax) + movq %rsi, SCTX_RSI(%rax) + + /* Restore callee-saved registers */ + movq SVMSTK_R15(%rsp), %r15 + movq SVMSTK_R14(%rsp), %r14 + movq SVMSTK_R13(%rsp), %r13 + movq SVMSTK_R12(%rsp), %r12 + movq SVMSTK_RBX(%rsp), %rbx + + /* Fix %gsbase to point back to the correct 'struct cpu *' */ + movq SVMSTK_RDX(%rsp), %rdx + movl %edx, %eax + shrq $32, %rdx + movl $MSR_GSBASE, %ecx wrmsr - VLEAVE + SVM_GUEST_FLUSH_SCRATCH + + addq $SVMSTKSIZE, %rsp + popq %rbp ret -END(svm_launch) +SET_SIZE(svm_launch) #endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index a723be0d28..a039455be7 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -2640,7 +2640,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) #ifdef __FreeBSD__ __asm __volatile("int $18"); #else - vmx_call_trap(T_MCE); + vmm_call_trap(T_MCE); #endif return (1); } @@ -2929,7 +2929,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) #ifdef __FreeBSD__ __asm __volatile("int $18"); #else - vmx_call_trap(T_MCE); + vmm_call_trap(T_MCE); #endif return (1); } @@ -3147,7 +3147,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) #ifdef __FreeBSD__ __asm __volatile("int $2"); #else - vmx_call_trap(T_NMIFLT); + vmm_call_trap(T_NMIFLT); #endif } } diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h index 9766e6b749..2d16799bdd 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h @@ -164,9 +164,6 @@ CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); #define VMX_VMWRITE_ERROR 4 int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); void vmx_call_isr(uintptr_t entry); -#ifndef __FreeBSD__ -void vmx_call_trap(uint64_t); -#endif u_long vmx_fix_cr0(u_long cr0); u_long vmx_fix_cr4(u_long cr4); diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s index a2375e3a6c..0130f88dd6 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s @@ -381,44 +381,4 @@ ENTRY_NP(vmx_call_isr) ret SET_SIZE(vmx_call_isr) -/* - * %rdi = trapno - * - * This variant is for any explicit exception injection that we need: in this - * case, we can't just, for example, do a direct "int $2", as that will then - * trash our %cr3 via tr_nmiint due to KPTI. So we have to fake a trap frame in - * a similar fashion to vmx_call_isr(). Both NMIs and MCEs don't push an 'err' - * into the frame. - */ -ENTRY_NP(vmx_call_trap) - pushq %rbp - movq %rsp, %rbp - movq %rsp, %r11 - andq $~0xf, %rsp /* align stack */ - pushq $KDS_SEL /* %ss */ - pushq %r11 /* %rsp */ - pushfq /* %rflags */ - pushq $KCS_SEL /* %cs */ - leaq .trap_iret_dest(%rip), %rcx - pushq %rcx /* %rip */ - cli - cmpq $T_NMIFLT, %rdi - je nmiint - cmpq $T_MCE, %rdi - je mcetrap - - pushq %rdi /* save our bad trapno... */ - leaq __vmx_call_bad_trap(%rip), %rdi - xorl %eax, %eax - call panic - /*NOTREACHED*/ - -.trap_iret_dest: - popq %rbp - ret -SET_SIZE(vmx_call_trap) - -__vmx_call_bad_trap: - .string "bad trapno for vmx_call_trap()" - #endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h index 7ac745f509..600872c321 100644 --- a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h @@ -93,6 +93,7 @@ struct vmm_pt_ops { }; extern struct vmm_pt_ops ept_ops; +extern struct vmm_pt_ops rvi_ops; #endif /* _VM_GLUE_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index 2fa0267f72..4b759b44e9 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -1628,10 +1628,19 @@ vmm_is_supported(intptr_t arg) int r; const char *msg; - if (!vmm_is_intel()) - return (ENXIO); + if (vmm_is_intel()) { + r = vmx_x86_supported(&msg); + } else if (vmm_is_amd()) { + /* + * HMA already ensured that the features necessary for SVM + * operation were present and online during vmm_attach(). + */ + r = 0; + } else { + r = ENXIO; + msg = "Unsupported CPU vendor"; + } - r = vmx_x86_supported(&msg); if (r != 0 && arg != NULL) { if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0) return (EFAULT); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c new file mode 100644 index 0000000000..d630d32630 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c @@ -0,0 +1,297 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> +#include <sys/x86_archext.h> + +#include <sys/gipt.h> +#include <vm/vm_glue.h> + + +struct rvi_map { + gipt_map_t rm_gipt; + uint64_t rm_wired_page_count; +}; +typedef struct rvi_map rvi_map_t; + +#define RVI_LOCK(m) (&(m)->rm_gipt.giptm_lock) + +#define RVI_MAX_LEVELS 4 + +CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS); + +#define RVI_PRESENT PT_VALID +#define RVI_WRITABLE PT_WRITABLE +#define RVI_ACCESSED PT_REF +#define RVI_DIRTY PT_MOD +#define RVI_LGPG PT_PAGESIZE +#define RVI_NX PT_NX +#define RVI_USER PT_USER +#define RVI_PWT PT_WRITETHRU +#define RVI_PCD PT_NOCACHE + +#define RVI_PA_MASK PT_PADDR + +#define RVI_PAT(attr) rvi_attr_to_pat(attr) +#define RVI_PADDR(addr) ((addr) & RVI_PA_MASK) +#define RVI_PROT(prot) \ + ((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \ + (((prot) & PROT_EXEC) == 0 ? RVI_NX : 0)) + +#define RVI_IS_ABSENT(pte) (((pte) & RVI_PRESENT) == 0) +#define RVI_PTE_PFN(pte) mmu_btop(RVI_PADDR(pte)) +#define RVI_MAPS_PAGE(pte, lvl) \ + (!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0)) +#define RVI_PTE_PROT(pte) \ + (RVI_IS_ABSENT(pte) ? 0 : ( \ + PROT_READ | \ + (((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) | \ + (((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0))) + +#define RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + (((lvl) != 0) ? RVI_LGPG : 0) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(attr) | \ + RVI_PROT(prot)) + +#define RVI_PTE_ASSIGN_TABLE(pfn) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(MTRR_TYPE_WB) | \ + RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC)) + + +/* Make sure that PAT indexes line up as expected */ +CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB); +CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC); + +static inline uint64_t +rvi_attr_to_pat(const uint8_t attr) +{ + if (attr == MTRR_TYPE_UC) { + /* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */ + return (RVI_PCD|RVI_PWT); + } else if (attr == MTRR_TYPE_WB) { + /* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */ + return (0); + } + + panic("unexpected memattr %x", attr); + return (0); +} + +static gipt_pte_type_t +rvi_pte_type(uint64_t pte, uint_t level) +{ + if (RVI_IS_ABSENT(pte)) { + return (PTET_EMPTY); + } else if (RVI_MAPS_PAGE(pte, level)) { + return (PTET_PAGE); + } else { + return (PTET_LINK); + } +} + +static uint64_t +rvi_pte_map(uint64_t pfn) +{ + return (RVI_PTE_ASSIGN_TABLE(pfn)); +} + +static void * +rvi_create(uintptr_t *pml4_kaddr) +{ + rvi_map_t *rmap; + gipt_map_t *map; + gipt_t *root; + struct gipt_cbs cbs = { + .giptc_pte_type = rvi_pte_type, + .giptc_pte_map = rvi_pte_map, + }; + + rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP); + map = &rmap->rm_gipt; + root = gipt_alloc(); + root->gipt_level = RVI_MAX_LEVELS - 1; + gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); + + *pml4_kaddr = (uintptr_t)root->gipt_kva; + return (rmap); +} + +static void +rvi_destroy(void *arg) +{ + rvi_map_t *rmap = arg; + + if (rmap != NULL) { + gipt_map_t *map = &rmap->rm_gipt; + + gipt_map_fini(map); + kmem_free(rmap, sizeof (*rmap)); + } +} + +static uint64_t +rvi_wired_count(void *arg) +{ + rvi_map_t *rmap = arg; + uint64_t res; + + mutex_enter(RVI_LOCK(rmap)); + res = rmap->rm_wired_page_count; + mutex_exit(RVI_LOCK(rmap)); + + return (res); +} + +static int +rvi_is_wired(void *arg, uint64_t va, uint_t *protp) +{ + rvi_map_t *rmap = arg; + gipt_t *pt; + int rv = -1; + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va); + if (pt != NULL) { + const uint64_t pte = GIPT_VA2PTE(pt, va); + + if (RVI_MAPS_PAGE(pte, pt->gipt_level)) { + *protp = RVI_PTE_PROT(pte); + rv = 0; + } + } + mutex_exit(RVI_LOCK(rmap)); + + return (rv); +} + +static int +rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, + uint8_t attr) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t *ptep, pte; + + ASSERT((prot & PROT_READ) != 0); + ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0); + ASSERT3U(lvl, <, RVI_MAX_LEVELS); + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup(map, va, lvl); + if (pt == NULL) { + /* + * A table at the appropriate VA/level that would house this + * mapping does not currently exist. Try to walk down to that + * point, creating any necessary parent(s). + */ + pt = gipt_map_create_parents(map, va, lvl); + + /* + * There was a large page mapping in the way of creating the + * necessary parent table(s). + */ + if (pt == NULL) { + panic("unexpected large page @ %08lx", va); + } + } + ptep = GIPT_VA2PTEP(pt, va); + + pte = *ptep; + if (!RVI_IS_ABSENT(pte)) { + if (!RVI_MAPS_PAGE(pte, lvl)) { + panic("unexpected PT link @ %08lx in %p", va, pt); + } else { + panic("unexpected page mapped @ %08lx in %p", va, pt); + } + } + + pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); + *ptep = pte; + pt->gipt_valid_cnt++; + rmap->rm_wired_page_count += gipt_level_count[lvl]; + + mutex_exit(RVI_LOCK(rmap)); + return (0); +} + +static uint64_t +rvi_unmap(void *arg, uint64_t va, uint64_t end_va) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t cur_va = va; + uint64_t unmapped = 0; + + mutex_enter(RVI_LOCK(rmap)); + + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { + cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); + if (cur_va == 0) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + } + + while (cur_va < end_va) { + uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); + const uint_t lvl = pt->gipt_level; + + ASSERT(RVI_MAPS_PAGE(*ptep, lvl)); + *ptep = 0; + pt->gipt_valid_cnt--; + unmapped += gipt_level_count[pt->gipt_level]; + + gipt_t *next_pt = pt; + uint64_t next_va; + next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); + + if (pt->gipt_valid_cnt == 0) { + gipt_map_clean_parents(map, pt); + } + if (next_va == 0) { + break; + } + pt = next_pt; + cur_va = next_va; + } + rmap->rm_wired_page_count -= unmapped; + + mutex_exit(RVI_LOCK(rmap)); + + return (unmapped); +} + +struct vmm_pt_ops rvi_ops = { + .vpo_init = rvi_create, + .vpo_free = rvi_destroy, + .vpo_wired_cnt = rvi_wired_count, + .vpo_is_wired = rvi_is_wired, + .vpo_map = rvi_map, + .vpo_unmap = rvi_unmap, +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c index 8d5051144c..58a62586a1 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c @@ -24,6 +24,7 @@ #include <sys/machsystm.h> #include <sys/vmsystm.h> #include <sys/malloc.h> +#include <sys/x86_archext.h> #include <vm/as.h> #include <vm/seg_vn.h> #include <vm/seg_kmem.h> @@ -208,8 +209,17 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags) pmap->pm_pml4 = pml4; return (1); } - case PT_RVI: - /* RVI support not yet implemented */ + case PT_RVI: { + struct vmm_pt_ops *ops = &rvi_ops; + void *pml4, *pmi; + + pmi = ops->vpo_init((uintptr_t *)&pml4); + + pmap->pm_ops = ops; + pmap->pm_impl = pmi; + pmap->pm_pml4 = pml4; + return (1); + } default: panic("unsupported pmap type: %x", type); break; @@ -537,6 +547,8 @@ vm_object_deallocate(vm_object_t vmo) kmem_free(vmo, sizeof (*vmo)); } +CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC); +CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB); int vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr) { diff --git a/usr/src/uts/i86pc/io/vmm/vmm_support.s b/usr/src/uts/i86pc/io/vmm/vmm_support.s new file mode 100644 index 0000000000..5777d46959 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_support.s @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> + +/* + * %rdi = trapno + * + * This variant is for any explicit exception injection that we need: in this + * case, we can't just, for example, do a direct "int $2", as that will then + * trash our %cr3 via tr_nmiint due to KPTI, so we have to fake a trap frame. + * Both NMIs and MCEs don't push an 'err' into the frame. + */ +ENTRY_NP(vmm_call_trap) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .trap_iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + cli + cmpq $T_NMIFLT, %rdi + je nmiint + cmpq $T_MCE, %rdi + je mcetrap + + pushq %rdi /* save our bad trapno... */ + leaq __vmm_call_bad_trap(%rip), %rdi + xorl %eax, %eax + call panic + /*NOTREACHED*/ + +.trap_iret_dest: + popq %rbp + ret +SET_SIZE(vmm_call_trap) + +__vmm_call_bad_trap: + .string "bad trapno for vmm_call_trap()" diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c index b02142e7e5..b126e96f2c 100644 --- a/usr/src/uts/i86pc/io/vmm/x86.c +++ b/usr/src/uts/i86pc/io/vmm/x86.c @@ -198,6 +198,18 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, /* Hide mwaitx/monitorx capability from the guest */ regs[2] &= ~AMDID2_MWAITX; +#ifndef __FreeBSD__ + /* + * Detection routines for TCE and FFXSR are missing + * from our vm_cpuid_capability() detection logic + * today. Mask them out until that is remedied. + * They do not appear to be in common usage, so their + * absence should not cause undue trouble. + */ + regs[2] &= ~AMDID2_TCE; + regs[3] &= ~AMDID_FFXSR; +#endif + /* * Hide rdtscp/ia32_tsc_aux until we know how * to deal with them. diff --git a/usr/src/uts/i86pc/os/hma.c b/usr/src/uts/i86pc/os/hma.c index 34ae85086d..ecaf9c17cb 100644 --- a/usr/src/uts/i86pc/os/hma.c +++ b/usr/src/uts/i86pc/os/hma.c @@ -37,23 +37,33 @@ static boolean_t hma_vmx_ready = B_FALSE; static const char *hma_vmx_error = NULL; static id_space_t *hma_vmx_vpid; -typedef enum vmx_cpu_state { - VCS_UNINITIALIZED = 0, - VCS_READY, - VCS_ERROR -} vmx_cpu_state_t; - /* - * The bulk of VMX-related HMA state is protected by cpu_lock, rather than a + * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a * mutex specific to the module. It (cpu_lock) is already required for the * state needed to perform setup on all CPUs, so it was a natural fit to * protect this data too. */ +typedef enum hma_cpu_state { + HCS_UNINITIALIZED = 0, + HCS_READY, + HCS_ERROR +} hma_cpu_state_t; +static hma_cpu_state_t hma_cpu_status[NCPU]; + static void *hma_vmx_vmxon_page[NCPU]; static uintptr_t hma_vmx_vmxon_pa[NCPU]; -static vmx_cpu_state_t hma_vmx_status[NCPU]; static uint32_t hma_vmx_revision; +static boolean_t hma_svm_ready = B_FALSE; +static const char *hma_svm_error = NULL; +static uint32_t hma_svm_features; +static uint32_t hma_svm_max_asid; + +static void *hma_svm_hsave_page[NCPU]; +static uintptr_t hma_svm_hsave_pa[NCPU]; + +static hma_svm_asid_t hma_svm_cpu_asid[NCPU]; + static int hma_vmx_init(void); static int hma_svm_init(void); @@ -94,8 +104,7 @@ hma_register(const char *name) is_ready = hma_vmx_ready; break; case X86_VENDOR_AMD: - /* Punt on SVM support for now */ - is_ready = B_FALSE; + is_ready = hma_svm_ready; break; default: is_ready = B_FALSE; @@ -156,9 +165,9 @@ hma_vmx_vpid_free(uint16_t vpid) extern int hma_vmx_vmxon(uintptr_t); -/* ARGSUSED */ static int -hma_vmx_cpu_vmxon(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) +hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, + xc_arg_t arg3 __unused) { uint64_t fctrl; processorid_t id = CPU->cpu_seqid; @@ -181,9 +190,9 @@ hma_vmx_cpu_vmxon(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) setcr4(getcr4() | CR4_VMXE); if (hma_vmx_vmxon(vmxon_pa) == 0) { - hma_vmx_status[id] = VCS_READY; + hma_cpu_status[id] = HCS_READY; } else { - hma_vmx_status[id] = VCS_ERROR; + hma_cpu_status[id] = HCS_ERROR; /* * If VMX has already been marked active and available for the @@ -198,9 +207,8 @@ hma_vmx_cpu_vmxon(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) return (0); } -/* ARGSUSED2 */ static int -hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg) +hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused) { ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(id >= 0 && id < NCPU); @@ -223,8 +231,8 @@ hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg) } /* Perform initialization if it has not been previously attempted. */ - if (hma_vmx_status[id] != VCS_UNINITIALIZED) { - return ((hma_vmx_status[id] == VCS_READY) ? 0 : -1); + if (hma_cpu_status[id] != HCS_UNINITIALIZED) { + return ((hma_cpu_status[id] == HCS_READY) ? 0 : -1); } /* Allocate the VMXON page for this CPU */ @@ -265,7 +273,7 @@ hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg) xc_sync(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon); } - return (hma_vmx_status[id] != VCS_READY); + return (hma_cpu_status[id] != HCS_READY); } static int @@ -329,10 +337,233 @@ bail: return (-1); } +#define VMCB_FLUSH_NOTHING 0x0 +#define VMCB_FLUSH_ALL 0x1 +#define VMCB_FLUSH_ASID 0x3 + +void +hma_svm_asid_init(hma_svm_asid_t *vcp) +{ + /* + * Initialize the generation to 0, forcing an ASID allocation on first + * entry. Leave the ASID at 0, so if the host forgoes the call to + * hma_svm_asid_update(), SVM will bail on the invalid vcpu state. + */ + vcp->hsa_gen = 0; + vcp->hsa_asid = 0; +} + +uint8_t +hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid, + boolean_t npt_flush) +{ + hma_svm_asid_t *hcp = &hma_svm_cpu_asid[CPU->cpu_seqid]; + + ASSERT(curthread->t_preempt != 0); + + /* + * If NPT changes dictate a TLB flush and by-ASID flushing is not + * supported/used, force a fresh ASID allocation. + */ + if (npt_flush && !flush_by_asid) { + vcp->hsa_gen = 0; + } + + if (vcp->hsa_gen != hcp->hsa_gen) { + hcp->hsa_asid++; + + if (hcp->hsa_asid >= hma_svm_max_asid) { + /* Keep the ASID properly constrained */ + hcp->hsa_asid = 1; + hcp->hsa_gen++; + if (hcp->hsa_gen == 0) { + /* + * Stay clear of the '0' sentinel value for + * generation, if wrapping around. + */ + hcp->hsa_gen = 1; + } + } + vcp->hsa_gen = hcp->hsa_gen; + vcp->hsa_asid = hcp->hsa_asid; + + ASSERT(vcp->hsa_asid != 0); + ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid); + + if (flush_by_asid) { + return (VMCB_FLUSH_ASID); + } + return (VMCB_FLUSH_ALL); + } else if (npt_flush) { + ASSERT(flush_by_asid); + return (VMCB_FLUSH_ASID); + } + return (VMCB_FLUSH_NOTHING); +} + +static int +hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, + xc_arg_t arg3 __unused) +{ + const processorid_t id = CPU->cpu_seqid; + const uintptr_t hsave_pa = hma_svm_hsave_pa[id]; + uint64_t efer; + + VERIFY(hsave_pa != 0); + + /* Enable SVM via EFER */ + efer = rdmsr(MSR_AMD_EFER); + efer |= AMD_EFER_SVME; + wrmsr(MSR_AMD_EFER, efer); + + /* Setup hsave area */ + wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa); + + hma_cpu_status[id] = HCS_READY; + return (0); +} + +static int +hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(id >= 0 && id < NCPU); + + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + break; + default: + /* + * Other events, such as CPU offlining, are of no interest. + * Letting the SVM state linger should not cause any harm. + * + * This logic assumes that any offlining activity is strictly + * administrative in nature and will not alter any existing + * configuration (such as EFER bits previously set). + */ + return (0); + } + + /* Perform initialization if it has not been previously attempted. */ + if (hma_cpu_status[id] != HCS_UNINITIALIZED) { + return ((hma_cpu_status[id] == HCS_READY) ? 0 : -1); + } + + /* Allocate the hsave page for this CPU */ + if (hma_svm_hsave_page[id] == NULL) { + caddr_t va; + pfn_t pfn; + + va = kmem_alloc(PAGESIZE, KM_SLEEP); + VERIFY0((uintptr_t)va & PAGEOFFSET); + hma_svm_hsave_page[id] = va; + + /* + * Cache the physical address of the hsave page rather than + * looking it up later when the potential blocking of + * hat_getpfnum would be less acceptable. + */ + pfn = hat_getpfnum(kas.a_hat, va); + hma_svm_hsave_pa[id] = (pfn << PAGESHIFT); + } else { + VERIFY(hma_svm_hsave_pa[id] != 0); + } + + kpreempt_disable(); + if (CPU->cpu_seqid == id) { + /* Perform svm setup directly if this CPU is the target */ + (void) hma_svm_cpu_activate(0, 0, 0); + kpreempt_enable(); + } else { + cpuset_t set; + + /* Use a cross-call if a remote CPU is the target */ + kpreempt_enable(); + cpuset_zero(&set); + cpuset_add(&set, id); + xc_sync(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate); + } + + return (hma_cpu_status[id] != HCS_READY); +} static int hma_svm_init(void) { - /* punt on AMD for now */ - return (ENOTSUP); + uint64_t msr; + const char *msg = NULL; + struct cpuid_regs regs; + cpu_t *cp; + + if (!is_x86_feature(x86_featureset, X86FSET_SVM)) { + msg = "CPU does not support SVM"; + goto bail; + } + + msr = rdmsr(MSR_AMD_VM_CR); + if ((msr & AMD_VM_CR_SVMDIS) != 0) { + msg = "SVM disabled by BIOS"; + goto bail; + } + + regs.cp_eax = 0x8000000a; + (void) cpuid_insn(NULL, ®s); + const uint32_t nasid = regs.cp_ebx; + const uint32_t feat = regs.cp_edx; + + if (nasid == 0) { + msg = "Not enough ASIDs for guests"; + goto bail; + } + if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) { + msg = "CPU does not support nested paging"; + goto bail; + } + if ((feat & CPUID_AMD_EDX_NRIPS) == 0) { + msg = "CPU does not support NRIP save"; + goto bail; + } + + hma_svm_features = feat; + hma_svm_max_asid = nasid; + + mutex_enter(&cpu_lock); + /* Perform SVM configuration for already-online CPUs. */ + cp = cpu_active; + do { + int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL); + if (err != 0) { + msg = "failure during SVM setup"; + mutex_exit(&cpu_lock); + goto bail; + } + } while ((cp = cp->cpu_next_onln) != cpu_active); + + /* + * Register callback for later-onlined CPUs and perform other remaining + * resource allocation. + */ + register_cpu_setup_func(hma_svm_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + /* Initialize per-CPU ASID state. */ + for (uint_t i = 0; i < NCPU; i++) { + /* + * Skip past sentinel 0 value for generation. Doing so for + * ASID is unneeded, since it will be incremented during the + * first allocation. + */ + hma_svm_cpu_asid[i].hsa_gen = 1; + hma_svm_cpu_asid[i].hsa_asid = 0; + } + + hma_svm_ready = B_TRUE; + return (0); + +bail: + hma_svm_error = msg; + cmn_err(CE_NOTE, "hma_svm_init: %s", msg); + return (-1); } diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h index 0c6161fdfc..86099b79e1 100644 --- a/usr/src/uts/i86pc/sys/hma.h +++ b/usr/src/uts/i86pc/sys/hma.h @@ -49,6 +49,15 @@ extern void hma_unregister(hma_reg_t *); extern uint16_t hma_vmx_vpid_alloc(void); extern void hma_vmx_vpid_free(uint16_t); +struct hma_svm_asid { + uint64_t hsa_gen; + uint32_t hsa_asid; +}; +typedef struct hma_svm_asid hma_svm_asid_t; + +extern void hma_svm_asid_init(hma_svm_asid_t *); +extern uint8_t hma_svm_asid_update(hma_svm_asid_t *, boolean_t, boolean_t); + /* * FPU related management. These functions provide a set of APIs to manage the * FPU state and switch between host and guest management of this state. diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index 163c0781cf..e5e5460211 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -38,7 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_H_ @@ -741,6 +741,8 @@ void vmm_sol_glue_cleanup(void); int vmm_mod_load(void); int vmm_mod_unload(void); +void vmm_call_trap(uint64_t); + /* * Because of tangled headers, these are mirrored by vmm_drv.h to present the * interface to driver consumers. diff --git a/usr/src/uts/intel/sys/controlregs.h b/usr/src/uts/intel/sys/controlregs.h index babf036e0b..0be7b3b650 100644 --- a/usr/src/uts/intel/sys/controlregs.h +++ b/usr/src/uts/intel/sys/controlregs.h @@ -200,6 +200,18 @@ extern "C" { #define MSR_AMD_KGSBASE 0xc0000102 /* swapgs swaps this with gsbase */ #define MSR_AMD_TSCAUX 0xc0000103 /* %ecx value on rdtscp insn */ + +/* AMD's SVM MSRs */ + +#define MSR_AMD_VM_CR 0xc0010114 /* SVM global control */ +#define MSR_AMD_VM_HSAVE_PA 0xc0010117 /* SVM host save area address */ + +#define AMD_VM_CR_DPD (1 << 0) +#define AMD_VM_CR_R_INIT (1 << 1) +#define AMD_VM_CR_DIS_A20M (1 << 2) +#define AMD_VM_CR_LOCK (1 << 3) +#define AMD_VM_CR_SVMDIS (1 << 4) + /* AMD's configuration MSRs, weakly documented in the revision guide */ #define MSR_AMD_DC_CFG 0xc0011022 diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index 943bdd8203..59a974dfd2 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -210,6 +210,18 @@ extern "C" { #define CPUID_AMD_EBX_SSB_NO 0x004000000 /* AMD: SSB Fixed */ /* + * AMD SVM features (extended function 0x8000000A). + */ +#define CPUID_AMD_EDX_NESTED_PAGING 0x000000001 /* AMD: SVM NP */ +#define CPUID_AMD_EDX_LBR_VIRT 0x000000002 /* AMD: LBR virt. */ +#define CPUID_AMD_EDX_SVML 0x000000004 /* AMD: SVM lock */ +#define CPUID_AMD_EDX_NRIPS 0x000000008 /* AMD: NRIP save */ +#define CPUID_AMD_EDX_TSC_RATE_MSR 0x000000010 /* AMD: MSR TSC ctrl */ +#define CPUID_AMD_EDX_VMCB_CLEAN 0x000000020 /* AMD: VMCB clean bits */ +#define CPUID_AMD_EDX_FLUSH_ASID 0x000000040 /* AMD: flush by ASID */ +#define CPUID_AMD_EDX_DECODE_ASSISTS 0x000000080 /* AMD: decode assists */ + +/* * Intel now seems to have claimed part of the "extended" function * space that we previously for non-Intel implementors to use. * More excitingly still, they've claimed bit 20 to mean LAHF/SAHF |