OS-7170 bhyve should support AMD

Reviewed by: John Levon <john.levon@joyent.com> Reviewed by: Hans Rosenfeld <hans.rosenfeld@joyent.com> Approved by: Robert Mustacchi <rm@joyent.com>
author: Patrick Mooney <pmooney@pfmooney.com> 2018-12-14 23:15:07 +0000
committer: Patrick Mooney <pmooney@pfmooney.com> 2019-03-15 15:01:23 +0000
commit: 2453029c010976e95241a5f5244e86d44dc6194c (patch)
tree: ee57e43a87769a4489b772635f14c4e153205ac5 /usr
parent: 7afbf32617941421fcf18c3c37cbce5f7d30a8fc (diff)
download: illumos-joyent-2453029c010976e95241a5f5244e86d44dc6194c.tar.gz
20 files changed, 942 insertions, 168 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 8eadfcb56f..3979ddaef7 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -279,6 +279,8 @@ VMM_OBJS += vmm.o \
 	vmm_sol_vm.o \
 	vmm_sol_glue.o \
 	vmm_sol_ept.o \
+	vmm_sol_rvi.o \
+	vmm_support.o \
 	vmm_zsd.o
 
 VIONA_OBJS += viona.o
diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules
index 3ea69978ce..0e3ea556ea 100644
--- a/usr/src/uts/i86pc/Makefile.rules
+++ b/usr/src/uts/i86pc/Makefile.rules
@@ -232,6 +232,9 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/io/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/%.s
+	$(COMPILE.s) -o $@ $<
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/intel/%.s
 	$(COMPILE.s) -o $@ $<
 
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
index e921383d22..ca9ed9e4e1 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 
 #ifndef __FreeBSD__
 #include <sys/x86_archext.h>
+#include <sys/trap.h>
 #endif
 
 #include <vm/vm.h>
@@ -123,6 +124,7 @@ static int disable_npf_assist;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
     &disable_npf_assist, 0, NULL);
 
+#ifdef __FreeBSD__
 /* Maximum ASIDs supported by the processor */
 static uint32_t nasid;
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
@@ -135,6 +137,7 @@ static struct asid asid[MAXCPU];
  * SVM host state saved area of size 4KB for each core.
  */
 static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+#endif /* __FreeBSD__ */
 
 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
@@ -156,6 +159,7 @@ decode_assist(void)
 	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
 }
 
+#ifdef __FreeBSD__
 static void
 svm_disable(void *arg __unused)
 {
@@ -298,6 +302,31 @@ svm_restore(void)
 
 	svm_enable(NULL);
 }		
+#else /* __FreeBSD__ */
+static int
+svm_cleanup(void)
+{
+	/* This is taken care of by the hma registration */
+	return (0);
+}
+
+static int
+svm_init(int ipinum)
+{
+	vmcb_clean &= VMCB_CACHE_DEFAULT;
+
+	svm_msr_init();
+	svm_npt_init(ipinum);
+
+	return (0);
+}
+
+static void
+svm_restore(void)
+{
+	/* No-op on illumos */
+}
+#endif /* __FreeBSD__ */
 
 /* Pentium compatible MSRs */
 #define MSR_PENTIUM_START 	0	
@@ -1309,7 +1338,11 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 	struct svm_regctx *ctx;
 	uint64_t code, info1, info2, val;
 	uint32_t eax, ecx, edx;
+#ifdef __FreeBSD__
 	int error, errcode_valid, handled, idtvec, reflect;
+#else
+	int error, errcode_valid = 0, handled, idtvec, reflect;
+#endif
 	bool retu;
 
 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
@@ -1380,8 +1413,11 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 			 */
 			reflect = 0;
 			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
-			/* XXXJOY: we will need equivalent of vmx_call_trap */
+#ifdef __FreeBSD__
 			__asm __volatile("int $18");
+#else
+			vmm_call_trap(T_MCE);
+#endif
 			break;
 		case IDT_PF:
 			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
@@ -1772,11 +1808,14 @@ restore_host_tss(void)
 	tss_sd->sd_type = SDT_SYSTSS;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 #else
-	/* XXXJOY: Add logic to restore TSS for us */
-	panic("SVM Restore system TSS");
+	system_desc_t *tss = (system_desc_t *)&CPU->cpu_gdt[GDT_KTSS];
+
+	tss->ssd_type = SDT_SYSTSS;
+	wr_tsr(KTSS_SEL);
 #endif
 }
 
+#ifdef __FreeBSD__
 static void
 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
 {
@@ -1879,6 +1918,27 @@ check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
 	KASSERT(ctrl->asid == vcpustate->asid.num,
 	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
 }
+#else /* __FreeBSD__ */
+static void
+check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
+{
+	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
+	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
+	long eptgen;
+	uint8_t flush;
+
+	eptgen = pmap->pm_eptgen;
+	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
+	    vcpustate->eptgen == eptgen);
+
+	if (flush != VMCB_TLB_FLUSH_NOTHING) {
+		ctrl->asid = vcpustate->hma_asid.hsa_asid;
+		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
+	}
+	ctrl->tlb_ctrl = flush;
+	vcpustate->eptgen = eptgen;
+}
+#endif /* __FreeBSD__ */
 
 static __inline void
 disable_gintr(void)
@@ -1983,7 +2043,11 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		/*
 		 * Force new ASID allocation by invalidating the generation.
 		 */
+#ifdef __FreeBSD__
 		vcpustate->asid.gen = 0;
+#else
+		vcpustate->hma_asid.hsa_gen = 0;
+#endif
 
 		/*
 		 * Invalidate the VMCB state cache by marking all fields dirty.
@@ -2006,10 +2070,25 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 
 	svm_msr_guest_enter(svm_sc, vcpu);
 
+#ifndef __FreeBSD__
+	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
+	vcpustate->loaded = B_TRUE;
+#endif
+
 	/* Update Guest RIP */
 	state->rip = rip;
 
 	do {
+#ifndef __FreeBSD__
+		/*
+		 * Interrupt injection may involve mutex contention which, on
+		 * illumos bhyve, are blocking/non-spin.  Doing so with global
+		 * interrupts disabled is a recipe for deadlock, so it is
+		 * performed here.
+		 */
+		svm_inj_interrupts(svm_sc, vcpu, vlapic);
+#endif
+
 		/*
 		 * Disable global interrupts to guarantee atomicity during
 		 * loading of guest state. This includes not only the state
@@ -2059,7 +2138,9 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		 */
 		ldt_sel = sldt();
 
+#ifdef __FreeBSD__
 		svm_inj_interrupts(svm_sc, vcpu, vlapic);
+#endif
 
 		/* Activate the nested pmap on 'curcpu' */
 		CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
@@ -2108,6 +2189,11 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 
 	svm_msr_guest_exit(svm_sc, vcpu);
 
+#ifndef __FreeBSD__
+	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
+	vcpustate->loaded = B_FALSE;
+#endif
+
 	return (0);
 }
 
@@ -2309,6 +2395,28 @@ svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
         free(vlapic, M_SVM_VLAPIC);
 }
 
+#ifndef __FreeBSD__
+static void
+svm_savectx(void *arg, int vcpu)
+{
+	struct svm_softc *sc = arg;
+
+	if (sc->vcpu[vcpu].loaded) {
+		svm_msr_guest_exit(sc, vcpu);
+	}
+}
+
+static void
+svm_restorectx(void *arg, int vcpu)
+{
+	struct svm_softc *sc = arg;
+
+	if (sc->vcpu[vcpu].loaded) {
+		svm_msr_guest_enter(sc, vcpu);
+	}
+}
+#endif /* __FreeBSD__ */
+
 struct vmm_ops vmm_ops_amd = {
 	svm_init,
 	svm_cleanup,
@@ -2328,11 +2436,7 @@ struct vmm_ops vmm_ops_amd = {
 	svm_vlapic_cleanup,
 
 #ifndef __FreeBSD__
-	/*
-	 * When SVM support is wired up and tested, it is likely to require
-	 * savectx/restorectx functions similar to VMX.
-	 */
-	NULL,
-	NULL,
+	svm_savectx,
+	svm_restorectx,
 #endif
 };
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c
index 67c43100f1..0c1ce0e4e0 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c
@@ -54,6 +54,7 @@ enum {
 	HOST_MSR_NUM		/* must be the last enumeration */
 };
 
+#ifdef __FreeBSD__
 static uint64_t host_msrs[HOST_MSR_NUM];
 
 void
@@ -68,6 +69,19 @@ svm_msr_init(void)
 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
 }
+#else
+
+CTASSERT(HOST_MSR_NUM == SVM_HOST_MSR_NUM);
+
+void
+svm_msr_init(void)
+{
+	/*
+	 * These MSRs do vary between CPUs on illumos, so saving system-wide
+	 * values for them serves no purpose.
+	 */
+}
+#endif /* __FreeBSD__ */
 
 void
 svm_msr_guest_init(struct svm_softc *sc, int vcpu)
@@ -89,11 +103,23 @@ svm_msr_guest_enter(struct svm_softc *sc, int vcpu)
 	/*
 	 * Save host MSRs (if any) and restore guest MSRs (if any).
 	 */
+#ifndef __FreeBSD__
+	uint64_t *host_msrs = sc->host_msrs[vcpu];
+
+	/* Save host MSRs */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+#endif /* __FreeBSD__ */
 }
 
 void
 svm_msr_guest_exit(struct svm_softc *sc, int vcpu)
 {
+#ifndef __FreeBSD__
+	uint64_t *host_msrs = sc->host_msrs[vcpu];
+#endif
 	/*
 	 * Save guest MSRs (if any) and restore host MSRs.
 	 */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
index 8735353bb4..b5ac1903e7 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
@@ -34,10 +34,17 @@
 #define SVM_IO_BITMAP_SIZE	(3 * PAGE_SIZE)
 #define SVM_MSR_BITMAP_SIZE	(2 * PAGE_SIZE)
 
+#ifdef __FreeBSD__
 struct asid {
 	uint64_t	gen;	/* range is [1, ~0UL] */
 	uint32_t	num;	/* range is [1, nasid - 1] */
 };
+#else
+#include <sys/hma.h>
+
+/* This must match HOST_MSR_NUM in svm_msr.c (where it is CTASSERTed) */
+#define SVM_HOST_MSR_NUM	4
+#endif /* __FreeBSD__ */
 
 /*
  * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space
@@ -51,7 +58,12 @@ struct svm_vcpu {
         int		lastcpu; /* host cpu that the vcpu last ran on */
 	uint32_t	dirty;	 /* state cache bits that must be cleared */
 	long		eptgen;	 /* pmap->pm_eptgen when the vcpu last ran */
+#ifdef __FreeBSD__
 	struct asid	asid;
+#else
+	hma_svm_asid_t	hma_asid;
+	boolean_t	loaded;
+#endif
 } __aligned(PAGE_SIZE);
 
 /*
@@ -64,6 +76,9 @@ struct svm_softc {
 	uint8_t		*iopm_bitmap;    /* shared by all vcpus */
 	uint8_t		*msr_bitmap;    /* shared by all vcpus */
 	struct vm	*vm;
+#ifndef __FreeBSD__
+	uint64_t	host_msrs[VM_MAXCPU][SVM_HOST_MSR_NUM];
+#endif
 };
 
 CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0);
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
index 4258c95d70..fad994b09c 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
@@ -25,7 +25,12 @@
  *
  * $FreeBSD$
  */
-#include <machine/asmacros.h>
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
 
 #include "svm_assym.h"
 
@@ -34,115 +39,126 @@
 #if defined(lint)
 
 struct svm_regctx;
-struct pcpu;
+struct cpu;
 
 /*ARGSUSED*/
 void
-svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu)
+svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *cpu)
 {}
 
 #else /* lint */
 
-/*
- * Be friendly to DTrace FBT's prologue/epilogue pattern matching.
- *
- * They are also responsible for saving/restoring the host %rbp across VMRUN.
- */
-#define	VENTER  push %rbp ; mov %rsp,%rbp
-#define	VLEAVE  pop %rbp
-
 #define	VMLOAD	.byte 0x0f, 0x01, 0xda
 #define	VMRUN	.byte 0x0f, 0x01, 0xd8
 #define	VMSAVE	.byte 0x0f, 0x01, 0xdb
 
+
+/*
+ * Flush scratch registers to avoid lingering guest state being used for
+ * Spectre v1 attacks when returning from guest entry.
+ */
+#define	SVM_GUEST_FLUSH_SCRATCH						\
+	xorl	%edi, %edi;						\
+	xorl	%esi, %esi;						\
+	xorl	%edx, %edx;						\
+	xorl	%ecx, %ecx;						\
+	xorl	%r8d, %r8d;						\
+	xorl	%r9d, %r9d;						\
+	xorl	%r10d, %r10d;						\
+	xorl	%r11d, %r11d;
+
+/* Stack layout (offset from %rsp) for svm_launch */
+#define	SVMSTK_R15	0x00	/* callee saved %r15			*/
+#define	SVMSTK_R14	0x08	/* callee saved %r14			*/
+#define	SVMSTK_R13	0x10	/* callee saved %r13			*/
+#define	SVMSTK_R12	0x18	/* callee saved %r12			*/
+#define	SVMSTK_RBX	0x20	/* callee saved %rbx			*/
+#define	SVMSTK_RDX	0x28	/* save-args %rdx (struct cpu *)	*/
+#define	SVMSTK_RSI	0x30	/* save-args %rsi (struct svm_regctx *)	*/
+#define	SVMSTK_RDI	0x38	/* save-args %rdi (uint64_t vmcb_pa)	*/
+#define	SVMSTK_FP	0x40	/* frame pointer %rbp			*/
+#define	SVMSTKSIZE	SVMSTK_FP
+
 /*
  * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu)
  * %rdi: physical address of VMCB
  * %rsi: pointer to guest context
  * %rdx: pointer to the pcpu data
  */
-ENTRY(svm_launch)
-	VENTER
-
-	/* save pointer to the pcpu data */
-	push %rdx
-
-	/*
-	 * Host register state saved across a VMRUN.
-	 *
-	 * All "callee saved registers" except:
-	 * %rsp: because it is preserved by the processor across VMRUN.
-	 * %rbp: because it is saved/restored by the function prologue/epilogue.
-	 */
-	push %rbx
-	push %r12
-	push %r13
-	push %r14
-	push %r15
-
-	/* Save the physical address of the VMCB in %rax */
-	movq %rdi, %rax
-
-	push %rsi		/* push guest context pointer on the stack */
-
-	/*
-	 * Restore guest state.
-	 */
-	movq SCTX_R8(%rsi), %r8
-	movq SCTX_R9(%rsi), %r9
-	movq SCTX_R10(%rsi), %r10
-	movq SCTX_R11(%rsi), %r11
-	movq SCTX_R12(%rsi), %r12
-	movq SCTX_R13(%rsi), %r13
-	movq SCTX_R14(%rsi), %r14
-	movq SCTX_R15(%rsi), %r15
-	movq SCTX_RBP(%rsi), %rbp
-	movq SCTX_RBX(%rsi), %rbx
-	movq SCTX_RCX(%rsi), %rcx
-	movq SCTX_RDX(%rsi), %rdx
-	movq SCTX_RDI(%rsi), %rdi
-	movq SCTX_RSI(%rsi), %rsi	/* %rsi must be restored last */
+ENTRY_NP(svm_launch)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$SVMSTKSIZE, %rsp
+	movq	%r15, SVMSTK_R15(%rsp)
+	movq	%r14, SVMSTK_R14(%rsp)
+	movq	%r13, SVMSTK_R13(%rsp)
+	movq	%r12, SVMSTK_R12(%rsp)
+	movq	%rbx, SVMSTK_RBX(%rsp)
+	movq	%rdx, SVMSTK_RDX(%rsp)
+	movq	%rsi, SVMSTK_RSI(%rsp)
+	movq	%rdi, SVMSTK_RDI(%rsp)
+
+	/* VMLOAD and VMRUN expect the VMCB physaddr in %rax */
+	movq	%rdi, %rax
+
+	/* Restore guest state. */
+	movq	SCTX_R8(%rsi), %r8
+	movq	SCTX_R9(%rsi), %r9
+	movq	SCTX_R10(%rsi), %r10
+	movq	SCTX_R11(%rsi), %r11
+	movq	SCTX_R12(%rsi), %r12
+	movq	SCTX_R13(%rsi), %r13
+	movq	SCTX_R14(%rsi), %r14
+	movq	SCTX_R15(%rsi), %r15
+	movq	SCTX_RBP(%rsi), %rbp
+	movq	SCTX_RBX(%rsi), %rbx
+	movq	SCTX_RCX(%rsi), %rcx
+	movq	SCTX_RDX(%rsi), %rdx
+	movq	SCTX_RDI(%rsi), %rdi
+	movq	SCTX_RSI(%rsi), %rsi	/* %rsi must be restored last */
 
 	VMLOAD
 	VMRUN
 	VMSAVE
 
-	pop %rax		/* pop guest context pointer from the stack */
-
-	/*
-	 * Save guest state.
-	 */
-	movq %r8, SCTX_R8(%rax)
-	movq %r9, SCTX_R9(%rax)
-	movq %r10, SCTX_R10(%rax)
-	movq %r11, SCTX_R11(%rax)
-	movq %r12, SCTX_R12(%rax)
-	movq %r13, SCTX_R13(%rax)
-	movq %r14, SCTX_R14(%rax)
-	movq %r15, SCTX_R15(%rax)
-	movq %rbp, SCTX_RBP(%rax)
-	movq %rbx, SCTX_RBX(%rax)
-	movq %rcx, SCTX_RCX(%rax)
-	movq %rdx, SCTX_RDX(%rax)
-	movq %rdi, SCTX_RDI(%rax)
-	movq %rsi, SCTX_RSI(%rax)
-
-	/* Restore host state */
-	pop %r15
-	pop %r14
-	pop %r13
-	pop %r12
-	pop %rbx
-
-	/* Restore %GS.base to point to the host's pcpu data */
-	pop %rdx
-	mov %edx, %eax
-	shr $32, %rdx
-	mov $MSR_GSBASE, %ecx
+	/* Grab the svm_regctx pointer */
+	movq	SVMSTK_RSI(%rsp), %rax
+
+	/* Save guest state. */
+	movq	%r8, SCTX_R8(%rax)
+	movq	%r9, SCTX_R9(%rax)
+	movq	%r10, SCTX_R10(%rax)
+	movq	%r11, SCTX_R11(%rax)
+	movq	%r12, SCTX_R12(%rax)
+	movq	%r13, SCTX_R13(%rax)
+	movq	%r14, SCTX_R14(%rax)
+	movq	%r15, SCTX_R15(%rax)
+	movq	%rbp, SCTX_RBP(%rax)
+	movq	%rbx, SCTX_RBX(%rax)
+	movq	%rcx, SCTX_RCX(%rax)
+	movq	%rdx, SCTX_RDX(%rax)
+	movq	%rdi, SCTX_RDI(%rax)
+	movq	%rsi, SCTX_RSI(%rax)
+
+	/* Restore callee-saved registers */
+	movq	SVMSTK_R15(%rsp), %r15
+	movq	SVMSTK_R14(%rsp), %r14
+	movq	SVMSTK_R13(%rsp), %r13
+	movq	SVMSTK_R12(%rsp), %r12
+	movq	SVMSTK_RBX(%rsp), %rbx
+
+	/* Fix %gsbase to point back to the correct 'struct cpu *' */
+	movq	SVMSTK_RDX(%rsp), %rdx
+	movl	%edx, %eax
+	shrq	$32, %rdx
+	movl	$MSR_GSBASE, %ecx
 	wrmsr
 
-	VLEAVE
+	SVM_GUEST_FLUSH_SCRATCH
+
+	addq	$SVMSTKSIZE, %rsp
+	popq	%rbp
 	ret
-END(svm_launch)
+SET_SIZE(svm_launch)
 
 #endif /* lint */
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index a723be0d28..a039455be7 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -2640,7 +2640,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 #ifdef __FreeBSD__
 		__asm __volatile("int $18");
 #else
-		vmx_call_trap(T_MCE);
+		vmm_call_trap(T_MCE);
 #endif
 		return (1);
 	}
@@ -2929,7 +2929,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 #ifdef __FreeBSD__
 			__asm __volatile("int $18");
 #else
-			vmx_call_trap(T_MCE);
+			vmm_call_trap(T_MCE);
 #endif
 			return (1);
 		}
@@ -3147,7 +3147,7 @@ vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 #ifdef __FreeBSD__
 		__asm __volatile("int $2");
 #else
-		vmx_call_trap(T_NMIFLT);
+		vmm_call_trap(T_NMIFLT);
 #endif
 	}
 }
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
index 9766e6b749..2d16799bdd 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
@@ -164,9 +164,6 @@ CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
 #define	VMX_VMWRITE_ERROR	4
 int	vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched);
 void	vmx_call_isr(uintptr_t entry);
-#ifndef __FreeBSD__
-void	vmx_call_trap(uint64_t);
-#endif
 
 u_long	vmx_fix_cr0(u_long cr0);
 u_long	vmx_fix_cr4(u_long cr4);
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
index a2375e3a6c..0130f88dd6 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
@@ -381,44 +381,4 @@ ENTRY_NP(vmx_call_isr)
 	ret
 SET_SIZE(vmx_call_isr)
 
-/*
- * %rdi = trapno
- *
- * This variant is for any explicit exception injection that we need: in this
- * case, we can't just, for example, do a direct "int $2", as that will then
- * trash our %cr3 via tr_nmiint due to KPTI.  So we have to fake a trap frame in
- * a similar fashion to vmx_call_isr().  Both NMIs and MCEs don't push an 'err'
- * into the frame.
- */
-ENTRY_NP(vmx_call_trap)
-	pushq	%rbp
-	movq	%rsp, %rbp
-	movq	%rsp, %r11
-	andq	$~0xf, %rsp	/* align stack */
-	pushq	$KDS_SEL	/* %ss */
-	pushq	%r11		/* %rsp */
-	pushfq			/* %rflags */
-	pushq	$KCS_SEL	/* %cs */
-	leaq	.trap_iret_dest(%rip), %rcx
-	pushq	%rcx		/* %rip */
-	cli
-	cmpq	$T_NMIFLT, %rdi
-	je	nmiint
-	cmpq	$T_MCE, %rdi
-	je	mcetrap
-
-	pushq	%rdi		/* save our bad trapno... */
-	leaq	__vmx_call_bad_trap(%rip), %rdi
-	xorl	%eax, %eax
-	call	panic
-	/*NOTREACHED*/
-
-.trap_iret_dest:
-	popq	%rbp
-	ret
-SET_SIZE(vmx_call_trap)
-
-__vmx_call_bad_trap:
-	.string	"bad trapno for vmx_call_trap()"
-
 #endif /* lint */
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h
index 7ac745f509..600872c321 100644
--- a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h
@@ -93,6 +93,7 @@ struct vmm_pt_ops {
 };
 
 extern struct vmm_pt_ops ept_ops;
+extern struct vmm_pt_ops rvi_ops;
 
 
 #endif /* _VM_GLUE_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index 2fa0267f72..4b759b44e9 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -1628,10 +1628,19 @@ vmm_is_supported(intptr_t arg)
 	int r;
 	const char *msg;
 
-	if (!vmm_is_intel())
-		return (ENXIO);
+	if (vmm_is_intel()) {
+		r = vmx_x86_supported(&msg);
+	} else if (vmm_is_amd()) {
+		/*
+		 * HMA already ensured that the features necessary for SVM
+		 * operation were present and online during vmm_attach().
+		 */
+		r = 0;
+	} else {
+		r = ENXIO;
+		msg = "Unsupported CPU vendor";
+	}
 
-	r = vmx_x86_supported(&msg);
 	if (r != 0 && arg != NULL) {
 		if (copyoutstr(msg, (char *)arg, strlen(msg), NULL) != 0)
 			return (EFAULT);
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
new file mode 100644
index 0000000000..d630d32630
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
@@ -0,0 +1,297 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/machsystm.h>
+#include <sys/x86_archext.h>
+
+#include <sys/gipt.h>
+#include <vm/vm_glue.h>
+
+
+struct rvi_map {
+	gipt_map_t	rm_gipt;
+	uint64_t	rm_wired_page_count;
+};
+typedef struct rvi_map rvi_map_t;
+
+#define	RVI_LOCK(m)	(&(m)->rm_gipt.giptm_lock)
+
+#define	RVI_MAX_LEVELS	4
+
+CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS);
+
+#define	RVI_PRESENT	PT_VALID
+#define	RVI_WRITABLE	PT_WRITABLE
+#define	RVI_ACCESSED	PT_REF
+#define	RVI_DIRTY	PT_MOD
+#define	RVI_LGPG	PT_PAGESIZE
+#define	RVI_NX		PT_NX
+#define	RVI_USER	PT_USER
+#define	RVI_PWT		PT_WRITETHRU
+#define	RVI_PCD		PT_NOCACHE
+
+#define	RVI_PA_MASK	PT_PADDR
+
+#define	RVI_PAT(attr)	rvi_attr_to_pat(attr)
+#define	RVI_PADDR(addr)	((addr) & RVI_PA_MASK)
+#define	RVI_PROT(prot)	\
+	((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \
+	(((prot) & PROT_EXEC) == 0 ? RVI_NX : 0))
+
+#define	RVI_IS_ABSENT(pte)	(((pte) & RVI_PRESENT) == 0)
+#define	RVI_PTE_PFN(pte)	mmu_btop(RVI_PADDR(pte))
+#define	RVI_MAPS_PAGE(pte, lvl)	\
+	(!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0))
+#define	RVI_PTE_PROT(pte)	\
+	(RVI_IS_ABSENT(pte) ? 0 : (			\
+	PROT_READ |					\
+	(((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) |	\
+	(((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0)))
+
+#define	RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr)	\
+	(RVI_PADDR(pfn_to_pa(pfn)) |			\
+	(((lvl) != 0) ? RVI_LGPG : 0) |			\
+	RVI_USER | RVI_ACCESSED | RVI_PRESENT |		\
+	RVI_PAT(attr) |					\
+	RVI_PROT(prot))
+
+#define	RVI_PTE_ASSIGN_TABLE(pfn)	\
+	(RVI_PADDR(pfn_to_pa(pfn)) |			\
+	RVI_USER | RVI_ACCESSED | RVI_PRESENT |		\
+	RVI_PAT(MTRR_TYPE_WB) |				\
+	RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC))
+
+
+/* Make sure that PAT indexes line up as expected */
+CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB);
+CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC);
+
+static inline uint64_t
+rvi_attr_to_pat(const uint8_t attr)
+{
+	if (attr == MTRR_TYPE_UC) {
+		/* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */
+		return (RVI_PCD|RVI_PWT);
+	} else if (attr == MTRR_TYPE_WB) {
+		/* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */
+		return (0);
+	}
+
+	panic("unexpected memattr %x", attr);
+	return (0);
+}
+
+static gipt_pte_type_t
+rvi_pte_type(uint64_t pte, uint_t level)
+{
+	if (RVI_IS_ABSENT(pte)) {
+		return (PTET_EMPTY);
+	} else if (RVI_MAPS_PAGE(pte, level)) {
+		return (PTET_PAGE);
+	} else {
+		return (PTET_LINK);
+	}
+}
+
+static uint64_t
+rvi_pte_map(uint64_t pfn)
+{
+	return (RVI_PTE_ASSIGN_TABLE(pfn));
+}
+
+static void *
+rvi_create(uintptr_t *pml4_kaddr)
+{
+	rvi_map_t *rmap;
+	gipt_map_t *map;
+	gipt_t *root;
+	struct gipt_cbs cbs = {
+		.giptc_pte_type = rvi_pte_type,
+		.giptc_pte_map = rvi_pte_map,
+	};
+
+	rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP);
+	map = &rmap->rm_gipt;
+	root = gipt_alloc();
+	root->gipt_level = RVI_MAX_LEVELS - 1;
+	gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root);
+
+	*pml4_kaddr = (uintptr_t)root->gipt_kva;
+	return (rmap);
+}
+
+static void
+rvi_destroy(void *arg)
+{
+	rvi_map_t *rmap = arg;
+
+	if (rmap != NULL) {
+		gipt_map_t *map = &rmap->rm_gipt;
+
+		gipt_map_fini(map);
+		kmem_free(rmap, sizeof (*rmap));
+	}
+}
+
+static uint64_t
+rvi_wired_count(void *arg)
+{
+	rvi_map_t *rmap = arg;
+	uint64_t res;
+
+	mutex_enter(RVI_LOCK(rmap));
+	res = rmap->rm_wired_page_count;
+	mutex_exit(RVI_LOCK(rmap));
+
+	return (res);
+}
+
+static int
+rvi_is_wired(void *arg, uint64_t va, uint_t *protp)
+{
+	rvi_map_t *rmap = arg;
+	gipt_t *pt;
+	int rv = -1;
+
+	mutex_enter(RVI_LOCK(rmap));
+	pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va);
+	if (pt != NULL) {
+		const uint64_t pte = GIPT_VA2PTE(pt, va);
+
+		if (RVI_MAPS_PAGE(pte, pt->gipt_level)) {
+			*protp = RVI_PTE_PROT(pte);
+			rv = 0;
+		}
+	}
+	mutex_exit(RVI_LOCK(rmap));
+
+	return (rv);
+}
+
+static int
+rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot,
+    uint8_t attr)
+{
+	rvi_map_t *rmap = arg;
+	gipt_map_t *map = &rmap->rm_gipt;
+	gipt_t *pt;
+	uint64_t *ptep, pte;
+
+	ASSERT((prot & PROT_READ) != 0);
+	ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0);
+	ASSERT3U(lvl, <, RVI_MAX_LEVELS);
+
+	mutex_enter(RVI_LOCK(rmap));
+	pt = gipt_map_lookup(map, va, lvl);
+	if (pt == NULL) {
+		/*
+		 * A table at the appropriate VA/level that would house this
+		 * mapping does not currently exist.  Try to walk down to that
+		 * point, creating any necessary parent(s).
+		 */
+		pt = gipt_map_create_parents(map, va, lvl);
+
+		/*
+		 * There was a large page mapping in the way of creating the
+		 * necessary parent table(s).
+		 */
+		if (pt == NULL) {
+			panic("unexpected large page @ %08lx", va);
+		}
+	}
+	ptep = GIPT_VA2PTEP(pt, va);
+
+	pte = *ptep;
+	if (!RVI_IS_ABSENT(pte)) {
+		if (!RVI_MAPS_PAGE(pte, lvl)) {
+			panic("unexpected PT link @ %08lx in %p", va, pt);
+		} else {
+			panic("unexpected page mapped @ %08lx in %p", va, pt);
+		}
+	}
+
+	pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr);
+	*ptep = pte;
+	pt->gipt_valid_cnt++;
+	rmap->rm_wired_page_count += gipt_level_count[lvl];
+
+	mutex_exit(RVI_LOCK(rmap));
+	return (0);
+}
+
+static uint64_t
+rvi_unmap(void *arg, uint64_t va, uint64_t end_va)
+{
+	rvi_map_t *rmap = arg;
+	gipt_map_t *map = &rmap->rm_gipt;
+	gipt_t *pt;
+	uint64_t cur_va = va;
+	uint64_t unmapped = 0;
+
+	mutex_enter(RVI_LOCK(rmap));
+
+	pt = gipt_map_lookup_deepest(map, cur_va);
+	if (pt == NULL) {
+		mutex_exit(RVI_LOCK(rmap));
+		return (0);
+	}
+	if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) {
+		cur_va = gipt_map_next_page(map, cur_va, end_va, &pt);
+		if (cur_va == 0) {
+			mutex_exit(RVI_LOCK(rmap));
+			return (0);
+		}
+	}
+
+	while (cur_va < end_va) {
+		uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va);
+		const uint_t lvl = pt->gipt_level;
+
+		ASSERT(RVI_MAPS_PAGE(*ptep, lvl));
+		*ptep = 0;
+		pt->gipt_valid_cnt--;
+		unmapped += gipt_level_count[pt->gipt_level];
+
+		gipt_t *next_pt = pt;
+		uint64_t next_va;
+		next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt);
+
+		if (pt->gipt_valid_cnt == 0) {
+			gipt_map_clean_parents(map, pt);
+		}
+		if (next_va == 0) {
+			break;
+		}
+		pt = next_pt;
+		cur_va = next_va;
+	}
+	rmap->rm_wired_page_count -= unmapped;
+
+	mutex_exit(RVI_LOCK(rmap));
+
+	return (unmapped);
+}
+
+struct vmm_pt_ops rvi_ops = {
+	.vpo_init	= rvi_create,
+	.vpo_free	= rvi_destroy,
+	.vpo_wired_cnt	= rvi_wired_count,
+	.vpo_is_wired	= rvi_is_wired,
+	.vpo_map	= rvi_map,
+	.vpo_unmap	= rvi_unmap,
+};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
index 8d5051144c..58a62586a1 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
@@ -24,6 +24,7 @@
 #include <sys/machsystm.h>
 #include <sys/vmsystm.h>
 #include <sys/malloc.h>
+#include <sys/x86_archext.h>
 #include <vm/as.h>
 #include <vm/seg_vn.h>
 #include <vm/seg_kmem.h>
@@ -208,8 +209,17 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags)
 		pmap->pm_pml4 = pml4;
 		return (1);
 	}
-	case PT_RVI:
-		/* RVI support not yet implemented */
+	case PT_RVI: {
+		struct vmm_pt_ops *ops = &rvi_ops;
+		void *pml4, *pmi;
+
+		pmi = ops->vpo_init((uintptr_t *)&pml4);
+
+		pmap->pm_ops = ops;
+		pmap->pm_impl = pmi;
+		pmap->pm_pml4 = pml4;
+		return (1);
+	}
 	default:
 		panic("unsupported pmap type: %x", type);
 		break;
@@ -537,6 +547,8 @@ vm_object_deallocate(vm_object_t vmo)
 	kmem_free(vmo, sizeof (*vmo));
 }
 
+CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC);
+CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB);
 int
 vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr)
 {
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_support.s b/usr/src/uts/i86pc/io/vmm/vmm_support.s
new file mode 100644
index 0000000000..5777d46959
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_support.s
@@ -0,0 +1,54 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/segments.h>
+
+/*
+ * %rdi = trapno
+ *
+ * This variant is for any explicit exception injection that we need: in this
+ * case, we can't just, for example, do a direct "int $2", as that will then
+ * trash our %cr3 via tr_nmiint due to KPTI, so we have to fake a trap frame.
+ * Both NMIs and MCEs don't push an 'err' into the frame.
+ */
+ENTRY_NP(vmm_call_trap)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rsp, %r11
+	andq	$~0xf, %rsp	/* align stack */
+	pushq	$KDS_SEL	/* %ss */
+	pushq	%r11		/* %rsp */
+	pushfq			/* %rflags */
+	pushq	$KCS_SEL	/* %cs */
+	leaq	.trap_iret_dest(%rip), %rcx
+	pushq	%rcx		/* %rip */
+	cli
+	cmpq	$T_NMIFLT, %rdi
+	je	nmiint
+	cmpq	$T_MCE, %rdi
+	je	mcetrap
+
+	pushq	%rdi		/* save our bad trapno... */
+	leaq	__vmm_call_bad_trap(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+	/*NOTREACHED*/
+
+.trap_iret_dest:
+	popq	%rbp
+	ret
+SET_SIZE(vmm_call_trap)
+
+__vmm_call_bad_trap:
+	.string	"bad trapno for vmm_call_trap()"
diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c
index b02142e7e5..b126e96f2c 100644
--- a/usr/src/uts/i86pc/io/vmm/x86.c
+++ b/usr/src/uts/i86pc/io/vmm/x86.c
@@ -198,6 +198,18 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			/* Hide mwaitx/monitorx capability from the guest */
 			regs[2] &= ~AMDID2_MWAITX;
 
+#ifndef __FreeBSD__
+			/*
+			 * Detection routines for TCE and FFXSR are missing
+			 * from our vm_cpuid_capability() detection logic
+			 * today.  Mask them out until that is remedied.
+			 * They do not appear to be in common usage, so their
+			 * absence should not cause undue trouble.
+			 */
+			regs[2] &= ~AMDID2_TCE;
+			regs[3] &= ~AMDID_FFXSR;
+#endif
+
 			/*
 			 * Hide rdtscp/ia32_tsc_aux until we know how
 			 * to deal with them.
diff --git a/usr/src/uts/i86pc/os/hma.c b/usr/src/uts/i86pc/os/hma.c
index 34ae85086d..ecaf9c17cb 100644
--- a/usr/src/uts/i86pc/os/hma.c
+++ b/usr/src/uts/i86pc/os/hma.c
@@ -37,23 +37,33 @@ static boolean_t hma_vmx_ready = B_FALSE;
 static const char *hma_vmx_error = NULL;
 static id_space_t *hma_vmx_vpid;
 
-typedef enum vmx_cpu_state {
-	VCS_UNINITIALIZED = 0,
-	VCS_READY,
-	VCS_ERROR
-} vmx_cpu_state_t;
-
 /*
- * The bulk of VMX-related HMA state is protected by cpu_lock, rather than a
+ * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a
  * mutex specific to the module.  It (cpu_lock) is already required for the
  * state needed to perform setup on all CPUs, so it was a natural fit to
  * protect this data too.
  */
+typedef enum hma_cpu_state {
+	HCS_UNINITIALIZED = 0,
+	HCS_READY,
+	HCS_ERROR
+} hma_cpu_state_t;
+static hma_cpu_state_t hma_cpu_status[NCPU];
+
 static void *hma_vmx_vmxon_page[NCPU];
 static uintptr_t hma_vmx_vmxon_pa[NCPU];
-static vmx_cpu_state_t hma_vmx_status[NCPU];
 static uint32_t hma_vmx_revision;
 
+static boolean_t hma_svm_ready = B_FALSE;
+static const char *hma_svm_error = NULL;
+static uint32_t hma_svm_features;
+static uint32_t hma_svm_max_asid;
+
+static void *hma_svm_hsave_page[NCPU];
+static uintptr_t hma_svm_hsave_pa[NCPU];
+
+static hma_svm_asid_t hma_svm_cpu_asid[NCPU];
+
 
 static int hma_vmx_init(void);
 static int hma_svm_init(void);
@@ -94,8 +104,7 @@ hma_register(const char *name)
 		is_ready = hma_vmx_ready;
 		break;
 	case X86_VENDOR_AMD:
-		/* Punt on SVM support for now */
-		is_ready = B_FALSE;
+		is_ready = hma_svm_ready;
 		break;
 	default:
 		is_ready = B_FALSE;
@@ -156,9 +165,9 @@ hma_vmx_vpid_free(uint16_t vpid)
 
 extern int hma_vmx_vmxon(uintptr_t);
 
-/* ARGSUSED */
 static int
-hma_vmx_cpu_vmxon(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
+hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
+    xc_arg_t arg3 __unused)
 {
 	uint64_t fctrl;
 	processorid_t id = CPU->cpu_seqid;
@@ -181,9 +190,9 @@ hma_vmx_cpu_vmxon(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
 	setcr4(getcr4() | CR4_VMXE);
 
 	if (hma_vmx_vmxon(vmxon_pa) == 0) {
-		hma_vmx_status[id] = VCS_READY;
+		hma_cpu_status[id] = HCS_READY;
 	} else {
-		hma_vmx_status[id] = VCS_ERROR;
+		hma_cpu_status[id] = HCS_ERROR;
 
 		/*
 		 * If VMX has already been marked active and available for the
@@ -198,9 +207,8 @@ hma_vmx_cpu_vmxon(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
 	return (0);
 }
 
-/* ARGSUSED2 */
 static int
-hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg)
+hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
 {
 	ASSERT(MUTEX_HELD(&cpu_lock));
 	ASSERT(id >= 0 && id < NCPU);
@@ -223,8 +231,8 @@ hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg)
 	}
 
 	/* Perform initialization if it has not been previously attempted. */
-	if (hma_vmx_status[id] != VCS_UNINITIALIZED) {
-		return ((hma_vmx_status[id] == VCS_READY) ? 0 : -1);
+	if (hma_cpu_status[id] != HCS_UNINITIALIZED) {
+		return ((hma_cpu_status[id] == HCS_READY) ? 0 : -1);
 	}
 
 	/* Allocate the VMXON page for this CPU */
@@ -265,7 +273,7 @@ hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg)
 		xc_sync(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon);
 	}
 
-	return (hma_vmx_status[id] != VCS_READY);
+	return (hma_cpu_status[id] != HCS_READY);
 }
 
 static int
@@ -329,10 +337,233 @@ bail:
 	return (-1);
 }
 
+#define	VMCB_FLUSH_NOTHING	0x0
+#define	VMCB_FLUSH_ALL		0x1
+#define	VMCB_FLUSH_ASID		0x3
+
+void
+hma_svm_asid_init(hma_svm_asid_t *vcp)
+{
+	/*
+	 * Initialize the generation to 0, forcing an ASID allocation on first
+	 * entry.  Leave the ASID at 0, so if the host forgoes the call to
+	 * hma_svm_asid_update(), SVM will bail on the invalid vcpu state.
+	 */
+	vcp->hsa_gen = 0;
+	vcp->hsa_asid = 0;
+}
+
+uint8_t
+hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid,
+    boolean_t npt_flush)
+{
+	hma_svm_asid_t *hcp = &hma_svm_cpu_asid[CPU->cpu_seqid];
+
+	ASSERT(curthread->t_preempt != 0);
+
+	/*
+	 * If NPT changes dictate a TLB flush and by-ASID flushing is not
+	 * supported/used, force a fresh ASID allocation.
+	 */
+	if (npt_flush && !flush_by_asid) {
+		vcp->hsa_gen = 0;
+	}
+
+	if (vcp->hsa_gen != hcp->hsa_gen) {
+		hcp->hsa_asid++;
+
+		if (hcp->hsa_asid >= hma_svm_max_asid) {
+			/* Keep the ASID properly constrained */
+			hcp->hsa_asid = 1;
+			hcp->hsa_gen++;
+			if (hcp->hsa_gen == 0) {
+				/*
+				 * Stay clear of the '0' sentinel value for
+				 * generation, if wrapping around.
+				 */
+				hcp->hsa_gen = 1;
+			}
+		}
+		vcp->hsa_gen = hcp->hsa_gen;
+		vcp->hsa_asid = hcp->hsa_asid;
+
+		ASSERT(vcp->hsa_asid != 0);
+		ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid);
+
+		if (flush_by_asid) {
+			return (VMCB_FLUSH_ASID);
+		}
+		return (VMCB_FLUSH_ALL);
+	} else if (npt_flush) {
+		ASSERT(flush_by_asid);
+		return (VMCB_FLUSH_ASID);
+	}
+	return (VMCB_FLUSH_NOTHING);
+}
+
+static int
+hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
+    xc_arg_t arg3 __unused)
+{
+	const processorid_t id = CPU->cpu_seqid;
+	const uintptr_t hsave_pa = hma_svm_hsave_pa[id];
+	uint64_t efer;
+
+	VERIFY(hsave_pa != 0);
+
+	/* Enable SVM via EFER */
+	efer = rdmsr(MSR_AMD_EFER);
+	efer |= AMD_EFER_SVME;
+	wrmsr(MSR_AMD_EFER, efer);
+
+	/* Setup hsave area */
+	wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa);
+
+	hma_cpu_status[id] = HCS_READY;
+	return (0);
+}
+
+static int
+hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
+{
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(id >= 0 && id < NCPU);
+
+	switch (what) {
+	case CPU_CONFIG:
+	case CPU_ON:
+	case CPU_INIT:
+		break;
+	default:
+		/*
+		 * Other events, such as CPU offlining, are of no interest.
+		 * Letting the SVM state linger should not cause any harm.
+		 *
+		 * This logic assumes that any offlining activity is strictly
+		 * administrative in nature and will not alter any existing
+		 * configuration (such as EFER bits previously set).
+		 */
+		return (0);
+	}
+
+	/* Perform initialization if it has not been previously attempted. */
+	if (hma_cpu_status[id] != HCS_UNINITIALIZED) {
+		return ((hma_cpu_status[id] == HCS_READY) ? 0 : -1);
+	}
+
+	/* Allocate the hsave page for this CPU */
+	if (hma_svm_hsave_page[id] == NULL) {
+		caddr_t va;
+		pfn_t pfn;
+
+		va = kmem_alloc(PAGESIZE, KM_SLEEP);
+		VERIFY0((uintptr_t)va & PAGEOFFSET);
+		hma_svm_hsave_page[id] = va;
+
+		/*
+		 * Cache the physical address of the hsave page rather than
+		 * looking it up later when the potential blocking of
+		 * hat_getpfnum would be less acceptable.
+		 */
+		pfn = hat_getpfnum(kas.a_hat, va);
+		hma_svm_hsave_pa[id] = (pfn << PAGESHIFT);
+	} else {
+		VERIFY(hma_svm_hsave_pa[id] != 0);
+	}
+
+	kpreempt_disable();
+	if (CPU->cpu_seqid == id) {
+		/* Perform svm setup directly if this CPU is the target */
+		(void) hma_svm_cpu_activate(0, 0, 0);
+		kpreempt_enable();
+	} else {
+		cpuset_t set;
+
+		/* Use a cross-call if a remote CPU is the target */
+		kpreempt_enable();
+		cpuset_zero(&set);
+		cpuset_add(&set, id);
+		xc_sync(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate);
+	}
+
+	return (hma_cpu_status[id] != HCS_READY);
+}
 
 static int
 hma_svm_init(void)
 {
-	/* punt on AMD for now */
-	return (ENOTSUP);
+	uint64_t msr;
+	const char *msg = NULL;
+	struct cpuid_regs regs;
+	cpu_t *cp;
+
+	if (!is_x86_feature(x86_featureset, X86FSET_SVM)) {
+		msg = "CPU does not support SVM";
+		goto bail;
+	}
+
+	msr = rdmsr(MSR_AMD_VM_CR);
+	if ((msr & AMD_VM_CR_SVMDIS) != 0) {
+		msg = "SVM disabled by BIOS";
+		goto bail;
+	}
+
+	regs.cp_eax = 0x8000000a;
+	(void) cpuid_insn(NULL, &regs);
+	const uint32_t nasid = regs.cp_ebx;
+	const uint32_t feat = regs.cp_edx;
+
+	if (nasid == 0) {
+		msg = "Not enough ASIDs for guests";
+		goto bail;
+	}
+	if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) {
+		msg = "CPU does not support nested paging";
+		goto bail;
+	}
+	if ((feat & CPUID_AMD_EDX_NRIPS) == 0) {
+		msg = "CPU does not support NRIP save";
+		goto bail;
+	}
+
+	hma_svm_features = feat;
+	hma_svm_max_asid = nasid;
+
+	mutex_enter(&cpu_lock);
+	/* Perform SVM configuration for already-online CPUs. */
+	cp = cpu_active;
+	do {
+		int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL);
+		if (err != 0) {
+			msg = "failure during SVM setup";
+			mutex_exit(&cpu_lock);
+			goto bail;
+		}
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+
+	/*
+	 * Register callback for later-onlined CPUs and perform other remaining
+	 * resource allocation.
+	 */
+	register_cpu_setup_func(hma_svm_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+
+	/* Initialize per-CPU ASID state. */
+	for (uint_t i = 0; i < NCPU; i++) {
+		/*
+		 * Skip past sentinel 0 value for generation.  Doing so for
+		 * ASID is unneeded, since it will be incremented during the
+		 * first allocation.
+		 */
+		hma_svm_cpu_asid[i].hsa_gen = 1;
+		hma_svm_cpu_asid[i].hsa_asid = 0;
+	}
+
+	hma_svm_ready = B_TRUE;
+	return (0);
+
+bail:
+	hma_svm_error = msg;
+	cmn_err(CE_NOTE, "hma_svm_init: %s", msg);
+	return (-1);
 }
diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h
index 0c6161fdfc..86099b79e1 100644
--- a/usr/src/uts/i86pc/sys/hma.h
+++ b/usr/src/uts/i86pc/sys/hma.h
@@ -49,6 +49,15 @@ extern void hma_unregister(hma_reg_t *);
 extern uint16_t hma_vmx_vpid_alloc(void);
 extern void hma_vmx_vpid_free(uint16_t);
 
+struct hma_svm_asid {
+	uint64_t hsa_gen;
+	uint32_t hsa_asid;
+};
+typedef struct hma_svm_asid hma_svm_asid_t;
+
+extern void hma_svm_asid_init(hma_svm_asid_t *);
+extern uint8_t hma_svm_asid_update(hma_svm_asid_t *, boolean_t, boolean_t);
+
 /*
  * FPU related management. These functions provide a set of APIs to manage the
  * FPU state and switch between host and guest management of this state.
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index 163c0781cf..e5e5460211 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -38,7 +38,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _VMM_H_
@@ -741,6 +741,8 @@ void vmm_sol_glue_cleanup(void);
 int vmm_mod_load(void);
 int vmm_mod_unload(void);
 
+void vmm_call_trap(uint64_t);
+
 /*
  * Because of tangled headers, these are mirrored by vmm_drv.h to present the
  * interface to driver consumers.
diff --git a/usr/src/uts/intel/sys/controlregs.h b/usr/src/uts/intel/sys/controlregs.h
index babf036e0b..0be7b3b650 100644
--- a/usr/src/uts/intel/sys/controlregs.h
+++ b/usr/src/uts/intel/sys/controlregs.h
@@ -200,6 +200,18 @@ extern "C" {
 #define	MSR_AMD_KGSBASE	0xc0000102	/* swapgs swaps this with gsbase */
 #define	MSR_AMD_TSCAUX	0xc0000103	/* %ecx value on rdtscp insn */
 
+
+/* AMD's SVM MSRs */
+
+#define	MSR_AMD_VM_CR		0xc0010114 /* SVM global control */
+#define	MSR_AMD_VM_HSAVE_PA	0xc0010117 /* SVM host save area address */
+
+#define	AMD_VM_CR_DPD		(1 << 0)
+#define	AMD_VM_CR_R_INIT	(1 << 1)
+#define	AMD_VM_CR_DIS_A20M	(1 << 2)
+#define	AMD_VM_CR_LOCK		(1 << 3)
+#define	AMD_VM_CR_SVMDIS	(1 << 4)
+
 /* AMD's configuration MSRs, weakly documented in the revision guide */
 
 #define	MSR_AMD_DC_CFG	0xc0011022
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index 943bdd8203..59a974dfd2 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -210,6 +210,18 @@ extern "C" {
 #define	CPUID_AMD_EBX_SSB_NO		0x004000000 /* AMD: SSB Fixed */
 
 /*
+ * AMD SVM features (extended function 0x8000000A).
+ */
+#define	CPUID_AMD_EDX_NESTED_PAGING	0x000000001 /* AMD: SVM NP */
+#define	CPUID_AMD_EDX_LBR_VIRT		0x000000002 /* AMD: LBR virt. */
+#define	CPUID_AMD_EDX_SVML		0x000000004 /* AMD: SVM lock */
+#define	CPUID_AMD_EDX_NRIPS		0x000000008 /* AMD: NRIP save */
+#define	CPUID_AMD_EDX_TSC_RATE_MSR	0x000000010 /* AMD: MSR TSC ctrl */
+#define	CPUID_AMD_EDX_VMCB_CLEAN	0x000000020 /* AMD: VMCB clean bits */
+#define	CPUID_AMD_EDX_FLUSH_ASID	0x000000040 /* AMD: flush by ASID */
+#define	CPUID_AMD_EDX_DECODE_ASSISTS	0x000000080 /* AMD: decode assists */
+
+/*
  * Intel now seems to have claimed part of the "extended" function
  * space that we previously for non-Intel implementors to use.
  * More excitingly still, they've claimed bit 20 to mean LAHF/SAHF
author	Patrick Mooney <pmooney@pfmooney.com>	2018-12-14 23:15:07 +0000
committer	Patrick Mooney <pmooney@pfmooney.com>	2019-03-15 15:01:23 +0000
commit	2453029c010976e95241a5f5244e86d44dc6194c (patch)
tree	ee57e43a87769a4489b772635f14c4e153205ac5 /usr
parent	7afbf32617941421fcf18c3c37cbce5f7d30a8fc (diff)
download	illumos-joyent-2453029c010976e95241a5f5244e86d44dc6194c.tar.gz