14635 bhyve should expose additional vcpu state

Reviewed by: Luqman Aden <luqman@oxide.computer> Reviewed by: Jordan Paige Hendricks <jordan@oxidecomputer.com> Approved by: Robert Mustacchi <rm@fingolfin.org>
author: Patrick Mooney <pmooney@pfmooney.com> 2022-06-10 23:05:32 +0000
committer: Patrick Mooney <pmooney@oxide.computer> 2022-06-27 23:20:35 +0000
commit: 54cf5b63effe805271443d5dd7afd37ec184fbab (patch)
tree: c2da22ed12b56879537c652b894932f257462d7b /usr
parent: ea962d11118b10579c946c4ac15559148ddf3cf8 (diff)
download: illumos-joyent-54cf5b63effe805271443d5dd7afd37ec184fbab.tar.gz
15 files changed, 1029 insertions, 578 deletions
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c
index a6c86fd5fc..3b3caf0d20 100644
--- a/usr/src/cmd/bhyvectl/bhyvectl.c
+++ b/usr/src/cmd/bhyvectl/bhyvectl.c
@@ -51,9 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/errno.h>
 #include <sys/mman.h>
 #include <sys/cpuset.h>
-#ifndef __FreeBSD__
 #include <sys/fp.h>
-#endif /* __FreeBSD__ */
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -72,6 +70,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
+#include <sys/vmm_data.h>
 #include <vmmapi.h>
 
 #include "amd/vmcb.h"
@@ -95,10 +94,8 @@ usage(bool cpu_intel)
 	"       [--cpu=<vcpu_number>]\n"
 	"       [--create]\n"
 	"       [--destroy]\n"
-#ifndef __FreeBSD__
 	"       [--pmtmr-port=ioport]\n"
 	"       [--wrlock-cycle]\n"
-#endif
 	"       [--get-all]\n"
 	"       [--get-stats]\n"
 	"       [--set-desc-ds]\n"
@@ -186,9 +183,6 @@ usage(bool cpu_intel)
 	"       [--get-ldtr]\n"
 	"       [--set-x2apic-state=<state>]\n"
 	"       [--get-x2apic-state]\n"
-#ifdef __FreeBSD__
-	"       [--unassign-pptdev=<bus/slot/func>]\n"
-#endif
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
 	"       [--get-highmem]\n"
@@ -307,16 +301,11 @@ static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
 static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
 static int set_x2apic_state, get_x2apic_state;
 enum x2apic_state x2apic_state;
-#ifdef __FreeBSD__
-static int unassign_pptdev, bus, slot, func;
-#endif
 static int run;
 static int get_cpu_topology;
-#ifndef __FreeBSD__
 static int pmtmr_port;
 static int wrlock_cycle;
 static int get_fpu;
-#endif
 
 /*
  * VMCB specific.
@@ -339,12 +328,13 @@ static int get_cr4_mask, get_cr4_shadow;
 static int get_cr3_targets;
 static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
 static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_guest_msrs;
 static int get_vpid_asid;
 static int get_inst_err, get_exit_ctls, get_entry_ctls;
 static int get_host_cr0, get_host_cr3, get_host_cr4;
 static int get_host_rip, get_host_rsp;
-static int get_guest_pat, get_host_pat;
-static int get_guest_sysenter, get_vmcs_link;
+static int get_host_pat;
+static int get_vmcs_link;
 static int get_exit_reason, get_vmcs_exit_qualification;
 static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
 static int get_vmcs_exit_inst_length;
@@ -406,172 +396,7 @@ dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
 #define MSR_AMD7TH_START	0xC0010000
 #define MSR_AMD7TH_END		0xC0011FFF
 
-#ifdef __FreeBSD__
-static const char *
-msr_name(uint32_t msr)
-{
-	static char buf[32];
-
-	switch(msr) {
-	case MSR_TSC:
-		return ("MSR_TSC");
-	case MSR_EFER:
-		return ("MSR_EFER");
-	case MSR_STAR:
-		return ("MSR_STAR");
-	case MSR_LSTAR:	
-		return ("MSR_LSTAR");
-	case MSR_CSTAR:
-		return ("MSR_CSTAR");
-	case MSR_SF_MASK:
-		return ("MSR_SF_MASK");
-	case MSR_FSBASE:
-		return ("MSR_FSBASE");
-	case MSR_GSBASE:
-		return ("MSR_GSBASE");
-	case MSR_KGSBASE:
-		return ("MSR_KGSBASE");
-	case MSR_SYSENTER_CS_MSR:
-		return ("MSR_SYSENTER_CS_MSR");
-	case MSR_SYSENTER_ESP_MSR:
-		return ("MSR_SYSENTER_ESP_MSR");
-	case MSR_SYSENTER_EIP_MSR:
-		return ("MSR_SYSENTER_EIP_MSR");
-	case MSR_PAT:
-		return ("MSR_PAT");
-	}
-	snprintf(buf, sizeof(buf), "MSR       %#08x", msr);
-
-	return (buf);
-}
-
-static inline void
-print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable)
-{
-
-	if (readable || writeable) {
-		printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu,
-			readable ? 'R' : '-', writeable ? 'W' : '-');
-	}
-}
-
-/*
- * Reference APM vol2, section 15.11 MSR Intercepts.
- */
-static void
-dump_amd_msr_pm(const char *bitmap, int vcpu)
-{
-	int byte, bit, readable, writeable;
-	uint32_t msr;
-
-	for (msr = 0; msr < 0x2000; msr++) {
-		byte = msr / 4;
-		bit = (msr % 4) * 2;
-
-		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
-		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
-		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
-		print_msr_pm(msr, vcpu, readable, writeable);
-
-		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
-		byte += 2048;
-		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
-		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
-		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
-				writeable);
-		
-		/* MSR 0xC0010000 to 0xC0011FF is only for AMD */
-		byte += 4096;
-		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
-		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
-		print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable,
-				writeable);
-	}
-}
-
-/*
- * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address
- */
-static void
-dump_intel_msr_pm(const char *bitmap, int vcpu)
-{
-	int byte, bit, readable, writeable;
-	uint32_t msr;
-
-	for (msr = 0; msr < 0x2000; msr++) {
-		byte = msr / 8;
-		bit = msr & 0x7;
-
-		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
-		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
-		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
-		print_msr_pm(msr, vcpu, readable, writeable);
-
-		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
-		byte += 1024;
-		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
-		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
-		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
-				writeable);
-	}
-}
-
-static int
-dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel)
-{
-	int error, fd, map_size;
-	const char *bitmap;
-
-	error = -1;
-	bitmap = MAP_FAILED;
-
-	fd = open("/dev/mem", O_RDONLY, 0);
-	if (fd < 0) {
-		perror("Couldn't open /dev/mem");
-		goto done;
-	}
-
-	if (cpu_intel)
-		map_size = PAGE_SIZE;
-	else
-		map_size = 2 * PAGE_SIZE;
-
-	bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr);
-	if (bitmap == MAP_FAILED) {
-		perror("mmap failed");
-		goto done;
-	}
-	
-	if (cpu_intel)
-		dump_intel_msr_pm(bitmap, vcpu);
-	else	
-		dump_amd_msr_pm(bitmap, vcpu);
-
-	error = 0;
-done:
-	if (bitmap != MAP_FAILED)
-		munmap((void *)bitmap, map_size);
-	if (fd >= 0)
-		close(fd);
-
-	return (error);
-}
-
-static int
-vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
-{
-
-	return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
-}
-
-static int
-vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
-{
-
-	return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
-}
-#else /* __FreeBSD__ */
-/* VMCS does not allow arbitrary reads/writes */
+/* Until a safe method is created, arbitrary VMCS reads/writes are forbidden */
 static int
 vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
 {
@@ -584,29 +409,11 @@ vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
 {
 	return (EINVAL);
 }
-#endif /* __FreeBSD__ */
-
-#ifdef __FreeBSD__
-static int
-vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
-	uint64_t *ret_val)
-{
-
-	return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val));
-}
 
-static int
-vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
-	uint64_t val)
-{
-	
-	return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val));
-}
-#else /* __FreeBSD__ */
-/* Arbitrary VMCB read/write is not allowed */
+/* Until a safe method is created, arbitrary VMCB reads/writes are forbidden */
 static int
 vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
-	uint64_t *ret_val)
+    uint64_t *ret_val)
 {
 	*ret_val = 0;
 	return (0);
@@ -614,11 +421,10 @@ vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
 
 static int
 vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
-	uint64_t val)
+    uint64_t val)
 {
 	return (EINVAL);
 }
-#endif /* __FreeBSD__ */
 
 enum {
 	VMNAME = 1000,	/* avoid collision with return values from getopt */
@@ -661,9 +467,7 @@ enum {
 	SET_RTC_TIME,
 	SET_RTC_NVRAM,
 	RTC_NVRAM_OFFSET,
-#ifndef __FreeBSD__
 	PMTMR_PORT,
-#endif
 };
 
 static void
@@ -686,38 +490,6 @@ print_cpus(const char *banner, const cpuset_t *cpus)
 	printf("\n");
 }
 
-#ifdef __FreeBSD__
-static void
-print_intinfo(const char *banner, uint64_t info)
-{
-	int type;
-
-	printf("%s:\t", banner);
-	if (info & VM_INTINFO_VALID) {
-		type = info & VM_INTINFO_TYPE;
-		switch (type) {
-		case VM_INTINFO_HWINTR:
-			printf("extint");
-			break;
-		case VM_INTINFO_NMI:
-			printf("nmi");
-			break;
-		case VM_INTINFO_SWINTR:
-			printf("swint");
-			break;
-		default:
-			printf("exception");
-			break;
-		}
-		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
-		if (info & VM_INTINFO_DEL_ERRCODE)
-			printf(" errcode %#x", (u_int)(info >> 32));
-	} else {
-		printf("n/a");
-	}
-	printf("\n");
-}
-#else /* __FreeBSD__ */
 static void
 print_intinfo(const char *banner, uint64_t info)
 {
@@ -746,7 +518,6 @@ print_intinfo(const char *banner, uint64_t info)
 	}
 	printf("\n");
 }
-#endif /* __FreeBSD__ */
 
 static bool
 cpu_vendor_intel(void)
@@ -1141,7 +912,7 @@ get_misc_vmcs(struct vmctx *ctx, int vcpu)
 		if (error == 0)
 			printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
 	}
-	
+
 	if (!error && (get_cr3_targets || get_all)) {
 		uint64_t target_count, target_addr;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
@@ -1214,7 +985,7 @@ get_misc_vmcs(struct vmctx *ctx, int vcpu)
 			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
-	if (!error && (get_vmcs_entry_interruption_info || 
+	if (!error && (get_vmcs_entry_interruption_info ||
 		get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
 		if (error == 0) {
@@ -1336,7 +1107,7 @@ get_misc_vmcs(struct vmctx *ctx, int vcpu)
 			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
 				vcpu, u64);
 	}
-	
+
 	return (error);
 }
 
@@ -1549,9 +1320,7 @@ setup_options(bool cpu_intel)
 					NO_ARG,	&get_msr_bitmap, 	1 },
 		{ "get-msr-bitmap-address",
 					NO_ARG,	&get_msr_bitmap_address, 1 },
-		{ "get-guest-pat",	NO_ARG,	&get_guest_pat,		1 },
-		{ "get-guest-sysenter",
-					NO_ARG,	&get_guest_sysenter, 	1 },
+		{ "get-guest-msrs",	NO_ARG,	&get_guest_msrs,	1 },
 		{ "get-exit-reason",
 					NO_ARG,	&get_exit_reason, 	1 },
 		{ "get-x2apic-state",	NO_ARG,	&get_x2apic_state, 	1 },
@@ -1566,11 +1335,9 @@ setup_options(bool cpu_intel)
 		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 	1 },
 		{ "get-intinfo", 	NO_ARG,	&get_intinfo,		1 },
 		{ "get-cpu-topology",	NO_ARG, &get_cpu_topology,	1 },
-#ifndef __FreeBSD__
 		{ "pmtmr-port",		REQ_ARG,	0,	PMTMR_PORT },
 		{ "wrlock-cycle",	NO_ARG,	&wrlock_cycle,	1 },
 		{ "get-fpu",	NO_ARG,		&get_fpu,	1 },
-#endif
 	};
 
 	const struct option intel_opts[] = {
@@ -1632,7 +1399,7 @@ setup_options(bool cpu_intel)
 	const struct option amd_opts[] = {
 		{ "get-vmcb-intercepts",
 				NO_ARG,	&get_vmcb_intercept, 	1 },
-		{ "get-vmcb-asid", 
+		{ "get-vmcb-asid",
 				NO_ARG,	&get_vpid_asid,	     	1 },
 		{ "get-vmcb-exit-details",
 				NO_ARG, &get_vmcb_exit_details,	1 },
@@ -1788,7 +1555,6 @@ show_memseg(struct vmctx *ctx)
 	}
 }
 
-#ifndef __FreeBSD__
 static int
 show_fpu(struct vmctx *ctx, int vcpu)
 {
@@ -1873,7 +1639,87 @@ show_fpu(struct vmctx *ctx, int vcpu)
 	free(buf);
 	return (0);
 }
-#endif /*__FreeBSD__ */
+
+static const char *
+msr_name(uint32_t msr)
+{
+#define MSR_IDENT_MAP(x)	case x: return (#x);
+	switch (msr) {
+	MSR_IDENT_MAP(MSR_PAT)
+	MSR_IDENT_MAP(MSR_SYSENTER_CS_MSR)
+	MSR_IDENT_MAP(MSR_SYSENTER_ESP_MSR)
+	MSR_IDENT_MAP(MSR_SYSENTER_EIP_MSR)
+	MSR_IDENT_MAP(MSR_STAR)
+	MSR_IDENT_MAP(MSR_LSTAR)
+	MSR_IDENT_MAP(MSR_CSTAR)
+	MSR_IDENT_MAP(MSR_SF_MASK)
+	MSR_IDENT_MAP(MSR_FSBASE)
+	MSR_IDENT_MAP(MSR_GSBASE)
+	MSR_IDENT_MAP(MSR_KGSBASE)
+	MSR_IDENT_MAP(MSR_EFER)
+	MSR_IDENT_MAP(MSR_MTRRcap)
+	MSR_IDENT_MAP(MSR_MTRRdefType)
+	case MSR_TSC:
+		return ("MSR_TSC (offset from system boot)");
+	default:
+		return (NULL);
+	}
+}
+
+static int
+show_msrs(struct vmctx *ctx, int vcpu)
+{
+	struct vdi_field_entry_v1 *msrs;
+	struct vm_data_xfer xfer = {
+		.vdx_vcpuid = vcpu,
+		.vdx_class = VDC_MSR,
+		.vdx_version = 1,
+		.vdx_len = 0,
+		.vdx_data = &msrs,
+	};
+	int fd = vm_get_device_fd(ctx);
+	int res;
+
+	/* Figure out how many entries we need to alloc for */
+	res = ioctl(fd, VM_DATA_READ, &xfer);
+	if (res == 0) {
+		return (EINVAL);
+	} else if (errno != ENOSPC) {
+		return (errno);
+	}
+	const uint32_t len = xfer.vdx_result_len;
+	msrs = malloc(len);
+	if (msrs == NULL) {
+		return (ENOMEM);
+	}
+	bzero(msrs, len);
+	xfer.vdx_data = msrs;
+	xfer.vdx_len = len;
+
+	/* Query the actual data, now that we should have an adequate buffer */
+	res = ioctl(fd, VM_DATA_READ, &xfer);
+	if (res != 0) {
+		free(msrs);
+		return (errno);
+	}
+
+	const uint_t count =
+	    xfer.vdx_result_len / sizeof (struct vdi_field_entry_v1);
+	for (uint_t i = 0; i < count; i++) {
+		const uint32_t ident = msrs[i].vfe_ident;
+		const uint64_t value = msrs[i].vfe_value;
+
+		const char *name = msr_name(ident);
+
+		if (name != NULL) {
+			printf("msr[%s]\t = %x\n", name, value);
+		} else {
+			printf("msr[%08x]\t = %x\n", ident, value);
+		}
+	}
+	free(msrs);
+	return (0);
+}
 
 int
 main(int argc, char *argv[])
@@ -1883,7 +1729,7 @@ main(int argc, char *argv[])
 	vm_paddr_t gpa_pmap;
 	struct vm_exit vmexit;
 	uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7;
-	uint64_t rsp, rip, rflags, efer, pat;
+	uint64_t rsp, rip, rflags, efer;
 	uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2];
 	struct vmctx *ctx;
 	cpuset_t cpus;
@@ -2049,21 +1895,12 @@ main(int argc, char *argv[])
 		case CAPNAME:
 			capname = optarg;
 			break;
-#ifdef __FreeBSD__
-		case UNASSIGN_PPTDEV:
-			unassign_pptdev = 1;
-			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
-				usage(cpu_intel);
-			break;
-#endif
 		case ASSERT_LAPIC_LVT:
 			assert_lapic_lvt = atoi(optarg);
 			break;
-#ifndef __FreeBSD__
 		case PMTMR_PORT:
 			pmtmr_port = strtoul(optarg, NULL, 16);
 			break;
-#endif
 		default:
 			usage(cpu_intel);
 		}
@@ -2076,13 +1913,8 @@ main(int argc, char *argv[])
 
 	error = 0;
 
-#ifndef __FreeBSD__
 	if (!error && create)
 		error = vm_create(vmname, 0);
-# else
-	if (!error && create)
-		error = vm_create(vmname);
-#endif /* __FreeBSD__ */
 
 	if (!error) {
 		ctx = vm_open(vmname);
@@ -2094,16 +1926,15 @@ main(int argc, char *argv[])
 		}
 	}
 
-#ifndef __FreeBSD__
 	if (!error && pmtmr_port) {
 		error = vm_pmtmr_set_location(ctx, pmtmr_port);
 		exit(error);
 	}
+
 	if (!error && wrlock_cycle) {
 		error = vm_wrlock_cycle(ctx);
 		exit(error);
 	}
-#endif /* __FreeBSD__ */
 
 	if (!error && memsize)
 		error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
@@ -2232,11 +2063,6 @@ main(int argc, char *argv[])
 	if (!error && set_x2apic_state)
 		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
 
-#ifdef __FreeBSD__
-	if (!error && unassign_pptdev)
-		error = vm_unassign_pptdev(ctx, bus, slot, func);
-#endif /* __FreeBSD__ */
-
 	if (!error && set_exception_bitmap) {
 		if (cpu_intel)
 			error = vm_set_vmcs_field(ctx, vcpu,
@@ -2273,11 +2099,9 @@ main(int argc, char *argv[])
 	if (!error)
 		error = get_all_segments(ctx, vcpu);
 
-#ifndef __FreeBSD__
 	if (!error && (get_fpu || get_all)) {
 		error = show_fpu(ctx, vcpu);
 	}
-#endif /* __FreeBSD__ */
 
 	if (!error) {
 		if (cpu_intel)
@@ -2285,7 +2109,7 @@ main(int argc, char *argv[])
 		else
 			error = get_misc_vmcb(ctx, vcpu);
 	}
-	
+
 	if (!error && (get_x2apic_state || get_all)) {
 		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
 		if (error == 0)
@@ -2340,7 +2164,7 @@ main(int argc, char *argv[])
 						  &tscoff);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
-						  VMCB_OFF_TSC_OFFSET, 
+						  VMCB_OFF_TSC_OFFSET,
 						  8, &tscoff);
 		if (error == 0)
 			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
@@ -2348,7 +2172,7 @@ main(int argc, char *argv[])
 
 	if (!error && (get_msr_bitmap_address || get_all)) {
 		if (cpu_intel)
-			error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, 
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP,
 						  &addr);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
@@ -2357,90 +2181,27 @@ main(int argc, char *argv[])
 			printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr);
 	}
 
-	if (!error && (get_msr_bitmap || get_all)) {
-		if (cpu_intel) {
-			error = vm_get_vmcs_field(ctx, vcpu, 
-						  VMCS_MSR_BITMAP, &addr);
-		} else {
-			error = vm_get_vmcb_field(ctx, vcpu,
-						  VMCB_OFF_MSR_PERM, 8,
-						  &addr);
-		}
-
-#ifdef __FreeBSD__
-		if (error == 0)
-			error = dump_msr_bitmap(vcpu, addr, cpu_intel);
-#else
-		/*
-		 * Skip dumping the MSR bitmap since raw access to the VMCS is
-		 * currently not possible.
-		 */
-#endif /* __FreeBSD__ */
-	}
-
 	if (!error && (get_vpid_asid || get_all)) {
 		uint64_t vpid;
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
 		else
-			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, 
+			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID,
 						  4, &vpid);
 		if (error == 0)
-			printf("%s[%d]\t\t0x%04lx\n", 
+			printf("%s[%d]\t\t0x%04lx\n",
 				cpu_intel ? "vpid" : "asid", vcpu, vpid);
 	}
 
-	if (!error && (get_guest_pat || get_all)) {
-		if (cpu_intel)
-			error = vm_get_vmcs_field(ctx, vcpu,
-						  VMCS_GUEST_IA32_PAT, &pat);
-		else
-			error = vm_get_vmcb_field(ctx, vcpu,
-						  VMCB_OFF_GUEST_PAT, 8, &pat);
-		if (error == 0)
-			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
-	}
-
-	if (!error && (get_guest_sysenter || get_all)) {
-		if (cpu_intel)
-			error = vm_get_vmcs_field(ctx, vcpu,
-						  VMCS_GUEST_IA32_SYSENTER_CS,
-						  &cs);
-		else
-			error = vm_get_vmcb_field(ctx, vcpu,
-						  VMCB_OFF_SYSENTER_CS, 8,
-						  &cs);
-
-		if (error == 0)
-			printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs);
-		if (cpu_intel)
-			error = vm_get_vmcs_field(ctx, vcpu,
-						  VMCS_GUEST_IA32_SYSENTER_ESP,
-						  &rsp);
-		else
-			error = vm_get_vmcb_field(ctx, vcpu,
-						  VMCB_OFF_SYSENTER_ESP, 8,
-						  &rsp);
-
-		if (error == 0)
-			printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp);
-		if (cpu_intel)
-			error = vm_get_vmcs_field(ctx, vcpu,
-						  VMCS_GUEST_IA32_SYSENTER_EIP,
-						  &rip);
-		else
-			error = vm_get_vmcb_field(ctx, vcpu,
-						  VMCB_OFF_SYSENTER_EIP, 8, 
-						  &rip);
-		if (error == 0)
-			printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip);
+	if (!error && (get_guest_msrs || get_all)) {
+		error = show_msrs(ctx, vcpu);
 	}
 
 	if (!error && (get_exit_reason || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON,
 						  &u64);
-		else	
+		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_EXIT_REASON, 8,
 						  &u64);
diff --git a/usr/src/uts/intel/io/vmm/amd/svm.c b/usr/src/uts/intel/io/vmm/amd/svm.c
index b699d57991..de4a492ae9 100644
--- a/usr/src/uts/intel/io/vmm/amd/svm.c
+++ b/usr/src/uts/intel/io/vmm/amd/svm.c
@@ -72,7 +72,6 @@ __FBSDID("$FreeBSD$");
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
-#include "x86.h"
 #include "vmcb.h"
 #include "svm.h"
 #include "svm_softc.h"
@@ -2257,6 +2256,17 @@ svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
 		if (SEG_DESC_UNUSABLE(desc->access)) {
 			seg->attrib &= ~0x80;
 		}
+		/*
+		 * Keep CPL synced with the DPL specified for %ss.
+		 *
+		 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD
+		 * (unlike Intel), but accepts such a possible deviation for
+		 * what is otherwise unreasonable behavior for a guest OS, since
+		 * they do the same synchronization.
+		 */
+		if (reg == VM_REG_GUEST_SS) {
+			vmcb->state.cpl = SEG_DESC_DPL(desc->access);
+		}
 		break;
 
 	case VM_REG_GUEST_GDTR:
@@ -2339,6 +2349,55 @@ svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 }
 
 static int
+svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
+{
+	struct svm_softc *sc = arg;
+	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
+	const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL);
+
+	if (msrp != NULL) {
+		*valp = *msrp;
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+static int
+svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val)
+{
+	struct svm_softc *sc = arg;
+	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
+
+	uint32_t dirty = 0;
+	uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty);
+	if (msrp == NULL) {
+		return (EINVAL);
+	}
+	switch (msr) {
+	case MSR_EFER:
+		/*
+		 * For now, just clone the logic from
+		 * svm_setreg():
+		 *
+		 * EFER_SVM must always be set when the guest is
+		 * executing
+		 */
+		*msrp = val | EFER_SVM;
+		break;
+	/* TODO: other necessary MSR masking */
+	default:
+		*msrp = val;
+		break;
+	}
+	if (dirty != 0) {
+		svm_set_dirty(sc, vcpu, dirty);
+	}
+	return (0);
+
+}
+
+static int
 svm_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct svm_softc *sc;
@@ -2450,4 +2509,7 @@ struct vmm_ops vmm_ops_amd = {
 
 	.vmsavectx	= svm_savectx,
 	.vmrestorectx	= svm_restorectx,
+
+	.vmgetmsr	= svm_get_msr,
+	.vmsetmsr	= svm_set_msr,
 };
diff --git a/usr/src/uts/intel/io/vmm/amd/vmcb.c b/usr/src/uts/intel/io/vmm/amd/vmcb.c
index 5be5240129..ec2c9674c0 100644
--- a/usr/src/uts/intel/io/vmm/amd/vmcb.c
+++ b/usr/src/uts/intel/io/vmm/amd/vmcb.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 
+#include <machine/specialreg.h>
 #include <machine/vmm.h>
 
 #include "vmcb.h"
@@ -148,3 +149,63 @@ vmcb_regptr(struct vmcb *vmcb, int ident, uint32_t *dirtyp)
 	}
 	return (res);
 }
+
+uint64_t *
+vmcb_msr_ptr(struct vmcb *vmcb, uint32_t msr, uint32_t *dirtyp)
+{
+	uint64_t *res = NULL;
+	uint32_t dirty = 0;
+	struct vmcb_state *state = &vmcb->state;
+
+	switch (msr) {
+	case MSR_EFER:
+		res = &state->efer;
+		dirty = VMCB_CACHE_CR;
+		break;
+
+	case MSR_GSBASE:
+		res = &state->gs.base;
+		dirty = VMCB_CACHE_SEG;
+		break;
+	case MSR_FSBASE:
+		res = &state->fs.base;
+		dirty = VMCB_CACHE_SEG;
+		break;
+	case MSR_KGSBASE:
+		res = &state->kernelgsbase;
+		break;
+
+	case MSR_STAR:
+		res = &state->star;
+		break;
+	case MSR_LSTAR:
+		res = &state->lstar;
+		break;
+	case MSR_CSTAR:
+		res = &state->cstar;
+		break;
+	case MSR_SF_MASK:
+		res = &state->sfmask;
+		break;
+
+	case MSR_SYSENTER_CS_MSR:
+		res = &state->sysenter_cs;
+		break;
+	case MSR_SYSENTER_ESP_MSR:
+		res = &state->sysenter_esp;
+		break;
+	case MSR_SYSENTER_EIP_MSR:
+		res = &state->sysenter_eip;
+		break;
+
+	case MSR_PAT:
+		res = &state->g_pat;
+		dirty = VMCB_CACHE_NP;
+		break;
+	}
+
+	if (res != NULL && dirtyp != NULL) {
+		*dirtyp = dirty;
+	}
+	return (res);
+}
diff --git a/usr/src/uts/intel/io/vmm/amd/vmcb.h b/usr/src/uts/intel/io/vmm/amd/vmcb.h
index da0f08445c..7a57979d56 100644
--- a/usr/src/uts/intel/io/vmm/amd/vmcb.h
+++ b/usr/src/uts/intel/io/vmm/amd/vmcb.h
@@ -397,6 +397,7 @@ CTASSERT(offsetof(struct vmcb, state) == 0x400);
 
 struct vmcb_segment *vmcb_segptr(struct vmcb *vmcb, int type);
 uint64_t *vmcb_regptr(struct vmcb *vmcb, int ident, uint32_t *dirtyp);
+uint64_t *vmcb_msr_ptr(struct vmcb *vmcb, uint32_t ident, uint32_t *dirtyp);
 
 #endif /* _KERNEL */
 #endif /* _VMCB_H_ */
diff --git a/usr/src/uts/intel/io/vmm/intel/vmcs.c b/usr/src/uts/intel/io/vmm/intel/vmcs.c
index 7fabba79f7..b5bc8130d9 100644
--- a/usr/src/uts/intel/io/vmm/intel/vmcs.c
+++ b/usr/src/uts/intel/io/vmm/intel/vmcs.c
@@ -165,6 +165,34 @@ vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
 	}
 }
 
+uint32_t
+vmcs_msr_encoding(uint32_t msr)
+{
+	switch (msr) {
+	case MSR_PAT:
+		return (VMCS_GUEST_IA32_PAT);
+	case MSR_EFER:
+		return (VMCS_GUEST_IA32_EFER);
+	case MSR_SYSENTER_CS_MSR:
+		return (VMCS_GUEST_IA32_SYSENTER_CS);
+	case MSR_SYSENTER_ESP_MSR:
+		return (VMCS_GUEST_IA32_SYSENTER_ESP);
+	case MSR_SYSENTER_EIP_MSR:
+		return (VMCS_GUEST_IA32_SYSENTER_EIP);
+	/*
+	 * While fsbase and gsbase are expected to be accessed (by the VMM) via
+	 * the segment descriptor interfaces, we still make it available as MSR
+	 * contents as well.
+	 */
+	case MSR_FSBASE:
+		return (VMCS_GUEST_FS_BASE);
+	case MSR_GSBASE:
+		return (VMCS_GUEST_GS_BASE);
+	default:
+		return (VMCS_INVALID_ENCODING);
+	}
+}
+
 void
 vmcs_clear(uintptr_t vmcs_pa)
 {
diff --git a/usr/src/uts/intel/io/vmm/intel/vmcs.h b/usr/src/uts/intel/io/vmm/intel/vmcs.h
index 24dc2dd574..9e4a9e3282 100644
--- a/usr/src/uts/intel/io/vmm/intel/vmcs.h
+++ b/usr/src/uts/intel/io/vmm/intel/vmcs.h
@@ -48,6 +48,7 @@ CTASSERT(sizeof (struct vmcs) == PAGE_SIZE);
 uint32_t vmcs_field_encoding(int ident);
 void vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim,
     uint32_t *acc);
+uint32_t vmcs_msr_encoding(uint32_t msr);
 
 void vmcs_initialize(struct vmcs *vmcs, uintptr_t vmcs_pa);
 
diff --git a/usr/src/uts/intel/io/vmm/intel/vmx.c b/usr/src/uts/intel/io/vmm/intel/vmx.c
index 360cec1056..4ef51259ab 100644
--- a/usr/src/uts/intel/io/vmm/intel/vmx.c
+++ b/usr/src/uts/intel/io/vmm/intel/vmx.c
@@ -85,7 +85,6 @@ __FBSDID("$FreeBSD$");
 #include "vmcs.h"
 #include "vmx.h"
 #include "vmx_msr.h"
-#include "x86.h"
 #include "vmx_controls.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
@@ -1629,6 +1628,25 @@ vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 	}
 }
 
+static void
+vmx_sync_efer_state(struct vmx *vmx, int vcpu, uint64_t efer)
+{
+	uint64_t ctrl;
+
+	/*
+	 * If the "load EFER" VM-entry control is 1 (which we require) then the
+	 * value of EFER.LMA must be identical to "IA-32e mode guest" bit in the
+	 * VM-entry control.
+	 */
+	ctrl = vmcs_read(VMCS_ENTRY_CTLS);
+	if ((efer & EFER_LMA) != 0) {
+		ctrl |= VM_ENTRY_GUEST_LMA;
+	} else {
+		ctrl &= ~VM_ENTRY_GUEST_LMA;
+	}
+	vmcs_write(VMCS_ENTRY_CTLS, ctrl);
+}
+
 static int
 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
@@ -1655,20 +1673,14 @@ vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 	vmcs_write(VMCS_GUEST_CR0, crval);
 
 	if (regval & CR0_PG) {
-		uint64_t efer, entry_ctls;
+		uint64_t efer;
 
-		/*
-		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
-		 * the "IA-32e mode guest" bit in VM-entry control must be
-		 * equal.
-		 */
+		/* Keep EFER.LMA properly updated if paging is enabled */
 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 		if (efer & EFER_LME) {
 			efer |= EFER_LMA;
 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
-			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
-			entry_ctls |= VM_ENTRY_GUEST_LMA;
-			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
+			vmx_sync_efer_state(vmx, vcpu, efer);
 		}
 	}
 
@@ -2934,6 +2946,44 @@ vmx_vmcleanup(void *arg)
 	kmem_free(vmx, sizeof (*vmx));
 }
 
+/*
+ * Ensure that the VMCS for this vcpu is loaded.
+ * Returns true if a VMCS load was required.
+ */
+static bool
+vmx_vmcs_access_ensure(struct vmx *vmx, int vcpu)
+{
+	int hostcpu;
+
+	if (vcpu_is_running(vmx->vm, vcpu, &hostcpu)) {
+		if (hostcpu != curcpu) {
+			panic("unexpected vcpu migration %d != %d",
+			    hostcpu, curcpu);
+		}
+		/* Earlier logic already took care of the load */
+		return (false);
+	} else {
+		vmcs_load(vmx->vmcs_pa[vcpu]);
+		return (true);
+	}
+}
+
+static void
+vmx_vmcs_access_done(struct vmx *vmx, int vcpu)
+{
+	int hostcpu;
+
+	if (vcpu_is_running(vmx->vm, vcpu, &hostcpu)) {
+		if (hostcpu != curcpu) {
+			panic("unexpected vcpu migration %d != %d",
+			    hostcpu, curcpu);
+		}
+		/* Later logic will take care of the unload */
+	} else {
+		vmcs_clear(vmx->vmcs_pa[vcpu]);
+	}
+}
+
 static uint64_t *
 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 {
@@ -2989,25 +3039,18 @@ vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 static int
 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 {
-	int running, hostcpu, err;
 	struct vmx *vmx = arg;
 	uint64_t *regp;
 
-	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
-	if (running && hostcpu != curcpu)
-		panic("vmx_getreg: %d is running", vcpu);
-
 	/* VMCS access not required for ctx reads */
 	if ((regp = vmxctx_regptr(&vmx->ctx[vcpu], reg)) != NULL) {
 		*retval = *regp;
 		return (0);
 	}
 
-	if (!running) {
-		vmcs_load(vmx->vmcs_pa[vcpu]);
-	}
+	bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
+	int err = 0;
 
-	err = 0;
 	if (reg == VM_REG_GUEST_INTR_SHADOW) {
 		uint64_t gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
@@ -3035,33 +3078,26 @@ vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 		}
 	}
 
-	if (!running) {
-		vmcs_clear(vmx->vmcs_pa[vcpu]);
+	if (vmcs_loaded) {
+		vmx_vmcs_access_done(vmx, vcpu);
 	}
-
 	return (err);
 }
 
 static int
 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
-	int running, hostcpu, error;
 	struct vmx *vmx = arg;
 	uint64_t *regp;
 
-	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
-	if (running && hostcpu != curcpu)
-		panic("vmx_setreg: %d is running", vcpu);
-
 	/* VMCS access not required for ctx writes */
 	if ((regp = vmxctx_regptr(&vmx->ctx[vcpu], reg)) != NULL) {
 		*regp = val;
 		return (0);
 	}
 
-	if (!running) {
-		vmcs_load(vmx->vmcs_pa[vcpu]);
-	}
+	bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
+	int err = 0;
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW) {
 		if (val != 0) {
@@ -3069,39 +3105,24 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 			 * Forcing the vcpu into an interrupt shadow is not
 			 * presently supported.
 			 */
-			error = EINVAL;
+			err = EINVAL;
 		} else {
 			uint64_t gi;
 
 			gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 			gi &= ~HWINTR_BLOCKING;
 			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
-			error = 0;
+			err = 0;
 		}
 	} else {
 		uint32_t encoding;
 
-		error = 0;
+		err = 0;
 		encoding = vmcs_field_encoding(reg);
 		switch (encoding) {
 		case VMCS_GUEST_IA32_EFER:
-			/*
-			 * If the "load EFER" VM-entry control is 1 then the
-			 * value of EFER.LMA must be identical to "IA-32e mode
-			 * guest" bit in the VM-entry control.
-			 */
-			if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0) {
-				uint64_t ctls;
-
-				ctls = vmcs_read(VMCS_ENTRY_CTLS);
-				if (val & EFER_LMA) {
-					ctls |= VM_ENTRY_GUEST_LMA;
-				} else {
-					ctls &= ~VM_ENTRY_GUEST_LMA;
-				}
-				vmcs_write(VMCS_ENTRY_CTLS, ctls);
-			}
 			vmcs_write(encoding, val);
+			vmx_sync_efer_state(vmx, vcpu, val);
 			break;
 		case VMCS_GUEST_CR0:
 			/*
@@ -3130,10 +3151,11 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 			 * XXX the processor retains global mappings when %cr3
 			 * is updated but vmx_invvpid() does not.
 			 */
-			vmx_invvpid(vmx, vcpu, running);
+			vmx_invvpid(vmx, vcpu,
+			    vcpu_is_running(vmx->vm, vcpu, NULL));
 			break;
 		case VMCS_INVALID_ENCODING:
-			error = EINVAL;
+			err = EINVAL;
 			break;
 		default:
 			vmcs_write(encoding, val);
@@ -3141,27 +3163,19 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 		}
 	}
 
-	if (!running) {
-		vmcs_clear(vmx->vmcs_pa[vcpu]);
+	if (vmcs_loaded) {
+		vmx_vmcs_access_done(vmx, vcpu);
 	}
-
-	return (error);
+	return (err);
 }
 
 static int
 vmx_getdesc(void *arg, int vcpu, int seg, struct seg_desc *desc)
 {
-	int hostcpu, running;
 	struct vmx *vmx = arg;
 	uint32_t base, limit, access;
 
-	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
-	if (running && hostcpu != curcpu)
-		panic("vmx_getdesc: %d is running", vcpu);
-
-	if (!running) {
-		vmcs_load(vmx->vmcs_pa[vcpu]);
-	}
+	bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
 
 	vmcs_seg_desc_encoding(seg, &base, &limit, &access);
 	desc->base = vmcs_read(base);
@@ -3172,8 +3186,8 @@ vmx_getdesc(void *arg, int vcpu, int seg, struct seg_desc *desc)
 		desc->access = 0;
 	}
 
-	if (!running) {
-		vmcs_clear(vmx->vmcs_pa[vcpu]);
+	if (vmcs_loaded) {
+		vmx_vmcs_access_done(vmx, vcpu);
 	}
 	return (0);
 }
@@ -3181,17 +3195,10 @@ vmx_getdesc(void *arg, int vcpu, int seg, struct seg_desc *desc)
 static int
 vmx_setdesc(void *arg, int vcpu, int seg, const struct seg_desc *desc)
 {
-	int hostcpu, running;
 	struct vmx *vmx = arg;
 	uint32_t base, limit, access;
 
-	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
-	if (running && hostcpu != curcpu)
-		panic("vmx_setdesc: %d is running", vcpu);
-
-	if (!running) {
-		vmcs_load(vmx->vmcs_pa[vcpu]);
-	}
+	bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
 
 	vmcs_seg_desc_encoding(seg, &base, &limit, &access);
 	vmcs_write(base, desc->base);
@@ -3200,12 +3207,94 @@ vmx_setdesc(void *arg, int vcpu, int seg, const struct seg_desc *desc)
 		vmcs_write(access, desc->access);
 	}
 
-	if (!running) {
-		vmcs_clear(vmx->vmcs_pa[vcpu]);
+	if (vmcs_loaded) {
+		vmx_vmcs_access_done(vmx, vcpu);
 	}
 	return (0);
 }
 
+static uint64_t *
+vmx_msr_ptr(struct vmx *vmx, int vcpu, uint32_t msr)
+{
+	uint64_t *guest_msrs = vmx->guest_msrs[vcpu];
+
+	switch (msr) {
+	case MSR_LSTAR:
+		return (&guest_msrs[IDX_MSR_LSTAR]);
+	case MSR_CSTAR:
+		return (&guest_msrs[IDX_MSR_CSTAR]);
+	case MSR_STAR:
+		return (&guest_msrs[IDX_MSR_STAR]);
+	case MSR_SF_MASK:
+		return (&guest_msrs[IDX_MSR_SF_MASK]);
+	case MSR_KGSBASE:
+		return (&guest_msrs[IDX_MSR_KGSBASE]);
+	case MSR_PAT:
+		return (&guest_msrs[IDX_MSR_PAT]);
+	default:
+		return (NULL);
+	}
+}
+
+static int
+vmx_msr_get(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
+{
+	struct vmx *vmx = arg;
+
+	ASSERT(valp != NULL);
+
+	const uint64_t *msrp = vmx_msr_ptr(vmx, vcpu, msr);
+	if (msrp != NULL) {
+		*valp = *msrp;
+		return (0);
+	}
+
+	const uint32_t vmcs_enc = vmcs_msr_encoding(msr);
+	if (vmcs_enc != VMCS_INVALID_ENCODING) {
+		bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
+
+		*valp = vmcs_read(vmcs_enc);
+
+		if (vmcs_loaded) {
+			vmx_vmcs_access_done(vmx, vcpu);
+		}
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+static int
+vmx_msr_set(void *arg, int vcpu, uint32_t msr, uint64_t val)
+{
+	struct vmx *vmx = arg;
+
+	/* TODO: mask value */
+
+	uint64_t *msrp = vmx_msr_ptr(vmx, vcpu, msr);
+	if (msrp != NULL) {
+		*msrp = val;
+		return (0);
+	}
+
+	const uint32_t vmcs_enc = vmcs_msr_encoding(msr);
+	if (vmcs_enc != VMCS_INVALID_ENCODING) {
+		bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
+
+		vmcs_write(vmcs_enc, val);
+
+		if (msr == MSR_EFER) {
+			vmx_sync_efer_state(vmx, vcpu, val);
+		}
+
+		if (vmcs_loaded) {
+			vmx_vmcs_access_done(vmx, vcpu);
+		}
+		return (0);
+	}
+	return (EINVAL);
+}
+
 static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
@@ -3711,6 +3800,9 @@ struct vmm_ops vmm_ops_intel = {
 
 	.vmsavectx	= vmx_savectx,
 	.vmrestorectx	= vmx_restorectx,
+
+	.vmgetmsr	= vmx_msr_get,
+	.vmsetmsr	= vmx_msr_set,
 };
 
 /* Side-effect free HW validation derived from checks in vmx_init. */
diff --git a/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h b/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h
index bc7f1bb0f2..1dba79a7bf 100644
--- a/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h
+++ b/usr/src/uts/intel/io/vmm/sys/vmm_kernel.h
@@ -90,6 +90,11 @@ typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
 typedef void	(*vmi_savectx)(void *vmi, int vcpu);
 typedef void	(*vmi_restorectx)(void *vmi, int vcpu);
 
+typedef int	(*vmi_get_msr_t)(void *vmi, int vcpu, uint32_t msr,
+    uint64_t *valp);
+typedef int	(*vmi_set_msr_t)(void *vmi, int vcpu, uint32_t msr,
+    uint64_t val);
+
 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
 	vmm_cleanup_func_t	cleanup;
@@ -109,6 +114,9 @@ struct vmm_ops {
 
 	vmi_savectx		vmsavectx;
 	vmi_restorectx		vmrestorectx;
+
+	vmi_get_msr_t		vmgetmsr;
+	vmi_set_msr_t		vmsetmsr;
 };
 
 extern struct vmm_ops vmm_ops_intel;
@@ -379,6 +387,19 @@ typedef enum vm_msr_result {
 	VMR_UNHANLDED	= 2, /* handle in userspace, kernel cannot emulate */
 } vm_msr_result_t;
 
+enum vm_cpuid_capability {
+	VCC_NONE,
+	VCC_NO_EXECUTE,
+	VCC_FFXSR,
+	VCC_TCE,
+	VCC_LAST
+};
+
+int x86_emulate_cpuid(struct vm *, int, uint64_t *, uint64_t *, uint64_t *,
+    uint64_t *);
+bool vm_cpuid_capability(struct vm *, int, enum vm_cpuid_capability);
+bool validate_guest_xcr0(uint64_t, uint64_t);
+
 void vmm_sol_glue_init(void);
 void vmm_sol_glue_cleanup(void);
 
@@ -445,6 +466,7 @@ typedef struct vmm_data_req {
 	uint32_t	vdr_flags;
 	uint32_t	vdr_len;
 	void		*vdr_data;
+	uint32_t	*vdr_result_len;
 } vmm_data_req_t;
 typedef struct vmm_data_req vmm_data_req_t;
 
@@ -455,6 +477,7 @@ typedef struct vmm_data_version_entry {
 	uint16_t		vdve_class;
 	uint16_t		vdve_version;
 	uint16_t		vdve_len_expect;
+	uint16_t		vdve_len_per_item;
 	vmm_data_readf_t	vdve_readf;
 	vmm_data_writef_t	vdve_writef;
 } vmm_data_version_entry_t;
diff --git a/usr/src/uts/intel/io/vmm/vmm.c b/usr/src/uts/intel/io/vmm/vmm.c
index 565dcbbe0a..e85d84d0b6 100644
--- a/usr/src/uts/intel/io/vmm/vmm.c
+++ b/usr/src/uts/intel/io/vmm/vmm.c
@@ -248,6 +248,8 @@ static struct vmm_ops vmm_ops_null = {
 	.vlapic_cleanup	= (vmi_vlapic_cleanup)nullop_panic,
 	.vmsavectx	= (vmi_savectx)nullop_panic,
 	.vmrestorectx	= (vmi_restorectx)nullop_panic,
+	.vmgetmsr	= (vmi_get_msr_t)nullop_panic,
+	.vmsetmsr	= (vmi_set_msr_t)nullop_panic,
 };
 
 static struct vmm_ops *ops = &vmm_ops_null;
@@ -1102,38 +1104,51 @@ vm_assign_pptdev(struct vm *vm, int pptfd)
 }
 
 int
-vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval)
 {
-
-	if (vcpu < 0 || vcpu >= vm->maxcpus)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
-	return (VMGETREG(vm->cookie, vcpu, reg, retval));
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+	switch (reg) {
+	case VM_REG_GUEST_XCR0:
+		*retval = vcpu->guest_xcr0;
+		return (0);
+	default:
+		return (VMGETREG(vm->cookie, vcpuid, reg, retval));
+	}
 }
 
 int
 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
-	struct vcpu *vcpu;
-	int error;
-
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
-	error = VMSETREG(vm->cookie, vcpuid, reg, val);
-	if (error || reg != VM_REG_GUEST_RIP)
+	int error;
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+	switch (reg) {
+	case VM_REG_GUEST_RIP:
+		error = VMSETREG(vm->cookie, vcpuid, reg, val);
+		if (error == 0) {
+			vcpu->nextrip = val;
+		}
 		return (error);
-
-	/* Set 'nextrip' to match the value of %rip */
-	vcpu = &vm->vcpu[vcpuid];
-	vcpu->nextrip = val;
-	return (0);
+	case VM_REG_GUEST_XCR0:
+		if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) {
+			return (EINVAL);
+		}
+		vcpu->guest_xcr0 = val;
+		return (0);
+	default:
+		return (VMSETREG(vm->cookie, vcpuid, reg, val));
+	}
 }
 
 static bool
@@ -1864,7 +1879,7 @@ vm_handle_run_state(struct vm *vm, int vcpuid)
 }
 
 static int
-vm_rdmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t *val)
+vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val)
 {
 	switch (num) {
 	case MSR_MTRRcap:
@@ -1945,6 +1960,22 @@ vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val)
 	return (0);
 }
 
+static bool
+is_mtrr_msr(uint32_t msr)
+{
+	switch (msr) {
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
+		return (true);
+	default:
+		return (false);
+	}
+}
+
 static int
 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
 {
@@ -3702,17 +3733,20 @@ vmm_data_is_cpu_specific(uint16_t data_class)
 	case VDC_MSR:
 	case VDC_FPU:
 	case VDC_LAPIC:
-	case VDC_VMM_ARCH:
 		return (true);
 	default:
 		return (false);
 	}
 }
 
-static const vmm_data_version_entry_t *
-vmm_data_find(const vmm_data_req_t *req, int *err)
+static int
+vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp)
 {
 	const vmm_data_version_entry_t **vdpp, *vdp;
+
+	ASSERT(resp != NULL);
+	ASSERT(req->vdr_result_len != NULL);
+
 	SET_FOREACH(vdpp, vmm_data_version_entries) {
 		vdp = *vdpp;
 		if (vdp->vdve_class == req->vdr_class &&
@@ -3722,15 +3756,15 @@ vmm_data_find(const vmm_data_req_t *req, int *err)
 			 * provider for this data.
 			 */
 			if (vdp->vdve_len_expect != 0 &&
-			    vdp->vdve_len_expect != req->vdr_len) {
-				*err = ENOSPC;
-				return (NULL);
+			    vdp->vdve_len_expect > req->vdr_len) {
+				*req->vdr_result_len = vdp->vdve_len_expect;
+				return (ENOSPC);
 			}
-			return (vdp);
+			*resp = vdp;
+			return (0);
 		}
 	}
-	*err = EINVAL;
-	return (NULL);
+	return (EINVAL);
 }
 
 static void *
@@ -3740,10 +3774,11 @@ vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid)
 		/* per-cpu data/devices */
 	case VDC_LAPIC:
 		return (vm_lapic(vm, vcpuid));
+	case VDC_VMM_ARCH:
+		return (vm);
 
 	case VDC_FPU:
 	case VDC_REGISTER:
-	case VDC_VMM_ARCH:
 	case VDC_MSR:
 		/*
 		 * These have per-CPU handling which is dispatched outside
@@ -3771,6 +3806,356 @@ vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid)
 	}
 }
 
+const uint32_t arch_msr_iter[] = {
+	MSR_EFER,
+
+	/*
+	 * While gsbase and fsbase are accessible via the MSR accessors, they
+	 * are not included in MSR iteration since they are covered by the
+	 * segment descriptor interface too.
+	 */
+	MSR_KGSBASE,
+
+	MSR_STAR,
+	MSR_LSTAR,
+	MSR_CSTAR,
+	MSR_SF_MASK,
+
+	MSR_SYSENTER_CS_MSR,
+	MSR_SYSENTER_ESP_MSR,
+	MSR_SYSENTER_EIP_MSR,
+	MSR_PAT,
+};
+const uint32_t generic_msr_iter[] = {
+	MSR_TSC,
+	MSR_MTRRcap,
+	MSR_MTRRdefType,
+
+	MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2,
+	MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5,
+	MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7,
+
+	MSR_MTRR16kBase, MSR_MTRR16kBase + 1,
+
+	MSR_MTRR64kBase,
+};
+
+static int
+vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
+{
+	VERIFY3U(req->vdr_class, ==, VDC_MSR);
+	VERIFY3U(req->vdr_version, ==, 1);
+
+	const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter)
+	    + (VMM_MTRR_VAR_MAX * 2);
+	const uint32_t output_len =
+	    num_msrs * sizeof (struct vdi_field_entry_v1);
+	*req->vdr_result_len = output_len;
+
+	if (req->vdr_len < output_len) {
+		return (ENOSPC);
+	}
+
+	struct vdi_field_entry_v1 *entryp = req->vdr_data;
+	for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) {
+		const uint32_t msr = arch_msr_iter[i];
+		uint64_t val = 0;
+
+		int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val);
+		/* All of these MSRs are expected to work */
+		VERIFY0(err);
+		entryp->vfe_ident = msr;
+		entryp->vfe_value = val;
+	}
+
+	struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr;
+	for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) {
+		const uint32_t msr = generic_msr_iter[i];
+
+		entryp->vfe_ident = msr;
+		switch (msr) {
+		case MSR_TSC:
+			/*
+			 * Communicate this as the difference from the VM-wide
+			 * offset of the boot time.
+			 */
+			entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset;
+			break;
+		case MSR_MTRRcap:
+		case MSR_MTRRdefType:
+		case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
+		case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+		case MSR_MTRR64kBase: {
+			int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value);
+			VERIFY0(err);
+			break;
+		}
+		default:
+			panic("unexpected msr export %x", msr);
+		}
+	}
+	/* Copy the variable MTRRs */
+	for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) {
+		const uint32_t msr = MSR_MTRRVarBase + i;
+
+		entryp->vfe_ident = msr;
+		int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value);
+		VERIFY0(err);
+	}
+	return (0);
+}
+
+static int
+vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
+{
+	VERIFY3U(req->vdr_class, ==, VDC_MSR);
+	VERIFY3U(req->vdr_version, ==, 1);
+
+	const struct vdi_field_entry_v1 *entryp = req->vdr_data;
+	const uint_t entry_count =
+	    req->vdr_len / sizeof (struct vdi_field_entry_v1);
+	struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr;
+
+	/*
+	 * First make sure that all of the MSRs can be manipulated.
+	 * For now, this check is done by going though the getmsr handler
+	 */
+	for (uint_t i = 0; i < entry_count; i++, entryp++) {
+		const uint32_t msr = entryp->vfe_ident;
+		uint64_t val;
+		int err = 0;
+
+		switch (msr) {
+		case MSR_TSC:
+			break;
+		default:
+			if (is_mtrr_msr(msr)) {
+				err = vm_rdmtrr(mtrr, msr, &val);
+			} else {
+				err = ops->vmgetmsr(vm->cookie, vcpuid, msr,
+				    &val);
+			}
+			break;
+		}
+		if (err != 0) {
+			return (err);
+		}
+	}
+
+	/*
+	 * Fairly confident that all of the 'set' operations are at least
+	 * targeting valid MSRs, continue on.
+	 */
+	entryp = req->vdr_data;
+	for (uint_t i = 0; i < entry_count; i++, entryp++) {
+		const uint32_t msr = entryp->vfe_ident;
+		const uint64_t val = entryp->vfe_value;
+		int err = 0;
+
+		switch (msr) {
+		case MSR_TSC:
+			vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value;
+			break;
+		default:
+			if (is_mtrr_msr(msr)) {
+				if (msr == MSR_MTRRcap) {
+					/*
+					 * MTRRcap is read-only.  If the current
+					 * value matches the incoming one,
+					 * consider it a success
+					 */
+					uint64_t comp;
+					err = vm_rdmtrr(mtrr, msr, &comp);
+					if (err != 0 || comp != val) {
+						err = EINVAL;
+					}
+				} else {
+					err = vm_wrmtrr(mtrr, msr, val);
+				}
+			} else {
+				err = ops->vmsetmsr(vm->cookie, vcpuid, msr,
+				    val);
+			}
+			break;
+		}
+		if (err != 0) {
+			return (err);
+		}
+	}
+	*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
+
+	return (0);
+}
+
+static const vmm_data_version_entry_t msr_v1 = {
+	.vdve_class = VDC_MSR,
+	.vdve_version = 1,
+	.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
+	/* Requires backend-specific dispatch */
+	.vdve_readf = NULL,
+	.vdve_writef = NULL,
+};
+VMM_DATA_VERSION(msr_v1);
+
+static const uint32_t vmm_arch_v1_fields[] = {
+	VAI_TSC_BOOT_OFFSET,
+	VAI_BOOT_HRTIME,
+	VAI_TSC_FREQ,
+};
+
+static bool
+vmm_read_arch_field(struct vm *vm, uint32_t ident, uint64_t *valp)
+{
+	ASSERT(valp != NULL);
+
+	switch (ident) {
+	case VAI_TSC_BOOT_OFFSET:
+		*valp = vm->boot_tsc_offset;
+		return (true);
+	case VAI_BOOT_HRTIME:
+		*valp = vm->boot_hrtime;
+		return (true);
+	case VAI_TSC_FREQ:
+		/*
+		 * Since the system TSC calibration is not public, just derive
+		 * it from the scaling functions available.
+		 */
+		*valp = unscalehrtime(NANOSEC);
+		return (true);
+	default:
+		break;
+	}
+	return (false);
+}
+
+static int
+vmm_data_read_vmm_arch(void *arg, const vmm_data_req_t *req)
+{
+	struct vm *vm = arg;
+
+	VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
+	VERIFY3U(req->vdr_version, ==, 1);
+
+	struct vdi_field_entry_v1 *entryp = req->vdr_data;
+
+	/* Specific fields requested */
+	if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) {
+		const uint_t count =
+		    req->vdr_len / sizeof (struct vdi_field_entry_v1);
+
+		for (uint_t i = 0; i < count; i++, entryp++) {
+			if (!vmm_read_arch_field(vm, entryp->vfe_ident,
+			    &entryp->vfe_value)) {
+				return (EINVAL);
+			}
+		}
+		*req->vdr_result_len =
+		    count * sizeof (struct vdi_field_entry_v1);
+		return (0);
+	}
+
+	/* Emit all of the possible values */
+	const uint32_t total_size = nitems(vmm_arch_v1_fields) *
+	    sizeof (struct vdi_field_entry_v1);
+	*req->vdr_result_len = total_size;
+	if (req->vdr_len < total_size) {
+		return (ENOSPC);
+	}
+	for (uint_t i = 0; i < nitems(vmm_arch_v1_fields); i++, entryp++) {
+		entryp->vfe_ident = vmm_arch_v1_fields[i];
+		VERIFY(vmm_read_arch_field(vm, entryp->vfe_ident,
+		    &entryp->vfe_value));
+	}
+	return (0);
+}
+
+static int
+vmm_data_write_vmm_arch(void *arg, const vmm_data_req_t *req)
+{
+	struct vm *vm = arg;
+
+	VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
+	VERIFY3U(req->vdr_version, ==, 1);
+
+	const struct vdi_field_entry_v1 *entryp = req->vdr_data;
+	const uint_t entry_count =
+	    req->vdr_len / sizeof (struct vdi_field_entry_v1);
+
+	for (uint_t i = 0; i < entry_count; i++, entryp++) {
+		const uint64_t val = entryp->vfe_value;
+
+		switch (entryp->vfe_ident) {
+		case VAI_TSC_BOOT_OFFSET:
+			vm->boot_tsc_offset = val;
+			break;
+		case VAI_BOOT_HRTIME:
+			vm->boot_hrtime = val;
+			break;
+		case VAI_TSC_FREQ:
+			/* Guest TSC frequency not (currently) adjustable */
+			return (EPERM);
+		default:
+			return (EINVAL);
+		}
+	}
+	*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
+	return (0);
+}
+
+static const vmm_data_version_entry_t vmm_arch_v1 = {
+	.vdve_class = VDC_VMM_ARCH,
+	.vdve_version = 1,
+	.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
+	.vdve_readf = vmm_data_read_vmm_arch,
+	.vdve_writef = vmm_data_write_vmm_arch,
+};
+VMM_DATA_VERSION(vmm_arch_v1);
+
+static int
+vmm_data_read_versions(void *arg, const vmm_data_req_t *req)
+{
+	VERIFY3U(req->vdr_class, ==, VDC_VERSION);
+	VERIFY3U(req->vdr_version, ==, 1);
+
+	const uint32_t total_size = SET_COUNT(vmm_data_version_entries) *
+	    sizeof (struct vdi_version_entry_v1);
+
+	/* Make sure there is room for all of the entries */
+	*req->vdr_result_len = total_size;
+	if (req->vdr_len < *req->vdr_result_len) {
+		return (ENOSPC);
+	}
+
+	struct vdi_version_entry_v1 *entryp = req->vdr_data;
+	const vmm_data_version_entry_t **vdpp;
+	SET_FOREACH(vdpp, vmm_data_version_entries) {
+		const vmm_data_version_entry_t *vdp = *vdpp;
+
+		entryp->vve_class = vdp->vdve_class;
+		entryp->vve_version = vdp->vdve_version;
+		entryp->vve_len_expect = vdp->vdve_len_expect;
+		entryp->vve_len_per_item = vdp->vdve_len_per_item;
+		entryp++;
+	}
+	return (0);
+}
+
+static int
+vmm_data_write_versions(void *arg, const vmm_data_req_t *req)
+{
+	/* Writing to the version information makes no sense */
+	return (EPERM);
+}
+
+static const vmm_data_version_entry_t versions_v1 = {
+	.vdve_class = VDC_VERSION,
+	.vdve_version = 1,
+	.vdve_len_per_item = sizeof (struct vdi_version_entry_v1),
+	.vdve_readf = vmm_data_read_versions,
+	.vdve_writef = vmm_data_write_versions,
+};
+VMM_DATA_VERSION(versions_v1);
+
 int
 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
 {
@@ -3782,28 +4167,34 @@ vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
 		}
 	}
 
-	const vmm_data_version_entry_t *entry;
-	entry = vmm_data_find(req, &err);
-	if (entry == NULL) {
-		ASSERT(err != 0);
+	const vmm_data_version_entry_t *entry = NULL;
+	err = vmm_data_find(req, &entry);
+	if (err != 0) {
 		return (err);
 	}
+	ASSERT(entry != NULL);
 
 	void *datap = vmm_data_from_class(req, vm, vcpuid);
 	if (datap != NULL) {
 		err = entry->vdve_readf(datap, req);
+
+		/*
+		 * Successful reads of fixed-length data should populate the
+		 * length of that result.
+		 */
+		if (err == 0 && entry->vdve_len_expect != 0) {
+			*req->vdr_result_len = entry->vdve_len_expect;
+		}
 	} else {
 		switch (req->vdr_class) {
+		case VDC_MSR:
+			err = vmm_data_read_msrs(vm, vcpuid, req);
+			break;
 		case VDC_FPU:
 			/* TODO: wire up to xsave export via hma_fpu iface */
 			err = EINVAL;
 			break;
 		case VDC_REGISTER:
-		case VDC_VMM_ARCH:
-		case VDC_MSR:
-			/* TODO: implement */
-			err = EINVAL;
-			break;
 		default:
 			err = EINVAL;
 			break;
@@ -3824,28 +4215,33 @@ vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
 		}
 	}
 
-	const vmm_data_version_entry_t *entry;
-	entry = vmm_data_find(req, &err);
-	if (entry == NULL) {
-		ASSERT(err != 0);
+	const vmm_data_version_entry_t *entry = NULL;
+	err = vmm_data_find(req, &entry);
+	if (err != 0) {
 		return (err);
 	}
+	ASSERT(entry != NULL);
 
 	void *datap = vmm_data_from_class(req, vm, vcpuid);
 	if (datap != NULL) {
 		err = entry->vdve_writef(datap, req);
+		/*
+		 * Successful writes of fixed-length data should populate the
+		 * length of that result.
+		 */
+		if (err == 0 && entry->vdve_len_expect != 0) {
+			*req->vdr_result_len = entry->vdve_len_expect;
+		}
 	} else {
 		switch (req->vdr_class) {
+		case VDC_MSR:
+			err = vmm_data_write_msrs(vm, vcpuid, req);
+			break;
 		case VDC_FPU:
 			/* TODO: wire up to xsave import via hma_fpu iface */
 			err = EINVAL;
 			break;
 		case VDC_REGISTER:
-		case VDC_VMM_ARCH:
-		case VDC_MSR:
-			/* TODO: implement */
-			err = EINVAL;
-			break;
 		default:
 			err = EINVAL;
 			break;
diff --git a/usr/src/uts/intel/io/vmm/vmm_sol_dev.c b/usr/src/uts/intel/io/vmm/vmm_sol_dev.c
index 9a4693fc78..ee07779b21 100644
--- a/usr/src/uts/intel/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/intel/io/vmm/vmm_sol_dev.c
@@ -1550,32 +1550,48 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 		}
 
 		const size_t len = vdx.vdx_len;
-		void *buf = kmem_alloc(len, KM_SLEEP);
-		if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0) {
-			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
+		void *buf = NULL;
+		if (len != 0) {
+			buf = kmem_alloc(len, KM_SLEEP);
+			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 &&
+			    ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
 				kmem_free(buf, len);
 				error = EFAULT;
 				break;
+			} else {
+				bzero(buf, len);
 			}
-		} else {
-			bzero(buf, len);
 		}
 
+		vdx.vdx_result_len = 0;
 		vmm_data_req_t req = {
 			.vdr_class = vdx.vdx_class,
 			.vdr_version = vdx.vdx_version,
 			.vdr_flags = vdx.vdx_flags,
-			.vdr_len = vdx.vdx_len,
+			.vdr_len = len,
 			.vdr_data = buf,
+			.vdr_result_len = &vdx.vdx_result_len,
 		};
 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
 
-		if (error == 0) {
+		if (error == 0 && buf != NULL) {
 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
 				error = EFAULT;
 			}
 		}
-		kmem_free(buf, len);
+
+		/*
+		 * Copy out the transfer request so that the value of
+		 * vdx_result_len can be made available, regardless of any
+		 * error(s) which may have occurred.
+		 */
+		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
+			error = (error != 0) ? error : EFAULT;
+		}
+
+		if (buf != NULL) {
+			kmem_free(buf, len);
+		}
 		break;
 	}
 	case VM_DATA_WRITE: {
@@ -1595,19 +1611,24 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 		}
 
 		const size_t len = vdx.vdx_len;
-		void *buf = kmem_alloc(len, KM_SLEEP);
-		if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
-			kmem_free(buf, len);
-			error = EFAULT;
-			break;
+		void *buf = NULL;
+		if (len != 0) {
+			buf = kmem_alloc(len, KM_SLEEP);
+			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
+				kmem_free(buf, len);
+				error = EFAULT;
+				break;
+			}
 		}
 
+		vdx.vdx_result_len = 0;
 		vmm_data_req_t req = {
 			.vdr_class = vdx.vdx_class,
 			.vdr_version = vdx.vdx_version,
 			.vdr_flags = vdx.vdx_flags,
-			.vdr_len = vdx.vdx_len,
+			.vdr_len = len,
 			.vdr_data = buf,
+			.vdr_result_len = &vdx.vdx_result_len,
 		};
 		if (vmm_allow_state_writes == 0) {
 			/* XXX: Play it safe for now */
@@ -1617,13 +1638,25 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 			    &req);
 		}
 
-		if (error == 0 &&
+		if (error == 0 && buf != NULL &&
 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
 				error = EFAULT;
 			}
 		}
-		kmem_free(buf, len);
+
+		/*
+		 * Copy out the transfer request so that the value of
+		 * vdx_result_len can be made available, regardless of any
+		 * error(s) which may have occurred.
+		 */
+		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
+			error = (error != 0) ? error : EFAULT;
+		}
+
+		if (buf != NULL) {
+			kmem_free(buf, len);
+		}
 		break;
 	}
 
diff --git a/usr/src/uts/intel/io/vmm/x86.c b/usr/src/uts/intel/io/vmm/x86.c
index de48ba1d48..e593e0c04e 100644
--- a/usr/src/uts/intel/io/vmm/x86.c
+++ b/usr/src/uts/intel/io/vmm/x86.c
@@ -58,10 +58,10 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 
 #include <machine/vmm.h>
+#include <sys/vmm_kernel.h>
 
 #include "vmm_host.h"
 #include "vmm_util.h"
-#include "x86.h"
 
 SYSCTL_DECL(_hw_vmm);
 
@@ -80,6 +80,42 @@ static int cpuid_leaf_b = 1;
  */
 static int vmm_force_invariant_tsc = 0;
 
+#define	CPUID_0000_0000	(0x0)
+#define	CPUID_0000_0001	(0x1)
+#define	CPUID_0000_0002	(0x2)
+#define	CPUID_0000_0003	(0x3)
+#define	CPUID_0000_0004	(0x4)
+#define	CPUID_0000_0006	(0x6)
+#define	CPUID_0000_0007	(0x7)
+#define	CPUID_0000_000A	(0xA)
+#define	CPUID_0000_000B	(0xB)
+#define	CPUID_0000_000D	(0xD)
+#define	CPUID_0000_000F	(0xF)
+#define	CPUID_0000_0010	(0x10)
+#define	CPUID_0000_0015	(0x15)
+#define	CPUID_8000_0000	(0x80000000)
+#define	CPUID_8000_0001	(0x80000001)
+#define	CPUID_8000_0002	(0x80000002)
+#define	CPUID_8000_0003	(0x80000003)
+#define	CPUID_8000_0004	(0x80000004)
+#define	CPUID_8000_0006	(0x80000006)
+#define	CPUID_8000_0007	(0x80000007)
+#define	CPUID_8000_0008	(0x80000008)
+#define	CPUID_8000_001D	(0x8000001D)
+#define	CPUID_8000_001E	(0x8000001E)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define	CPUID_0000_0001_APICID_MASK	(0xff<<24)
+#define	CPUID_0000_0001_APICID_SHIFT	24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define	CPUID_0000_0001_FEAT0_VMX	(1<<5)
+
+
 /*
  * Round up to the next power of two, if necessary, and then take log2.
  * Returns -1 if argument is zero.
@@ -649,6 +685,10 @@ default_leaf:
 	return (1);
 }
 
+/*
+ * Return 'true' if the capability 'cap' is enabled in this virtual cpu
+ * and 'false' otherwise.
+ */
 bool
 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
 {
@@ -690,3 +730,23 @@ vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
 	}
 	return (rv);
 }
+
+bool
+validate_guest_xcr0(uint64_t val, uint64_t limit_mask)
+{
+	/* x87 feature must be enabled */
+	if ((val & XFEATURE_ENABLED_X87) == 0) {
+		return (false);
+	}
+	/* AVX cannot be enabled without SSE */
+	if ((val & (XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX)) ==
+	    XFEATURE_ENABLED_SSE) {
+		return (false);
+	}
+	/* No bits should be outside what we dictate to be allowed */
+	if ((val & ~limit_mask) != 0) {
+		return (false);
+	}
+
+	return (true);
+}
diff --git a/usr/src/uts/intel/io/vmm/x86.h b/usr/src/uts/intel/io/vmm/x86.h
deleted file mode 100644
index f3459e4f8a..0000000000
--- a/usr/src/uts/intel/io/vmm/x86.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _X86_H_
-#define	_X86_H_
-
-#define	CPUID_0000_0000	(0x0)
-#define	CPUID_0000_0001	(0x1)
-#define	CPUID_0000_0002	(0x2)
-#define	CPUID_0000_0003	(0x3)
-#define	CPUID_0000_0004	(0x4)
-#define	CPUID_0000_0006	(0x6)
-#define	CPUID_0000_0007	(0x7)
-#define	CPUID_0000_000A	(0xA)
-#define	CPUID_0000_000B	(0xB)
-#define	CPUID_0000_000D	(0xD)
-#define	CPUID_0000_000F	(0xF)
-#define	CPUID_0000_0010	(0x10)
-#define	CPUID_0000_0015	(0x15)
-#define	CPUID_8000_0000	(0x80000000)
-#define	CPUID_8000_0001	(0x80000001)
-#define	CPUID_8000_0002	(0x80000002)
-#define	CPUID_8000_0003	(0x80000003)
-#define	CPUID_8000_0004	(0x80000004)
-#define	CPUID_8000_0006	(0x80000006)
-#define	CPUID_8000_0007	(0x80000007)
-#define	CPUID_8000_0008	(0x80000008)
-#define	CPUID_8000_001D	(0x8000001D)
-#define	CPUID_8000_001E	(0x8000001E)
-
-/*
- * CPUID instruction Fn0000_0001:
- */
-#define	CPUID_0000_0001_APICID_MASK	(0xff<<24)
-#define	CPUID_0000_0001_APICID_SHIFT	24
-
-/*
- * CPUID instruction Fn0000_0001 ECX
- */
-#define	CPUID_0000_0001_FEAT0_VMX	(1<<5)
-
-int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx,
-    uint64_t *rcx, uint64_t *rdx);
-
-enum vm_cpuid_capability {
-	VCC_NONE,
-	VCC_NO_EXECUTE,
-	VCC_FFXSR,
-	VCC_TCE,
-	VCC_LAST
-};
-
-/*
- * Return 'true' if the capability 'cap' is enabled in this virtual cpu
- * and 'false' otherwise.
- */
-bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability);
-#endif
diff --git a/usr/src/uts/intel/sys/vmm.h b/usr/src/uts/intel/sys/vmm.h
index 268b2e82ce..50d76ab17c 100644
--- a/usr/src/uts/intel/sys/vmm.h
+++ b/usr/src/uts/intel/sys/vmm.h
@@ -103,6 +103,7 @@ enum vm_reg_name {
 	VM_REG_GUEST_DR3,
 	VM_REG_GUEST_DR6,
 	VM_REG_GUEST_ENTRY_INST_LENGTH,
+	VM_REG_GUEST_XCR0,
 	VM_REG_LAST
 };
 
diff --git a/usr/src/uts/intel/sys/vmm_data.h b/usr/src/uts/intel/sys/vmm_data.h
index 1b8614543c..9ba385c5d6 100644
--- a/usr/src/uts/intel/sys/vmm_data.h
+++ b/usr/src/uts/intel/sys/vmm_data.h
@@ -18,7 +18,6 @@
 #define	_VMM_DATA_H_
 
 /* VMM Data Classes */
-#define	VDC_META	0	/* Meta information about data system */
 #define	VDC_VERSION	1	/* Version information for each data class */
 
 /* Classes bearing per-CPU data */
@@ -42,21 +41,27 @@
 
 /* VMM Data Identifiers */
 
-
-/*
- * VDC_REGISTER:
- */
-
 /*
- * VDC_MSR:
+ * Generic field encoding for 64-bit (or smaller) data which are identified by a
+ * 32-bit (or smaller) name.
  *
- * Use MSR identifiers directly
+ * Used by the following classes/version:
+ * - VDC_REGISTER v1: `vm_reg_name` identifiers
+ * - VDC_MSR v1: MSR identifiers
+ * - VDC_VMM_ARCH v1: Identifiers described below
  */
-
-struct vdi_msr_entry_v1 {
-	uint32_t	vme_msr;
+struct vdi_field_entry_v1 {
+	uint32_t	vfe_ident;
 	uint32_t	_pad;
-	uint64_t	vme_value;
+	uint64_t	vfe_value;
+};
+
+/* VDC_VERSION */
+struct vdi_version_entry_v1 {
+	uint16_t	vve_class;
+	uint16_t	vve_version;
+	uint16_t	vve_len_expect;
+	uint16_t	vve_len_per_item;
 };
 
 /*
@@ -98,11 +103,22 @@ struct vdi_lapic_v1 {
 	uint32_t		vl_esr_pending;
 };
 
-
 /*
  * VDC_VMM_ARCH:
  */
 
+/*
+ * Version 1 identifiers:
+ */
+
+/* Offset of guest TSC from system at time of boot */
+#define	VAI_TSC_BOOT_OFFSET	1
+/* Time that guest (nominally) booted, as hrtime */
+#define	VAI_BOOT_HRTIME		2
+/* Guest TSC frequency measured by hrtime (not effected by wall clock adj.) */
+#define	VAI_TSC_FREQ		3
+
+
 /* VDC_IOAPIC: */
 
 struct vdi_ioapic_v1 {
diff --git a/usr/src/uts/intel/sys/vmm_dev.h b/usr/src/uts/intel/sys/vmm_dev.h
index fc8ccf406e..8d1b2713dd 100644
--- a/usr/src/uts/intel/sys/vmm_dev.h
+++ b/usr/src/uts/intel/sys/vmm_dev.h
@@ -366,6 +366,7 @@ struct vm_data_xfer {
 	uint16_t	vdx_version;
 	uint32_t	vdx_flags;
 	uint32_t	vdx_len;
+	uint32_t	vdx_result_len;
 	void		*vdx_data;
 };
 
@@ -384,7 +385,7 @@ struct vm_data_xfer {
  * best-effort activity.  Nothing is to be inferred about the magnitude of a
  * change when the version is modified.  It follows no rules like semver.
  */
-#define	VMM_CURRENT_INTERFACE_VERSION	2
+#define	VMM_CURRENT_INTERFACE_VERSION	3
 
 
 #define	VMMCTL_IOC_BASE		(('V' << 16) | ('M' << 8))
author	Patrick Mooney <pmooney@pfmooney.com>	2022-06-10 23:05:32 +0000
committer	Patrick Mooney <pmooney@oxide.computer>	2022-06-27 23:20:35 +0000
commit	54cf5b63effe805271443d5dd7afd37ec184fbab (patch)
tree	c2da22ed12b56879537c652b894932f257462d7b /usr
parent	ea962d11118b10579c946c4ac15559148ddf3cf8 (diff)
download	illumos-joyent-54cf5b63effe805271443d5dd7afd37ec184fbab.tar.gz