From 761dea5e2659dff417ad5cdddda547702c3966ae Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Fri, 13 Oct 2017 22:24:22 +0000
Subject: 12528 expand hypervisor management API Reviewed by: John Levon
 <john.levon@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com>
 Reviewed by: Toomas Soome <tsoome@me.com> Reviewed by: Andy Fiddaman
 <andy@omniosce.org> Approved by: Dan McDonald <danmcd@joyent.com>

---
 usr/src/uts/i86pc/Makefile.files  |   2 +
 usr/src/uts/i86pc/ml/hma_asm.s    |  52 +++
 usr/src/uts/i86pc/os/hma.c        | 690 ++++++++++++++++++++++++++++++++++++++
 usr/src/uts/i86pc/os/mp_startup.c |  10 +
 usr/src/uts/i86pc/sys/hma.h       |  38 ++-
 5 files changed, 791 insertions(+), 1 deletion(-)
 create mode 100644 usr/src/uts/i86pc/ml/hma_asm.s
 create mode 100644 usr/src/uts/i86pc/os/hma.c

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 8cb50417d7..879b8d86cb 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -65,6 +65,8 @@ CORE_OBJS +=			\
 	hardclk.o		\
 	hat_i86.o		\
 	hat_kdi.o		\
+	hma.o			\
+	hma_asm.o		\
 	hma_fpu.o		\
 	hment.o			\
 	hold_page.o		\
diff --git a/usr/src/uts/i86pc/ml/hma_asm.s b/usr/src/uts/i86pc/ml/hma_asm.s
new file mode 100644
index 0000000000..49afbdd240
--- /dev/null
+++ b/usr/src/uts/i86pc/ml/hma_asm.s
@@ -0,0 +1,52 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/asm_linkage.h>
+
+	ENTRY_NP(hma_vmx_vmxon)
+	push	%rbp
+	movq	%rsp, %rbp
+	pushq	%rdi
+
+	xorl	%eax, %eax
+	vmxon	-0x8(%rbp)
+	ja	1f	/* CF=0, ZF=0 (success) */
+	incl	%eax
+1:
+
+	leave
+	ret
+	SET_SIZE(hma_vmx_vmxon)
+
+	ENTRY_NP(hma_vmx_do_invept)
+	push	%rbp
+	movq	%rsp, %rbp
+	pushq	%rdi
+	pushq	%rsi
+
+	/* build INVEPT descriptor on stack */
+	xorl	%eax, %eax
+	pushq	%rax;
+	pushq	%rsi
+
+	invept	(%rsp), %rdi
+	ja	1f	/* CF=0, ZF=0 (success) */
+	incl	%eax
+1:
+
+	leave
+	ret
+	SET_SIZE(hma_vmx_do_invept)
diff --git a/usr/src/uts/i86pc/os/hma.c b/usr/src/uts/i86pc/os/hma.c
new file mode 100644
index 0000000000..9678894da0
--- /dev/null
+++ b/usr/src/uts/i86pc/os/hma.c
@@ -0,0 +1,690 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/cpuvar.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/machsystm.h>
+#include <sys/controlregs.h>
+#include <sys/x86_archext.h>
+#include <sys/id_space.h>
+#include <sys/hma.h>
+#include <sys/cmn_err.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+
+struct hma_reg {
+	const char	*hr_name;
+	list_node_t	hr_node;
+};
+
+static kmutex_t hma_lock;
+static list_t hma_registrations;
+
+static boolean_t hma_vmx_ready = B_FALSE;
+static const char *hma_vmx_error = NULL;
+static id_space_t *hma_vmx_vpid;
+
+/*
+ * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a
+ * mutex specific to the module.  It (cpu_lock) is already required for the
+ * state needed to perform setup on all CPUs, so it was a natural fit to
+ * protect this data too.
+ */
+typedef enum hma_cpu_state {
+	HCS_UNINITIALIZED = 0,
+	HCS_READY,
+	HCS_ERROR
+} hma_cpu_state_t;
+static hma_cpu_state_t hma_cpu_status[NCPU];
+
+/* HMA-internal tracking of optional VMX capabilities */
+typedef enum {
+	HVC_EPT		= (1 << 0),
+	HVC_VPID	= (1 << 1),
+	HVC_INVEPT_ONE	= (1 << 2),
+	HVC_INVEPT_ALL	= (1 << 3),
+} hma_vmx_capab_t;
+
+static void *hma_vmx_vmxon_page[NCPU];
+static uintptr_t hma_vmx_vmxon_pa[NCPU];
+static uint32_t hma_vmx_revision;
+static hma_vmx_capab_t hma_vmx_capabs = 0;
+
+static boolean_t hma_svm_ready = B_FALSE;
+static const char *hma_svm_error = NULL;
+static uint32_t hma_svm_features;
+static uint32_t hma_svm_max_asid;
+
+static void *hma_svm_hsave_page[NCPU];
+static uintptr_t hma_svm_hsave_pa[NCPU];
+
+static hma_svm_asid_t hma_svm_cpu_asid[NCPU];
+
+
+static int hma_vmx_init(void);
+static int hma_svm_init(void);
+
+/* Helpers from ml/hma_asm.s */
+int hma_vmx_do_invept(int, uintptr_t);
+int hma_vmx_vmxon(uintptr_t);
+
+void
+hma_init(void)
+{
+	mutex_init(&hma_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&hma_registrations, sizeof (struct hma_reg),
+	    offsetof(struct hma_reg, hr_node));
+
+	switch (cpuid_getvendor(CPU)) {
+	case X86_VENDOR_Intel:
+		(void) hma_vmx_init();
+		break;
+	case X86_VENDOR_AMD:
+		(void) hma_svm_init();
+		break;
+	default:
+		break;
+	}
+}
+
+hma_reg_t *
+hma_register(const char *name)
+{
+	struct hma_reg *reg;
+	boolean_t is_ready;
+
+	VERIFY(name != NULL);
+
+	reg = kmem_zalloc(sizeof (*reg), KM_SLEEP);
+	reg->hr_name = name;
+
+	mutex_enter(&hma_lock);
+	switch (cpuid_getvendor(CPU)) {
+	case X86_VENDOR_Intel:
+		is_ready = hma_vmx_ready;
+		break;
+	case X86_VENDOR_AMD:
+		is_ready = hma_svm_ready;
+		break;
+	default:
+		is_ready = B_FALSE;
+		break;
+	}
+
+	if (!is_ready) {
+		kmem_free(reg, sizeof (*reg));
+		reg = NULL;
+	} else {
+		list_insert_tail(&hma_registrations, reg);
+	}
+	mutex_exit(&hma_lock);
+
+	return (reg);
+}
+
+void
+hma_unregister(hma_reg_t *reg)
+{
+	VERIFY(reg != NULL);
+	VERIFY(!list_is_empty(&hma_registrations));
+
+	mutex_enter(&hma_lock);
+	list_remove(&hma_registrations, reg);
+	mutex_exit(&hma_lock);
+	kmem_free(reg, sizeof (*reg));
+}
+
+/*
+ * VPID 0 is reserved for instances where VPID is disabled.  Some hypervisors
+ * (read: bhyve) reserve lower-order VPIDs for use in fallback behavior if
+ * unique VPIDs could not be allocated for all the vCPUs belonging to a VM.
+ */
+#define	HMA_VPID_RESERVED	NCPU
+
+uint16_t
+hma_vmx_vpid_alloc(void)
+{
+	id_t res;
+
+	/* Do not bother if the CPU lacks support */
+	if ((hma_vmx_capabs & HVC_VPID) == 0) {
+		return (0);
+	}
+
+	res = id_alloc_nosleep(hma_vmx_vpid);
+	if (res == -1) {
+		return (0);
+	} else {
+		ASSERT(res > HMA_VPID_RESERVED && res <= UINT16_MAX);
+		return (res);
+	}
+}
+
+void
+hma_vmx_vpid_free(uint16_t vpid)
+{
+	VERIFY(vpid > HMA_VPID_RESERVED);
+	id_free(hma_vmx_vpid, (id_t)vpid);
+}
+
+#define	INVEPT_SINGLE_CONTEXT	1
+#define	INVEPT_ALL_CONTEXTS	2
+
+static int
+hma_vmx_invept_xcall(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3 __unused)
+{
+	int flag = (int)arg1;
+	uintptr_t eptp = (uintptr_t)arg2;
+
+	ASSERT(flag == INVEPT_SINGLE_CONTEXT || flag == INVEPT_ALL_CONTEXTS);
+
+	VERIFY0(hma_vmx_do_invept(flag, eptp));
+	return (0);
+}
+
+void
+hma_vmx_invept_allcpus(uintptr_t eptp)
+{
+	int flag = -1;
+	cpuset_t set;
+
+	if ((hma_vmx_capabs & HVC_INVEPT_ONE) != 0) {
+		flag = INVEPT_SINGLE_CONTEXT;
+	} else if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) {
+		flag = INVEPT_ALL_CONTEXTS;
+		eptp = 0;
+	} else {
+		return;
+	}
+
+	cpuset_zero(&set);
+	mutex_enter(&cpu_lock);
+
+	cpuset_or(&set, &cpu_active_set);
+	xc_call((xc_arg_t)flag, (xc_arg_t)eptp, 0, CPUSET2BV(set),
+	    hma_vmx_invept_xcall);
+
+	mutex_exit(&cpu_lock);
+}
+
+static int
+hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
+    xc_arg_t arg3 __unused)
+{
+	uint64_t fctrl;
+	processorid_t id = CPU->cpu_seqid;
+	void *vmxon_region = hma_vmx_vmxon_page[id];
+	uintptr_t vmxon_pa = hma_vmx_vmxon_pa[id];
+
+	VERIFY(vmxon_region != NULL && vmxon_pa != 0);
+
+	/*
+	 * Ensure that the VMX support and lock bits are enabled in the
+	 * feature-control MSR.
+	 */
+	fctrl = rdmsr(MSR_IA32_FEAT_CTRL);
+	if ((fctrl & IA32_FEAT_CTRL_LOCK) == 0 ||
+	    (fctrl & IA32_FEAT_CTRL_VMX_EN) == 0) {
+		fctrl = fctrl | IA32_FEAT_CTRL_VMX_EN | IA32_FEAT_CTRL_LOCK;
+		wrmsr(MSR_IA32_FEAT_CTRL, fctrl);
+	}
+
+	setcr4(getcr4() | CR4_VMXE);
+
+	if (hma_vmx_vmxon(vmxon_pa) == 0) {
+		hma_cpu_status[id] = HCS_READY;
+	} else {
+		hma_cpu_status[id] = HCS_ERROR;
+
+		/*
+		 * If VMX has already been marked active and available for the
+		 * system, then failure to perform VMXON on a newly-onlined CPU
+		 * represents a fatal problem.  Continuing on would mean
+		 * failure for any hypervisor thread which landed here.
+		 */
+		if (hma_vmx_ready) {
+			panic("VMXON failure after VMX marked ready");
+		}
+	}
+	return (0);
+}
+
+static int
+hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
+{
+	hma_cpu_state_t state;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(id >= 0 && id < NCPU);
+
+	if (what != CPU_ON) {
+		/*
+		 * For the purposes of VMX setup, only the CPU_ON event is of
+		 * interest.  Letting VMX state linger on an offline CPU should
+		 * not cause any harm.
+		 *
+		 * This logic assumes that any offlining activity is strictly
+		 * administrative in nature and will not alter any existing
+		 * configuration (such as %cr4 bits previously set).
+		 */
+		return (0);
+	}
+
+	state = hma_cpu_status[id];
+	if (state == HCS_ERROR) {
+		return (-1);
+	}
+
+	/* Allocate the VMXON page for this CPU, if not already done */
+	if (hma_vmx_vmxon_page[id] == NULL) {
+		caddr_t va;
+		pfn_t pfn;
+
+		va = kmem_alloc(PAGESIZE, KM_SLEEP);
+		VERIFY0((uintptr_t)va & PAGEOFFSET);
+		hma_vmx_vmxon_page[id] = va;
+
+		/* Initialize the VMX revision field as expected */
+		bcopy(&hma_vmx_revision, va, sizeof (hma_vmx_revision));
+
+		/*
+		 * Cache the physical address of the VMXON page rather than
+		 * looking it up later when the potential blocking of
+		 * hat_getpfnum would be less acceptable.
+		 */
+		pfn = hat_getpfnum(kas.a_hat, va);
+		hma_vmx_vmxon_pa[id] = (pfn << PAGESHIFT);
+	} else {
+		VERIFY(hma_vmx_vmxon_pa[id] != 0);
+	}
+
+	if (state == HCS_UNINITIALIZED) {
+		cpuset_t set;
+
+		/* Activate VMX on this CPU */
+		cpuset_zero(&set);
+		cpuset_add(&set, id);
+		xc_call(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon);
+	} else {
+		VERIFY3U(state, ==, HCS_READY);
+
+		/*
+		 * If an already-initialized CPU is going back online, perform
+		 * an all-contexts invept to eliminate the possibility of
+		 * cached EPT state causing issues.
+		 */
+		if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) {
+			cpuset_t set;
+
+			cpuset_zero(&set);
+			cpuset_add(&set, id);
+			xc_call((xc_arg_t)INVEPT_ALL_CONTEXTS, 0, 0,
+			    CPUSET2BV(set), hma_vmx_invept_xcall);
+		}
+	}
+
+	return (hma_cpu_status[id] != HCS_READY);
+}
+
+/*
+ * Determining the availability of VM execution controls is somewhat different
+ * from conventional means, where one simply checks for asserted bits in the
+ * MSR value.  Instead, these execution control MSRs are split into two halves:
+ * the lower 32-bits indicating capabilities which can be zeroed in the VMCS
+ * field and the upper 32-bits indicating capabilities which can be set to one.
+ *
+ * It is described in detail in Appendix A.3 of SDM volume 3.
+ */
+#define	VMX_CTL_ONE_SETTING(val, flag)	\
+	(((val) & ((uint64_t)(flag) << 32)) != 0)
+
+static const char *
+hma_vmx_query_details(void)
+{
+	boolean_t query_true_ctl = B_FALSE;
+	uint64_t msr;
+
+	/* The basic INS/OUTS functionality is cited as a necessary prereq */
+	msr = rdmsr(MSR_IA32_VMX_BASIC);
+	if ((msr & IA32_VMX_BASIC_INS_OUTS) == 0) {
+		return ("VMX does not support INS/OUTS");
+	}
+
+	/* Record the VMX revision for later VMXON usage */
+	hma_vmx_revision = (uint32_t)msr;
+
+	/*
+	 * Bit 55 in the VMX_BASIC MSR determines how VMX control information
+	 * can be queried.
+	 */
+	query_true_ctl = (msr & IA32_VMX_BASIC_TRUE_CTRLS) != 0;
+
+	/* Check for EPT and VPID support */
+	msr = rdmsr(query_true_ctl ?
+	    MSR_IA32_VMX_TRUE_PROCBASED_CTLS : MSR_IA32_VMX_PROCBASED_CTLS);
+	if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED_2ND_CTLS)) {
+		msr = rdmsr(MSR_IA32_VMX_PROCBASED2_CTLS);
+		if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_EPT)) {
+			hma_vmx_capabs |= HVC_EPT;
+		}
+		if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_VPID)) {
+			hma_vmx_capabs |= HVC_VPID;
+		}
+	}
+
+	/* Check for INVEPT support */
+	if ((hma_vmx_capabs & HVC_EPT) != 0) {
+		msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
+		if ((msr & IA32_VMX_EPT_VPID_INVEPT) != 0) {
+			if ((msr & IA32_VMX_EPT_VPID_INVEPT_SINGLE) != 0) {
+				hma_vmx_capabs |= HVC_INVEPT_ONE;
+			}
+			if ((msr & IA32_VMX_EPT_VPID_INVEPT_ALL) != 0) {
+				hma_vmx_capabs |= HVC_INVEPT_ALL;
+			}
+		}
+	}
+
+	return (NULL);
+}
+
+static int
+hma_vmx_init(void)
+{
+	cpu_t *cp;
+	uint64_t msr;
+	int err = 0;
+	const char *msg = NULL;
+
+	if (!is_x86_feature(x86_featureset, X86FSET_VMX)) {
+		msg = "CPU does not support VMX";
+		goto bail;
+	}
+
+	/* Has the BIOS set the feature-control lock bit without VMX enabled? */
+	msr = rdmsr(MSR_IA32_FEAT_CTRL);
+	if ((msr & IA32_FEAT_CTRL_LOCK) != 0 &&
+	    (msr & IA32_FEAT_CTRL_VMX_EN) == 0) {
+		msg = "VMX support disabled by BIOS";
+		goto bail;
+	}
+
+	msg = hma_vmx_query_details();
+	if (msg != NULL) {
+		goto bail;
+	}
+
+	mutex_enter(&cpu_lock);
+	/* Perform VMX configuration for already-online CPUs. */
+	cp = cpu_active;
+	do {
+		err = hma_vmx_cpu_setup(CPU_ON, cp->cpu_seqid, NULL);
+		if (err != 0) {
+			msg = "failure during VMXON setup";
+			mutex_exit(&cpu_lock);
+			goto bail;
+		}
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+
+	/*
+	 * Register callback for later-onlined CPUs and perform other remaining
+	 * resource allocation.
+	 */
+	register_cpu_setup_func(hma_vmx_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+
+	hma_vmx_vpid = id_space_create("hma_vmx_vpid", HMA_VPID_RESERVED + 1,
+	    UINT16_MAX);
+	hma_vmx_ready = B_TRUE;
+
+	return (0);
+
+bail:
+	hma_vmx_error = msg;
+	cmn_err(CE_NOTE, "hma_vmx_init: %s", msg);
+	return (-1);
+}
+
+#define	VMCB_FLUSH_NOTHING	0x0
+#define	VMCB_FLUSH_ALL		0x1
+#define	VMCB_FLUSH_ASID		0x3
+
+void
+hma_svm_asid_init(hma_svm_asid_t *vcp)
+{
+	/*
+	 * Initialize the generation to 0, forcing an ASID allocation on first
+	 * entry.  Leave the ASID at 0, so if the host forgoes the call to
+	 * hma_svm_asid_update(), SVM will bail on the invalid vcpu state.
+	 */
+	vcp->hsa_gen = 0;
+	vcp->hsa_asid = 0;
+}
+
+uint8_t
+hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid,
+    boolean_t npt_flush)
+{
+	hma_svm_asid_t *hcp = &hma_svm_cpu_asid[CPU->cpu_seqid];
+
+	ASSERT(curthread->t_preempt != 0);
+
+	/*
+	 * If NPT changes dictate a TLB flush and by-ASID flushing is not
+	 * supported/used, force a fresh ASID allocation.
+	 */
+	if (npt_flush && !flush_by_asid) {
+		vcp->hsa_gen = 0;
+	}
+
+	if (vcp->hsa_gen != hcp->hsa_gen) {
+		hcp->hsa_asid++;
+
+		if (hcp->hsa_asid >= hma_svm_max_asid) {
+			/* Keep the ASID properly constrained */
+			hcp->hsa_asid = 1;
+			hcp->hsa_gen++;
+			if (hcp->hsa_gen == 0) {
+				/*
+				 * Stay clear of the '0' sentinel value for
+				 * generation, if wrapping around.
+				 */
+				hcp->hsa_gen = 1;
+			}
+		}
+		vcp->hsa_gen = hcp->hsa_gen;
+		vcp->hsa_asid = hcp->hsa_asid;
+
+		ASSERT(vcp->hsa_asid != 0);
+		ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid);
+
+		if (flush_by_asid) {
+			return (VMCB_FLUSH_ASID);
+		}
+		return (VMCB_FLUSH_ALL);
+	} else if (npt_flush) {
+		ASSERT(flush_by_asid);
+		return (VMCB_FLUSH_ASID);
+	}
+	return (VMCB_FLUSH_NOTHING);
+}
+
+static int
+hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
+    xc_arg_t arg3 __unused)
+{
+	const processorid_t id = CPU->cpu_seqid;
+	const uintptr_t hsave_pa = hma_svm_hsave_pa[id];
+	uint64_t efer;
+
+	VERIFY(hsave_pa != 0);
+
+	/* Enable SVM via EFER */
+	efer = rdmsr(MSR_AMD_EFER);
+	efer |= AMD_EFER_SVME;
+	wrmsr(MSR_AMD_EFER, efer);
+
+	/* Setup hsave area */
+	wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa);
+
+	hma_cpu_status[id] = HCS_READY;
+	return (0);
+}
+
+static int
+hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused)
+{
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(id >= 0 && id < NCPU);
+
+	switch (what) {
+	case CPU_CONFIG:
+	case CPU_ON:
+	case CPU_INIT:
+		break;
+	default:
+		/*
+		 * Other events, such as CPU offlining, are of no interest.
+		 * Letting the SVM state linger should not cause any harm.
+		 *
+		 * This logic assumes that any offlining activity is strictly
+		 * administrative in nature and will not alter any existing
+		 * configuration (such as EFER bits previously set).
+		 */
+		return (0);
+	}
+
+	/* Perform initialization if it has not been previously attempted. */
+	if (hma_cpu_status[id] != HCS_UNINITIALIZED) {
+		return ((hma_cpu_status[id] == HCS_READY) ? 0 : -1);
+	}
+
+	/* Allocate the hsave page for this CPU */
+	if (hma_svm_hsave_page[id] == NULL) {
+		caddr_t va;
+		pfn_t pfn;
+
+		va = kmem_alloc(PAGESIZE, KM_SLEEP);
+		VERIFY0((uintptr_t)va & PAGEOFFSET);
+		hma_svm_hsave_page[id] = va;
+
+		/*
+		 * Cache the physical address of the hsave page rather than
+		 * looking it up later when the potential blocking of
+		 * hat_getpfnum would be less acceptable.
+		 */
+		pfn = hat_getpfnum(kas.a_hat, va);
+		hma_svm_hsave_pa[id] = (pfn << PAGESHIFT);
+	} else {
+		VERIFY(hma_svm_hsave_pa[id] != 0);
+	}
+
+	kpreempt_disable();
+	if (CPU->cpu_seqid == id) {
+		/* Perform svm setup directly if this CPU is the target */
+		(void) hma_svm_cpu_activate(0, 0, 0);
+		kpreempt_enable();
+	} else {
+		cpuset_t set;
+
+		/* Use a cross-call if a remote CPU is the target */
+		kpreempt_enable();
+		cpuset_zero(&set);
+		cpuset_add(&set, id);
+		xc_call(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate);
+	}
+
+	return (hma_cpu_status[id] != HCS_READY);
+}
+
+static int
+hma_svm_init(void)
+{
+	uint64_t msr;
+	const char *msg = NULL;
+	struct cpuid_regs regs;
+	cpu_t *cp;
+
+	if (!is_x86_feature(x86_featureset, X86FSET_SVM)) {
+		msg = "CPU does not support SVM";
+		goto bail;
+	}
+
+	msr = rdmsr(MSR_AMD_VM_CR);
+	if ((msr & AMD_VM_CR_SVMDIS) != 0) {
+		msg = "SVM disabled by BIOS";
+		goto bail;
+	}
+
+	regs.cp_eax = 0x8000000a;
+	(void) cpuid_insn(NULL, &regs);
+	const uint32_t nasid = regs.cp_ebx;
+	const uint32_t feat = regs.cp_edx;
+
+	if (nasid == 0) {
+		msg = "Not enough ASIDs for guests";
+		goto bail;
+	}
+	if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) {
+		msg = "CPU does not support nested paging";
+		goto bail;
+	}
+	if ((feat & CPUID_AMD_EDX_NRIPS) == 0) {
+		msg = "CPU does not support NRIP save";
+		goto bail;
+	}
+
+	hma_svm_features = feat;
+	hma_svm_max_asid = nasid;
+
+	mutex_enter(&cpu_lock);
+	/* Perform SVM configuration for already-online CPUs. */
+	cp = cpu_active;
+	do {
+		int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL);
+		if (err != 0) {
+			msg = "failure during SVM setup";
+			mutex_exit(&cpu_lock);
+			goto bail;
+		}
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+
+	/*
+	 * Register callback for later-onlined CPUs and perform other remaining
+	 * resource allocation.
+	 */
+	register_cpu_setup_func(hma_svm_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+
+	/* Initialize per-CPU ASID state. */
+	for (uint_t i = 0; i < NCPU; i++) {
+		/*
+		 * Skip past sentinel 0 value for generation.  Doing so for
+		 * ASID is unneeded, since it will be incremented during the
+		 * first allocation.
+		 */
+		hma_svm_cpu_asid[i].hsa_gen = 1;
+		hma_svm_cpu_asid[i].hsa_asid = 0;
+	}
+
+	hma_svm_ready = B_TRUE;
+	return (0);
+
+bail:
+	hma_svm_error = msg;
+	cmn_err(CE_NOTE, "hma_svm_init: %s", msg);
+	return (-1);
+}
diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c
index 61bcf9ef05..ffc8ee84aa 100644
--- a/usr/src/uts/i86pc/os/mp_startup.c
+++ b/usr/src/uts/i86pc/os/mp_startup.c
@@ -77,6 +77,8 @@
 #include <sys/sysmacros.h>
 #if defined(__xpv)
 #include <sys/hypervisor.h>
+#else
+#include <sys/hma.h>
 #endif
 #include <sys/cpu_module.h>
 #include <sys/ontrap.h>
@@ -1615,6 +1617,14 @@ done:
 		workaround_errata_end();
 	cmi_post_mpstartup();
 
+#if !defined(__xpv)
+	/*
+	 * Once other CPUs have completed startup procedures, perform
+	 * initialization of hypervisor resources for HMA.
+	 */
+	hma_init();
+#endif
+
 	if (use_mp && ncpus != boot_max_ncpus) {
 		cmn_err(CE_NOTE,
 		    "System detected %d cpus, but "
diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h
index 00009cf439..688f09bfb7 100644
--- a/usr/src/uts/i86pc/sys/hma.h
+++ b/usr/src/uts/i86pc/sys/hma.h
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _SYS_HMA_H
@@ -30,6 +30,39 @@
 extern "C" {
 #endif
 
+
+/*
+ * Register a hypervisor with HMA.  On success, a pointer to the opaque
+ * registration token will be returned, indicating that proper host setup has
+ * occurred for further hypervisor actions.
+ */
+typedef struct hma_reg hma_reg_t;
+extern hma_reg_t *hma_register(const char *);
+extern void hma_unregister(hma_reg_t *);
+
+/*
+ * Allocate or free a VPID for use with VMX.
+ *
+ * This must not be performed by a hypervisor until it has successfully
+ * registered via hma_register().
+ */
+extern uint16_t hma_vmx_vpid_alloc(void);
+extern void hma_vmx_vpid_free(uint16_t);
+
+/*
+ * On all active CPUs, perform a single-context INVEPT on the given EPTP.
+ */
+extern void hma_vmx_invept_allcpus(uintptr_t);
+
+struct hma_svm_asid {
+	uint64_t hsa_gen;
+	uint32_t hsa_asid;
+};
+typedef struct hma_svm_asid hma_svm_asid_t;
+
+extern void hma_svm_asid_init(hma_svm_asid_t *);
+extern uint8_t hma_svm_asid_update(hma_svm_asid_t *, boolean_t, boolean_t);
+
 /*
  * FPU related management. These functions provide a set of APIs to manage the
  * FPU state and switch between host and guest management of this state.
@@ -96,6 +129,9 @@ extern void hma_fpu_stop_guest(hma_fpu_t *);
 extern void hma_fpu_get_fxsave_state(const hma_fpu_t *, struct fxsave_state *);
 extern int hma_fpu_set_fxsave_state(hma_fpu_t *, const struct fxsave_state *);
 
+/* Perform HMA initialization steps during boot-up. */
+extern void hma_init(void);
+
 #ifdef __cplusplus
 }
 #endif
-- 
cgit v1.2.3


From 454f0c49f9d5b08ab88fe3db5788d9e5e6a7cf0f Mon Sep 17 00:00:00 2001
From: Andy Fiddaman <omnios@citrus-it.co.uk>
Date: Thu, 7 Feb 2019 00:02:27 +0000
Subject: 12529 want exclusive hma registration Reviewed by: Patrick Mooney
 <patrick.mooney@joyent.com> Reviewed by: Jerry Jelinek
 <jerry.jelinek@joyent.com> Reviewed by: Toomas Soome <tsoome@me.com> Approved
 by: Dan McDonald <danmcd@joyent.com>

---
 usr/src/pkg/manifests/system-header.mf |  1 +
 usr/src/uts/i86pc/os/hma.c             | 59 +++++++++++++++++++++++++++-------
 usr/src/uts/i86pc/sys/Makefile         |  1 +
 usr/src/uts/i86pc/sys/hma.h            |  1 +
 4 files changed, 50 insertions(+), 12 deletions(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf
index bb961d21ce..0a3e045e1a 100644
--- a/usr/src/pkg/manifests/system-header.mf
+++ b/usr/src/pkg/manifests/system-header.mf
@@ -1677,6 +1677,7 @@ $(i386_ONLY)file path=usr/platform/i86pc/include/sys/cram.h
 $(i386_ONLY)file path=usr/platform/i86pc/include/sys/ddi_subrdefs.h
 $(i386_ONLY)file path=usr/platform/i86pc/include/sys/debug_info.h
 $(i386_ONLY)file path=usr/platform/i86pc/include/sys/fastboot.h
+$(i386_ONLY)file path=usr/platform/i86pc/include/sys/hma.h
 $(i386_ONLY)file path=usr/platform/i86pc/include/sys/mach_mmu.h
 $(i386_ONLY)file path=usr/platform/i86pc/include/sys/machclock.h
 $(i386_ONLY)file path=usr/platform/i86pc/include/sys/machcpuvar.h
diff --git a/usr/src/uts/i86pc/os/hma.c b/usr/src/uts/i86pc/os/hma.c
index 9678894da0..a41ff3e0d1 100644
--- a/usr/src/uts/i86pc/os/hma.c
+++ b/usr/src/uts/i86pc/os/hma.c
@@ -32,6 +32,7 @@ struct hma_reg {
 
 static kmutex_t hma_lock;
 static list_t hma_registrations;
+static boolean_t hma_exclusive = B_FALSE;
 
 static boolean_t hma_vmx_ready = B_FALSE;
 static const char *hma_vmx_error = NULL;
@@ -100,18 +101,14 @@ hma_init(void)
 	}
 }
 
-hma_reg_t *
-hma_register(const char *name)
+static hma_reg_t *
+hma_register_backend(const char *name)
 {
 	struct hma_reg *reg;
 	boolean_t is_ready;
 
-	VERIFY(name != NULL);
-
-	reg = kmem_zalloc(sizeof (*reg), KM_SLEEP);
-	reg->hr_name = name;
+	ASSERT(MUTEX_HELD(&hma_lock));
 
-	mutex_enter(&hma_lock);
 	switch (cpuid_getvendor(CPU)) {
 	case X86_VENDOR_Intel:
 		is_ready = hma_vmx_ready;
@@ -124,12 +121,48 @@ hma_register(const char *name)
 		break;
 	}
 
-	if (!is_ready) {
-		kmem_free(reg, sizeof (*reg));
-		reg = NULL;
-	} else {
-		list_insert_tail(&hma_registrations, reg);
+	if (!is_ready)
+		return (NULL);
+
+	reg = kmem_zalloc(sizeof (*reg), KM_SLEEP);
+	reg->hr_name = name;
+	list_insert_tail(&hma_registrations, reg);
+
+	return (reg);
+}
+
+hma_reg_t *
+hma_register(const char *name)
+{
+	struct hma_reg *reg = NULL;
+
+	VERIFY(name != NULL);
+
+	mutex_enter(&hma_lock);
+
+	if (!hma_exclusive)
+		reg = hma_register_backend(name);
+
+	mutex_exit(&hma_lock);
+
+	return (reg);
+}
+
+hma_reg_t *
+hma_register_exclusive(const char *name)
+{
+	struct hma_reg *reg = NULL;
+
+	VERIFY(name != NULL);
+
+	mutex_enter(&hma_lock);
+
+	if (list_is_empty(&hma_registrations)) {
+		reg = hma_register_backend(name);
+		if (reg != NULL)
+			hma_exclusive = B_TRUE;
 	}
+
 	mutex_exit(&hma_lock);
 
 	return (reg);
@@ -143,6 +176,8 @@ hma_unregister(hma_reg_t *reg)
 
 	mutex_enter(&hma_lock);
 	list_remove(&hma_registrations, reg);
+	if (hma_exclusive && list_is_empty(&hma_registrations))
+		hma_exclusive = B_FALSE;
 	mutex_exit(&hma_lock);
 	kmem_free(reg, sizeof (*reg));
 }
diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile
index 292cd04c2b..8eff4c905c 100644
--- a/usr/src/uts/i86pc/sys/Makefile
+++ b/usr/src/uts/i86pc/sys/Makefile
@@ -46,6 +46,7 @@ HDRS=  \
 	ddi_subrdefs.h	\
 	debug_info.h	\
 	fastboot.h	\
+	hma.h		\
 	mach_mmu.h	\
 	machclock.h	\
 	machcpuvar.h	\
diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h
index 688f09bfb7..16ab708896 100644
--- a/usr/src/uts/i86pc/sys/hma.h
+++ b/usr/src/uts/i86pc/sys/hma.h
@@ -38,6 +38,7 @@ extern "C" {
  */
 typedef struct hma_reg hma_reg_t;
 extern hma_reg_t *hma_register(const char *);
+extern hma_reg_t *hma_register_exclusive(const char *);
 extern void hma_unregister(hma_reg_t *);
 
 /*
-- 
cgit v1.2.3


From 04909c8c9ef61a86dd44bdaf341a1d9a2f0206e5 Mon Sep 17 00:00:00 2001
From: John Levon <john.levon@joyent.com>
Date: Fri, 2 Mar 2018 17:34:28 +0000
Subject: 12608 want memory arena for vmm applications 12609 x86 memory DR
 should be disabled Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
 Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Toomas
 Soome <tsoome@me.com> Approved by: Gordon Ross <gordon.w.ross@gmail.com>

---
 usr/src/cmd/mdb/common/modules/genunix/memory.c |  35 ++--
 usr/src/uts/common/sys/vnode.h                  |   3 +-
 usr/src/uts/common/vm/page_lock.c               |   9 +-
 usr/src/uts/common/vm/page_retire.c             |   7 +-
 usr/src/uts/common/vm/seg_kmem.c                |  75 +++++----
 usr/src/uts/common/vm/seg_kmem.h                |  18 +-
 usr/src/uts/i86pc/dboot/dboot_startkern.c       |   8 +-
 usr/src/uts/i86pc/os/ddi_impl.c                 |   5 +-
 usr/src/uts/i86pc/os/startup.c                  | 213 ++++++++++--------------
 usr/src/uts/i86pc/sys/machparam.h               | 190 ++++++---------------
 usr/src/uts/i86pc/vm/seg_vmm.c                  |   7 +-
 usr/src/uts/sun4/os/startup.c                   |   3 +-
 12 files changed, 235 insertions(+), 338 deletions(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/cmd/mdb/common/modules/genunix/memory.c b/usr/src/cmd/mdb/common/modules/genunix/memory.c
index 3810c9e506..d5ffa1537a 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/memory.c
@@ -443,11 +443,11 @@ vn_get(vn_htable_t *hp, struct vnode *vp, uintptr_t ptr)
 
 /* Summary statistics of pages */
 typedef struct memstat {
-	struct vnode    *ms_kvp;	/* Cached address of kernel vnode */
 	struct vnode    *ms_unused_vp;	/* Unused pages vnode pointer	  */
-	struct vnode    *ms_zvp;	/* Cached address of zio vnode    */
+	struct vnode    *ms_kvps;	/* Cached address of vnode array  */
 	uint64_t	ms_kmem;	/* Pages of kernel memory	  */
 	uint64_t	ms_zfs_data;	/* Pages of zfs data		  */
+	uint64_t	ms_vmm_mem;	/* Pages of VMM mem		  */
 	uint64_t	ms_anon;	/* Pages of anonymous memory	  */
 	uint64_t	ms_vnode;	/* Pages of named (vnode) memory  */
 	uint64_t	ms_exec;	/* Pages of exec/library memory	  */
@@ -458,11 +458,8 @@ typedef struct memstat {
 	struct vnode	ms_vn;		/* vnode buffer			  */
 } memstat_t;
 
-#define	MS_PP_ISKAS(pp, stats)				\
-	((pp)->p_vnode == (stats)->ms_kvp)
-
-#define	MS_PP_ISZFS_DATA(pp, stats)			\
-	(((stats)->ms_zvp != NULL) && ((pp)->p_vnode == (stats)->ms_zvp))
+#define	MS_PP_ISTYPE(pp, stats, index) \
+	((pp)->p_vnode == &(stats->ms_kvps[index]))
 
 /*
  * Summarize pages by type and update stat information
@@ -478,10 +475,12 @@ memstat_callback(page_t *page, page_t *pp, memstat_t *stats)
 		stats->ms_bootpages++;
 	else if (pp->p_vnode == NULL || pp->p_vnode == stats->ms_unused_vp)
 		return (WALK_NEXT);
-	else if (MS_PP_ISKAS(pp, stats))
+	else if (MS_PP_ISTYPE(pp, stats, KV_KVP))
 		stats->ms_kmem++;
-	else if (MS_PP_ISZFS_DATA(pp, stats))
+	else if (MS_PP_ISTYPE(pp, stats, KV_ZVP))
 		stats->ms_zfs_data++;
+	else if (MS_PP_ISTYPE(pp, stats, KV_VVP))
+		stats->ms_vmm_mem++;
 	else if (PP_ISFREE(pp))
 		stats->ms_cachelist++;
 	else if (vn_get(stats->ms_vn_htable, vp, (uintptr_t)pp->p_vnode))
@@ -507,7 +506,6 @@ memstat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	memstat_t stats;
 	GElf_Sym sym;
 	vn_htable_t ht;
-	struct vnode *kvps;
 	uintptr_t vn_size = 0;
 #if defined(__i386) || defined(__amd64)
 	bln_stats_t bln_stats;
@@ -548,16 +546,10 @@ memstat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	/* read kernel vnode array pointer */
 	if (mdb_lookup_by_obj(MDB_OBJ_EXEC, "kvps",
 	    (GElf_Sym *)&sym) == -1) {
-		mdb_warn("unable to read kvps");
+		mdb_warn("unable to look up kvps");
 		return (DCMD_ERR);
 	}
-	kvps = (struct vnode *)(uintptr_t)sym.st_value;
-	stats.ms_kvp =  &kvps[KV_KVP];
-
-	/*
-	 * Read the zio vnode pointer.
-	 */
-	stats.ms_zvp = &kvps[KV_ZVP];
+	stats.ms_kvps = (struct vnode *)(uintptr_t)sym.st_value;
 
 	/*
 	 * If physmem != total_pages, then the administrator has limited the
@@ -605,6 +597,13 @@ memstat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		    MS_PCT_TOTAL(stats.ms_zfs_data));
 	}
 
+	if (stats.ms_vmm_mem != 0) {
+		mdb_printf("VMM Memory       %16llu  %16llu  %3lu%%\n",
+		    stats.ms_vmm_mem,
+		    (uint64_t)stats.ms_vmm_mem * PAGESIZE / (1024 * 1024),
+		    MS_PCT_TOTAL(stats.ms_vmm_mem));
+	}
+
 	mdb_printf("Anon             %16llu  %16llu  %3lu%%\n",
 	    stats.ms_anon,
 	    (uint64_t)stats.ms_anon * PAGESIZE / (1024 * 1024),
diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h
index b48db0afd6..b8702bc8f5 100644
--- a/usr/src/uts/common/sys/vnode.h
+++ b/usr/src/uts/common/sys/vnode.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright 2017 RackTop Systems.
  */
@@ -1489,6 +1489,7 @@ extern struct vnode kvps[];
 typedef enum {
 	KV_KVP,		/* vnode for all segkmem pages */
 	KV_ZVP,		/* vnode for all ZFS pages */
+	KV_VVP,		/* vnode for all VMM pages */
 #if defined(__sparc)
 	KV_MPVP,	/* vnode for all page_t meta-pages */
 	KV_PROMVP,	/* vnode for all PROM pages */
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
index 2a9b2e1f5a..a5a39d04c1 100644
--- a/usr/src/uts/common/vm/page_lock.c
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -141,9 +141,8 @@ static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
 	    & (VPH_TABLE_SIZE - 1))
 
 /*
- * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
- * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
- * VPH_TABLE_SIZE + 1.
+ * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes,
+ * one for kvps[KV_ZVP], and one for other kvps[] users.
  */
 
 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
@@ -880,10 +879,10 @@ static int page_vnode_mutex_stress = 0;
 kmutex_t *
 page_vnode_mutex(vnode_t *vp)
 {
-	if (vp == &kvp)
+	if (vp == &kvp || vp == &kvps[KV_VVP])
 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
 
-	if (vp == &zvp)
+	if (vp == &kvps[KV_ZVP])
 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
 #ifdef DEBUG
 	if (page_vnode_mutex_stress != 0)
diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c
index 76be970a45..f4e8d0737f 100644
--- a/usr/src/uts/common/vm/page_retire.c
+++ b/usr/src/uts/common/vm/page_retire.c
@@ -22,6 +22,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -851,9 +852,8 @@ page_retire_incr_pend_count(void *datap)
 {
 	PR_INCR_KSTAT(pr_pending);
 
-	if ((datap == &kvp) || (datap == &zvp)) {
+	if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
 		PR_INCR_KSTAT(pr_pending_kas);
-	}
 }
 
 void
@@ -861,9 +861,8 @@ page_retire_decr_pend_count(void *datap)
 {
 	PR_DECR_KSTAT(pr_pending);
 
-	if ((datap == &kvp) || (datap == &zvp)) {
+	if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP])
 		PR_DECR_KSTAT(pr_pending_kas);
-	}
 }
 
 /*
diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c
index 3ed5660a9f..540f2b251b 100644
--- a/usr/src/uts/common/vm/seg_kmem.c
+++ b/usr/src/uts/common/vm/seg_kmem.c
@@ -122,6 +122,11 @@ vmem_t *static_alloc_arena;	/* arena for allocating static memory */
 vmem_t *zio_arena = NULL;	/* arena for allocating zio memory */
 vmem_t *zio_alloc_arena = NULL;	/* arena for allocating zio memory */
 
+#if defined(__amd64)
+vmem_t *kvmm_arena;		/* arena for vmm VA */
+struct seg kvmmseg;		/* Segment for vmm memory */
+#endif
+
 /*
  * seg_kmem driver can map part of the kernel heap with large pages.
  * Currently this functionality is implemented for sparc platforms only.
@@ -655,13 +660,19 @@ segkmem_dump(struct seg *seg)
 		    segkmem_dump_range, seg->s_as);
 		vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 		    segkmem_dump_range, seg->s_as);
+	/*
+	 * We don't want to dump pages attached to kzioseg since they
+	 * contain file data from ZFS.  If this page's segment is
+	 * kzioseg return instead of writing it to the dump device.
+	 *
+	 * Same applies to VM memory allocations.
+	 */
 	} else if (seg == &kzioseg) {
-		/*
-		 * We don't want to dump pages attached to kzioseg since they
-		 * contain file data from ZFS.  If this page's segment is
-		 * kzioseg return instead of writing it to the dump device.
-		 */
 		return;
+#if defined(__amd64)
+	} else if (seg == &kvmmseg) {
+		return;
+#endif
 	} else {
 		segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
 	}
@@ -801,22 +812,19 @@ struct seg_ops segkmem_ops = {
 	seg_inherit_notsup		/* inherit */
 };
 
-int
-segkmem_zio_create(struct seg *seg)
-{
-	ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
-	seg->s_ops = &segkmem_ops;
-	seg->s_data = &zvp;
-	kas.a_size += seg->s_size;
-	return (0);
-}
-
 int
 segkmem_create(struct seg *seg)
 {
 	ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 	seg->s_ops = &segkmem_ops;
-	seg->s_data = &kvp;
+	if (seg == &kzioseg)
+		seg->s_data = &kvps[KV_ZVP];
+#if defined(__amd64)
+	else if (seg == &kvmmseg)
+		seg->s_data = &kvps[KV_VVP];
+#endif
+	else
+		seg->s_data = &kvps[KV_KVP];
 	kas.a_size += seg->s_size;
 	return (0);
 }
@@ -967,10 +975,10 @@ segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
 	return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
 }
 
-void *
+static void *
 segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
 {
-	return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
+	return (segkmem_alloc_vn(vmp, size, vmflag, &kvps[KV_ZVP]));
 }
 
 /*
@@ -979,8 +987,8 @@ segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
  * we currently don't have a special kernel segment for non-paged
  * kernel memory that is exported by drivers to user space.
  */
-static void
-segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
+void
+segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
     void (*func)(page_t *))
 {
 	page_t *pp;
@@ -1036,22 +1044,16 @@ segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
 
 }
 
-void
-segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
-{
-	segkmem_free_vn(vmp, inaddr, size, &kvp, func);
-}
-
 void
 segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
 {
-	segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
+	segkmem_xfree(vmp, inaddr, size, &kvp, NULL);
 }
 
-void
+static void
 segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
 {
-	segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
+	segkmem_xfree(vmp, inaddr, size, &kvps[KV_ZVP], NULL);
 }
 
 void
@@ -1533,8 +1535,21 @@ segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
 	ASSERT(zio_alloc_arena != NULL);
 }
 
-#ifdef __sparc
+#if defined(__amd64)
+
+void
+segkmem_kvmm_init(void *base, size_t size)
+{
+	ASSERT(base != NULL);
+	ASSERT(size != 0);
+
+	kvmm_arena = vmem_create("kvmm_arena", base, size, 1024 * 1024,
+	    NULL, NULL, NULL, 0, VM_SLEEP);
+
+	ASSERT(kvmm_arena != NULL);
+}
 
+#elif defined(__sparc)
 
 static void *
 segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h
index 1db85826b1..9a20101670 100644
--- a/usr/src/uts/common/vm/seg_kmem.h
+++ b/usr/src/uts/common/vm/seg_kmem.h
@@ -21,7 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 RackTop Systems.
  */
 
@@ -65,12 +65,18 @@ extern vmem_t *static_arena;	/* arena for caches to import static memory */
 extern vmem_t *static_alloc_arena;	/* arena for allocating static memory */
 extern vmem_t *zio_arena;	/* arena for zio caches */
 extern vmem_t *zio_alloc_arena;	/* arena for zio caches */
+
+#if defined(__amd64)
+extern struct seg kvmmseg;	/* Segment for vmm mappings */
+extern vmem_t *kvmm_arena;	/* arena for vmm VA */
+extern void segkmem_kvmm_init(void *, size_t);
+#endif
+
 extern struct vnode kvps[];
 /*
- * segkmem page vnodes
+ * segkmem page vnodes (please don't add more defines here...)
  */
 #define	kvp		(kvps[KV_KVP])
-#define	zvp		(kvps[KV_ZVP])
 #if defined(__sparc)
 #define	mpvp		(kvps[KV_MPVP])
 #define	promvp		(kvps[KV_PROMVP])
@@ -83,16 +89,14 @@ extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t,
 extern void *segkmem_alloc(vmem_t *, size_t, int);
 extern void *segkmem_alloc_permanent(vmem_t *, size_t, int);
 extern void segkmem_free(vmem_t *, void *, size_t);
-extern void segkmem_xfree(vmem_t *, void *, size_t, void (*)(page_t *));
+extern void segkmem_xfree(vmem_t *, void *, size_t,
+    struct vnode *, void (*)(page_t *));
 
 extern void *boot_alloc(void *, size_t, uint_t);
 extern void boot_mapin(caddr_t addr, size_t size);
 extern void kernelheap_init(void *, void *, char *, void *, void *);
 extern void segkmem_gc(void);
 
-extern void *segkmem_zio_alloc(vmem_t *, size_t, int);
-extern int segkmem_zio_create(struct seg *);
-extern void segkmem_zio_free(vmem_t *, void *, size_t);
 extern void segkmem_zio_init(void *, size_t);
 
 /*
diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c
index 13c6868cba..e6a1127941 100644
--- a/usr/src/uts/i86pc/dboot/dboot_startkern.c
+++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c
@@ -2303,13 +2303,7 @@ startup_kernel(void)
 	/*
 	 * Need correct target_kernel_text value
 	 */
-#if defined(_BOOT_TARGET_amd64)
-	target_kernel_text = KERNEL_TEXT_amd64;
-#elif defined(__xpv)
-	target_kernel_text = KERNEL_TEXT_i386_xpv;
-#else
-	target_kernel_text = KERNEL_TEXT_i386;
-#endif
+	target_kernel_text = KERNEL_TEXT;
 	DBG(target_kernel_text);
 
 #if defined(__xpv)
diff --git a/usr/src/uts/i86pc/os/ddi_impl.c b/usr/src/uts/i86pc/os/ddi_impl.c
index c86cfcdaeb..e5a2aad350 100644
--- a/usr/src/uts/i86pc/os/ddi_impl.c
+++ b/usr/src/uts/i86pc/os/ddi_impl.c
@@ -24,6 +24,7 @@
  * Copyright 2012 Garrett D'Amore <garrett@damore.org>
  * Copyright 2014 Pluribus Networks, Inc.
  * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -1010,10 +1011,10 @@ page_create_io_wrapper(void *addr, size_t len, int vmflag, void *arg)
 
 #ifdef __xpv
 static void
-segkmem_free_io(vmem_t *vmp, void * ptr, size_t size)
+segkmem_free_io(vmem_t *vmp, void *ptr, size_t size)
 {
 	extern void page_destroy_io(page_t *);
-	segkmem_xfree(vmp, ptr, size, page_destroy_io);
+	segkmem_xfree(vmp, ptr, size, &kvp, page_destroy_io);
 }
 #endif
 
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 07a08658c9..4771824703 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -319,22 +319,16 @@ static struct seg *segmap = &kmapseg;	/* easier to use name for in here */
 
 struct seg *segkp = &kpseg;	/* Pageable kernel virtual memory segment */
 
-#if defined(__amd64)
 struct seg kvseg_core;		/* Segment used for the core heap */
 struct seg kpmseg;		/* Segment used for physical mapping */
 struct seg *segkpm = &kpmseg;	/* 64bit kernel physical mapping segment */
-#else
-struct seg *segkpm = NULL;	/* Unused on IA32 */
-#endif
 
 caddr_t segkp_base;		/* Base address of segkp */
 caddr_t segzio_base;		/* Base address of segzio */
-#if defined(__amd64)
 pgcnt_t segkpsize = btop(SEGKPDEFSIZE);	/* size of segkp segment in pages */
-#else
-pgcnt_t segkpsize = 0;
-#endif
-pgcnt_t segziosize = 0;		/* size of zio segment in pages */
+caddr_t segkvmm_base;
+pgcnt_t segkvmmsize;
+pgcnt_t segziosize;
 
 /*
  * A static DR page_t VA map is reserved that can map the page structures
@@ -455,23 +449,32 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
  * 0xFFFFFFFF.C0000000  |-----------------------|- core_base / ekernelheap
  *			|	 Kernel		|
  *			|	  heap		|
+ *			|			|
+ *			|			|
  * 0xFFFFFXXX.XXX00000  |-----------------------|- kernelheap (floating)
  *			|	 segmap		|
  * 0xFFFFFXXX.XXX00000  |-----------------------|- segmap_start (floating)
  *			|    device mappings	|
  * 0xFFFFFXXX.XXX00000  |-----------------------|- toxic_addr (floating)
- *			|	  segzio	|
+ *			|	 segzio		|
  * 0xFFFFFXXX.XXX00000  |-----------------------|- segzio_base (floating)
- *			|	  segkp		|
- * ---                  |-----------------------|- segkp_base (floating)
+ *			|        segkvmm	|
+ *			|			|
+ *			|			|
+ *			|			|
+ * 0xFFFFFXXX.XXX00000  |-----------------------|- segkvmm_base (floating)
+ *			|	 segkp		|
+ * 			|-----------------------|- segkp_base (floating)
  *			|   page_t structures	|  valloc_base + valloc_sz
  *			|   memsegs, memlists,	|
  *			|   page hash, etc.	|
- * 0xFFFFFF00.00000000  |-----------------------|- valloc_base (lower if >256GB)
+ * 0xFFFFFE00.00000000  |-----------------------|- valloc_base (lower if >256GB)
  *			|	 segkpm		|
- * 0xFFFFFE00.00000000  |-----------------------|
+ *			|			|
+ * 0xFFFFFD00.00000000  |-----------------------|- SEGKPM_BASE (lower if >256GB)
  *			|	Red Zone	|
- * 0xFFFFFD80.00000000  |-----------------------|- KERNELBASE (lower if >256GB)
+ * 0xFFFFFC80.00000000  |-----------------------|- KERNELBASE (lower if >256GB)
+ * 0xFFFFFC7F.FFE00000  |-----------------------|- USERLIMIT (lower if >256GB)
  *			|     User stack	|- User space memory
  *			|			|
  *			| shared objects, etc	|	(grows downwards)
@@ -1084,22 +1087,9 @@ startup_memlist(void)
 	PRM_DEBUG(memblocks);
 
 	/*
-	 * Compute maximum physical address for memory DR operations.
-	 * Memory DR operations are unsupported on xpv or 32bit OSes.
+	 * We no longer support any form of memory DR.
 	 */
-#ifdef	__amd64
-	if (plat_dr_support_memory()) {
-		if (plat_dr_physmax == 0) {
-			uint_t pabits = UINT_MAX;
-
-			cpuid_get_addrsize(CPU, &pabits, NULL);
-			plat_dr_physmax = btop(1ULL << pabits);
-		}
-		if (plat_dr_physmax > PHYSMEM_MAX64)
-			plat_dr_physmax = PHYSMEM_MAX64;
-	} else
-#endif
-		plat_dr_physmax = 0;
+	plat_dr_physmax = 0;
 
 	/*
 	 * Examine the bios reserved memory to find out:
@@ -1260,68 +1250,55 @@ startup_memlist(void)
 	pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t);
 	ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size);
 
-#if defined(__amd64)
 	valloc_sz = ROUND_UP_LPAGE(valloc_sz);
 	valloc_base = VALLOC_BASE;
 
 	/*
-	 * The default values of VALLOC_BASE and SEGKPM_BASE should work
-	 * for values of physmax up to 256GB (1/4 TB). They need adjusting when
-	 * memory is at addresses above 256GB. When adjusted, segkpm_base must
-	 * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
+	 * The signicant memory-sized regions are roughly sized as follows in
+	 * the default layout with max physmem:
+	 *  segkpm: 1x physmem allocated (but 1Tb room, below VALLOC_BASE)
+	 *  segzio: 1.5x physmem
+	 *  segkvmm: 4x physmem
+	 *  heap: whatever's left up to COREHEAP_BASE, at least 1.5x physmem
+	 *
+	 * The idea is that we leave enough room to avoid fragmentation issues,
+	 * so we would like the VA arenas to have some extra.
+	 *
+	 * Ignoring the loose change of segkp, valloc, and such, this means that
+	 * as COREHEAP_BASE-VALLOC_BASE=2Tb, we can accommodate a physmem up to
+	 * about (2Tb / 7.0), rounded down to 256Gb in the check below.
 	 *
-	 * In the general case (>256GB), we use (4 * physmem) for the
-	 * kernel's virtual addresses, which is divided approximately
-	 * as follows:
-	 *  - 1 * physmem for segkpm
-	 *  - 1.5 * physmem for segzio
-	 *  - 1.5 * physmem for heap
-	 * Total: 4.0 * physmem
+	 * Note that KPM lives below VALLOC_BASE, but we want to include it in
+	 * adjustments, hence the 8 below.
 	 *
-	 * Note that the segzio and heap sizes are more than physmem so that
-	 * VA fragmentation does not prevent either of them from being
-	 * able to use nearly all of physmem.  The value of 1.5x is determined
-	 * experimentally and may need to change if the workload changes.
+	 * Beyond 256Gb, we push segkpm_base (and hence kernelbase and
+	 * _userlimit) down to accommodate the VA requirements above.
 	 */
-	if (physmax + 1 > mmu_btop(TERABYTE / 4) ||
-	    plat_dr_physmax > mmu_btop(TERABYTE / 4)) {
-		uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
+	if (physmax + 1 > mmu_btop(TERABYTE / 4)) {
+		uint64_t physmem_bytes = mmu_ptob(physmax + 1);
+		uint64_t adjustment = 8 * (physmem_bytes - (TERABYTE / 4));
 
-		if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
-			kpm_resv_amount = mmu_ptob(plat_dr_physmax);
-		}
+		PRM_DEBUG(adjustment);
 
 		/*
-		 * This is what actually controls the KVA : UVA split.
-		 * The kernel uses high VA, and this is lowering the
-		 * boundary, thus increasing the amount of VA for the kernel.
-		 * This gives the kernel 4 * (amount of physical memory) VA.
-		 *
-		 * The maximum VA is UINT64_MAX and we are using
-		 * 64-bit 2's complement math, so e.g. if you have 512GB
-		 * of memory, segkpm_base = -(4 * 512GB) == -2TB ==
-		 * UINT64_MAX - 2TB (approximately).  So the kernel's
-		 * VA is [UINT64_MAX-2TB to UINT64_MAX].
+		 * segkpm_base is always aligned on a L3 PTE boundary.
 		 */
-		segkpm_base = -(P2ROUNDUP((4 * kpm_resv_amount),
-		    KERNEL_REDZONE_SIZE));
+		segkpm_base -= P2ROUNDUP(adjustment, KERNEL_REDZONE_SIZE);
 
-		/* make sure we leave some space for user apps above hole */
+		/*
+		 * But make sure we leave some space for user apps above hole.
+		 */
 		segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
-		if (segkpm_base > SEGKPM_BASE)
-			segkpm_base = SEGKPM_BASE;
-		PRM_DEBUG(segkpm_base);
 
-		valloc_base = segkpm_base + P2ROUNDUP(kpm_resv_amount, ONE_GIG);
+		ASSERT(segkpm_base <= SEGKPM_BASE);
+
+		valloc_base = segkpm_base + P2ROUNDUP(physmem_bytes, ONE_GIG);
 		if (valloc_base < segkpm_base)
 			panic("not enough kernel VA to support memory size");
-		PRM_DEBUG(valloc_base);
 	}
-#else	/* __i386 */
-	valloc_base = (uintptr_t)(MISC_VA_BASE - valloc_sz);
-	valloc_base = P2ALIGN(valloc_base, mmu.level_size[1]);
+
+	PRM_DEBUG(segkpm_base);
 	PRM_DEBUG(valloc_base);
-#endif	/* __i386 */
 
 	/*
 	 * do all the initial allocations
@@ -1909,73 +1886,70 @@ protect_boot_range(uintptr_t low, uintptr_t high, int setaside)
 }
 
 /*
- *
+ * Establish the final size of the kernel's heap, size of segmap, segkp, etc.
  */
 static void
 layout_kernel_va(void)
 {
-	PRM_POINT("layout_kernel_va() starting...");
-	/*
-	 * Establish the final size of the kernel's heap, size of segmap,
-	 * segkp, etc.
-	 */
+	const size_t physmem_size = mmu_ptob(physmem);
+	size_t size;
 
-#if defined(__amd64)
+	PRM_POINT("layout_kernel_va() starting...");
 
 	kpm_vbase = (caddr_t)segkpm_base;
-	if (physmax + 1 < plat_dr_physmax) {
-		kpm_size = ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax));
-	} else {
-		kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
-	}
+	kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
 	if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base)
 		panic("not enough room for kpm!");
 	PRM_DEBUG(kpm_size);
 	PRM_DEBUG(kpm_vbase);
 
-	/*
-	 * By default we create a seg_kp in 64 bit kernels, it's a little
-	 * faster to access than embedding it in the heap.
-	 */
 	segkp_base = (caddr_t)valloc_base + valloc_sz;
 	if (!segkp_fromheap) {
-		size_t sz = mmu_ptob(segkpsize);
+		size = mmu_ptob(segkpsize);
 
 		/*
 		 * determine size of segkp
 		 */
-		if (sz < SEGKPMINSIZE || sz > SEGKPMAXSIZE) {
-			sz = SEGKPDEFSIZE;
+		if (size < SEGKPMINSIZE || size > SEGKPMAXSIZE) {
+			size = SEGKPDEFSIZE;
 			cmn_err(CE_WARN, "!Illegal value for segkpsize. "
 			    "segkpsize has been reset to %ld pages",
-			    mmu_btop(sz));
+			    mmu_btop(size));
 		}
-		sz = MIN(sz, MAX(SEGKPMINSIZE, mmu_ptob(physmem)));
+		size = MIN(size, MAX(SEGKPMINSIZE, physmem_size));
 
-		segkpsize = mmu_btop(ROUND_UP_LPAGE(sz));
+		segkpsize = mmu_btop(ROUND_UP_LPAGE(size));
 	}
 	PRM_DEBUG(segkp_base);
 	PRM_DEBUG(segkpsize);
 
 	/*
-	 * segzio is used for ZFS cached data. It uses a distinct VA
-	 * segment (from kernel heap) so that we can easily tell not to
-	 * include it in kernel crash dumps on 64 bit kernels. The trick is
-	 * to give it lots of VA, but not constrain the kernel heap.
-	 * We can use 1.5x physmem for segzio, leaving approximately
-	 * another 1.5x physmem for heap.  See also the comment in
-	 * startup_memlist().
+	 * segkvmm: backing for vmm guest memory. Like segzio, we have a
+	 * separate segment for two reasons: it makes it easy to skip our pages
+	 * on kernel crash dumps, and it helps avoid fragmentation.  With this
+	 * segment, we're expecting significantly-sized allocations only; we'll
+	 * default to 4x the size of physmem.
+	 */
+	segkvmm_base = segkp_base + mmu_ptob(segkpsize);
+	size = segkvmmsize != 0 ? mmu_ptob(segkvmmsize) : (physmem_size * 4);
+
+	size = MAX(size, SEGVMMMINSIZE);
+	segkvmmsize = mmu_btop(ROUND_UP_LPAGE(size));
+
+	PRM_DEBUG(segkvmmsize);
+	PRM_DEBUG(segkvmm_base);
+
+	/*
+	 * segzio is used for ZFS cached data.  For segzio, we use 1.5x physmem.
 	 */
-	segzio_base = segkp_base + mmu_ptob(segkpsize);
+	segzio_base = segkvmm_base + mmu_ptob(segkvmmsize);
 	if (segzio_fromheap) {
 		segziosize = 0;
 	} else {
-		size_t physmem_size = mmu_ptob(physmem);
-		size_t size = (segziosize == 0) ?
-		    physmem_size * 3 / 2 : mmu_ptob(segziosize);
+		size = (segziosize != 0) ? mmu_ptob(segziosize) :
+		    (physmem_size * 3) / 2;
 
-		if (size < SEGZIOMINSIZE)
-			size = SEGZIOMINSIZE;
+		size = MAX(size, SEGZIOMINSIZE);
 		segziosize = mmu_btop(ROUND_UP_LPAGE(size));
 	}
 	PRM_DEBUG(segziosize);
@@ -1989,10 +1963,6 @@ layout_kernel_va(void)
 	    ROUND_UP_LPAGE((uintptr_t)segzio_base + mmu_ptob(segziosize));
 	PRM_DEBUG(toxic_addr);
 	segmap_start = ROUND_UP_LPAGE(toxic_addr + toxic_size);
-#else /* __i386 */
-	segmap_start = ROUND_UP_LPAGE(kernelbase);
-#endif /* __i386 */
-	PRM_DEBUG(segmap_start);
 
 	/*
 	 * Users can change segmapsize through eeprom. If the variable
@@ -2001,16 +1971,6 @@ layout_kernel_va(void)
 	 */
 	segmapsize = MAX(ROUND_UP_LPAGE(segmapsize), SEGMAPDEFAULT);
 
-#if defined(__i386)
-	/*
-	 * 32-bit systems don't have segkpm or segkp, so segmap appears at
-	 * the bottom of the kernel's address range.  Set aside space for a
-	 * small red zone just below the start of segmap.
-	 */
-	segmap_start += KERNEL_REDZONE_SIZE;
-	segmapsize -= KERNEL_REDZONE_SIZE;
-#endif
-
 	PRM_DEBUG(segmap_start);
 	PRM_DEBUG(segmapsize);
 	kernelheap = (caddr_t)ROUND_UP_LPAGE(segmap_start + segmapsize);
@@ -2801,11 +2761,16 @@ kvm_init(void)
 		(void) segkmem_create(&kvseg_core);
 	}
 
+	PRM_POINT("attaching segkvmm");
+	(void) seg_attach(&kas, segkvmm_base, mmu_ptob(segkvmmsize), &kvmmseg);
+	(void) segkmem_create(&kvmmseg);
+	segkmem_kvmm_init(segkvmm_base, mmu_ptob(segkvmmsize));
+
 	if (segziosize > 0) {
 		PRM_POINT("attaching segzio");
 		(void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
 		    &kzioseg);
-		(void) segkmem_zio_create(&kzioseg);
+		(void) segkmem_create(&kzioseg);
 
 		/* create zio area covering new segment */
 		segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index 51d7559483..3728f30ca6 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -31,14 +31,15 @@
 #ifndef _SYS_MACHPARAM_H
 #define	_SYS_MACHPARAM_H
 
-#if !defined(_ASM)
+#ifndef _ASM
+
 #include <sys/types.h>
 
 #if defined(__xpv)
 #include <sys/xpv_impl.h>
 #endif
 
-#endif
+#endif /* !_ASM */
 
 #ifdef	__cplusplus
 extern "C" {
@@ -54,17 +55,12 @@ extern "C" {
  * Machine dependent parameters and limits.
  */
 
-#if defined(__amd64)
 /*
  * If NCPU grows beyond 256, sizing for the x86 comm page will require
  * adjustment.
  */
 #define	NCPU	256
 #define	NCPU_LOG2	8
-#elif defined(__i386)
-#define	NCPU	32
-#define	NCPU_LOG2	5
-#endif
 
 /* NCPU_P2 is NCPU rounded to a power of 2 */
 #define	NCPU_P2	(1 << NCPU_LOG2)
@@ -116,11 +112,7 @@ extern "C" {
 /*
  * DEFAULT KERNEL THREAD stack size (in pages).
  */
-#if defined(__amd64)
 #define	DEFAULTSTKSZ_NPGS	5
-#elif defined(__i386)
-#define	DEFAULTSTKSZ_NPGS	3
-#endif
 
 #if !defined(_ASM)
 #define	DEFAULTSTKSZ	(DEFAULTSTKSZ_NPGS * PAGESIZE)
@@ -129,43 +121,42 @@ extern "C" {
 #endif	/* !_ASM */
 
 /*
- * KERNELBASE is the virtual address at which the kernel segments start in
- * all contexts.
- *
- * KERNELBASE is not fixed.  The value of KERNELBASE can change with
- * installed memory or on 32 bit systems the eprom variable 'eprom_kernelbase'.
- *
- * common/conf/param.c requires a compile time defined value for KERNELBASE.
- * This value is save in the variable _kernelbase.  _kernelbase may then be
- * modified with to a different value in i86pc/os/startup.c.
- *
- * Most code should be using kernelbase, which resolves to a reference to
- * _kernelbase.
+ * During intial boot we limit heap to the top 4Gig.
  */
-#define	KERNEL_TEXT_amd64	UINT64_C(0xfffffffffb800000)
-
-#ifdef __i386
-
-#define	KERNEL_TEXT_i386	ADDRESS_C(0xfe800000)
+#define	BOOT_KERNELHEAP_BASE	ADDRESS_C(0xffffffff00000000)
 
 /*
- * We don't use HYPERVISOR_VIRT_START, as we need both the PAE and non-PAE
- * versions in our code. We always compile based on the lower PAE address.
+ * VMWare works best if we don't use the top 64Meg of memory for amd64.
+ * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages.
  */
-#define	KERNEL_TEXT_i386_xpv	\
-	(HYPERVISOR_VIRT_START_PAE - 3 * ADDRESS_C(0x400000))
-
-#endif /* __i386 */
+#define	PROMSTART	ADDRESS_C(0xffc00000)
 
-#if defined(__amd64)
+/*
+ * Virtual address range available to the debugger
+ */
+#define	SEGDEBUGBASE	ADDRESS_C(0xffffffffff800000)
+#define	SEGDEBUGSIZE	ADDRESS_C(0x400000)
 
-#define	KERNELBASE	ADDRESS_C(0xfffffd8000000000)
+#define	KERNEL_TEXT	UINT64_C(0xfffffffffb800000)
 
 /*
- * Size of the unmapped "red zone" at the very bottom of the kernel's
- * address space.  Corresponds to 1 slot in the toplevel pagetable.
+ * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug
+ * info.
+ *
+ * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps
+ * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable
+ * long term, but it's the best we've got for now.
  */
-#define	KERNEL_REDZONE_SIZE   ((uintptr_t)1 << 39)
+#if !defined(_ASM)
+#define	DEBUG_INFO_VA	(KERNEL_TEXT - MMU_PAGESIZE)
+#define	GDT_VA		(DEBUG_INFO_VA - MMU_PAGESIZE)
+#define	IDT_VA		(GDT_VA - MMU_PAGESIZE)
+#define	LDT_VA		(IDT_VA - (16 * MMU_PAGESIZE))
+#define	KTSS_VA		(IDT_VA - MMU_PAGESIZE)
+#define	DFTSS_VA	(KTSS_VA - MMU_PAGESIZE)
+#define	MISC_VA_BASE	(DFTSS_VA)
+#define	MISC_VA_SIZE	(KERNEL_TEXT - MISC_VA_BASE)
+#endif /* !_ASM */
 
 /*
  * Base of 'core' heap area, which is used for kernel and module text/data
@@ -173,53 +164,48 @@ extern "C" {
  */
 #define	COREHEAP_BASE	ADDRESS_C(0xffffffffc0000000)
 
-/*
- * Beginning of the segkpm window. A lower value than this is used if
- * physical addresses exceed 1TB. See i86pc/os/startup.c
- */
-#define	SEGKPM_BASE	ADDRESS_C(0xfffffe0000000000)
-
 /*
  * This is valloc_base, above seg_kpm, but below everything else.
  * A lower value than this may be used if SEGKPM_BASE is adjusted.
  * See i86pc/os/startup.c
  */
-#define	VALLOC_BASE	ADDRESS_C(0xffffff0000000000)
+#define	VALLOC_BASE	ADDRESS_C(0xfffffe0000000000)
+
+#define	SEGZIOMINSIZE	(400L * 1024 * 1024L)			/* 400M */
+#define	SEGVMMMINSIZE	(4096L * 1024 * 1024L)			/* 4G */
 
-/*
- * default and boundary sizes for segkp
- */
 #define	SEGKPDEFSIZE	(2L * 1024L * 1024L * 1024L)		/*   2G */
 #define	SEGKPMAXSIZE	(8L * 1024L * 1024L * 1024L)		/*   8G */
 #define	SEGKPMINSIZE	(200L * 1024 * 1024L)			/* 200M */
 
-/*
- * minimum size for segzio
- */
-#define	SEGZIOMINSIZE	(400L * 1024 * 1024L)			/* 400M */
-
-/*
- * During intial boot we limit heap to the top 4Gig.
- */
-#define	BOOT_KERNELHEAP_BASE	ADDRESS_C(0xffffffff00000000)
+#define	SEGKPM_BASE	ADDRESS_C(0xfffffd0000000000)
 
 /*
- * VMWare works best if we don't use the top 64Meg of memory for amd64.
- * Set KERNEL_TEXT to top_o_memory - 64Meg - 8 Meg for 8Meg of nucleus pages.
+ * KERNELBASE is the virtual address at which the kernel segments start in
+ * all contexts.
+ *
+ * KERNELBASE is not fixed.  The value of KERNELBASE can change with
+ * installed memory size.
+ *
+ * common/conf/param.c requires a compile time defined value for KERNELBASE.
+ * This value is save in the variable _kernelbase.  _kernelbase may then be
+ * modified with to a different value in i86pc/os/startup.c.
+ *
+ * Most code should be using kernelbase, which resolves to a reference to
+ * _kernelbase.
  */
-#define	PROMSTART	ADDRESS_C(0xffc00000)
-#define	KERNEL_TEXT	KERNEL_TEXT_amd64
+#define	KERNELBASE	ADDRESS_C(0xfffffc8000000000)
 
 /*
- * Virtual address range available to the debugger
+ * Size of the unmapped "red zone" at the very bottom of the kernel's
+ * address space.  Corresponds to 1 slot in the toplevel pagetable.
  */
-#define	SEGDEBUGBASE	ADDRESS_C(0xffffffffff800000)
-#define	SEGDEBUGSIZE	ADDRESS_C(0x400000)
+#define	KERNEL_REDZONE_SIZE   ((uintptr_t)1 << 39)
 
 /*
  * Define upper limit on user address space
  *
- * In amd64, the upper limit on a 64-bit user address space is 1 large page
+ * The upper limit on a 64-bit user address space is 1 large page
  * (2MB) below kernelbase.  The upper limit for a 32-bit user address space
  * is 1 small page (4KB) below the top of the 32-bit range.  The 64-bit
  * limit give dtrace the red zone it needs below kernelbase.  The 32-bit
@@ -232,7 +218,7 @@ extern "C" {
 #if defined(__xpv)
 #define	USERLIMIT	ADDRESS_C(0x00007fffffe00000)
 #else
-#define	USERLIMIT	ADDRESS_C(0xfffffd7fffe00000)
+#define	USERLIMIT	ADDRESS_C(0xfffffc7fffe00000)
 #endif
 
 #ifdef bug_5074717_is_fixed
@@ -241,76 +227,6 @@ extern "C" {
 #define	USERLIMIT32	ADDRESS_C(0xfefff000)
 #endif
 
-#elif defined(__i386)
-
-#ifdef DEBUG
-#define	KERNELBASE	ADDRESS_C(0xc8000000)
-#else
-#define	KERNELBASE	ADDRESS_C(0xd4000000)
-#endif
-
-#define	KERNELBASE_MAX	ADDRESS_C(0xe0000000)
-
-/*
- * The i386 ABI requires that the user address space be at least 3Gb
- * in size.  KERNELBASE_ABI_MIN is used as the default KERNELBASE for
- * physical memory configurations > 4gb.
- */
-#define	KERNELBASE_ABI_MIN	ADDRESS_C(0xc0000000)
-
-/*
- * Size of the unmapped "red zone" at the very bottom of the kernel's
- * address space.  Since segmap start immediately above the red zone, this
- * needs to be MAXBSIZE aligned.
- */
-#define	KERNEL_REDZONE_SIZE   MAXBSIZE
-
-/*
- * This is the last 4MB of the 4G address space. Some psm modules
- * need this region of virtual address space mapped 1-1
- * The top 64MB of the address space is reserved for the hypervisor.
- */
-#define	PROMSTART	ADDRESS_C(0xffc00000)
-#ifdef __xpv
-#define	KERNEL_TEXT	KERNEL_TEXT_i386_xpv
-#else
-#define	KERNEL_TEXT	KERNEL_TEXT_i386
-#endif
-
-/*
- * Virtual address range available to the debugger
- * We place it just above the kernel text (4M) and kernel data (4M).
- */
-#define	SEGDEBUGBASE	(KERNEL_TEXT + ADDRESS_C(0x800000))
-#define	SEGDEBUGSIZE	ADDRESS_C(0x400000)
-
-/*
- * Define upper limit on user address space
- */
-#define	USERLIMIT	KERNELBASE
-#define	USERLIMIT32	USERLIMIT
-
-#endif	/* __i386 */
-
-/*
- * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug
- * info.
- *
- * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps
- * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable
- * long term, but it's the best we've got for now.
- */
-#if !defined(_ASM)
-#define	DEBUG_INFO_VA	(KERNEL_TEXT - MMU_PAGESIZE)
-#define	GDT_VA		(DEBUG_INFO_VA - MMU_PAGESIZE)
-#define	IDT_VA		(GDT_VA - MMU_PAGESIZE)
-#define	LDT_VA		(IDT_VA - (16 * MMU_PAGESIZE))
-#define	KTSS_VA		(LDT_VA - MMU_PAGESIZE)
-#define	DFTSS_VA	(KTSS_VA - MMU_PAGESIZE)
-#define	MISC_VA_BASE	(DFTSS_VA)
-#define	MISC_VA_SIZE	(KERNEL_TEXT - MISC_VA_BASE)
-#endif /* !_ASM */
-
 #if !defined(_ASM) && !defined(_KMDB)
 extern uintptr_t kernelbase, segmap_start, segmapsize;
 #endif
diff --git a/usr/src/uts/i86pc/vm/seg_vmm.c b/usr/src/uts/i86pc/vm/seg_vmm.c
index faebf9ac36..beb5e81d53 100644
--- a/usr/src/uts/i86pc/vm/seg_vmm.c
+++ b/usr/src/uts/i86pc/vm/seg_vmm.c
@@ -14,12 +14,15 @@
  */
 
 /*
- * VM - Virtual-Machine-Memory segment
+ * segvmm - Virtual-Machine-Memory segment
  *
  * The vmm segment driver was designed for mapping regions of kernel memory
  * allocated to an HVM instance into userspace for manipulation there.  It
  * draws direct lineage from the umap segment driver, but meant for larger
  * mappings with fewer restrictions.
+ *
+ * seg*k*vmm, in contrast, has mappings for every VMM into kas.  We use its
+ * mappings here only to find the relevant PFNs in segvmm_fault_in().
  */
 
 
@@ -93,7 +96,7 @@ static struct seg_ops segvmm_ops = {
 
 
 /*
- * Create a kernel/user-mapped segment.
+ * Create a kernel/user-mapped segment.  ->kaddr is the segkvmm mapping.
  */
 int
 segvmm_create(struct seg **segpp, void *argsp)
diff --git a/usr/src/uts/sun4/os/startup.c b/usr/src/uts/sun4/os/startup.c
index c74590b002..759802b2cd 100644
--- a/usr/src/uts/sun4/os/startup.c
+++ b/usr/src/uts/sun4/os/startup.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2019 Peter Tribble.
  */
 
@@ -2026,7 +2027,7 @@ startup_vm(void)
 
 		(void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
 		    &kzioseg);
-		(void) segkmem_zio_create(&kzioseg);
+		(void) segkmem_create(&kzioseg);
 
 		/* create zio area covering new segment */
 		segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
-- 
cgit v1.2.3


From 4c7b9a81057545d490dd52cf823de529d8137a5b Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Sun, 10 May 2020 15:56:29 +0000
Subject: 12718 LDT overlap with TSS reintroduced in 12608 Reviewed by: John
 Levon <john.levon@joyent.com> Reviewed by: Andy Fiddaman
 <omnios@citrus-it.co.uk> Approved by: Robert Mustacchi <rm@fingolfin.org>

---
 usr/src/uts/i86pc/sys/machparam.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index 3728f30ca6..f79b582df4 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -152,7 +152,7 @@ extern "C" {
 #define	GDT_VA		(DEBUG_INFO_VA - MMU_PAGESIZE)
 #define	IDT_VA		(GDT_VA - MMU_PAGESIZE)
 #define	LDT_VA		(IDT_VA - (16 * MMU_PAGESIZE))
-#define	KTSS_VA		(IDT_VA - MMU_PAGESIZE)
+#define	KTSS_VA		(LDT_VA - MMU_PAGESIZE)
 #define	DFTSS_VA	(KTSS_VA - MMU_PAGESIZE)
 #define	MISC_VA_BASE	(DFTSS_VA)
 #define	MISC_VA_SIZE	(KERNEL_TEXT - MISC_VA_BASE)
-- 
cgit v1.2.3


From bf21cd9318e0a3a51b7f02c14a7c1b1aef2dc861 Mon Sep 17 00:00:00 2001
From: Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
Date: Tue, 26 Sep 2017 12:19:41 +0200
Subject: 12612 import Pluribus bhyve port Authored by: Krupal Joshi
 <krupal.joshi@pluribusnetworks.com> Contributed by: Pluribus Networks Inc.
 Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Dan
 McDonald <danmcd@joyent.com> Reviewed by: Mike Gerdts
 <mike.gerdts@joyent.com> Reviewed by: Patrick Mooney
 <patrick.mooney@joyent.com> Reviewed by: Toomas Soome <tsoome@me.com>
 Approved by: Robert Mustacchi <rm@fingolfin.org>

---
 usr/contrib/freebsd/amd64/machine/_types.h         |    6 +
 usr/contrib/freebsd/amd64/machine/psl.h            |    6 +
 usr/contrib/freebsd/amd64/machine/specialreg.h     |    6 +
 usr/contrib/freebsd/amd64/machine/timerreg.h       |   54 +
 usr/contrib/freebsd/amd64/machine/vm.h             |   45 +
 usr/contrib/freebsd/dev/acpica/acpi_hpet.h         |   67 +
 usr/contrib/freebsd/dev/ic/i8253reg.h              |   78 +
 usr/contrib/freebsd/dev/ic/i8259.h                 |   86 +
 usr/contrib/freebsd/dev/ic/ns16550.h               |  240 ++
 usr/contrib/freebsd/dev/pci/pcireg.h               |  922 +++++++
 usr/contrib/freebsd/isa/isareg.h                   |   70 +
 usr/contrib/freebsd/lib/libutil/expand_number.c    |   93 +
 usr/contrib/freebsd/sys/ata.h                      |  635 +++++
 usr/contrib/freebsd/sys/linker_set.h               |  119 +
 usr/contrib/freebsd/sys/tree.h                     |  765 ++++++
 usr/contrib/freebsd/x86/apicreg.h                  |  455 ++++
 usr/contrib/freebsd/x86/mptable.h                  |  204 ++
 usr/contrib/freebsd/x86/psl.h                      |   92 +
 usr/contrib/freebsd/x86/specialreg.h               |  839 ++++++
 usr/src/cmd/bhyve/Makefile                         |   41 +
 usr/src/cmd/bhyve/Makefile.com                     |   94 +
 usr/src/cmd/bhyve/acpi.h                           |   54 +
 usr/src/cmd/bhyve/ahci.h                           |  304 +++
 usr/src/cmd/bhyve/amd64/Makefile                   |   21 +
 usr/src/cmd/bhyve/atkbdc.c                         |  576 ++++
 usr/src/cmd/bhyve/atkbdc.h                         |   38 +
 usr/src/cmd/bhyve/bhyve_sol_glue.c                 |   86 +
 usr/src/cmd/bhyve/bhyvegc.c                        |   78 +
 usr/src/cmd/bhyve/bhyvegc.h                        |   44 +
 usr/src/cmd/bhyve/bhyverun.c                       |  820 ++++++
 usr/src/cmd/bhyve/bhyverun.h                       |   73 +
 usr/src/cmd/bhyve/block_if.c                       |  625 +++++
 usr/src/cmd/bhyve/block_if.h                       |   70 +
 usr/src/cmd/bhyve/console.c                        |  101 +
 usr/src/cmd/bhyve/console.h                        |   50 +
 usr/src/cmd/bhyve/consport.c                       |  155 ++
 usr/src/cmd/bhyve/dbgport.h                        |   34 +
 usr/src/cmd/bhyve/inout.c                          |  297 ++
 usr/src/cmd/bhyve/inout.h                          |   91 +
 usr/src/cmd/bhyve/ioapic.c                         |   74 +
 usr/src/cmd/bhyve/ioapic.h                         |   39 +
 usr/src/cmd/bhyve/mem.c                            |  291 ++
 usr/src/cmd/bhyve/mem.h                            |   61 +
 usr/src/cmd/bhyve/mptbl.c                          |  377 +++
 usr/src/cmd/bhyve/mptbl.h                          |   35 +
 usr/src/cmd/bhyve/pci_ahci.c                       | 2009 ++++++++++++++
 usr/src/cmd/bhyve/pci_emul.c                       | 2103 +++++++++++++++
 usr/src/cmd/bhyve/pci_emul.h                       |  283 ++
 usr/src/cmd/bhyve/pci_hostbridge.c                 |   70 +
 usr/src/cmd/bhyve/pci_irq.c                        |  351 +++
 usr/src/cmd/bhyve/pci_irq.h                        |   45 +
 usr/src/cmd/bhyve/pci_lpc.c                        |  433 +++
 usr/src/cmd/bhyve/pci_lpc.h                        |   72 +
 usr/src/cmd/bhyve/pci_virtio_block.c               |  392 +++
 usr/src/cmd/bhyve/pci_virtio_net.c                 |  870 ++++++
 usr/src/cmd/bhyve/pci_virtio_viona.c               |  706 +++++
 usr/src/cmd/bhyve/pm.c                             |  333 +++
 usr/src/cmd/bhyve/pmtmr.c                          |  212 ++
 usr/src/cmd/bhyve/post.c                           |   53 +
 usr/src/cmd/bhyve/ps2kbd.c                         |  418 +++
 usr/src/cmd/bhyve/ps2kbd.h                         |   39 +
 usr/src/cmd/bhyve/ps2mouse.c                       |  371 +++
 usr/src/cmd/bhyve/ps2mouse.h                       |   39 +
 usr/src/cmd/bhyve/rfb.c                            |  420 +++
 usr/src/cmd/bhyve/rfb.h                            |   36 +
 usr/src/cmd/bhyve/rtc.c                            |  380 +++
 usr/src/cmd/bhyve/rtc.h                            |   34 +
 usr/src/cmd/bhyve/smbiostbl.c                      |  827 ++++++
 usr/src/cmd/bhyve/smbiostbl.h                      |   36 +
 usr/src/cmd/bhyve/spinup_ap.c                      |  104 +
 usr/src/cmd/bhyve/spinup_ap.h                      |   34 +
 usr/src/cmd/bhyve/uart_emul.c                      | 1042 +++++++
 usr/src/cmd/bhyve/uart_emul.h                      |   45 +
 usr/src/cmd/bhyve/vga.c                            | 1289 +++++++++
 usr/src/cmd/bhyve/vga.h                            |  160 ++
 usr/src/cmd/bhyve/virtio.c                         |  755 ++++++
 usr/src/cmd/bhyve/virtio.h                         |  475 ++++
 usr/src/cmd/bhyve/xmsr.c                           |  237 ++
 usr/src/cmd/bhyve/xmsr.h                           |   36 +
 usr/src/cmd/bhyveconsole/Makefile                  |   41 +
 usr/src/cmd/bhyveconsole/bhyveconsole.c            |  360 +++
 usr/src/cmd/bhyveconsole/i386/Makefile             |   43 +
 usr/src/cmd/bhyvectl/Makefile                      |   41 +
 usr/src/cmd/bhyvectl/Makefile.com                  |   48 +
 usr/src/cmd/bhyvectl/amd64/Makefile                |   21 +
 usr/src/cmd/bhyvectl/bhyvectl.c                    | 1523 +++++++++++
 usr/src/cmd/bhyveload-uefi/Makefile                |   41 +
 usr/src/cmd/bhyveload-uefi/Makefile.com            |   52 +
 usr/src/cmd/bhyveload-uefi/amd64/Makefile          |   21 +
 usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c        |  190 ++
 usr/src/cmd/bhyveload-uefi/i386/Makefile           |   18 +
 usr/src/cmd/mdb/intel/amd64/vmm/Makefile           |   20 +
 usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile     |   32 +
 usr/src/cmd/mdb/intel/amd64/vmm/vmm.c              |  238 ++
 usr/src/compat/freebsd/amd64/machine/asmacros.h    |   28 +
 usr/src/compat/freebsd/amd64/machine/atomic.h      |  244 ++
 usr/src/compat/freebsd/amd64/machine/clock.h       |   23 +
 usr/src/compat/freebsd/amd64/machine/cpufunc.h     |  165 ++
 usr/src/compat/freebsd/amd64/machine/fpu.h         |   29 +
 usr/src/compat/freebsd/amd64/machine/md_var.h      |   24 +
 usr/src/compat/freebsd/amd64/machine/param.h       |   39 +
 usr/src/compat/freebsd/amd64/machine/pcb.h         |   21 +
 usr/src/compat/freebsd/amd64/machine/pmap.h        |   44 +
 usr/src/compat/freebsd/amd64/machine/segments.h    |   21 +
 usr/src/compat/freebsd/amd64/machine/smp.h         |   19 +
 usr/src/compat/freebsd/amd64/machine/vmm.h         |   21 +
 usr/src/compat/freebsd/amd64/machine/vmm_dev.h     |   21 +
 .../freebsd/amd64/machine/vmm_instruction_emul.h   |   21 +
 usr/src/compat/freebsd/amd64/machine/vmparam.h     |   19 +
 usr/src/compat/freebsd/libutil.h                   |   21 +
 usr/src/compat/freebsd/net/ethernet.h              |   21 +
 usr/src/compat/freebsd/paths.h                     |   21 +
 usr/src/compat/freebsd/pthread_np.h                |   28 +
 usr/src/compat/freebsd/string.h                    |   26 +
 usr/src/compat/freebsd/strings.h                   |   23 +
 usr/src/compat/freebsd/sys/_iovec.h                |   24 +
 usr/src/compat/freebsd/sys/_pthreadtypes.h         |   19 +
 usr/src/compat/freebsd/sys/_types.h                |   22 +
 usr/src/compat/freebsd/sys/callout.h               |   70 +
 usr/src/compat/freebsd/sys/cdefs.h                 |   58 +
 usr/src/compat/freebsd/sys/cpuset.h                |   44 +
 usr/src/compat/freebsd/sys/disk.h                  |   19 +
 usr/src/compat/freebsd/sys/endian.h                |  125 +
 usr/src/compat/freebsd/sys/errno.h                 |   27 +
 usr/src/compat/freebsd/sys/fcntl.h                 |   23 +
 usr/src/compat/freebsd/sys/ioctl.h                 |   22 +
 usr/src/compat/freebsd/sys/kernel.h                |   25 +
 usr/src/compat/freebsd/sys/ktr.h                   |   27 +
 usr/src/compat/freebsd/sys/libkern.h               |   25 +
 usr/src/compat/freebsd/sys/limits.h                |   19 +
 usr/src/compat/freebsd/sys/malloc.h                |   44 +
 usr/src/compat/freebsd/sys/module.h                |   19 +
 usr/src/compat/freebsd/sys/mutex.h                 |   81 +
 usr/src/compat/freebsd/sys/param.h                 |   48 +
 usr/src/compat/freebsd/sys/pcpu.h                  |   21 +
 usr/src/compat/freebsd/sys/sched.h                 |   19 +
 usr/src/compat/freebsd/sys/select.h                |   23 +
 usr/src/compat/freebsd/sys/smp.h                   |   28 +
 usr/src/compat/freebsd/sys/sysctl.h                |   27 +
 usr/src/compat/freebsd/sys/systm.h                 |   53 +
 usr/src/compat/freebsd/sys/time.h                  |  104 +
 usr/src/compat/freebsd/sys/types.h                 |   74 +
 usr/src/compat/freebsd/sys/uio.h                   |   26 +
 usr/src/compat/freebsd/termios.h                   |   23 +
 usr/src/compat/freebsd/uuid.h                      |   55 +
 usr/src/compat/freebsd/vm/pmap.h                   |   21 +
 usr/src/compat/freebsd/vm/vm.h                     |   39 +
 usr/src/compat/freebsd/x86/_types.h                |   49 +
 usr/src/compat/freebsd/x86/segments.h              |   28 +
 usr/src/head/bhyve.h                               |   25 +
 usr/src/lib/libvmmapi/Makefile                     |   49 +
 usr/src/lib/libvmmapi/Makefile.com                 |   53 +
 usr/src/lib/libvmmapi/amd64/Makefile               |   21 +
 usr/src/lib/libvmmapi/common/llib-lvmmapi          |    2 +
 usr/src/lib/libvmmapi/common/mapfile-vers          |   77 +
 usr/src/lib/libvmmapi/common/vmmapi.c              | 1257 +++++++++
 usr/src/lib/libvmmapi/common/vmmapi.h              |  159 ++
 usr/src/tools/scripts/gensetdefs.pl                |   31 +
 usr/src/uts/i86pc/io/viona/viona.c                 | 1404 ++++++++++
 usr/src/uts/i86pc/io/viona/viona.conf              |   14 +
 usr/src/uts/i86pc/io/vmm/amd/amdv.c                |  271 ++
 usr/src/uts/i86pc/io/vmm/intel/ept.c               |  452 ++++
 usr/src/uts/i86pc/io/vmm/intel/ept.h               |   43 +
 usr/src/uts/i86pc/io/vmm/intel/vmcs.c              |  597 ++++
 usr/src/uts/i86pc/io/vmm/intel/vmcs.h              |  410 +++
 usr/src/uts/i86pc/io/vmm/intel/vmx.c               | 2842 ++++++++++++++++++++
 usr/src/uts/i86pc/io/vmm/intel/vmx.h               |  156 ++
 usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h      |   96 +
 usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h       |  245 ++
 usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c           |  445 +++
 usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h           |   70 +
 usr/src/uts/i86pc/io/vmm/intel/vmx_support.s       |  271 ++
 usr/src/uts/i86pc/io/vmm/io/vatpic.c               |  809 ++++++
 usr/src/uts/i86pc/io/vmm/io/vatpic.h               |   57 +
 usr/src/uts/i86pc/io/vmm/io/vatpit.c               |  458 ++++
 usr/src/uts/i86pc/io/vmm/io/vatpit.h               |   45 +
 usr/src/uts/i86pc/io/vmm/io/vdev.c                 |  282 ++
 usr/src/uts/i86pc/io/vmm/io/vdev.h                 |   96 +
 usr/src/uts/i86pc/io/vmm/io/vhpet.c                |  821 ++++++
 usr/src/uts/i86pc/io/vmm/io/vhpet.h                |   44 +
 usr/src/uts/i86pc/io/vmm/io/vioapic.c              |  514 ++++
 usr/src/uts/i86pc/io/vmm/io/vioapic.h              |   66 +
 usr/src/uts/i86pc/io/vmm/io/vlapic.c               | 1687 ++++++++++++
 usr/src/uts/i86pc/io/vmm/io/vlapic.h               |  109 +
 usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h          |  190 ++
 usr/src/uts/i86pc/io/vmm/offsets.in                |   72 +
 usr/src/uts/i86pc/io/vmm/vmm.c                     | 1894 +++++++++++++
 usr/src/uts/i86pc/io/vmm/vmm.conf                  |    1 +
 usr/src/uts/i86pc/io/vmm/vmm_host.c                |  160 ++
 usr/src/uts/i86pc/io/vmm/vmm_host.h                |  119 +
 usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c    | 2370 ++++++++++++++++
 usr/src/uts/i86pc/io/vmm/vmm_ioport.c              |  174 ++
 usr/src/uts/i86pc/io/vmm/vmm_ioport.h              |   37 +
 usr/src/uts/i86pc/io/vmm/vmm_ipi.h                 |   37 +
 usr/src/uts/i86pc/io/vmm/vmm_ktr.h                 |   69 +
 usr/src/uts/i86pc/io/vmm/vmm_lapic.c               |  256 ++
 usr/src/uts/i86pc/io/vmm/vmm_lapic.h               |   87 +
 usr/src/uts/i86pc/io/vmm/vmm_mem.h                 |   49 +
 usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c             | 1040 +++++++
 usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c            |  779 ++++++
 usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c             |  111 +
 usr/src/uts/i86pc/io/vmm/vmm_stat.h                |  127 +
 usr/src/uts/i86pc/io/vmm/vmm_util.c                |  125 +
 usr/src/uts/i86pc/io/vmm/vmm_util.h                |   40 +
 usr/src/uts/i86pc/io/vmm/vmx_assym.s               |    1 +
 usr/src/uts/i86pc/io/vmm/x86.c                     |  276 ++
 usr/src/uts/i86pc/io/vmm/x86.h                     |   65 +
 usr/src/uts/i86pc/sys/viona_io.h                   |   45 +
 usr/src/uts/i86pc/sys/vmm.h                        |  565 ++++
 usr/src/uts/i86pc/sys/vmm_dev.h                    |  334 +++
 usr/src/uts/i86pc/sys/vmm_impl.h                   |   86 +
 usr/src/uts/i86pc/sys/vmm_instruction_emul.h       |  126 +
 usr/src/uts/i86pc/viona/Makefile                   |   72 +
 usr/src/uts/i86pc/vmm/Makefile                     |   94 +
 214 files changed, 52298 insertions(+)
 create mode 100644 usr/contrib/freebsd/amd64/machine/_types.h
 create mode 100644 usr/contrib/freebsd/amd64/machine/psl.h
 create mode 100644 usr/contrib/freebsd/amd64/machine/specialreg.h
 create mode 100644 usr/contrib/freebsd/amd64/machine/timerreg.h
 create mode 100644 usr/contrib/freebsd/amd64/machine/vm.h
 create mode 100644 usr/contrib/freebsd/dev/acpica/acpi_hpet.h
 create mode 100644 usr/contrib/freebsd/dev/ic/i8253reg.h
 create mode 100644 usr/contrib/freebsd/dev/ic/i8259.h
 create mode 100644 usr/contrib/freebsd/dev/ic/ns16550.h
 create mode 100644 usr/contrib/freebsd/dev/pci/pcireg.h
 create mode 100644 usr/contrib/freebsd/isa/isareg.h
 create mode 100644 usr/contrib/freebsd/lib/libutil/expand_number.c
 create mode 100644 usr/contrib/freebsd/sys/ata.h
 create mode 100644 usr/contrib/freebsd/sys/linker_set.h
 create mode 100644 usr/contrib/freebsd/sys/tree.h
 create mode 100644 usr/contrib/freebsd/x86/apicreg.h
 create mode 100644 usr/contrib/freebsd/x86/mptable.h
 create mode 100644 usr/contrib/freebsd/x86/psl.h
 create mode 100644 usr/contrib/freebsd/x86/specialreg.h
 create mode 100644 usr/src/cmd/bhyve/Makefile
 create mode 100644 usr/src/cmd/bhyve/Makefile.com
 create mode 100644 usr/src/cmd/bhyve/acpi.h
 create mode 100644 usr/src/cmd/bhyve/ahci.h
 create mode 100644 usr/src/cmd/bhyve/amd64/Makefile
 create mode 100644 usr/src/cmd/bhyve/atkbdc.c
 create mode 100644 usr/src/cmd/bhyve/atkbdc.h
 create mode 100644 usr/src/cmd/bhyve/bhyve_sol_glue.c
 create mode 100644 usr/src/cmd/bhyve/bhyvegc.c
 create mode 100644 usr/src/cmd/bhyve/bhyvegc.h
 create mode 100644 usr/src/cmd/bhyve/bhyverun.c
 create mode 100644 usr/src/cmd/bhyve/bhyverun.h
 create mode 100644 usr/src/cmd/bhyve/block_if.c
 create mode 100644 usr/src/cmd/bhyve/block_if.h
 create mode 100644 usr/src/cmd/bhyve/console.c
 create mode 100644 usr/src/cmd/bhyve/console.h
 create mode 100644 usr/src/cmd/bhyve/consport.c
 create mode 100644 usr/src/cmd/bhyve/dbgport.h
 create mode 100644 usr/src/cmd/bhyve/inout.c
 create mode 100644 usr/src/cmd/bhyve/inout.h
 create mode 100644 usr/src/cmd/bhyve/ioapic.c
 create mode 100644 usr/src/cmd/bhyve/ioapic.h
 create mode 100644 usr/src/cmd/bhyve/mem.c
 create mode 100644 usr/src/cmd/bhyve/mem.h
 create mode 100644 usr/src/cmd/bhyve/mptbl.c
 create mode 100644 usr/src/cmd/bhyve/mptbl.h
 create mode 100644 usr/src/cmd/bhyve/pci_ahci.c
 create mode 100644 usr/src/cmd/bhyve/pci_emul.c
 create mode 100644 usr/src/cmd/bhyve/pci_emul.h
 create mode 100644 usr/src/cmd/bhyve/pci_hostbridge.c
 create mode 100644 usr/src/cmd/bhyve/pci_irq.c
 create mode 100644 usr/src/cmd/bhyve/pci_irq.h
 create mode 100644 usr/src/cmd/bhyve/pci_lpc.c
 create mode 100644 usr/src/cmd/bhyve/pci_lpc.h
 create mode 100644 usr/src/cmd/bhyve/pci_virtio_block.c
 create mode 100644 usr/src/cmd/bhyve/pci_virtio_net.c
 create mode 100644 usr/src/cmd/bhyve/pci_virtio_viona.c
 create mode 100644 usr/src/cmd/bhyve/pm.c
 create mode 100644 usr/src/cmd/bhyve/pmtmr.c
 create mode 100644 usr/src/cmd/bhyve/post.c
 create mode 100644 usr/src/cmd/bhyve/ps2kbd.c
 create mode 100644 usr/src/cmd/bhyve/ps2kbd.h
 create mode 100644 usr/src/cmd/bhyve/ps2mouse.c
 create mode 100644 usr/src/cmd/bhyve/ps2mouse.h
 create mode 100644 usr/src/cmd/bhyve/rfb.c
 create mode 100644 usr/src/cmd/bhyve/rfb.h
 create mode 100644 usr/src/cmd/bhyve/rtc.c
 create mode 100644 usr/src/cmd/bhyve/rtc.h
 create mode 100644 usr/src/cmd/bhyve/smbiostbl.c
 create mode 100644 usr/src/cmd/bhyve/smbiostbl.h
 create mode 100644 usr/src/cmd/bhyve/spinup_ap.c
 create mode 100644 usr/src/cmd/bhyve/spinup_ap.h
 create mode 100644 usr/src/cmd/bhyve/uart_emul.c
 create mode 100644 usr/src/cmd/bhyve/uart_emul.h
 create mode 100644 usr/src/cmd/bhyve/vga.c
 create mode 100644 usr/src/cmd/bhyve/vga.h
 create mode 100644 usr/src/cmd/bhyve/virtio.c
 create mode 100644 usr/src/cmd/bhyve/virtio.h
 create mode 100644 usr/src/cmd/bhyve/xmsr.c
 create mode 100644 usr/src/cmd/bhyve/xmsr.h
 create mode 100644 usr/src/cmd/bhyveconsole/Makefile
 create mode 100644 usr/src/cmd/bhyveconsole/bhyveconsole.c
 create mode 100644 usr/src/cmd/bhyveconsole/i386/Makefile
 create mode 100644 usr/src/cmd/bhyvectl/Makefile
 create mode 100644 usr/src/cmd/bhyvectl/Makefile.com
 create mode 100644 usr/src/cmd/bhyvectl/amd64/Makefile
 create mode 100644 usr/src/cmd/bhyvectl/bhyvectl.c
 create mode 100644 usr/src/cmd/bhyveload-uefi/Makefile
 create mode 100644 usr/src/cmd/bhyveload-uefi/Makefile.com
 create mode 100644 usr/src/cmd/bhyveload-uefi/amd64/Makefile
 create mode 100644 usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
 create mode 100644 usr/src/cmd/bhyveload-uefi/i386/Makefile
 create mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/Makefile
 create mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
 create mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
 create mode 100644 usr/src/compat/freebsd/amd64/machine/asmacros.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/atomic.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/clock.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/cpufunc.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/fpu.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/md_var.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/param.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/pcb.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/pmap.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/segments.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/smp.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/vmm.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/vmm_dev.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/vmparam.h
 create mode 100644 usr/src/compat/freebsd/libutil.h
 create mode 100644 usr/src/compat/freebsd/net/ethernet.h
 create mode 100644 usr/src/compat/freebsd/paths.h
 create mode 100644 usr/src/compat/freebsd/pthread_np.h
 create mode 100644 usr/src/compat/freebsd/string.h
 create mode 100644 usr/src/compat/freebsd/strings.h
 create mode 100644 usr/src/compat/freebsd/sys/_iovec.h
 create mode 100644 usr/src/compat/freebsd/sys/_pthreadtypes.h
 create mode 100644 usr/src/compat/freebsd/sys/_types.h
 create mode 100644 usr/src/compat/freebsd/sys/callout.h
 create mode 100644 usr/src/compat/freebsd/sys/cdefs.h
 create mode 100644 usr/src/compat/freebsd/sys/cpuset.h
 create mode 100644 usr/src/compat/freebsd/sys/disk.h
 create mode 100644 usr/src/compat/freebsd/sys/endian.h
 create mode 100644 usr/src/compat/freebsd/sys/errno.h
 create mode 100644 usr/src/compat/freebsd/sys/fcntl.h
 create mode 100644 usr/src/compat/freebsd/sys/ioctl.h
 create mode 100644 usr/src/compat/freebsd/sys/kernel.h
 create mode 100644 usr/src/compat/freebsd/sys/ktr.h
 create mode 100644 usr/src/compat/freebsd/sys/libkern.h
 create mode 100644 usr/src/compat/freebsd/sys/limits.h
 create mode 100644 usr/src/compat/freebsd/sys/malloc.h
 create mode 100644 usr/src/compat/freebsd/sys/module.h
 create mode 100644 usr/src/compat/freebsd/sys/mutex.h
 create mode 100644 usr/src/compat/freebsd/sys/param.h
 create mode 100644 usr/src/compat/freebsd/sys/pcpu.h
 create mode 100644 usr/src/compat/freebsd/sys/sched.h
 create mode 100644 usr/src/compat/freebsd/sys/select.h
 create mode 100644 usr/src/compat/freebsd/sys/smp.h
 create mode 100644 usr/src/compat/freebsd/sys/sysctl.h
 create mode 100644 usr/src/compat/freebsd/sys/systm.h
 create mode 100644 usr/src/compat/freebsd/sys/time.h
 create mode 100644 usr/src/compat/freebsd/sys/types.h
 create mode 100644 usr/src/compat/freebsd/sys/uio.h
 create mode 100644 usr/src/compat/freebsd/termios.h
 create mode 100644 usr/src/compat/freebsd/uuid.h
 create mode 100644 usr/src/compat/freebsd/vm/pmap.h
 create mode 100644 usr/src/compat/freebsd/vm/vm.h
 create mode 100644 usr/src/compat/freebsd/x86/_types.h
 create mode 100644 usr/src/compat/freebsd/x86/segments.h
 create mode 100644 usr/src/head/bhyve.h
 create mode 100644 usr/src/lib/libvmmapi/Makefile
 create mode 100644 usr/src/lib/libvmmapi/Makefile.com
 create mode 100644 usr/src/lib/libvmmapi/amd64/Makefile
 create mode 100644 usr/src/lib/libvmmapi/common/llib-lvmmapi
 create mode 100644 usr/src/lib/libvmmapi/common/mapfile-vers
 create mode 100644 usr/src/lib/libvmmapi/common/vmmapi.c
 create mode 100644 usr/src/lib/libvmmapi/common/vmmapi.h
 create mode 100644 usr/src/tools/scripts/gensetdefs.pl
 create mode 100644 usr/src/uts/i86pc/io/viona/viona.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona.conf
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/amdv.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/ept.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/ept.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmcs.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmcs.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmx.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmx.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vatpic.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vatpic.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vatpit.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vatpit.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vdev.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vdev.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vhpet.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vhpet.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vioapic.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vioapic.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vlapic.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vlapic.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/offsets.in
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm.conf
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_host.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_host.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_ioport.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_ioport.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_ipi.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_ktr.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_lapic.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_lapic.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_mem.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_stat.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_util.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_util.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmx_assym.s
 create mode 100644 usr/src/uts/i86pc/io/vmm/x86.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/x86.h
 create mode 100644 usr/src/uts/i86pc/sys/viona_io.h
 create mode 100644 usr/src/uts/i86pc/sys/vmm.h
 create mode 100644 usr/src/uts/i86pc/sys/vmm_dev.h
 create mode 100644 usr/src/uts/i86pc/sys/vmm_impl.h
 create mode 100644 usr/src/uts/i86pc/sys/vmm_instruction_emul.h
 create mode 100644 usr/src/uts/i86pc/viona/Makefile
 create mode 100644 usr/src/uts/i86pc/vmm/Makefile

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/contrib/freebsd/amd64/machine/_types.h b/usr/contrib/freebsd/amd64/machine/_types.h
new file mode 100644
index 0000000000..59994352b5
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/_types.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/_types.h 232261 2012-02-28 18:15:28Z tijl $ */
+
+#include <x86/_types.h>
diff --git a/usr/contrib/freebsd/amd64/machine/psl.h b/usr/contrib/freebsd/amd64/machine/psl.h
new file mode 100644
index 0000000000..c660bfbab0
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/psl.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/psl.h 233204 2012-03-19 21:29:57Z tijl $ */
+
+#include <x86/psl.h>
diff --git a/usr/contrib/freebsd/amd64/machine/specialreg.h b/usr/contrib/freebsd/amd64/machine/specialreg.h
new file mode 100644
index 0000000000..41d4125cb9
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/specialreg.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/specialreg.h 233207 2012-03-19 21:34:11Z tijl $ */
+
+#include <x86/specialreg.h>
diff --git a/usr/contrib/freebsd/amd64/machine/timerreg.h b/usr/contrib/freebsd/amd64/machine/timerreg.h
new file mode 100644
index 0000000000..bca7b4dd19
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/timerreg.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (C) 2005 TAKAHASHI Yoshihiro. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/timerreg.h 177642 2008-03-26 20:09:21Z phk $
+ */
+
+/*
+ * The outputs of the three timers are connected as follows:
+ *
+ *	 timer 0 -> irq 0
+ *	 timer 1 -> dma chan 0 (for dram refresh)
+ * 	 timer 2 -> speaker (via keyboard controller)
+ *
+ * Timer 0 is used to call hardclock.
+ * Timer 2 is used to generate console beeps.
+ */
+
+#ifndef _MACHINE_TIMERREG_H_
+#define _MACHINE_TIMERREG_H_
+
+#ifdef _KERNEL
+
+#include <dev/ic/i8253reg.h>
+
+#define	IO_TIMER1	0x40		/* 8253 Timer #1 */
+#define	TIMER_CNTR0	(IO_TIMER1 + TIMER_REG_CNTR0)
+#define	TIMER_CNTR1	(IO_TIMER1 + TIMER_REG_CNTR1)
+#define	TIMER_CNTR2	(IO_TIMER1 + TIMER_REG_CNTR2)
+#define	TIMER_MODE	(IO_TIMER1 + TIMER_REG_MODE)
+
+#endif /* _KERNEL */
+
+#endif /* _MACHINE_TIMERREG_H_ */
diff --git a/usr/contrib/freebsd/amd64/machine/vm.h b/usr/contrib/freebsd/amd64/machine/vm.h
new file mode 100644
index 0000000000..885c1607ea
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/vm.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2009 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vm.h 233671 2012-03-29 16:51:22Z jhb $
+ */
+
+#ifndef _MACHINE_VM_H_
+#define	_MACHINE_VM_H_
+
+#include <machine/specialreg.h>
+
+/* Memory attributes. */
+#define	VM_MEMATTR_UNCACHEABLE		((vm_memattr_t)PAT_UNCACHEABLE)
+#define	VM_MEMATTR_WRITE_COMBINING	((vm_memattr_t)PAT_WRITE_COMBINING)
+#define	VM_MEMATTR_WRITE_THROUGH	((vm_memattr_t)PAT_WRITE_THROUGH)
+#define	VM_MEMATTR_WRITE_PROTECTED	((vm_memattr_t)PAT_WRITE_PROTECTED)
+#define	VM_MEMATTR_WRITE_BACK		((vm_memattr_t)PAT_WRITE_BACK)
+#define	VM_MEMATTR_WEAK_UNCACHEABLE	((vm_memattr_t)PAT_UNCACHED)
+
+#define	VM_MEMATTR_DEFAULT		VM_MEMATTR_WRITE_BACK
+
+#endif /* !_MACHINE_VM_H_ */
diff --git a/usr/contrib/freebsd/dev/acpica/acpi_hpet.h b/usr/contrib/freebsd/dev/acpica/acpi_hpet.h
new file mode 100644
index 0000000000..df817b7a2b
--- /dev/null
+++ b/usr/contrib/freebsd/dev/acpica/acpi_hpet.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2005 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/dev/acpica/acpi_hpet.h 224919 2011-08-16 21:51:29Z mav $
+ */
+
+#ifndef __ACPI_HPET_H__
+#define	__ACPI_HPET_H__
+
+#define HPET_MEM_WIDTH		0x400	/* Expected memory region size */
+
+/* General registers */
+#define HPET_CAPABILITIES	0x0	/* General capabilities and ID */
+#define	HPET_CAP_VENDOR_ID	0xffff0000
+#define	HPET_CAP_LEG_RT		0x00008000
+#define	HPET_CAP_COUNT_SIZE	0x00002000 /* 1 = 64-bit, 0 = 32-bit */
+#define	HPET_CAP_NUM_TIM	0x00001f00
+#define	HPET_CAP_REV_ID		0x000000ff
+#define HPET_PERIOD		0x4	/* Period (1/hz) of timer */
+#define HPET_CONFIG		0x10	/* General configuration register */
+#define	HPET_CNF_LEG_RT		0x00000002
+#define	HPET_CNF_ENABLE		0x00000001
+#define	HPET_ISR		0x20	/* General interrupt status register */
+#define HPET_MAIN_COUNTER	0xf0	/* Main counter register */
+
+/* Timer registers */
+#define	HPET_TIMER_CAP_CNF(x)	((x) * 0x20 + 0x100)
+#define	HPET_TCAP_INT_ROUTE	0xffffffff00000000
+#define	HPET_TCAP_FSB_INT_DEL	0x00008000
+#define	HPET_TCNF_FSB_EN	0x00004000
+#define	HPET_TCNF_INT_ROUTE	0x00003e00
+#define	HPET_TCNF_32MODE	0x00000100
+#define	HPET_TCNF_VAL_SET	0x00000040
+#define	HPET_TCAP_SIZE		0x00000020 /* 1 = 64-bit, 0 = 32-bit */
+#define	HPET_TCAP_PER_INT	0x00000010 /* Supports periodic interrupts */
+#define	HPET_TCNF_TYPE		0x00000008 /* 1 = periodic, 0 = one-shot */
+#define	HPET_TCNF_INT_ENB	0x00000004
+#define	HPET_TCNF_INT_TYPE	0x00000002 /* 1 = level triggered, 0 = edge */
+#define	HPET_TIMER_COMPARATOR(x) ((x) * 0x20 + 0x108)
+#define	HPET_TIMER_FSB_VAL(x)	((x) * 0x20 + 0x110)
+#define	HPET_TIMER_FSB_ADDR(x)	((x) * 0x20 + 0x114)
+
+#define	HPET_MIN_CYCLES		128	/* Period considered reliable. */
+
+#endif /* !__ACPI_HPET_H__ */
diff --git a/usr/contrib/freebsd/dev/ic/i8253reg.h b/usr/contrib/freebsd/dev/ic/i8253reg.h
new file mode 100644
index 0000000000..47568b3436
--- /dev/null
+++ b/usr/contrib/freebsd/dev/ic/i8253reg.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 1993 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: Header: timerreg.h,v 1.2 93/02/28 15:08:58 mccanne Exp
+ * $FreeBSD: head/sys/dev/ic/i8253reg.h 146215 2005-05-14 10:26:31Z nyan $
+ */
+
+/*
+ * Register definitions for the Intel 8253 Programmable Interval Timer.
+ *
+ * This chip has three independent 16-bit down counters that can be
+ * read on the fly.  There are three mode registers and three countdown
+ * registers.  The countdown registers are addressed directly, via the
+ * first three I/O ports.  The three mode registers are accessed via
+ * the fourth I/O port, with two bits in the mode byte indicating the
+ * register.  (Why are hardware interfaces always so braindead?).
+ *
+ * To write a value into the countdown register, the mode register
+ * is first programmed with a command indicating the which byte of
+ * the two byte register is to be modified.  The three possibilities
+ * are load msb (TMR_MR_MSB), load lsb (TMR_MR_LSB), or load lsb then
+ * msb (TMR_MR_BOTH).
+ *
+ * To read the current value ("on the fly") from the countdown register,
+ * you write a "latch" command into the mode register, then read the stable
+ * value from the corresponding I/O port.  For example, you write
+ * TMR_MR_LATCH into the corresponding mode register.  Presumably,
+ * after doing this, a write operation to the I/O port would result
+ * in undefined behavior (but hopefully not fry the chip).
+ * Reading in this manner has no side effects.
+ */
+
+/*
+ * Macros for specifying values to be written into a mode register.
+ */
+#define	TIMER_REG_CNTR0	0	/* timer 0 counter port */
+#define	TIMER_REG_CNTR1	1	/* timer 1 counter port */
+#define	TIMER_REG_CNTR2	2	/* timer 2 counter port */
+#define	TIMER_REG_MODE	3	/* timer mode port */
+#define		TIMER_SEL0	0x00	/* select counter 0 */
+#define		TIMER_SEL1	0x40	/* select counter 1 */
+#define		TIMER_SEL2	0x80	/* select counter 2 */
+#define		TIMER_INTTC	0x00	/* mode 0, intr on terminal cnt */
+#define		TIMER_ONESHOT	0x02	/* mode 1, one shot */
+#define		TIMER_RATEGEN	0x04	/* mode 2, rate generator */
+#define		TIMER_SQWAVE	0x06	/* mode 3, square wave */
+#define		TIMER_SWSTROBE	0x08	/* mode 4, s/w triggered strobe */
+#define		TIMER_HWSTROBE	0x0a	/* mode 5, h/w triggered strobe */
+#define		TIMER_LATCH	0x00	/* latch counter for reading */
+#define		TIMER_LSB	0x10	/* r/w counter LSB */
+#define		TIMER_MSB	0x20	/* r/w counter MSB */
+#define		TIMER_16BIT	0x30	/* r/w counter 16 bits, LSB first */
+#define		TIMER_BCD	0x01	/* count in BCD */
diff --git a/usr/contrib/freebsd/dev/ic/i8259.h b/usr/contrib/freebsd/dev/ic/i8259.h
new file mode 100644
index 0000000000..be523c1df4
--- /dev/null
+++ b/usr/contrib/freebsd/dev/ic/i8259.h
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/dev/ic/i8259.h 151580 2005-10-23 09:05:51Z glebius $
+ */
+
+/*
+ * Register defintions for the i8259A programmable interrupt controller.
+ */
+
+#ifndef _DEV_IC_I8259_H_
+#define	_DEV_IC_I8259_H_
+
+/* Initialization control word 1. Written to even address. */
+#define	ICW1_IC4	0x01		/* ICW4 present */
+#define	ICW1_SNGL	0x02		/* 1 = single, 0 = cascaded */
+#define	ICW1_ADI	0x04		/* 1 = 4, 0 = 8 byte vectors */
+#define	ICW1_LTIM	0x08		/* 1 = level trigger, 0 = edge */
+#define	ICW1_RESET	0x10		/* must be 1 */
+/* 0x20 - 0x80 - in 8080/8085 mode only */
+
+/* Initialization control word 2. Written to the odd address. */
+/* No definitions, it is the base vector of the IDT for 8086 mode */
+
+/* Initialization control word 3. Written to the odd address. */
+/* For a master PIC, bitfield indicating a slave 8259 on given input */
+/* For slave, lower 3 bits are the slave's ID binary id on master */
+
+/* Initialization control word 4. Written to the odd address. */
+#define	ICW4_8086	0x01		/* 1 = 8086, 0 = 8080 */
+#define	ICW4_AEOI	0x02		/* 1 = Auto EOI */
+#define	ICW4_MS		0x04		/* 1 = buffered master, 0 = slave */
+#define	ICW4_BUF	0x08		/* 1 = enable buffer mode */
+#define	ICW4_SFNM	0x10		/* 1 = special fully nested mode */
+
+/* Operation control words.  Written after initialization. */
+
+/* Operation control word type 1 */
+/*
+ * No definitions.  Written to the odd address.  Bitmask for interrupts.
+ * 1 = disabled.
+ */
+
+/* Operation control word type 2.  Bit 3 (0x08) must be zero. Even address. */
+#define	OCW2_L0		0x01		/* Level */
+#define	OCW2_L1		0x02
+#define	OCW2_L2		0x04
+/* 0x08 must be 0 to select OCW2 vs OCW3 */
+/* 0x10 must be 0 to select OCW2 vs ICW1 */
+#define	OCW2_EOI	0x20		/* 1 = EOI */
+#define	OCW2_SL		0x40		/* EOI mode */
+#define	OCW2_R		0x80		/* EOI mode */
+
+/* Operation control word type 3.  Bit 3 (0x08) must be set. Even address. */
+#define	OCW3_RIS	0x01		/* 1 = read IS, 0 = read IR */
+#define	OCW3_RR		0x02		/* register read */
+#define	OCW3_P		0x04		/* poll mode command */
+/* 0x08 must be 1 to select OCW3 vs OCW2 */
+#define	OCW3_SEL	0x08		/* must be 1 */
+/* 0x10 must be 0 to select OCW3 vs ICW1 */
+#define	OCW3_SMM	0x20		/* special mode mask */
+#define	OCW3_ESMM	0x40		/* enable SMM */
+
+#endif /* !_DEV_IC_I8259_H_ */
diff --git a/usr/contrib/freebsd/dev/ic/ns16550.h b/usr/contrib/freebsd/dev/ic/ns16550.h
new file mode 100644
index 0000000000..5e8f30e3e8
--- /dev/null
+++ b/usr/contrib/freebsd/dev/ic/ns16550.h
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)ns16550.h	7.1 (Berkeley) 5/9/91
+ * $FreeBSD: head/sys/dev/ic/ns16550.h 257170 2013-10-26 17:24:59Z zbb $
+ */
+
+/*
+ * NS8250... UART registers.
+ */
+
+/* 8250 registers #[0-6]. */
+
+#define	com_data	0	/* data register (R/W) */
+#define	REG_DATA	com_data
+
+#define	com_ier		1	/* interrupt enable register (W) */
+#define	REG_IER		com_ier
+#define	IER_ERXRDY	0x1
+#define	IER_ETXRDY	0x2
+#define	IER_ERLS	0x4
+#define	IER_EMSC	0x8
+
+#define	IER_BITS	"\20\1ERXRDY\2ETXRDY\3ERLS\4EMSC"
+
+#define	com_iir		2	/* interrupt identification register (R) */
+#define	REG_IIR		com_iir
+#define	IIR_IMASK	0xf
+#define	IIR_RXTOUT	0xc
+#define	IIR_BUSY	0x7
+#define	IIR_RLS		0x6
+#define	IIR_RXRDY	0x4
+#define	IIR_TXRDY	0x2
+#define	IIR_NOPEND	0x1
+#define	IIR_MLSC	0x0
+#define	IIR_FIFO_MASK	0xc0	/* set if FIFOs are enabled */
+
+#define	IIR_BITS	"\20\1NOPEND\2TXRDY\3RXRDY"
+
+#define	com_lcr		3	/* line control register (R/W) */
+#define	com_cfcr	com_lcr	/* character format control register (R/W) */
+#define	REG_LCR		com_lcr
+#define	LCR_DLAB	0x80
+#define	CFCR_DLAB	LCR_DLAB
+#define	LCR_EFR_ENABLE	0xbf	/* magic to enable EFR on 16650 up */
+#define	CFCR_EFR_ENABLE	LCR_EFR_ENABLE
+#define	LCR_SBREAK	0x40
+#define	CFCR_SBREAK	LCR_SBREAK
+#define	LCR_PZERO	0x30
+#define	CFCR_PZERO	LCR_PZERO
+#define	LCR_PONE	0x20
+#define	CFCR_PONE	LCR_PONE
+#define	LCR_PEVEN	0x10
+#define	CFCR_PEVEN	LCR_PEVEN
+#define	LCR_PODD	0x00
+#define	CFCR_PODD	LCR_PODD
+#define	LCR_PENAB	0x08
+#define	CFCR_PENAB	LCR_PENAB
+#define	LCR_STOPB	0x04
+#define	CFCR_STOPB	LCR_STOPB
+#define	LCR_8BITS	0x03
+#define	CFCR_8BITS	LCR_8BITS
+#define	LCR_7BITS	0x02
+#define	CFCR_7BITS	LCR_7BITS
+#define	LCR_6BITS	0x01
+#define	CFCR_6BITS	LCR_6BITS
+#define	LCR_5BITS	0x00
+#define	CFCR_5BITS	LCR_5BITS
+
+#define	com_mcr		4	/* modem control register (R/W) */
+#define	REG_MCR		com_mcr
+#define	MCR_PRESCALE	0x80	/* only available on 16650 up */
+#define	MCR_LOOPBACK	0x10
+#define	MCR_IE		0x08
+#define	MCR_IENABLE	MCR_IE
+#define	MCR_DRS		0x04
+#define	MCR_RTS		0x02
+#define	MCR_DTR		0x01
+
+#define	MCR_BITS	"\20\1DTR\2RTS\3DRS\4IE\5LOOPBACK\10PRESCALE"
+
+#define	com_lsr		5	/* line status register (R/W) */
+#define	REG_LSR		com_lsr
+#define	LSR_RCV_FIFO	0x80
+#define	LSR_TEMT	0x40
+#define	LSR_TSRE	LSR_TEMT
+#define	LSR_THRE	0x20
+#define	LSR_TXRDY	LSR_THRE
+#define	LSR_BI		0x10
+#define	LSR_FE		0x08
+#define	LSR_PE		0x04
+#define	LSR_OE		0x02
+#define	LSR_RXRDY	0x01
+#define	LSR_RCV_MASK	0x1f
+
+#define	LSR_BITS	"\20\1RXRDY\2OE\3PE\4FE\5BI\6THRE\7TEMT\10RCV_FIFO"
+
+#define	com_msr		6	/* modem status register (R/W) */
+#define	REG_MSR		com_msr
+#define	MSR_DCD		0x80
+#define	MSR_RI		0x40
+#define	MSR_DSR		0x20
+#define	MSR_CTS		0x10
+#define	MSR_DDCD	0x08
+#define	MSR_TERI	0x04
+#define	MSR_DDSR	0x02
+#define	MSR_DCTS	0x01
+
+#define	MSR_BITS	"\20\1DCTS\2DDSR\3TERI\4DDCD\5CTS\6DSR\7RI\10DCD"
+
+/* 8250 multiplexed registers #[0-1].  Access enabled by LCR[7]. */
+#define	com_dll		0	/* divisor latch low (R/W) */
+#define	com_dlbl	com_dll
+#define	com_dlm		1	/* divisor latch high (R/W) */
+#define	com_dlbh	com_dlm
+#define	REG_DLL		com_dll
+#define	REG_DLH		com_dlm
+
+/* 16450 register #7.  Not multiplexed. */
+#define	com_scr		7	/* scratch register (R/W) */
+
+/* 16550 register #2.  Not multiplexed. */
+#define	com_fcr		2	/* FIFO control register (W) */
+#define	com_fifo	com_fcr
+#define	REG_FCR		com_fcr
+#define	FCR_ENABLE	0x01
+#define	FIFO_ENABLE	FCR_ENABLE
+#define	FCR_RCV_RST	0x02
+#define	FIFO_RCV_RST	FCR_RCV_RST
+#define	FCR_XMT_RST	0x04
+#define	FIFO_XMT_RST	FCR_XMT_RST
+#define	FCR_DMA		0x08
+#define	FIFO_DMA_MODE	FCR_DMA
+#define	FCR_RX_LOW	0x00
+#define	FIFO_RX_LOW	FCR_RX_LOW
+#define	FCR_RX_MEDL	0x40
+#define	FIFO_RX_MEDL	FCR_RX_MEDL
+#define	FCR_RX_MEDH	0x80
+#define	FIFO_RX_MEDH	FCR_RX_MEDH
+#define	FCR_RX_HIGH	0xc0
+#define	FIFO_RX_HIGH	FCR_RX_HIGH
+
+#define	FCR_BITS	"\20\1ENABLE\2RCV_RST\3XMT_RST\4DMA"
+
+/* 16650 registers #2,[4-7].  Access enabled by LCR_EFR_ENABLE. */
+
+#define	com_efr		2	/* enhanced features register (R/W) */
+#define	REG_EFR		com_efr
+#define	EFR_CTS		0x80
+#define	EFR_AUTOCTS	EFR_CTS
+#define	EFR_RTS		0x40
+#define	EFR_AUTORTS	EFR_RTS
+#define	EFR_EFE		0x10	/* enhanced functions enable */
+
+#define	com_xon1	4	/* XON 1 character (R/W) */
+#define	com_xon2	5	/* XON 2 character (R/W) */
+#define	com_xoff1	6	/* XOFF 1 character (R/W) */
+#define	com_xoff2	7	/* XOFF 2 character (R/W) */
+
+#define DW_REG_USR	31	/* DesignWare derived Uart Status Reg */
+#define com_usr		39	/* Octeon 16750/16550 Uart Status Reg */
+#define REG_USR		com_usr
+#define USR_BUSY	1	/* Uart Busy. Serial transfer in progress */
+#define USR_TXFIFO_NOTFULL 2    /* Uart TX FIFO Not full */
+
+/* 16950 register #1.  Access enabled by ACR[7].  Also requires !LCR[7]. */
+#define	com_asr		1	/* additional status register (R[0-7]/W[0-1]) */
+
+/* 16950 register #3.  R/W access enabled by ACR[7]. */
+#define	com_rfl		3	/* receiver fifo level (R) */
+
+/*
+ * 16950 register #4.  Access enabled by ACR[7].  Also requires
+ * !LCR_EFR_ENABLE.
+ */
+#define	com_tfl		4	/* transmitter fifo level (R) */
+
+/*
+ * 16950 register #5.  Accessible if !LCR_EFR_ENABLE.  Read access also
+ * requires ACR[6].
+ */
+#define	com_icr		5	/* index control register (R/W) */
+
+/*
+ * 16950 register #7.  It is the same as com_scr except it has a different
+ * abbreviation in the manufacturer's data sheet and it also serves as an
+ * index into the Indexed Control register set.
+ */
+#define	com_spr		com_scr	/* scratch pad (and index) register (R/W) */
+#define	REG_SPR		com_scr
+
+/*
+ * 16950 indexed control registers #[0-0x13].  Access is via index in SPR,
+ * data in ICR (if ICR is accessible).
+ */
+
+#define	com_acr		0	/* additional control register (R/W) */
+#define	ACR_ASE		0x80	/* ASR/RFL/TFL enable */
+#define	ACR_ICRE	0x40	/* ICR enable */
+#define	ACR_TLE		0x20	/* TTL/RTL enable */
+
+#define	com_cpr		1	/* clock prescaler register (R/W) */
+#define	com_tcr		2	/* times clock register (R/W) */
+#define	com_ttl		4	/* transmitter trigger level (R/W) */
+#define	com_rtl		5	/* receiver trigger level (R/W) */
+/* ... */
+
+/* Hardware extension mode register for RSB-2000/3000. */
+#define	com_emr		com_msr
+#define	EMR_EXBUFF	0x04
+#define	EMR_CTSFLW	0x08
+#define	EMR_DSRFLW	0x10
+#define	EMR_RTSFLW	0x20
+#define	EMR_DTRFLW	0x40
+#define	EMR_EFMODE	0x80
diff --git a/usr/contrib/freebsd/dev/pci/pcireg.h b/usr/contrib/freebsd/dev/pci/pcireg.h
new file mode 100644
index 0000000000..32a569dbd4
--- /dev/null
+++ b/usr/contrib/freebsd/dev/pci/pcireg.h
@@ -0,0 +1,922 @@
+/*-
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/dev/pci/pcireg.h 266468 2014-05-20 14:39:22Z mav $
+ *
+ */
+
+/*
+ * PCIM_xxx: mask to locate subfield in register
+ * PCIR_xxx: config register offset
+ * PCIC_xxx: device class
+ * PCIS_xxx: device subclass
+ * PCIP_xxx: device programming interface
+ * PCIV_xxx: PCI vendor ID (only required to fixup ancient devices)
+ * PCID_xxx: device ID
+ * PCIY_xxx: capability identification number
+ * PCIZ_xxx: extended capability identification number
+ */
+
+/* some PCI bus constants */
+#define	PCI_DOMAINMAX	65535	/* highest supported domain number */
+#define	PCI_BUSMAX	255	/* highest supported bus number */
+#define	PCI_SLOTMAX	31	/* highest supported slot number */
+#define	PCI_FUNCMAX	7	/* highest supported function number */
+#define	PCI_REGMAX	255	/* highest supported config register addr. */
+#define	PCIE_REGMAX	4095	/* highest supported config register addr. */
+#define	PCI_MAXHDRTYPE	2
+
+#define	PCIE_ARI_SLOTMAX 0
+#define	PCIE_ARI_FUNCMAX 255
+
+#define	PCI_RID_BUS_SHIFT	8
+#define	PCI_RID_SLOT_SHIFT	3
+#define	PCI_RID_FUNC_SHIFT	0
+
+#define PCI_RID(bus, slot, func) \
+    ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \
+    (((slot) & PCI_SLOTMAX) << PCI_RID_SLOT_SHIFT) | \
+    (((func) & PCI_FUNCMAX) << PCI_RID_FUNC_SHIFT))
+
+#define PCI_ARI_RID(bus, func) \
+    ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \
+    (((func) & PCIE_ARI_FUNCMAX) << PCI_RID_FUNC_SHIFT))
+
+#define PCI_RID2BUS(rid) (((rid) >> PCI_RID_BUS_SHIFT) & PCI_BUSMAX)
+#define PCI_RID2SLOT(rid) (((rid) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX)
+#define PCI_RID2FUNC(rid) (((rid) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX)
+
+#define PCIE_ARI_SLOT(func) (((func) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX)
+#define PCIE_ARI_FUNC(func) (((func) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX)
+
+/* PCI config header registers for all devices */
+
+#define	PCIR_DEVVENDOR	0x00
+#define	PCIR_VENDOR	0x00
+#define	PCIR_DEVICE	0x02
+#define	PCIR_COMMAND	0x04
+#define	PCIM_CMD_PORTEN		0x0001
+#define	PCIM_CMD_MEMEN		0x0002
+#define	PCIM_CMD_BUSMASTEREN	0x0004
+#define	PCIM_CMD_SPECIALEN	0x0008
+#define	PCIM_CMD_MWRICEN	0x0010
+#define	PCIM_CMD_PERRESPEN	0x0040
+#define	PCIM_CMD_SERRESPEN	0x0100
+#define	PCIM_CMD_BACKTOBACK	0x0200
+#define	PCIM_CMD_INTxDIS	0x0400
+#define	PCIR_STATUS	0x06
+#define	PCIM_STATUS_INTxSTATE	0x0008
+#define	PCIM_STATUS_CAPPRESENT	0x0010
+#define	PCIM_STATUS_66CAPABLE	0x0020
+#define	PCIM_STATUS_BACKTOBACK	0x0080
+#define	PCIM_STATUS_MDPERR	0x0100
+#define	PCIM_STATUS_SEL_FAST	0x0000
+#define	PCIM_STATUS_SEL_MEDIMUM	0x0200
+#define	PCIM_STATUS_SEL_SLOW	0x0400
+#define	PCIM_STATUS_SEL_MASK	0x0600
+#define	PCIM_STATUS_STABORT	0x0800
+#define	PCIM_STATUS_RTABORT	0x1000
+#define	PCIM_STATUS_RMABORT	0x2000
+#define	PCIM_STATUS_SERR	0x4000
+#define	PCIM_STATUS_PERR	0x8000
+#define	PCIR_REVID	0x08
+#define	PCIR_PROGIF	0x09
+#define	PCIR_SUBCLASS	0x0a
+#define	PCIR_CLASS	0x0b
+#define	PCIR_CACHELNSZ	0x0c
+#define	PCIR_LATTIMER	0x0d
+#define	PCIR_HDRTYPE	0x0e
+#define	PCIM_HDRTYPE		0x7f
+#define	PCIM_HDRTYPE_NORMAL	0x00
+#define	PCIM_HDRTYPE_BRIDGE	0x01
+#define	PCIM_HDRTYPE_CARDBUS	0x02
+#define	PCIM_MFDEV		0x80
+#define	PCIR_BIST	0x0f
+
+/* Capability Register Offsets */
+
+#define	PCICAP_ID	0x0
+#define	PCICAP_NEXTPTR	0x1
+
+/* Capability Identification Numbers */
+
+#define	PCIY_PMG	0x01	/* PCI Power Management */
+#define	PCIY_AGP	0x02	/* AGP */
+#define	PCIY_VPD	0x03	/* Vital Product Data */
+#define	PCIY_SLOTID	0x04	/* Slot Identification */
+#define	PCIY_MSI	0x05	/* Message Signaled Interrupts */
+#define	PCIY_CHSWP	0x06	/* CompactPCI Hot Swap */
+#define	PCIY_PCIX	0x07	/* PCI-X */
+#define	PCIY_HT		0x08	/* HyperTransport */
+#define	PCIY_VENDOR	0x09	/* Vendor Unique */
+#define	PCIY_DEBUG	0x0a	/* Debug port */
+#define	PCIY_CRES	0x0b	/* CompactPCI central resource control */
+#define	PCIY_HOTPLUG	0x0c	/* PCI Hot-Plug */
+#define	PCIY_SUBVENDOR	0x0d	/* PCI-PCI bridge subvendor ID */
+#define	PCIY_AGP8X	0x0e	/* AGP 8x */
+#define	PCIY_SECDEV	0x0f	/* Secure Device */
+#define	PCIY_EXPRESS	0x10	/* PCI Express */
+#define	PCIY_MSIX	0x11	/* MSI-X */
+#define	PCIY_SATA	0x12	/* SATA */
+#define	PCIY_PCIAF	0x13	/* PCI Advanced Features */
+
+/* Extended Capability Register Fields */
+
+#define	PCIR_EXTCAP	0x100
+#define	PCIM_EXTCAP_ID		0x0000ffff
+#define	PCIM_EXTCAP_VER		0x000f0000
+#define	PCIM_EXTCAP_NEXTPTR	0xfff00000
+#define	PCI_EXTCAP_ID(ecap)	((ecap) & PCIM_EXTCAP_ID)
+#define	PCI_EXTCAP_VER(ecap)	(((ecap) & PCIM_EXTCAP_VER) >> 16)
+#define	PCI_EXTCAP_NEXTPTR(ecap) (((ecap) & PCIM_EXTCAP_NEXTPTR) >> 20)
+
+/* Extended Capability Identification Numbers */
+
+#define	PCIZ_AER	0x0001	/* Advanced Error Reporting */
+#define	PCIZ_VC		0x0002	/* Virtual Channel if MFVC Ext Cap not set */
+#define	PCIZ_SERNUM	0x0003	/* Device Serial Number */
+#define	PCIZ_PWRBDGT	0x0004	/* Power Budgeting */
+#define	PCIZ_RCLINK_DCL	0x0005	/* Root Complex Link Declaration */
+#define	PCIZ_RCLINK_CTL	0x0006	/* Root Complex Internal Link Control */
+#define	PCIZ_RCEC_ASSOC	0x0007	/* Root Complex Event Collector Association */
+#define	PCIZ_MFVC	0x0008	/* Multi-Function Virtual Channel */
+#define	PCIZ_VC2	0x0009	/* Virtual Channel if MFVC Ext Cap set */
+#define	PCIZ_RCRB	0x000a	/* RCRB Header */
+#define	PCIZ_VENDOR	0x000b	/* Vendor Unique */
+#define	PCIZ_CAC	0x000c	/* Configuration Access Correction -- obsolete */
+#define	PCIZ_ACS	0x000d	/* Access Control Services */
+#define	PCIZ_ARI	0x000e	/* Alternative Routing-ID Interpretation */
+#define	PCIZ_ATS	0x000f	/* Address Translation Services */
+#define	PCIZ_SRIOV	0x0010	/* Single Root IO Virtualization */
+#define	PCIZ_MRIOV	0x0011	/* Multiple Root IO Virtualization */
+#define	PCIZ_MULTICAST	0x0012	/* Multicast */
+#define	PCIZ_PAGE_REQ	0x0013	/* Page Request */
+#define	PCIZ_AMD	0x0014	/* Reserved for AMD */
+#define	PCIZ_RESIZE_BAR	0x0015	/* Resizable BAR */
+#define	PCIZ_DPA	0x0016	/* Dynamic Power Allocation */
+#define	PCIZ_TPH_REQ	0x0017	/* TPH Requester */
+#define	PCIZ_LTR	0x0018	/* Latency Tolerance Reporting */
+#define	PCIZ_SEC_PCIE	0x0019	/* Secondary PCI Express */
+#define	PCIZ_PMUX	0x001a	/* Protocol Multiplexing */
+#define	PCIZ_PASID	0x001b	/* Process Address Space ID */
+#define	PCIZ_LN_REQ	0x001c	/* LN Requester */
+#define	PCIZ_DPC	0x001d	/* Downstream Porto Containment */
+#define	PCIZ_L1PM	0x001e	/* L1 PM Substates */
+
+/* config registers for header type 0 devices */
+
+#define	PCIR_BARS	0x10
+#define	PCIR_BAR(x)		(PCIR_BARS + (x) * 4)
+#define	PCIR_MAX_BAR_0		5
+#define	PCI_RID2BAR(rid)	(((rid) - PCIR_BARS) / 4)
+#define	PCI_BAR_IO(x)		(((x) & PCIM_BAR_SPACE) == PCIM_BAR_IO_SPACE)
+#define	PCI_BAR_MEM(x)		(((x) & PCIM_BAR_SPACE) == PCIM_BAR_MEM_SPACE)
+#define	PCIM_BAR_SPACE		0x00000001
+#define	PCIM_BAR_MEM_SPACE	0
+#define	PCIM_BAR_IO_SPACE	1
+#define	PCIM_BAR_MEM_TYPE	0x00000006
+#define	PCIM_BAR_MEM_32		0
+#define	PCIM_BAR_MEM_1MB	2	/* Locate below 1MB in PCI <= 2.1 */
+#define	PCIM_BAR_MEM_64		4
+#define	PCIM_BAR_MEM_PREFETCH	0x00000008
+#define	PCIM_BAR_MEM_BASE	0xfffffffffffffff0ULL
+#define	PCIM_BAR_IO_RESERVED	0x00000002
+#define	PCIM_BAR_IO_BASE	0xfffffffc
+#define	PCIR_CIS	0x28
+#define	PCIM_CIS_ASI_MASK	0x00000007
+#define	PCIM_CIS_ASI_CONFIG	0
+#define	PCIM_CIS_ASI_BAR0	1
+#define	PCIM_CIS_ASI_BAR1	2
+#define	PCIM_CIS_ASI_BAR2	3
+#define	PCIM_CIS_ASI_BAR3	4
+#define	PCIM_CIS_ASI_BAR4	5
+#define	PCIM_CIS_ASI_BAR5	6
+#define	PCIM_CIS_ASI_ROM	7
+#define	PCIM_CIS_ADDR_MASK	0x0ffffff8
+#define	PCIM_CIS_ROM_MASK	0xf0000000
+#define	PCIM_CIS_CONFIG_MASK	0xff
+#define	PCIR_SUBVEND_0	0x2c
+#define	PCIR_SUBDEV_0	0x2e
+#define	PCIR_BIOS	0x30
+#define	PCIM_BIOS_ENABLE	0x01
+#define	PCIM_BIOS_ADDR_MASK	0xfffff800
+#define	PCIR_CAP_PTR	0x34
+#define	PCIR_INTLINE	0x3c
+#define	PCIR_INTPIN	0x3d
+#define	PCIR_MINGNT	0x3e
+#define	PCIR_MAXLAT	0x3f
+
+/* config registers for header type 1 (PCI-to-PCI bridge) devices */
+
+#define	PCIR_MAX_BAR_1	1
+#define	PCIR_SECSTAT_1	0x1e
+
+#define	PCIR_PRIBUS_1	0x18
+#define	PCIR_SECBUS_1	0x19
+#define	PCIR_SUBBUS_1	0x1a
+#define	PCIR_SECLAT_1	0x1b
+
+#define	PCIR_IOBASEL_1	0x1c
+#define	PCIR_IOLIMITL_1	0x1d
+#define	PCIR_IOBASEH_1	0x30
+#define	PCIR_IOLIMITH_1	0x32
+#define	PCIM_BRIO_16		0x0
+#define	PCIM_BRIO_32		0x1
+#define	PCIM_BRIO_MASK		0xf
+
+#define	PCIR_MEMBASE_1	0x20
+#define	PCIR_MEMLIMIT_1	0x22
+
+#define	PCIR_PMBASEL_1	0x24
+#define	PCIR_PMLIMITL_1	0x26
+#define	PCIR_PMBASEH_1	0x28
+#define	PCIR_PMLIMITH_1	0x2c
+#define	PCIM_BRPM_32		0x0
+#define	PCIM_BRPM_64		0x1
+#define	PCIM_BRPM_MASK		0xf
+
+#define	PCIR_BIOS_1	0x38
+#define	PCIR_BRIDGECTL_1 0x3e
+
+/* config registers for header type 2 (CardBus) devices */
+
+#define	PCIR_MAX_BAR_2	0
+#define	PCIR_CAP_PTR_2	0x14
+#define	PCIR_SECSTAT_2	0x16
+
+#define	PCIR_PRIBUS_2	0x18
+#define	PCIR_SECBUS_2	0x19
+#define	PCIR_SUBBUS_2	0x1a
+#define	PCIR_SECLAT_2	0x1b
+
+#define	PCIR_MEMBASE0_2	0x1c
+#define	PCIR_MEMLIMIT0_2 0x20
+#define	PCIR_MEMBASE1_2	0x24
+#define	PCIR_MEMLIMIT1_2 0x28
+#define	PCIR_IOBASE0_2	0x2c
+#define	PCIR_IOLIMIT0_2	0x30
+#define	PCIR_IOBASE1_2	0x34
+#define	PCIR_IOLIMIT1_2	0x38
+
+#define	PCIR_BRIDGECTL_2 0x3e
+
+#define	PCIR_SUBVEND_2	0x40
+#define	PCIR_SUBDEV_2	0x42
+
+#define	PCIR_PCCARDIF_2	0x44
+
+/* PCI device class, subclass and programming interface definitions */
+
+#define	PCIC_OLD	0x00
+#define	PCIS_OLD_NONVGA		0x00
+#define	PCIS_OLD_VGA		0x01
+
+#define	PCIC_STORAGE	0x01
+#define	PCIS_STORAGE_SCSI	0x00
+#define	PCIS_STORAGE_IDE	0x01
+#define	PCIP_STORAGE_IDE_MODEPRIM	0x01
+#define	PCIP_STORAGE_IDE_PROGINDPRIM	0x02
+#define	PCIP_STORAGE_IDE_MODESEC	0x04
+#define	PCIP_STORAGE_IDE_PROGINDSEC	0x08
+#define	PCIP_STORAGE_IDE_MASTERDEV	0x80
+#define	PCIS_STORAGE_FLOPPY	0x02
+#define	PCIS_STORAGE_IPI	0x03
+#define	PCIS_STORAGE_RAID	0x04
+#define	PCIS_STORAGE_ATA_ADMA	0x05
+#define	PCIS_STORAGE_SATA	0x06
+#define	PCIP_STORAGE_SATA_AHCI_1_0	0x01
+#define	PCIS_STORAGE_SAS	0x07
+#define	PCIS_STORAGE_NVM	0x08
+#define	PCIP_STORAGE_NVM_NVMHCI_1_0	0x01
+#define	PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0	0x02
+#define	PCIS_STORAGE_OTHER	0x80
+
+#define	PCIC_NETWORK	0x02
+#define	PCIS_NETWORK_ETHERNET	0x00
+#define	PCIS_NETWORK_TOKENRING	0x01
+#define	PCIS_NETWORK_FDDI	0x02
+#define	PCIS_NETWORK_ATM	0x03
+#define	PCIS_NETWORK_ISDN	0x04
+#define	PCIS_NETWORK_WORLDFIP	0x05
+#define	PCIS_NETWORK_PICMG	0x06
+#define	PCIS_NETWORK_OTHER	0x80
+
+#define	PCIC_DISPLAY	0x03
+#define	PCIS_DISPLAY_VGA	0x00
+#define	PCIS_DISPLAY_XGA	0x01
+#define	PCIS_DISPLAY_3D		0x02
+#define	PCIS_DISPLAY_OTHER	0x80
+
+#define	PCIC_MULTIMEDIA	0x04
+#define	PCIS_MULTIMEDIA_VIDEO	0x00
+#define	PCIS_MULTIMEDIA_AUDIO	0x01
+#define	PCIS_MULTIMEDIA_TELE	0x02
+#define	PCIS_MULTIMEDIA_HDA	0x03
+#define	PCIS_MULTIMEDIA_OTHER	0x80
+
+#define	PCIC_MEMORY	0x05
+#define	PCIS_MEMORY_RAM		0x00
+#define	PCIS_MEMORY_FLASH	0x01
+#define	PCIS_MEMORY_OTHER	0x80
+
+#define	PCIC_BRIDGE	0x06
+#define	PCIS_BRIDGE_HOST	0x00
+#define	PCIS_BRIDGE_ISA		0x01
+#define	PCIS_BRIDGE_EISA	0x02
+#define	PCIS_BRIDGE_MCA		0x03
+#define	PCIS_BRIDGE_PCI		0x04
+#define	PCIP_BRIDGE_PCI_SUBTRACTIVE	0x01
+#define	PCIS_BRIDGE_PCMCIA	0x05
+#define	PCIS_BRIDGE_NUBUS	0x06
+#define	PCIS_BRIDGE_CARDBUS	0x07
+#define	PCIS_BRIDGE_RACEWAY	0x08
+#define	PCIS_BRIDGE_PCI_TRANSPARENT 0x09
+#define	PCIS_BRIDGE_INFINIBAND	0x0a
+#define	PCIS_BRIDGE_OTHER	0x80
+
+#define	PCIC_SIMPLECOMM	0x07
+#define	PCIS_SIMPLECOMM_UART	0x00
+#define	PCIP_SIMPLECOMM_UART_8250	0x00
+#define	PCIP_SIMPLECOMM_UART_16450A	0x01
+#define	PCIP_SIMPLECOMM_UART_16550A	0x02
+#define	PCIP_SIMPLECOMM_UART_16650A	0x03
+#define	PCIP_SIMPLECOMM_UART_16750A	0x04
+#define	PCIP_SIMPLECOMM_UART_16850A	0x05
+#define	PCIP_SIMPLECOMM_UART_16950A	0x06
+#define	PCIS_SIMPLECOMM_PAR	0x01
+#define	PCIS_SIMPLECOMM_MULSER	0x02
+#define	PCIS_SIMPLECOMM_MODEM	0x03
+#define	PCIS_SIMPLECOMM_GPIB	0x04
+#define	PCIS_SIMPLECOMM_SMART_CARD 0x05
+#define	PCIS_SIMPLECOMM_OTHER	0x80
+
+#define	PCIC_BASEPERIPH	0x08
+#define	PCIS_BASEPERIPH_PIC	0x00
+#define	PCIP_BASEPERIPH_PIC_8259A	0x00
+#define	PCIP_BASEPERIPH_PIC_ISA		0x01
+#define	PCIP_BASEPERIPH_PIC_EISA	0x02
+#define	PCIP_BASEPERIPH_PIC_IO_APIC	0x10
+#define	PCIP_BASEPERIPH_PIC_IOX_APIC	0x20
+#define	PCIS_BASEPERIPH_DMA	0x01
+#define	PCIS_BASEPERIPH_TIMER	0x02
+#define	PCIS_BASEPERIPH_RTC	0x03
+#define	PCIS_BASEPERIPH_PCIHOT	0x04
+#define	PCIS_BASEPERIPH_SDHC	0x05
+#define	PCIS_BASEPERIPH_IOMMU	0x06
+#define	PCIS_BASEPERIPH_OTHER	0x80
+
+#define	PCIC_INPUTDEV	0x09
+#define	PCIS_INPUTDEV_KEYBOARD	0x00
+#define	PCIS_INPUTDEV_DIGITIZER	0x01
+#define	PCIS_INPUTDEV_MOUSE	0x02
+#define	PCIS_INPUTDEV_SCANNER	0x03
+#define	PCIS_INPUTDEV_GAMEPORT	0x04
+#define	PCIS_INPUTDEV_OTHER	0x80
+
+#define	PCIC_DOCKING	0x0a
+#define	PCIS_DOCKING_GENERIC	0x00
+#define	PCIS_DOCKING_OTHER	0x80
+
+#define	PCIC_PROCESSOR	0x0b
+#define	PCIS_PROCESSOR_386	0x00
+#define	PCIS_PROCESSOR_486	0x01
+#define	PCIS_PROCESSOR_PENTIUM	0x02
+#define	PCIS_PROCESSOR_ALPHA	0x10
+#define	PCIS_PROCESSOR_POWERPC	0x20
+#define	PCIS_PROCESSOR_MIPS	0x30
+#define	PCIS_PROCESSOR_COPROC	0x40
+
+#define	PCIC_SERIALBUS	0x0c
+#define	PCIS_SERIALBUS_FW	0x00
+#define	PCIS_SERIALBUS_ACCESS	0x01
+#define	PCIS_SERIALBUS_SSA	0x02
+#define	PCIS_SERIALBUS_USB	0x03
+#define	PCIP_SERIALBUS_USB_UHCI		0x00
+#define	PCIP_SERIALBUS_USB_OHCI		0x10
+#define	PCIP_SERIALBUS_USB_EHCI		0x20
+#define	PCIP_SERIALBUS_USB_XHCI		0x30
+#define	PCIP_SERIALBUS_USB_DEVICE	0xfe
+#define	PCIS_SERIALBUS_FC	0x04
+#define	PCIS_SERIALBUS_SMBUS	0x05
+#define	PCIS_SERIALBUS_INFINIBAND 0x06
+#define	PCIS_SERIALBUS_IPMI	0x07
+#define	PCIP_SERIALBUS_IPMI_SMIC	0x00
+#define	PCIP_SERIALBUS_IPMI_KCS		0x01
+#define	PCIP_SERIALBUS_IPMI_BT		0x02
+#define	PCIS_SERIALBUS_SERCOS	0x08
+#define	PCIS_SERIALBUS_CANBUS	0x09
+
+#define	PCIC_WIRELESS	0x0d
+#define	PCIS_WIRELESS_IRDA	0x00
+#define	PCIS_WIRELESS_IR	0x01
+#define	PCIS_WIRELESS_RF	0x10
+#define	PCIS_WIRELESS_BLUETOOTH	0x11
+#define	PCIS_WIRELESS_BROADBAND	0x12
+#define	PCIS_WIRELESS_80211A	0x20
+#define	PCIS_WIRELESS_80211B	0x21
+#define	PCIS_WIRELESS_OTHER	0x80
+
+#define	PCIC_INTELLIIO	0x0e
+#define	PCIS_INTELLIIO_I2O	0x00
+
+#define	PCIC_SATCOM	0x0f
+#define	PCIS_SATCOM_TV		0x01
+#define	PCIS_SATCOM_AUDIO	0x02
+#define	PCIS_SATCOM_VOICE	0x03
+#define	PCIS_SATCOM_DATA	0x04
+
+#define	PCIC_CRYPTO	0x10
+#define	PCIS_CRYPTO_NETCOMP	0x00
+#define	PCIS_CRYPTO_ENTERTAIN	0x10
+#define	PCIS_CRYPTO_OTHER	0x80
+
+#define	PCIC_DASP	0x11
+#define	PCIS_DASP_DPIO		0x00
+#define	PCIS_DASP_PERFCNTRS	0x01
+#define	PCIS_DASP_COMM_SYNC	0x10
+#define	PCIS_DASP_MGMT_CARD	0x20
+#define	PCIS_DASP_OTHER		0x80
+
+#define	PCIC_OTHER	0xff
+
+/* Bridge Control Values. */
+#define	PCIB_BCR_PERR_ENABLE		0x0001
+#define	PCIB_BCR_SERR_ENABLE		0x0002
+#define	PCIB_BCR_ISA_ENABLE		0x0004
+#define	PCIB_BCR_VGA_ENABLE		0x0008
+#define	PCIB_BCR_MASTER_ABORT_MODE	0x0020
+#define	PCIB_BCR_SECBUS_RESET		0x0040
+#define	PCIB_BCR_SECBUS_BACKTOBACK	0x0080
+#define	PCIB_BCR_PRI_DISCARD_TIMEOUT	0x0100
+#define	PCIB_BCR_SEC_DISCARD_TIMEOUT	0x0200
+#define	PCIB_BCR_DISCARD_TIMER_STATUS	0x0400
+#define	PCIB_BCR_DISCARD_TIMER_SERREN	0x0800
+
+/* PCI power manangement */
+#define	PCIR_POWER_CAP		0x2
+#define	PCIM_PCAP_SPEC			0x0007
+#define	PCIM_PCAP_PMEREQCLK		0x0008
+#define	PCIM_PCAP_DEVSPECINIT		0x0020
+#define	PCIM_PCAP_AUXPWR_0		0x0000
+#define	PCIM_PCAP_AUXPWR_55		0x0040
+#define	PCIM_PCAP_AUXPWR_100		0x0080
+#define	PCIM_PCAP_AUXPWR_160		0x00c0
+#define	PCIM_PCAP_AUXPWR_220		0x0100
+#define	PCIM_PCAP_AUXPWR_270		0x0140
+#define	PCIM_PCAP_AUXPWR_320		0x0180
+#define	PCIM_PCAP_AUXPWR_375		0x01c0
+#define	PCIM_PCAP_AUXPWRMASK		0x01c0
+#define	PCIM_PCAP_D1SUPP		0x0200
+#define	PCIM_PCAP_D2SUPP		0x0400
+#define	PCIM_PCAP_D0PME			0x0800
+#define	PCIM_PCAP_D1PME			0x1000
+#define	PCIM_PCAP_D2PME			0x2000
+#define	PCIM_PCAP_D3PME_HOT		0x4000
+#define	PCIM_PCAP_D3PME_COLD		0x8000
+
+#define	PCIR_POWER_STATUS	0x4
+#define	PCIM_PSTAT_D0			0x0000
+#define	PCIM_PSTAT_D1			0x0001
+#define	PCIM_PSTAT_D2			0x0002
+#define	PCIM_PSTAT_D3			0x0003
+#define	PCIM_PSTAT_DMASK		0x0003
+#define	PCIM_PSTAT_NOSOFTRESET		0x0008
+#define	PCIM_PSTAT_PMEENABLE		0x0100
+#define	PCIM_PSTAT_D0POWER		0x0000
+#define	PCIM_PSTAT_D1POWER		0x0200
+#define	PCIM_PSTAT_D2POWER		0x0400
+#define	PCIM_PSTAT_D3POWER		0x0600
+#define	PCIM_PSTAT_D0HEAT		0x0800
+#define	PCIM_PSTAT_D1HEAT		0x0a00
+#define	PCIM_PSTAT_D2HEAT		0x0c00
+#define	PCIM_PSTAT_D3HEAT		0x0e00
+#define	PCIM_PSTAT_DATASELMASK		0x1e00
+#define	PCIM_PSTAT_DATAUNKN		0x0000
+#define	PCIM_PSTAT_DATADIV10		0x2000
+#define	PCIM_PSTAT_DATADIV100		0x4000
+#define	PCIM_PSTAT_DATADIV1000		0x6000
+#define	PCIM_PSTAT_DATADIVMASK		0x6000
+#define	PCIM_PSTAT_PME			0x8000
+
+#define	PCIR_POWER_BSE		0x6
+#define	PCIM_PMCSR_BSE_D3B3		0x00
+#define	PCIM_PMCSR_BSE_D3B2		0x40
+#define	PCIM_PMCSR_BSE_BPCCE		0x80
+
+#define	PCIR_POWER_DATA		0x7
+
+/* VPD capability registers */
+#define	PCIR_VPD_ADDR		0x2
+#define	PCIR_VPD_DATA		0x4
+
+/* PCI Message Signalled Interrupts (MSI) */
+#define	PCIR_MSI_CTRL		0x2
+#define	PCIM_MSICTRL_VECTOR		0x0100
+#define	PCIM_MSICTRL_64BIT		0x0080
+#define	PCIM_MSICTRL_MME_MASK		0x0070
+#define	PCIM_MSICTRL_MME_1		0x0000
+#define	PCIM_MSICTRL_MME_2		0x0010
+#define	PCIM_MSICTRL_MME_4		0x0020
+#define	PCIM_MSICTRL_MME_8		0x0030
+#define	PCIM_MSICTRL_MME_16		0x0040
+#define	PCIM_MSICTRL_MME_32		0x0050
+#define	PCIM_MSICTRL_MMC_MASK		0x000E
+#define	PCIM_MSICTRL_MMC_1		0x0000
+#define	PCIM_MSICTRL_MMC_2		0x0002
+#define	PCIM_MSICTRL_MMC_4		0x0004
+#define	PCIM_MSICTRL_MMC_8		0x0006
+#define	PCIM_MSICTRL_MMC_16		0x0008
+#define	PCIM_MSICTRL_MMC_32		0x000A
+#define	PCIM_MSICTRL_MSI_ENABLE		0x0001
+#define	PCIR_MSI_ADDR		0x4
+#define	PCIR_MSI_ADDR_HIGH	0x8
+#define	PCIR_MSI_DATA		0x8
+#define	PCIR_MSI_DATA_64BIT	0xc
+#define	PCIR_MSI_MASK		0x10
+#define	PCIR_MSI_PENDING	0x14
+
+/* PCI-X definitions */
+
+/* For header type 0 devices */
+#define	PCIXR_COMMAND		0x2
+#define	PCIXM_COMMAND_DPERR_E		0x0001	/* Data Parity Error Recovery */
+#define	PCIXM_COMMAND_ERO		0x0002	/* Enable Relaxed Ordering */
+#define	PCIXM_COMMAND_MAX_READ		0x000c	/* Maximum Burst Read Count */
+#define	PCIXM_COMMAND_MAX_READ_512	0x0000
+#define	PCIXM_COMMAND_MAX_READ_1024	0x0004
+#define	PCIXM_COMMAND_MAX_READ_2048	0x0008
+#define	PCIXM_COMMAND_MAX_READ_4096	0x000c
+#define	PCIXM_COMMAND_MAX_SPLITS 	0x0070	/* Maximum Split Transactions */
+#define	PCIXM_COMMAND_MAX_SPLITS_1	0x0000
+#define	PCIXM_COMMAND_MAX_SPLITS_2	0x0010
+#define	PCIXM_COMMAND_MAX_SPLITS_3	0x0020
+#define	PCIXM_COMMAND_MAX_SPLITS_4	0x0030
+#define	PCIXM_COMMAND_MAX_SPLITS_8	0x0040
+#define	PCIXM_COMMAND_MAX_SPLITS_12	0x0050
+#define	PCIXM_COMMAND_MAX_SPLITS_16	0x0060
+#define	PCIXM_COMMAND_MAX_SPLITS_32	0x0070
+#define	PCIXM_COMMAND_VERSION		0x3000
+#define	PCIXR_STATUS		0x4
+#define	PCIXM_STATUS_DEVFN		0x000000FF
+#define	PCIXM_STATUS_BUS		0x0000FF00
+#define	PCIXM_STATUS_64BIT		0x00010000
+#define	PCIXM_STATUS_133CAP		0x00020000
+#define	PCIXM_STATUS_SC_DISCARDED	0x00040000
+#define	PCIXM_STATUS_UNEXP_SC		0x00080000
+#define	PCIXM_STATUS_COMPLEX_DEV	0x00100000
+#define	PCIXM_STATUS_MAX_READ		0x00600000
+#define	PCIXM_STATUS_MAX_READ_512	0x00000000
+#define	PCIXM_STATUS_MAX_READ_1024	0x00200000
+#define	PCIXM_STATUS_MAX_READ_2048	0x00400000
+#define	PCIXM_STATUS_MAX_READ_4096	0x00600000
+#define	PCIXM_STATUS_MAX_SPLITS		0x03800000
+#define	PCIXM_STATUS_MAX_SPLITS_1	0x00000000
+#define	PCIXM_STATUS_MAX_SPLITS_2	0x00800000
+#define	PCIXM_STATUS_MAX_SPLITS_3	0x01000000
+#define	PCIXM_STATUS_MAX_SPLITS_4	0x01800000
+#define	PCIXM_STATUS_MAX_SPLITS_8	0x02000000
+#define	PCIXM_STATUS_MAX_SPLITS_12	0x02800000
+#define	PCIXM_STATUS_MAX_SPLITS_16	0x03000000
+#define	PCIXM_STATUS_MAX_SPLITS_32	0x03800000
+#define	PCIXM_STATUS_MAX_CUM_READ	0x1C000000
+#define	PCIXM_STATUS_RCVD_SC_ERR	0x20000000
+#define	PCIXM_STATUS_266CAP		0x40000000
+#define	PCIXM_STATUS_533CAP		0x80000000
+
+/* For header type 1 devices (PCI-X bridges) */
+#define	PCIXR_SEC_STATUS	0x2
+#define	PCIXM_SEC_STATUS_64BIT		0x0001
+#define	PCIXM_SEC_STATUS_133CAP		0x0002
+#define	PCIXM_SEC_STATUS_SC_DISC	0x0004
+#define	PCIXM_SEC_STATUS_UNEXP_SC	0x0008
+#define	PCIXM_SEC_STATUS_SC_OVERRUN	0x0010
+#define	PCIXM_SEC_STATUS_SR_DELAYED	0x0020
+#define	PCIXM_SEC_STATUS_BUS_MODE	0x03c0
+#define	PCIXM_SEC_STATUS_VERSION	0x3000
+#define	PCIXM_SEC_STATUS_266CAP		0x4000
+#define	PCIXM_SEC_STATUS_533CAP		0x8000
+#define	PCIXR_BRIDGE_STATUS	0x4
+#define	PCIXM_BRIDGE_STATUS_DEVFN	0x000000FF
+#define	PCIXM_BRIDGE_STATUS_BUS		0x0000FF00
+#define	PCIXM_BRIDGE_STATUS_64BIT	0x00010000
+#define	PCIXM_BRIDGE_STATUS_133CAP	0x00020000
+#define	PCIXM_BRIDGE_STATUS_SC_DISCARDED 0x00040000
+#define	PCIXM_BRIDGE_STATUS_UNEXP_SC	0x00080000
+#define	PCIXM_BRIDGE_STATUS_SC_OVERRUN	0x00100000
+#define	PCIXM_BRIDGE_STATUS_SR_DELAYED	0x00200000
+#define	PCIXM_BRIDGE_STATUS_DEVID_MSGCAP 0x20000000
+#define	PCIXM_BRIDGE_STATUS_266CAP	0x40000000
+#define	PCIXM_BRIDGE_STATUS_533CAP	0x80000000
+
+/* HT (HyperTransport) Capability definitions */
+#define	PCIR_HT_COMMAND		0x2
+#define	PCIM_HTCMD_CAP_MASK		0xf800	/* Capability type. */
+#define	PCIM_HTCAP_SLAVE		0x0000	/* 000xx */
+#define	PCIM_HTCAP_HOST			0x2000	/* 001xx */
+#define	PCIM_HTCAP_SWITCH		0x4000	/* 01000 */
+#define	PCIM_HTCAP_INTERRUPT		0x8000	/* 10000 */
+#define	PCIM_HTCAP_REVISION_ID		0x8800	/* 10001 */
+#define	PCIM_HTCAP_UNITID_CLUMPING	0x9000	/* 10010 */
+#define	PCIM_HTCAP_EXT_CONFIG_SPACE	0x9800	/* 10011 */
+#define	PCIM_HTCAP_ADDRESS_MAPPING	0xa000	/* 10100 */
+#define	PCIM_HTCAP_MSI_MAPPING		0xa800	/* 10101 */
+#define	PCIM_HTCAP_DIRECT_ROUTE		0xb000	/* 10110 */
+#define	PCIM_HTCAP_VCSET		0xb800	/* 10111 */
+#define	PCIM_HTCAP_RETRY_MODE		0xc000	/* 11000 */
+#define	PCIM_HTCAP_X86_ENCODING		0xc800	/* 11001 */
+#define	PCIM_HTCAP_GEN3			0xd000	/* 11010 */
+#define	PCIM_HTCAP_FLE			0xd800	/* 11011 */
+#define	PCIM_HTCAP_PM			0xe000	/* 11100 */
+#define	PCIM_HTCAP_HIGH_NODE_COUNT	0xe800	/* 11101 */
+
+/* HT MSI Mapping Capability definitions. */
+#define	PCIM_HTCMD_MSI_ENABLE		0x0001
+#define	PCIM_HTCMD_MSI_FIXED		0x0002
+#define	PCIR_HTMSI_ADDRESS_LO	0x4
+#define	PCIR_HTMSI_ADDRESS_HI	0x8
+
+/* PCI Vendor capability definitions */
+#define	PCIR_VENDOR_LENGTH	0x2
+#define	PCIR_VENDOR_DATA	0x3
+
+/* PCI EHCI Debug Port definitions */
+#define	PCIR_DEBUG_PORT		0x2
+#define	PCIM_DEBUG_PORT_OFFSET		0x1FFF
+#define	PCIM_DEBUG_PORT_BAR		0xe000
+
+/* PCI-PCI Bridge Subvendor definitions */
+#define	PCIR_SUBVENDCAP_ID	0x4
+
+/* PCI Express definitions */
+#define	PCIER_FLAGS		0x2
+#define	PCIEM_FLAGS_VERSION		0x000F
+#define	PCIEM_FLAGS_TYPE		0x00F0
+#define	PCIEM_TYPE_ENDPOINT		0x0000
+#define	PCIEM_TYPE_LEGACY_ENDPOINT	0x0010
+#define	PCIEM_TYPE_ROOT_PORT		0x0040
+#define	PCIEM_TYPE_UPSTREAM_PORT	0x0050
+#define	PCIEM_TYPE_DOWNSTREAM_PORT	0x0060
+#define	PCIEM_TYPE_PCI_BRIDGE		0x0070
+#define	PCIEM_TYPE_PCIE_BRIDGE		0x0080
+#define	PCIEM_TYPE_ROOT_INT_EP		0x0090
+#define	PCIEM_TYPE_ROOT_EC		0x00a0
+#define	PCIEM_FLAGS_SLOT		0x0100
+#define	PCIEM_FLAGS_IRQ			0x3e00
+#define	PCIER_DEVICE_CAP	0x4
+#define	PCIEM_CAP_MAX_PAYLOAD		0x00000007
+#define	PCIEM_CAP_PHANTHOM_FUNCS	0x00000018
+#define	PCIEM_CAP_EXT_TAG_FIELD		0x00000020
+#define	PCIEM_CAP_L0S_LATENCY		0x000001c0
+#define	PCIEM_CAP_L1_LATENCY		0x00000e00
+#define	PCIEM_CAP_ROLE_ERR_RPT		0x00008000
+#define	PCIEM_CAP_SLOT_PWR_LIM_VAL	0x03fc0000
+#define	PCIEM_CAP_SLOT_PWR_LIM_SCALE	0x0c000000
+#define	PCIEM_CAP_FLR			0x10000000
+#define	PCIER_DEVICE_CTL	0x8
+#define	PCIEM_CTL_COR_ENABLE		0x0001
+#define	PCIEM_CTL_NFER_ENABLE		0x0002
+#define	PCIEM_CTL_FER_ENABLE		0x0004
+#define	PCIEM_CTL_URR_ENABLE		0x0008
+#define	PCIEM_CTL_RELAXED_ORD_ENABLE	0x0010
+#define	PCIEM_CTL_MAX_PAYLOAD		0x00e0
+#define	PCIEM_CTL_EXT_TAG_FIELD		0x0100
+#define	PCIEM_CTL_PHANTHOM_FUNCS	0x0200
+#define	PCIEM_CTL_AUX_POWER_PM		0x0400
+#define	PCIEM_CTL_NOSNOOP_ENABLE	0x0800
+#define	PCIEM_CTL_MAX_READ_REQUEST	0x7000
+#define	PCIEM_CTL_BRDG_CFG_RETRY	0x8000	/* PCI-E - PCI/PCI-X bridges */
+#define	PCIEM_CTL_INITIATE_FLR		0x8000	/* FLR capable endpoints */
+#define	PCIER_DEVICE_STA	0xa
+#define	PCIEM_STA_CORRECTABLE_ERROR	0x0001
+#define	PCIEM_STA_NON_FATAL_ERROR	0x0002
+#define	PCIEM_STA_FATAL_ERROR		0x0004
+#define	PCIEM_STA_UNSUPPORTED_REQ	0x0008
+#define	PCIEM_STA_AUX_POWER		0x0010
+#define	PCIEM_STA_TRANSACTION_PND	0x0020
+#define	PCIER_LINK_CAP		0xc
+#define	PCIEM_LINK_CAP_MAX_SPEED	0x0000000f
+#define	PCIEM_LINK_CAP_MAX_WIDTH	0x000003f0
+#define	PCIEM_LINK_CAP_ASPM		0x00000c00
+#define	PCIEM_LINK_CAP_L0S_EXIT		0x00007000
+#define	PCIEM_LINK_CAP_L1_EXIT		0x00038000
+#define	PCIEM_LINK_CAP_CLOCK_PM		0x00040000
+#define	PCIEM_LINK_CAP_SURPRISE_DOWN	0x00080000
+#define	PCIEM_LINK_CAP_DL_ACTIVE	0x00100000
+#define	PCIEM_LINK_CAP_LINK_BW_NOTIFY	0x00200000
+#define	PCIEM_LINK_CAP_ASPM_COMPLIANCE	0x00400000
+#define	PCIEM_LINK_CAP_PORT		0xff000000
+#define	PCIER_LINK_CTL		0x10
+#define	PCIEM_LINK_CTL_ASPMC_DIS	0x0000
+#define	PCIEM_LINK_CTL_ASPMC_L0S	0x0001
+#define	PCIEM_LINK_CTL_ASPMC_L1		0x0002
+#define	PCIEM_LINK_CTL_ASPMC		0x0003
+#define	PCIEM_LINK_CTL_RCB		0x0008
+#define	PCIEM_LINK_CTL_LINK_DIS		0x0010
+#define	PCIEM_LINK_CTL_RETRAIN_LINK	0x0020
+#define	PCIEM_LINK_CTL_COMMON_CLOCK	0x0040
+#define	PCIEM_LINK_CTL_EXTENDED_SYNC	0x0080
+#define	PCIEM_LINK_CTL_ECPM		0x0100
+#define	PCIEM_LINK_CTL_HAWD		0x0200
+#define	PCIEM_LINK_CTL_LBMIE		0x0400
+#define	PCIEM_LINK_CTL_LABIE		0x0800
+#define	PCIER_LINK_STA		0x12
+#define	PCIEM_LINK_STA_SPEED		0x000f
+#define	PCIEM_LINK_STA_WIDTH		0x03f0
+#define	PCIEM_LINK_STA_TRAINING_ERROR	0x0400
+#define	PCIEM_LINK_STA_TRAINING		0x0800
+#define	PCIEM_LINK_STA_SLOT_CLOCK	0x1000
+#define	PCIEM_LINK_STA_DL_ACTIVE	0x2000
+#define	PCIEM_LINK_STA_LINK_BW_MGMT	0x4000
+#define	PCIEM_LINK_STA_LINK_AUTO_BW	0x8000
+#define	PCIER_SLOT_CAP		0x14
+#define	PCIEM_SLOT_CAP_APB		0x00000001
+#define	PCIEM_SLOT_CAP_PCP		0x00000002
+#define	PCIEM_SLOT_CAP_MRLSP		0x00000004
+#define	PCIEM_SLOT_CAP_AIP		0x00000008
+#define	PCIEM_SLOT_CAP_PIP		0x00000010
+#define	PCIEM_SLOT_CAP_HPS		0x00000020
+#define	PCIEM_SLOT_CAP_HPC		0x00000040
+#define	PCIEM_SLOT_CAP_SPLV		0x00007f80
+#define	PCIEM_SLOT_CAP_SPLS		0x00018000
+#define	PCIEM_SLOT_CAP_EIP		0x00020000
+#define	PCIEM_SLOT_CAP_NCCS		0x00040000
+#define	PCIEM_SLOT_CAP_PSN		0xfff80000
+#define	PCIER_SLOT_CTL		0x18
+#define	PCIEM_SLOT_CTL_ABPE		0x0001
+#define	PCIEM_SLOT_CTL_PFDE		0x0002
+#define	PCIEM_SLOT_CTL_MRLSCE		0x0004
+#define	PCIEM_SLOT_CTL_PDCE		0x0008
+#define	PCIEM_SLOT_CTL_CCIE		0x0010
+#define	PCIEM_SLOT_CTL_HPIE		0x0020
+#define	PCIEM_SLOT_CTL_AIC		0x00c0
+#define	PCIEM_SLOT_CTL_PIC		0x0300
+#define	PCIEM_SLOT_CTL_PCC		0x0400
+#define	PCIEM_SLOT_CTL_EIC		0x0800
+#define	PCIEM_SLOT_CTL_DLLSCE		0x1000
+#define	PCIER_SLOT_STA		0x1a
+#define	PCIEM_SLOT_STA_ABP		0x0001
+#define	PCIEM_SLOT_STA_PFD		0x0002
+#define	PCIEM_SLOT_STA_MRLSC		0x0004
+#define	PCIEM_SLOT_STA_PDC		0x0008
+#define	PCIEM_SLOT_STA_CC		0x0010
+#define	PCIEM_SLOT_STA_MRLSS		0x0020
+#define	PCIEM_SLOT_STA_PDS		0x0040
+#define	PCIEM_SLOT_STA_EIS		0x0080
+#define	PCIEM_SLOT_STA_DLLSC		0x0100
+#define	PCIER_ROOT_CTL		0x1c
+#define	PCIEM_ROOT_CTL_SERR_CORR	0x0001
+#define	PCIEM_ROOT_CTL_SERR_NONFATAL	0x0002
+#define	PCIEM_ROOT_CTL_SERR_FATAL	0x0004
+#define	PCIEM_ROOT_CTL_PME		0x0008
+#define	PCIEM_ROOT_CTL_CRS_VIS		0x0010
+#define	PCIER_ROOT_CAP		0x1e
+#define	PCIEM_ROOT_CAP_CRS_VIS		0x0001
+#define	PCIER_ROOT_STA		0x20
+#define	PCIEM_ROOT_STA_PME_REQID_MASK	0x0000ffff
+#define	PCIEM_ROOT_STA_PME_STATUS	0x00010000
+#define	PCIEM_ROOT_STA_PME_PEND		0x00020000
+#define	PCIER_DEVICE_CAP2	0x24
+#define	PCIEM_CAP2_ARI		0x20
+#define	PCIER_DEVICE_CTL2	0x28
+#define	PCIEM_CTL2_COMP_TIMEOUT_VAL	0x000f
+#define	PCIEM_CTL2_COMP_TIMEOUT_DIS	0x0010
+#define	PCIEM_CTL2_ARI			0x0020
+#define	PCIEM_CTL2_ATOMIC_REQ_ENABLE	0x0040
+#define	PCIEM_CTL2_ATOMIC_EGR_BLOCK	0x0080
+#define	PCIEM_CTL2_ID_ORDERED_REQ_EN	0x0100
+#define	PCIEM_CTL2_ID_ORDERED_CMP_EN	0x0200
+#define	PCIEM_CTL2_LTR_ENABLE		0x0400
+#define	PCIEM_CTL2_OBFF			0x6000
+#define	PCIEM_OBFF_DISABLE		0x0000
+#define	PCIEM_OBFF_MSGA_ENABLE		0x2000
+#define	PCIEM_OBFF_MSGB_ENABLE		0x4000
+#define	PCIEM_OBFF_WAKE_ENABLE		0x6000
+#define	PCIEM_CTL2_END2END_TLP		0x8000
+#define	PCIER_DEVICE_STA2	0x2a
+#define	PCIER_LINK_CAP2		0x2c
+#define	PCIER_LINK_CTL2		0x30
+#define	PCIER_LINK_STA2		0x32
+#define	PCIER_SLOT_CAP2		0x34
+#define	PCIER_SLOT_CTL2		0x38
+#define	PCIER_SLOT_STA2		0x3a
+
+/* MSI-X definitions */
+#define	PCIR_MSIX_CTRL		0x2
+#define	PCIM_MSIXCTRL_MSIX_ENABLE	0x8000
+#define	PCIM_MSIXCTRL_FUNCTION_MASK	0x4000
+#define	PCIM_MSIXCTRL_TABLE_SIZE	0x07FF
+#define	PCIR_MSIX_TABLE		0x4
+#define	PCIR_MSIX_PBA		0x8
+#define	PCIM_MSIX_BIR_MASK		0x7
+#define	PCIM_MSIX_BIR_BAR_10		0
+#define	PCIM_MSIX_BIR_BAR_14		1
+#define	PCIM_MSIX_BIR_BAR_18		2
+#define	PCIM_MSIX_BIR_BAR_1C		3
+#define	PCIM_MSIX_BIR_BAR_20		4
+#define	PCIM_MSIX_BIR_BAR_24		5
+#define	PCIM_MSIX_VCTRL_MASK		0x1
+
+/* PCI Advanced Features definitions */
+#define	PCIR_PCIAF_CAP		0x3
+#define	PCIM_PCIAFCAP_TP	0x01
+#define	PCIM_PCIAFCAP_FLR	0x02
+#define	PCIR_PCIAF_CTRL		0x4
+#define	PCIR_PCIAFCTRL_FLR	0x01
+#define	PCIR_PCIAF_STATUS	0x5
+#define	PCIR_PCIAFSTATUS_TP	0x01
+
+/* Advanced Error Reporting */
+#define	PCIR_AER_UC_STATUS	0x04
+#define	PCIM_AER_UC_TRAINING_ERROR	0x00000001
+#define	PCIM_AER_UC_DL_PROTOCOL_ERROR	0x00000010
+#define	PCIM_AER_UC_SURPRISE_LINK_DOWN	0x00000020
+#define	PCIM_AER_UC_POISONED_TLP	0x00001000
+#define	PCIM_AER_UC_FC_PROTOCOL_ERROR	0x00002000
+#define	PCIM_AER_UC_COMPLETION_TIMEOUT	0x00004000
+#define	PCIM_AER_UC_COMPLETER_ABORT	0x00008000
+#define	PCIM_AER_UC_UNEXPECTED_COMPLETION 0x00010000
+#define	PCIM_AER_UC_RECEIVER_OVERFLOW	0x00020000
+#define	PCIM_AER_UC_MALFORMED_TLP	0x00040000
+#define	PCIM_AER_UC_ECRC_ERROR		0x00080000
+#define	PCIM_AER_UC_UNSUPPORTED_REQUEST	0x00100000
+#define	PCIM_AER_UC_ACS_VIOLATION	0x00200000
+#define	PCIM_AER_UC_INTERNAL_ERROR	0x00400000
+#define	PCIM_AER_UC_MC_BLOCKED_TLP	0x00800000
+#define	PCIM_AER_UC_ATOMIC_EGRESS_BLK	0x01000000
+#define	PCIM_AER_UC_TLP_PREFIX_BLOCKED	0x02000000
+#define	PCIR_AER_UC_MASK	0x08	/* Shares bits with UC_STATUS */
+#define	PCIR_AER_UC_SEVERITY	0x0c	/* Shares bits with UC_STATUS */
+#define	PCIR_AER_COR_STATUS	0x10
+#define	PCIM_AER_COR_RECEIVER_ERROR	0x00000001
+#define	PCIM_AER_COR_BAD_TLP		0x00000040
+#define	PCIM_AER_COR_BAD_DLLP		0x00000080
+#define	PCIM_AER_COR_REPLAY_ROLLOVER	0x00000100
+#define	PCIM_AER_COR_REPLAY_TIMEOUT	0x00001000
+#define	PCIM_AER_COR_ADVISORY_NF_ERROR	0x00002000
+#define	PCIM_AER_COR_INTERNAL_ERROR	0x00004000
+#define	PCIM_AER_COR_HEADER_LOG_OVFLOW	0x00008000
+#define	PCIR_AER_COR_MASK	0x14	/* Shares bits with COR_STATUS */
+#define	PCIR_AER_CAP_CONTROL	0x18
+#define	PCIM_AER_FIRST_ERROR_PTR	0x0000001f
+#define	PCIM_AER_ECRC_GEN_CAPABLE	0x00000020
+#define	PCIM_AER_ECRC_GEN_ENABLE	0x00000040
+#define	PCIM_AER_ECRC_CHECK_CAPABLE	0x00000080
+#define	PCIM_AER_ECRC_CHECK_ENABLE	0x00000100
+#define	PCIM_AER_MULT_HDR_CAPABLE	0x00000200
+#define	PCIM_AER_MULT_HDR_ENABLE	0x00000400
+#define	PCIM_AER_TLP_PREFIX_LOG_PRESENT	0x00000800
+#define	PCIR_AER_HEADER_LOG	0x1c
+#define	PCIR_AER_ROOTERR_CMD	0x2c	/* Only for root complex ports */
+#define	PCIM_AER_ROOTERR_COR_ENABLE	0x00000001
+#define	PCIM_AER_ROOTERR_NF_ENABLE	0x00000002
+#define	PCIM_AER_ROOTERR_F_ENABLE	0x00000004
+#define	PCIR_AER_ROOTERR_STATUS	0x30	/* Only for root complex ports */
+#define	PCIM_AER_ROOTERR_COR_ERR	0x00000001
+#define	PCIM_AER_ROOTERR_MULTI_COR_ERR	0x00000002
+#define	PCIM_AER_ROOTERR_UC_ERR		0x00000004
+#define	PCIM_AER_ROOTERR_MULTI_UC_ERR	0x00000008
+#define	PCIM_AER_ROOTERR_FIRST_UC_FATAL	0x00000010
+#define	PCIM_AER_ROOTERR_NF_ERR		0x00000020
+#define	PCIM_AER_ROOTERR_F_ERR		0x00000040
+#define	PCIM_AER_ROOTERR_INT_MESSAGE	0xf8000000
+#define	PCIR_AER_COR_SOURCE_ID	0x34	/* Only for root complex ports */
+#define	PCIR_AER_ERR_SOURCE_ID	0x36	/* Only for root complex ports */
+#define	PCIR_AER_TLP_PREFIX_LOG	0x38	/* Only for TLP prefix functions */
+
+/* Virtual Channel definitions */
+#define	PCIR_VC_CAP1		0x04
+#define	PCIM_VC_CAP1_EXT_COUNT		0x00000007
+#define	PCIM_VC_CAP1_LOWPRI_EXT_COUNT	0x00000070
+#define	PCIR_VC_CAP2		0x08
+#define	PCIR_VC_CONTROL		0x0C
+#define	PCIR_VC_STATUS		0x0E
+#define	PCIR_VC_RESOURCE_CAP(n)	(0x10 + (n) * 0x0C)
+#define	PCIR_VC_RESOURCE_CTL(n)	(0x14 + (n) * 0x0C)
+#define	PCIR_VC_RESOURCE_STA(n)	(0x18 + (n) * 0x0C)
+
+/* Serial Number definitions */
+#define	PCIR_SERIAL_LOW		0x04
+#define	PCIR_SERIAL_HIGH	0x08
+
diff --git a/usr/contrib/freebsd/isa/isareg.h b/usr/contrib/freebsd/isa/isareg.h
new file mode 100644
index 0000000000..e83e34674f
--- /dev/null
+++ b/usr/contrib/freebsd/isa/isareg.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)isa.h	5.7 (Berkeley) 5/9/91
+ * $FreeBSD: head/sys/isa/isareg.h 263379 2014-03-19 21:03:04Z imp $
+ */
+
+#ifdef PC98
+#error isareg.h is included from PC-9801 source
+#endif
+
+#ifndef _ISA_ISA_H_
+#define	_ISA_ISA_H_
+
+/*
+ * ISA Bus conventions
+ */
+
+/*
+ * Input / Output Port Assignments
+ */
+#ifndef IO_ISABEGIN
+#define	IO_ISABEGIN	0x000		/* 0x000 - Beginning of I/O Registers */
+#define	IO_ICU1		0x020		/* 8259A Interrupt Controller #1 */
+#define	IO_KBD		0x060		/* 8042 Keyboard */
+#define	IO_RTC		0x070		/* RTC */
+#define	IO_ICU2		0x0A0		/* 8259A Interrupt Controller #2 */
+
+#define	IO_MDA		0x3B0		/* Monochome Adapter */
+#define	IO_VGA		0x3C0		/* E/VGA Ports */
+#define	IO_CGA		0x3D0		/* CGA Ports */
+
+#endif /* !IO_ISABEGIN */
+
+/*
+ * Input / Output Port Sizes
+ */
+#define	IO_CGASIZE	12		/* CGA controllers */
+#define	IO_MDASIZE	12		/* Monochrome display controllers */
+#define	IO_VGASIZE	16		/* VGA controllers */
+
+#endif /* !_ISA_ISA_H_ */
diff --git a/usr/contrib/freebsd/lib/libutil/expand_number.c b/usr/contrib/freebsd/lib/libutil/expand_number.c
new file mode 100644
index 0000000000..f3b4da89f9
--- /dev/null
+++ b/usr/contrib/freebsd/lib/libutil/expand_number.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org>
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/lib/libutil/expand_number.c 255069 2013-08-30 11:21:52Z pluknet $");
+
+#include <sys/types.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <libutil.h>
+#include <stdint.h>
+
+int
+expand_number(const char *buf, uint64_t *num)
+{
+	char *endptr;
+	uintmax_t umaxval;
+	uint64_t number;
+	unsigned shift;
+	int serrno;
+
+	serrno = errno;
+	errno = 0;
+	umaxval = strtoumax(buf, &endptr, 0);
+	if (umaxval > UINT64_MAX)
+		errno = ERANGE;
+	if (errno != 0)
+		return (-1);
+	errno = serrno;
+	number = umaxval;
+
+	switch (tolower((unsigned char)*endptr)) {
+	case 'e':
+		shift = 60;
+		break;
+	case 'p':
+		shift = 50;
+		break;
+	case 't':
+		shift = 40;
+		break;
+	case 'g':
+		shift = 30;
+		break;
+	case 'm':
+		shift = 20;
+		break;
+	case 'k':
+		shift = 10;
+		break;
+	case 'b':
+	case '\0': /* No unit. */
+		*num = number;
+		return (0);
+	default:
+		/* Unrecognized unit. */
+		errno = EINVAL;
+		return (-1);
+	}
+
+	if ((number << shift) >> shift != number) {
+		/* Overflow */
+		errno = ERANGE;
+		return (-1);
+	}
+	*num = number << shift;
+	return (0);
+}
diff --git a/usr/contrib/freebsd/sys/ata.h b/usr/contrib/freebsd/sys/ata.h
new file mode 100644
index 0000000000..705460355f
--- /dev/null
+++ b/usr/contrib/freebsd/sys/ata.h
@@ -0,0 +1,635 @@
+/*-
+ * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/sys/ata.h 264853 2014-04-24 01:28:14Z smh $
+ */
+
+#ifndef _SYS_ATA_H_
+#define _SYS_ATA_H_
+
+#include <sys/ioccom.h>
+
+/* ATA/ATAPI device parameters */
+struct ata_params {
+/*000*/ u_int16_t       config;         /* configuration info */
+#define ATA_PROTO_MASK                  0x8003
+#define ATA_PROTO_ATAPI                 0x8000
+#define ATA_PROTO_ATAPI_12              0x8000
+#define ATA_PROTO_ATAPI_16              0x8001
+#define ATA_PROTO_CFA                   0x848a
+#define ATA_ATAPI_TYPE_MASK             0x1f00
+#define ATA_ATAPI_TYPE_DIRECT           0x0000  /* disk/floppy */
+#define ATA_ATAPI_TYPE_TAPE             0x0100  /* streaming tape */
+#define ATA_ATAPI_TYPE_CDROM            0x0500  /* CD-ROM device */
+#define ATA_ATAPI_TYPE_OPTICAL          0x0700  /* optical disk */
+#define ATA_DRQ_MASK                    0x0060
+#define ATA_DRQ_SLOW                    0x0000  /* cpu 3 ms delay */
+#define ATA_DRQ_INTR                    0x0020  /* interrupt 10 ms delay */
+#define ATA_DRQ_FAST                    0x0040  /* accel 50 us delay */
+#define ATA_RESP_INCOMPLETE             0x0004
+
+/*001*/ u_int16_t       cylinders;              /* # of cylinders */
+/*002*/ u_int16_t       specconf;		/* specific configuration */
+/*003*/ u_int16_t       heads;                  /* # heads */
+	u_int16_t       obsolete4;
+	u_int16_t       obsolete5;
+/*006*/ u_int16_t       sectors;                /* # sectors/track */
+/*007*/ u_int16_t       vendor7[3];
+/*010*/ u_int8_t        serial[20];             /* serial number */
+/*020*/ u_int16_t       retired20;
+	u_int16_t       retired21;
+	u_int16_t       obsolete22;
+/*023*/ u_int8_t        revision[8];            /* firmware revision */
+/*027*/ u_int8_t        model[40];              /* model name */
+/*047*/ u_int16_t       sectors_intr;           /* sectors per interrupt */
+/*048*/ u_int16_t       usedmovsd;              /* double word read/write? */
+/*049*/ u_int16_t       capabilities1;
+#define ATA_SUPPORT_DMA                 0x0100
+#define ATA_SUPPORT_LBA                 0x0200
+#define ATA_SUPPORT_IORDY               0x0400
+#define ATA_SUPPORT_IORDYDIS            0x0800
+#define ATA_SUPPORT_OVERLAP             0x4000
+
+/*050*/ u_int16_t       capabilities2;
+/*051*/ u_int16_t       retired_piomode;        /* PIO modes 0-2 */
+#define ATA_RETIRED_PIO_MASK            0x0300
+
+/*052*/ u_int16_t       retired_dmamode;        /* DMA modes */
+#define ATA_RETIRED_DMA_MASK            0x0003
+
+/*053*/ u_int16_t       atavalid;               /* fields valid */
+#define ATA_FLAG_54_58                  0x0001  /* words 54-58 valid */
+#define ATA_FLAG_64_70                  0x0002  /* words 64-70 valid */
+#define ATA_FLAG_88                     0x0004  /* word 88 valid */
+
+/*054*/ u_int16_t       current_cylinders;
+/*055*/ u_int16_t       current_heads;
+/*056*/ u_int16_t       current_sectors;
+/*057*/ u_int16_t       current_size_1;
+/*058*/ u_int16_t       current_size_2;
+/*059*/ u_int16_t       multi;
+#define ATA_MULTI_VALID                 0x0100
+
+/*060*/ u_int16_t       lba_size_1;
+	u_int16_t       lba_size_2;
+	u_int16_t       obsolete62;
+/*063*/ u_int16_t       mwdmamodes;             /* multiword DMA modes */
+/*064*/ u_int16_t       apiomodes;              /* advanced PIO modes */
+
+/*065*/ u_int16_t       mwdmamin;               /* min. M/W DMA time/word ns */
+/*066*/ u_int16_t       mwdmarec;               /* rec. M/W DMA time ns */
+/*067*/ u_int16_t       pioblind;               /* min. PIO cycle w/o flow */
+/*068*/ u_int16_t       pioiordy;               /* min. PIO cycle IORDY flow */
+/*069*/ u_int16_t       support3;
+#define ATA_SUPPORT_RZAT                0x0020
+#define ATA_SUPPORT_DRAT                0x4000
+	u_int16_t       reserved70;
+/*071*/ u_int16_t       rlsovlap;               /* rel time (us) for overlap */
+/*072*/ u_int16_t       rlsservice;             /* rel time (us) for service */
+	u_int16_t       reserved73;
+	u_int16_t       reserved74;
+/*075*/ u_int16_t       queue;
+#define ATA_QUEUE_LEN(x)                ((x) & 0x001f)
+
+/*76*/  u_int16_t       satacapabilities;
+#define ATA_SATA_GEN1                   0x0002
+#define ATA_SATA_GEN2                   0x0004
+#define ATA_SATA_GEN3                   0x0008
+#define ATA_SUPPORT_NCQ                 0x0100
+#define ATA_SUPPORT_IFPWRMNGTRCV        0x0200
+#define ATA_SUPPORT_PHYEVENTCNT         0x0400
+#define ATA_SUPPORT_NCQ_UNLOAD          0x0800
+#define ATA_SUPPORT_NCQ_PRIO            0x1000
+#define ATA_SUPPORT_HAPST               0x2000
+#define ATA_SUPPORT_DAPST               0x4000
+#define ATA_SUPPORT_READLOGDMAEXT       0x8000
+
+/*77*/  u_int16_t       satacapabilities2;
+#define ATA_SATA_CURR_GEN_MASK          0x0006
+#define ATA_SUPPORT_NCQ_STREAM          0x0010
+#define ATA_SUPPORT_NCQ_QMANAGEMENT     0x0020
+#define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040
+/*78*/  u_int16_t       satasupport;
+#define ATA_SUPPORT_NONZERO             0x0002
+#define ATA_SUPPORT_AUTOACTIVATE        0x0004
+#define ATA_SUPPORT_IFPWRMNGT           0x0008
+#define ATA_SUPPORT_INORDERDATA         0x0010
+#define ATA_SUPPORT_ASYNCNOTIF          0x0020
+#define ATA_SUPPORT_SOFTSETPRESERVE     0x0040
+/*79*/  u_int16_t       sataenabled;
+#define ATA_ENABLED_DAPST               0x0080
+
+/*080*/ u_int16_t       version_major;
+/*081*/ u_int16_t       version_minor;
+
+	struct {
+/*082/085*/ u_int16_t   command1;
+#define ATA_SUPPORT_SMART               0x0001
+#define ATA_SUPPORT_SECURITY            0x0002
+#define ATA_SUPPORT_REMOVABLE           0x0004
+#define ATA_SUPPORT_POWERMGT            0x0008
+#define ATA_SUPPORT_PACKET              0x0010
+#define ATA_SUPPORT_WRITECACHE          0x0020
+#define ATA_SUPPORT_LOOKAHEAD           0x0040
+#define ATA_SUPPORT_RELEASEIRQ          0x0080
+#define ATA_SUPPORT_SERVICEIRQ          0x0100
+#define ATA_SUPPORT_RESET               0x0200
+#define ATA_SUPPORT_PROTECTED           0x0400
+#define ATA_SUPPORT_WRITEBUFFER         0x1000
+#define ATA_SUPPORT_READBUFFER          0x2000
+#define ATA_SUPPORT_NOP                 0x4000
+
+/*083/086*/ u_int16_t   command2;
+#define ATA_SUPPORT_MICROCODE           0x0001
+#define ATA_SUPPORT_QUEUED              0x0002
+#define ATA_SUPPORT_CFA                 0x0004
+#define ATA_SUPPORT_APM                 0x0008
+#define ATA_SUPPORT_NOTIFY              0x0010
+#define ATA_SUPPORT_STANDBY             0x0020
+#define ATA_SUPPORT_SPINUP              0x0040
+#define ATA_SUPPORT_MAXSECURITY         0x0100
+#define ATA_SUPPORT_AUTOACOUSTIC        0x0200
+#define ATA_SUPPORT_ADDRESS48           0x0400
+#define ATA_SUPPORT_OVERLAY             0x0800
+#define ATA_SUPPORT_FLUSHCACHE          0x1000
+#define ATA_SUPPORT_FLUSHCACHE48        0x2000
+
+/*084/087*/ u_int16_t   extension;
+#define ATA_SUPPORT_SMARTLOG		0x0001
+#define ATA_SUPPORT_SMARTTEST		0x0002
+#define ATA_SUPPORT_MEDIASN		0x0004
+#define ATA_SUPPORT_MEDIAPASS		0x0008
+#define ATA_SUPPORT_STREAMING		0x0010
+#define ATA_SUPPORT_GENLOG		0x0020
+#define ATA_SUPPORT_WRITEDMAFUAEXT	0x0040
+#define ATA_SUPPORT_WRITEDMAQFUAEXT	0x0080
+#define ATA_SUPPORT_64BITWWN		0x0100
+#define ATA_SUPPORT_UNLOAD		0x2000
+	} __packed support, enabled;
+
+/*088*/ u_int16_t       udmamodes;              /* UltraDMA modes */
+/*089*/ u_int16_t       erase_time;             /* time req'd in 2min units */
+/*090*/ u_int16_t       enhanced_erase_time;    /* time req'd in 2min units */
+/*091*/ u_int16_t       apm_value;
+/*092*/ u_int16_t       master_passwd_revision; /* password revision code */
+/*093*/ u_int16_t       hwres;
+#define ATA_CABLE_ID                    0x2000
+
+/*094*/ u_int16_t       acoustic;
+#define ATA_ACOUSTIC_CURRENT(x)         ((x) & 0x00ff)
+#define ATA_ACOUSTIC_VENDOR(x)          (((x) & 0xff00) >> 8)
+
+/*095*/ u_int16_t       stream_min_req_size;
+/*096*/ u_int16_t       stream_transfer_time;
+/*097*/ u_int16_t       stream_access_latency;
+/*098*/ u_int32_t       stream_granularity;
+/*100*/ u_int16_t       lba_size48_1;
+	u_int16_t       lba_size48_2;
+	u_int16_t       lba_size48_3;
+	u_int16_t       lba_size48_4;
+	u_int16_t       reserved104;
+/*105*/	u_int16_t       max_dsm_blocks;
+/*106*/	u_int16_t       pss;
+#define ATA_PSS_LSPPS			0x000F
+#define ATA_PSS_LSSABOVE512		0x1000
+#define ATA_PSS_MULTLS			0x2000
+#define ATA_PSS_VALID_MASK		0xC000
+#define ATA_PSS_VALID_VALUE		0x4000
+/*107*/ u_int16_t       isd;
+/*108*/ u_int16_t       wwn[4];
+	u_int16_t       reserved112[5];
+/*117*/ u_int16_t       lss_1;
+/*118*/ u_int16_t       lss_2;
+/*119*/ u_int16_t       support2;
+#define ATA_SUPPORT_WRITEREADVERIFY	0x0002
+#define ATA_SUPPORT_WRITEUNCORREXT	0x0004
+#define ATA_SUPPORT_RWLOGDMAEXT		0x0008
+#define ATA_SUPPORT_MICROCODE3		0x0010
+#define ATA_SUPPORT_FREEFALL		0x0020
+/*120*/ u_int16_t       enabled2;
+	u_int16_t       reserved121[6];
+/*127*/ u_int16_t       removable_status;
+/*128*/ u_int16_t       security_status;
+#define ATA_SECURITY_LEVEL		0x0100	/* 0: high, 1: maximum */
+#define ATA_SECURITY_ENH_SUPP		0x0020	/* enhanced erase supported */
+#define ATA_SECURITY_COUNT_EXP		0x0010	/* count expired */
+#define ATA_SECURITY_FROZEN		0x0008	/* security config is frozen */
+#define ATA_SECURITY_LOCKED		0x0004	/* drive is locked */
+#define ATA_SECURITY_ENABLED		0x0002	/* ATA Security is enabled */
+#define ATA_SECURITY_SUPPORTED		0x0001	/* ATA Security is supported */
+
+	u_int16_t       reserved129[31];
+/*160*/ u_int16_t       cfa_powermode1;
+	u_int16_t       reserved161;
+/*162*/ u_int16_t       cfa_kms_support;
+/*163*/ u_int16_t       cfa_trueide_modes;
+/*164*/ u_int16_t       cfa_memory_modes;
+	u_int16_t       reserved165[4];
+/*169*/	u_int16_t       support_dsm;
+#define ATA_SUPPORT_DSM_TRIM		0x0001
+	u_int16_t       reserved170[6];
+/*176*/ u_int8_t        media_serial[60];
+/*206*/ u_int16_t       sct;
+	u_int16_t       reserved206[2];
+/*209*/ u_int16_t       lsalign;
+/*210*/ u_int16_t       wrv_sectors_m3_1;
+	u_int16_t       wrv_sectors_m3_2;
+/*212*/ u_int16_t       wrv_sectors_m2_1;
+	u_int16_t       wrv_sectors_m2_2;
+/*214*/ u_int16_t       nv_cache_caps;
+/*215*/ u_int16_t       nv_cache_size_1;
+	u_int16_t       nv_cache_size_2;
+/*217*/ u_int16_t       media_rotation_rate;
+#define ATA_RATE_NOT_REPORTED		0x0000
+#define ATA_RATE_NON_ROTATING		0x0001
+	u_int16_t       reserved218;
+/*219*/ u_int16_t       nv_cache_opt;
+/*220*/ u_int16_t       wrv_mode;
+	u_int16_t       reserved221;
+/*222*/ u_int16_t       transport_major;
+/*223*/ u_int16_t       transport_minor;
+	u_int16_t       reserved224[31];
+/*255*/ u_int16_t       integrity;
+} __packed;
+
+/* ATA Dataset Management */
+#define ATA_DSM_BLK_SIZE	512
+#define ATA_DSM_BLK_RANGES	64
+#define ATA_DSM_RANGE_SIZE	8
+#define ATA_DSM_RANGE_MAX	65535
+
+/*
+ * ATA Device Register
+ *
+ * bit 7 Obsolete (was 1 in early ATA specs)
+ * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS 
+ * bit 5 Obsolete (was 1 in early ATA specs)
+ * bit 4 1 = Slave Drive, 0 = Master Drive
+ * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number
+*/
+
+#define ATA_DEV_MASTER		0x00
+#define ATA_DEV_SLAVE		0x10
+#define ATA_DEV_LBA		0x40
+
+/* ATA limits */
+#define ATA_MAX_28BIT_LBA	268435455UL
+
+/* ATA Status Register */
+#define ATA_STATUS_ERROR	0x01
+#define ATA_STATUS_DEVICE_FAULT	0x20
+
+/* ATA Error Register */
+#define ATA_ERROR_ABORT		0x04
+#define ATA_ERROR_ID_NOT_FOUND	0x10
+
+/* ATA HPA Features */
+#define ATA_HPA_FEAT_MAX_ADDR	0x00
+#define ATA_HPA_FEAT_SET_PWD	0x01
+#define ATA_HPA_FEAT_LOCK	0x02
+#define ATA_HPA_FEAT_UNLOCK	0x03
+#define ATA_HPA_FEAT_FREEZE	0x04
+
+/* ATA transfer modes */
+#define ATA_MODE_MASK           0x0f
+#define ATA_DMA_MASK            0xf0
+#define ATA_PIO                 0x00
+#define ATA_PIO0                0x08
+#define ATA_PIO1                0x09
+#define ATA_PIO2                0x0a
+#define ATA_PIO3                0x0b
+#define ATA_PIO4                0x0c
+#define ATA_PIO_MAX             0x0f
+#define ATA_DMA                 0x10
+#define ATA_WDMA0               0x20
+#define ATA_WDMA1               0x21
+#define ATA_WDMA2               0x22
+#define ATA_UDMA0               0x40
+#define ATA_UDMA1               0x41
+#define ATA_UDMA2               0x42
+#define ATA_UDMA3               0x43
+#define ATA_UDMA4               0x44
+#define ATA_UDMA5               0x45
+#define ATA_UDMA6               0x46
+#define ATA_SA150               0x47
+#define ATA_SA300               0x48
+#define ATA_DMA_MAX             0x4f
+
+
+/* ATA commands */
+#define ATA_NOP                         0x00    /* NOP */
+#define         ATA_NF_FLUSHQUEUE       0x00    /* flush queued cmd's */
+#define         ATA_NF_AUTOPOLL         0x01    /* start autopoll function */
+#define ATA_DATA_SET_MANAGEMENT		0x06
+#define 	ATA_DSM_TRIM		0x01
+#define ATA_DEVICE_RESET                0x08    /* reset device */
+#define ATA_READ                        0x20    /* read */
+#define ATA_READ48                      0x24    /* read 48bit LBA */
+#define ATA_READ_DMA48                  0x25    /* read DMA 48bit LBA */
+#define ATA_READ_DMA_QUEUED48           0x26    /* read DMA QUEUED 48bit LBA */
+#define ATA_READ_NATIVE_MAX_ADDRESS48   0x27    /* read native max addr 48bit */
+#define ATA_READ_MUL48                  0x29    /* read multi 48bit LBA */
+#define ATA_READ_STREAM_DMA48           0x2a    /* read DMA stream 48bit LBA */
+#define ATA_READ_LOG_EXT                0x2f    /* read log ext - PIO Data-In */
+#define ATA_READ_STREAM48               0x2b    /* read stream 48bit LBA */
+#define ATA_WRITE                       0x30    /* write */
+#define ATA_WRITE48                     0x34    /* write 48bit LBA */
+#define ATA_WRITE_DMA48                 0x35    /* write DMA 48bit LBA */
+#define ATA_WRITE_DMA_QUEUED48          0x36    /* write DMA QUEUED 48bit LBA*/
+#define ATA_SET_MAX_ADDRESS48           0x37    /* set max address 48bit */
+#define ATA_WRITE_MUL48                 0x39    /* write multi 48bit LBA */
+#define ATA_WRITE_STREAM_DMA48          0x3a
+#define ATA_WRITE_STREAM48              0x3b
+#define ATA_WRITE_DMA_FUA48             0x3d
+#define ATA_WRITE_DMA_QUEUED_FUA48      0x3e
+#define ATA_WRITE_LOG_EXT               0x3f
+#define ATA_READ_VERIFY                 0x40
+#define ATA_READ_VERIFY48               0x42
+#define ATA_READ_LOG_DMA_EXT            0x47    /* read log DMA ext - PIO Data-In */
+#define ATA_READ_FPDMA_QUEUED           0x60    /* read DMA NCQ */
+#define ATA_WRITE_FPDMA_QUEUED          0x61    /* write DMA NCQ */
+#define ATA_SEND_FPDMA_QUEUED           0x64    /* send DMA NCQ */
+#define ATA_RECV_FPDMA_QUEUED           0x65    /* recieve DMA NCQ */
+#define ATA_SEP_ATTN                    0x67    /* SEP request */
+#define ATA_SEEK                        0x70    /* seek */
+#define ATA_PACKET_CMD                  0xa0    /* packet command */
+#define ATA_ATAPI_IDENTIFY              0xa1    /* get ATAPI params*/
+#define ATA_SERVICE                     0xa2    /* service command */
+#define ATA_SMART_CMD                   0xb0    /* SMART command */
+#define ATA_CFA_ERASE                   0xc0    /* CFA erase */
+#define ATA_READ_MUL                    0xc4    /* read multi */
+#define ATA_WRITE_MUL                   0xc5    /* write multi */
+#define ATA_SET_MULTI                   0xc6    /* set multi size */
+#define ATA_READ_DMA_QUEUED             0xc7    /* read DMA QUEUED */
+#define ATA_READ_DMA                    0xc8    /* read DMA */
+#define ATA_WRITE_DMA                   0xca    /* write DMA */
+#define ATA_WRITE_DMA_QUEUED            0xcc    /* write DMA QUEUED */
+#define ATA_WRITE_MUL_FUA48             0xce
+#define ATA_STANDBY_IMMEDIATE           0xe0    /* standby immediate */
+#define ATA_IDLE_IMMEDIATE              0xe1    /* idle immediate */
+#define ATA_STANDBY_CMD                 0xe2    /* standby */
+#define ATA_IDLE_CMD                    0xe3    /* idle */
+#define ATA_READ_BUFFER                 0xe4    /* read buffer */
+#define ATA_READ_PM                     0xe4    /* read portmultiplier */
+#define ATA_SLEEP                       0xe6    /* sleep */
+#define ATA_FLUSHCACHE                  0xe7    /* flush cache to disk */
+#define ATA_WRITE_PM                    0xe8    /* write portmultiplier */
+#define ATA_FLUSHCACHE48                0xea    /* flush cache to disk */
+#define ATA_ATA_IDENTIFY                0xec    /* get ATA params */
+#define ATA_SETFEATURES                 0xef    /* features command */
+#define         ATA_SF_SETXFER          0x03    /* set transfer mode */
+#define         ATA_SF_ENAB_WCACHE      0x02    /* enable write cache */
+#define         ATA_SF_DIS_WCACHE       0x82    /* disable write cache */
+#define         ATA_SF_ENAB_PUIS        0x06    /* enable PUIS */
+#define         ATA_SF_DIS_PUIS         0x86    /* disable PUIS */
+#define         ATA_SF_PUIS_SPINUP      0x07    /* PUIS spin-up */
+#define         ATA_SF_ENAB_RCACHE      0xaa    /* enable readahead cache */
+#define         ATA_SF_DIS_RCACHE       0x55    /* disable readahead cache */
+#define         ATA_SF_ENAB_RELIRQ      0x5d    /* enable release interrupt */
+#define         ATA_SF_DIS_RELIRQ       0xdd    /* disable release interrupt */
+#define         ATA_SF_ENAB_SRVIRQ      0x5e    /* enable service interrupt */
+#define         ATA_SF_DIS_SRVIRQ       0xde    /* disable service interrupt */
+#define ATA_SECURITY_SET_PASSWORD       0xf1    /* set drive password */
+#define ATA_SECURITY_UNLOCK             0xf2    /* unlock drive using passwd */
+#define ATA_SECURITY_ERASE_PREPARE      0xf3    /* prepare to erase drive */
+#define ATA_SECURITY_ERASE_UNIT         0xf4    /* erase all blocks on drive */
+#define ATA_SECURITY_FREEZE_LOCK        0xf5    /* freeze security config */
+#define ATA_SECURITY_DISABLE_PASSWORD   0xf6    /* disable drive password */
+#define ATA_READ_NATIVE_MAX_ADDRESS     0xf8    /* read native max address */
+#define ATA_SET_MAX_ADDRESS             0xf9    /* set max address */
+
+
+/* ATAPI commands */
+#define ATAPI_TEST_UNIT_READY           0x00    /* check if device is ready */
+#define ATAPI_REZERO                    0x01    /* rewind */
+#define ATAPI_REQUEST_SENSE             0x03    /* get sense data */
+#define ATAPI_FORMAT                    0x04    /* format unit */
+#define ATAPI_READ                      0x08    /* read data */
+#define ATAPI_WRITE                     0x0a    /* write data */
+#define ATAPI_WEOF                      0x10    /* write filemark */
+#define         ATAPI_WF_WRITE          0x01
+#define ATAPI_SPACE                     0x11    /* space command */
+#define         ATAPI_SP_FM             0x01
+#define         ATAPI_SP_EOD            0x03
+#define ATAPI_INQUIRY			0x12	/* get inquiry data */
+#define ATAPI_MODE_SELECT               0x15    /* mode select */
+#define ATAPI_ERASE                     0x19    /* erase */
+#define ATAPI_MODE_SENSE                0x1a    /* mode sense */
+#define ATAPI_START_STOP                0x1b    /* start/stop unit */
+#define         ATAPI_SS_LOAD           0x01
+#define         ATAPI_SS_RETENSION      0x02
+#define         ATAPI_SS_EJECT          0x04
+#define ATAPI_PREVENT_ALLOW             0x1e    /* media removal */
+#define ATAPI_READ_FORMAT_CAPACITIES    0x23    /* get format capacities */
+#define ATAPI_READ_CAPACITY             0x25    /* get volume capacity */
+#define ATAPI_READ_BIG                  0x28    /* read data */
+#define ATAPI_WRITE_BIG                 0x2a    /* write data */
+#define ATAPI_LOCATE                    0x2b    /* locate to position */
+#define ATAPI_READ_POSITION             0x34    /* read position */
+#define ATAPI_SYNCHRONIZE_CACHE         0x35    /* flush buf, close channel */
+#define ATAPI_WRITE_BUFFER              0x3b    /* write device buffer */
+#define ATAPI_READ_BUFFER               0x3c    /* read device buffer */
+#define ATAPI_READ_SUBCHANNEL           0x42    /* get subchannel info */
+#define ATAPI_READ_TOC                  0x43    /* get table of contents */
+#define ATAPI_PLAY_10                   0x45    /* play by lba */
+#define ATAPI_PLAY_MSF                  0x47    /* play by MSF address */
+#define ATAPI_PLAY_TRACK                0x48    /* play by track number */
+#define ATAPI_PAUSE                     0x4b    /* pause audio operation */
+#define ATAPI_READ_DISK_INFO            0x51    /* get disk info structure */
+#define ATAPI_READ_TRACK_INFO           0x52    /* get track info structure */
+#define ATAPI_RESERVE_TRACK             0x53    /* reserve track */
+#define ATAPI_SEND_OPC_INFO             0x54    /* send OPC structurek */
+#define ATAPI_MODE_SELECT_BIG           0x55    /* set device parameters */
+#define ATAPI_REPAIR_TRACK              0x58    /* repair track */
+#define ATAPI_READ_MASTER_CUE           0x59    /* read master CUE info */
+#define ATAPI_MODE_SENSE_BIG            0x5a    /* get device parameters */
+#define ATAPI_CLOSE_TRACK               0x5b    /* close track/session */
+#define ATAPI_READ_BUFFER_CAPACITY      0x5c    /* get buffer capicity */
+#define ATAPI_SEND_CUE_SHEET            0x5d    /* send CUE sheet */
+#define ATAPI_SERVICE_ACTION_IN         0x96	/* get service data */
+#define ATAPI_BLANK                     0xa1    /* blank the media */
+#define ATAPI_SEND_KEY                  0xa3    /* send DVD key structure */
+#define ATAPI_REPORT_KEY                0xa4    /* get DVD key structure */
+#define ATAPI_PLAY_12                   0xa5    /* play by lba */
+#define ATAPI_LOAD_UNLOAD               0xa6    /* changer control command */
+#define ATAPI_READ_STRUCTURE            0xad    /* get DVD structure */
+#define ATAPI_PLAY_CD                   0xb4    /* universal play command */
+#define ATAPI_SET_SPEED                 0xbb    /* set drive speed */
+#define ATAPI_MECH_STATUS               0xbd    /* get changer status */
+#define ATAPI_READ_CD                   0xbe    /* read data */
+#define ATAPI_POLL_DSC                  0xff    /* poll DSC status bit */
+
+
+struct ata_ioc_devices {
+    int                 channel;
+    char                name[2][32];
+    struct ata_params   params[2];
+};
+
+/* pr channel ATA ioctl calls */
+#define IOCATAGMAXCHANNEL       _IOR('a',  1, int)
+#define IOCATAREINIT            _IOW('a',  2, int)
+#define IOCATAATTACH            _IOW('a',  3, int)
+#define IOCATADETACH            _IOW('a',  4, int)
+#define IOCATADEVICES           _IOWR('a',  5, struct ata_ioc_devices)
+
+/* ATAPI request sense structure */
+struct atapi_sense {
+    u_int8_t	error;				/* current or deferred errors */
+#define	ATA_SENSE_VALID			0x80
+
+    u_int8_t	segment;			/* segment number */
+    u_int8_t	key;				/* sense key */
+#define ATA_SENSE_KEY_MASK		0x0f    /* sense key mask */
+#define ATA_SENSE_NO_SENSE		0x00    /* no specific sense key info */
+#define ATA_SENSE_RECOVERED_ERROR 	0x01    /* command OK, data recovered */
+#define ATA_SENSE_NOT_READY		0x02    /* no access to drive */
+#define ATA_SENSE_MEDIUM_ERROR		0x03    /* non-recovered data error */
+#define ATA_SENSE_HARDWARE_ERROR	0x04    /* non-recoverable HW failure */
+#define ATA_SENSE_ILLEGAL_REQUEST	0x05    /* invalid command param(s) */
+#define ATA_SENSE_UNIT_ATTENTION	0x06    /* media changed */
+#define ATA_SENSE_DATA_PROTECT		0x07    /* write protect */
+#define ATA_SENSE_BLANK_CHECK		0x08    /* blank check */
+#define ATA_SENSE_VENDOR_SPECIFIC	0x09    /* vendor specific skey */
+#define ATA_SENSE_COPY_ABORTED		0x0a    /* copy aborted */
+#define ATA_SENSE_ABORTED_COMMAND	0x0b    /* command aborted, try again */
+#define ATA_SENSE_EQUAL			0x0c    /* equal */
+#define ATA_SENSE_VOLUME_OVERFLOW	0x0d    /* volume overflow */
+#define ATA_SENSE_MISCOMPARE		0x0e    /* data dont match the medium */
+#define ATA_SENSE_RESERVED		0x0f
+#define	ATA_SENSE_ILI			0x20;
+#define	ATA_SENSE_EOM			0x40;
+#define	ATA_SENSE_FILEMARK		0x80;
+
+    u_int32_t   cmd_info;		/* cmd information */
+    u_int8_t	sense_length;		/* additional sense len (n-7) */
+    u_int32_t   cmd_specific_info;	/* additional cmd spec info */
+    u_int8_t    asc;			/* additional sense code */
+    u_int8_t    ascq;			/* additional sense code qual */
+    u_int8_t    replaceable_unit_code;	/* replaceable unit code */
+    u_int8_t	specific;		/* sense key specific */
+#define	ATA_SENSE_SPEC_VALID	0x80
+#define	ATA_SENSE_SPEC_MASK	0x7f
+	
+    u_int8_t	specific1;		/* sense key specific */
+    u_int8_t	specific2;		/* sense key specific */
+} __packed;
+
+struct ata_ioc_request {
+    union {
+	struct {
+	    u_int8_t            command;
+	    u_int8_t            feature;
+	    u_int64_t           lba;
+	    u_int16_t           count;
+	} ata;
+	struct {
+	    char                ccb[16];
+	    struct atapi_sense	sense;
+	} atapi;
+    } u;
+    caddr_t             data;
+    int                 count;
+    int                 flags;
+#define ATA_CMD_CONTROL                 0x01
+#define ATA_CMD_READ                    0x02
+#define ATA_CMD_WRITE                   0x04
+#define ATA_CMD_ATAPI                   0x08
+
+    int                 timeout;
+    int                 error;
+};
+
+struct ata_security_password {
+	u_int16_t		ctrl;
+#define ATA_SECURITY_PASSWORD_USER	0x0000
+#define ATA_SECURITY_PASSWORD_MASTER	0x0001
+#define ATA_SECURITY_ERASE_NORMAL	0x0000
+#define ATA_SECURITY_ERASE_ENHANCED	0x0002
+#define ATA_SECURITY_LEVEL_HIGH		0x0000
+#define ATA_SECURITY_LEVEL_MAXIMUM	0x0100
+
+	u_int8_t		password[32];
+	u_int16_t		revision;
+	u_int16_t		reserved[238];
+};
+
+/* pr device ATA ioctl calls */
+#define IOCATAREQUEST           _IOWR('a', 100, struct ata_ioc_request)
+#define IOCATAGPARM             _IOR('a', 101, struct ata_params)
+#define IOCATAGMODE             _IOR('a', 102, int)
+#define IOCATASMODE             _IOW('a', 103, int)
+
+#define IOCATAGSPINDOWN		_IOR('a', 104, int)
+#define IOCATASSPINDOWN		_IOW('a', 105, int)
+
+
+struct ata_ioc_raid_config {
+	    int                 lun;
+	    int                 type;
+#define AR_JBOD                         0x0001
+#define AR_SPAN                         0x0002
+#define AR_RAID0                        0x0004
+#define AR_RAID1                        0x0008
+#define AR_RAID01                       0x0010
+#define AR_RAID3                        0x0020
+#define AR_RAID4                        0x0040
+#define AR_RAID5                        0x0080
+
+	    int                 interleave;
+	    int                 status;
+#define AR_READY                        1
+#define AR_DEGRADED                     2
+#define AR_REBUILDING                   4
+
+	    int                 progress;
+	    int                 total_disks;
+	    int                 disks[16];
+};
+
+struct ata_ioc_raid_status {
+	    int                 lun;
+	    int                 type;
+	    int                 interleave;
+	    int                 status;
+	    int                 progress;
+	    int                 total_disks;
+	    struct {
+		    int		state;
+#define AR_DISK_ONLINE			0x01
+#define AR_DISK_PRESENT			0x02
+#define AR_DISK_SPARE			0x04
+		    int		lun;
+	    } disks[16];
+};
+
+/* ATA RAID ioctl calls */
+#define IOCATARAIDCREATE        _IOWR('a', 200, struct ata_ioc_raid_config)
+#define IOCATARAIDDELETE        _IOW('a', 201, int)
+#define IOCATARAIDSTATUS        _IOWR('a', 202, struct ata_ioc_raid_status)
+#define IOCATARAIDADDSPARE      _IOW('a', 203, struct ata_ioc_raid_config)
+#define IOCATARAIDREBUILD       _IOW('a', 204, int)
+
+#endif /* _SYS_ATA_H_ */
diff --git a/usr/contrib/freebsd/sys/linker_set.h b/usr/contrib/freebsd/sys/linker_set.h
new file mode 100644
index 0000000000..393dfbc131
--- /dev/null
+++ b/usr/contrib/freebsd/sys/linker_set.h
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 1999 John D. Polstra
+ * Copyright (c) 1999,2001 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/sys/linker_set.h 215701 2010-11-22 19:32:54Z dim $
+ */
+
+#ifndef _SYS_LINKER_SET_H_
+#define _SYS_LINKER_SET_H_
+
+#ifdef	__FreeBSD__
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+#else
+#ifndef	_COMPAT_FREEBSD_SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+#endif
+
+/*
+ * The following macros are used to declare global sets of objects, which
+ * are collected by the linker into a `linker_set' as defined below.
+ * For ELF, this is done by constructing a separate segment for each set.
+ */
+
+/*
+ * Private macros, not to be used outside this header file.
+ */
+#ifdef __GNUCLIKE___SECTION
+#ifdef	__FreeBSD__
+#define __MAKE_SET(set, sym)						\
+	__GLOBL(__CONCAT(__start_set_,set));				\
+	__GLOBL(__CONCAT(__stop_set_,set));				\
+	static void const * const __set_##set##_sym_##sym 		\
+	__section("set_" #set) __used = &sym
+#else
+#define __MAKE_SET(set, sym)						\
+	static void const * const __set_##set##_sym_##sym 		\
+	__section("set_" #set) __used = &sym
+#endif
+#else /* !__GNUCLIKE___SECTION */
+#ifndef lint
+#error this file needs to be ported to your compiler
+#endif /* lint */
+#define __MAKE_SET(set, sym)	extern void const * const (__set_##set##_sym_##sym)
+#endif /* __GNUCLIKE___SECTION */
+
+/*
+ * Public macros.
+ */
+#define TEXT_SET(set, sym)	__MAKE_SET(set, sym)
+#define DATA_SET(set, sym)	__MAKE_SET(set, sym)
+#define BSS_SET(set, sym)	__MAKE_SET(set, sym)
+#define ABS_SET(set, sym)	__MAKE_SET(set, sym)
+#define SET_ENTRY(set, sym)	__MAKE_SET(set, sym)
+
+/*
+ * Initialize before referring to a given linker set.
+ */
+#ifdef	__FreeBSD__
+#define SET_DECLARE(set, ptype)						\
+	extern ptype *__CONCAT(__start_set_,set);			\
+	extern ptype *__CONCAT(__stop_set_,set)
+#else
+#define	SET_DECLARE(set, ptype)						\
+	_Pragma(__XSTRING(weak __CONCAT(__start_set_,set)))		\
+	_Pragma(__XSTRING(weak __CONCAT(__stop_set_,set)))		\
+	extern ptype *__CONCAT(__start_set_,set);			\
+	extern ptype *__CONCAT(__stop_set_,set)
+#endif
+
+#define SET_BEGIN(set)							\
+	(&__CONCAT(__start_set_,set))
+#define SET_LIMIT(set)							\
+	(&__CONCAT(__stop_set_,set))
+
+/*
+ * Iterate over all the elements of a set.
+ *
+ * Sets always contain addresses of things, and "pvar" points to words
+ * containing those addresses.  Thus is must be declared as "type **pvar",
+ * and the address of each set item is obtained inside the loop by "*pvar".
+ */
+#define SET_FOREACH(pvar, set)						\
+	for (pvar = SET_BEGIN(set); pvar < SET_LIMIT(set); pvar++)
+
+#define SET_ITEM(set, i)						\
+	((SET_BEGIN(set))[i])
+
+/*
+ * Provide a count of the items in a set.
+ */
+#define SET_COUNT(set)							\
+	(SET_LIMIT(set) - SET_BEGIN(set))
+
+#endif	/* _SYS_LINKER_SET_H_ */
diff --git a/usr/contrib/freebsd/sys/tree.h b/usr/contrib/freebsd/sys/tree.h
new file mode 100644
index 0000000000..6b47e247bb
--- /dev/null
+++ b/usr/contrib/freebsd/sys/tree.h
@@ -0,0 +1,765 @@
+/*	$NetBSD: tree.h,v 1.8 2004/03/28 19:38:30 provos Exp $	*/
+/*	$OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $	*/
+/* $FreeBSD: head/sys/sys/tree.h 189204 2009-03-01 04:57:23Z bms $ */
+
+/*-
+ * Copyright 2002 Niels Provos <provos@citi.umich.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_TREE_H_
+#define	_SYS_TREE_H_
+
+#include <sys/cdefs.h>
+
+/*
+ * This file defines data structures for different types of trees:
+ * splay trees and red-black trees.
+ *
+ * A splay tree is a self-organizing data structure.  Every operation
+ * on the tree causes a splay to happen.  The splay moves the requested
+ * node to the root of the tree and partly rebalances it.
+ *
+ * This has the benefit that request locality causes faster lookups as
+ * the requested nodes move to the top of the tree.  On the other hand,
+ * every lookup causes memory writes.
+ *
+ * The Balance Theorem bounds the total access time for m operations
+ * and n inserts on an initially empty tree as O((m + n)lg n).  The
+ * amortized cost for a sequence of m accesses to a splay tree is O(lg n);
+ *
+ * A red-black tree is a binary search tree with the node color as an
+ * extra attribute.  It fulfills a set of conditions:
+ *	- every search path from the root to a leaf consists of the
+ *	  same number of black nodes,
+ *	- each red node (except for the root) has a black parent,
+ *	- each leaf node is black.
+ *
+ * Every operation on a red-black tree is bounded as O(lg n).
+ * The maximum height of a red-black tree is 2lg (n+1).
+ */
+
+#define SPLAY_HEAD(name, type)						\
+struct name {								\
+	struct type *sph_root; /* root of the tree */			\
+}
+
+#define SPLAY_INITIALIZER(root)						\
+	{ NULL }
+
+#define SPLAY_INIT(root) do {						\
+	(root)->sph_root = NULL;					\
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_ENTRY(type)						\
+struct {								\
+	struct type *spe_left; /* left element */			\
+	struct type *spe_right; /* right element */			\
+}
+
+#define SPLAY_LEFT(elm, field)		(elm)->field.spe_left
+#define SPLAY_RIGHT(elm, field)		(elm)->field.spe_right
+#define SPLAY_ROOT(head)		(head)->sph_root
+#define SPLAY_EMPTY(head)		(SPLAY_ROOT(head) == NULL)
+
+/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */
+#define SPLAY_ROTATE_RIGHT(head, tmp, field) do {			\
+	SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field);	\
+	SPLAY_RIGHT(tmp, field) = (head)->sph_root;			\
+	(head)->sph_root = tmp;						\
+} while (/*CONSTCOND*/ 0)
+	
+#define SPLAY_ROTATE_LEFT(head, tmp, field) do {			\
+	SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field);	\
+	SPLAY_LEFT(tmp, field) = (head)->sph_root;			\
+	(head)->sph_root = tmp;						\
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_LINKLEFT(head, tmp, field) do {				\
+	SPLAY_LEFT(tmp, field) = (head)->sph_root;			\
+	tmp = (head)->sph_root;						\
+	(head)->sph_root = SPLAY_LEFT((head)->sph_root, field);		\
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_LINKRIGHT(head, tmp, field) do {				\
+	SPLAY_RIGHT(tmp, field) = (head)->sph_root;			\
+	tmp = (head)->sph_root;						\
+	(head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);	\
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_ASSEMBLE(head, node, left, right, field) do {		\
+	SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field);	\
+	SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\
+	SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field);	\
+	SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field);	\
+} while (/*CONSTCOND*/ 0)
+
+/* Generates prototypes and inline functions */
+
+#define SPLAY_PROTOTYPE(name, type, field, cmp)				\
+void name##_SPLAY(struct name *, struct type *);			\
+void name##_SPLAY_MINMAX(struct name *, int);				\
+struct type *name##_SPLAY_INSERT(struct name *, struct type *);		\
+struct type *name##_SPLAY_REMOVE(struct name *, struct type *);		\
+									\
+/* Finds the node with the same key as elm */				\
+static __inline struct type *						\
+name##_SPLAY_FIND(struct name *head, struct type *elm)			\
+{									\
+	if (SPLAY_EMPTY(head))						\
+		return(NULL);						\
+	name##_SPLAY(head, elm);					\
+	if ((cmp)(elm, (head)->sph_root) == 0)				\
+		return (head->sph_root);				\
+	return (NULL);							\
+}									\
+									\
+static __inline struct type *						\
+name##_SPLAY_NEXT(struct name *head, struct type *elm)			\
+{									\
+	name##_SPLAY(head, elm);					\
+	if (SPLAY_RIGHT(elm, field) != NULL) {				\
+		elm = SPLAY_RIGHT(elm, field);				\
+		while (SPLAY_LEFT(elm, field) != NULL) {		\
+			elm = SPLAY_LEFT(elm, field);			\
+		}							\
+	} else								\
+		elm = NULL;						\
+	return (elm);							\
+}									\
+									\
+static __inline struct type *						\
+name##_SPLAY_MIN_MAX(struct name *head, int val)			\
+{									\
+	name##_SPLAY_MINMAX(head, val);					\
+        return (SPLAY_ROOT(head));					\
+}
+
+/* Main splay operation.
+ * Moves node close to the key of elm to top
+ */
+#define SPLAY_GENERATE(name, type, field, cmp)				\
+struct type *								\
+name##_SPLAY_INSERT(struct name *head, struct type *elm)		\
+{									\
+    if (SPLAY_EMPTY(head)) {						\
+	    SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL;	\
+    } else {								\
+	    int __comp;							\
+	    name##_SPLAY(head, elm);					\
+	    __comp = (cmp)(elm, (head)->sph_root);			\
+	    if(__comp < 0) {						\
+		    SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\
+		    SPLAY_RIGHT(elm, field) = (head)->sph_root;		\
+		    SPLAY_LEFT((head)->sph_root, field) = NULL;		\
+	    } else if (__comp > 0) {					\
+		    SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\
+		    SPLAY_LEFT(elm, field) = (head)->sph_root;		\
+		    SPLAY_RIGHT((head)->sph_root, field) = NULL;	\
+	    } else							\
+		    return ((head)->sph_root);				\
+    }									\
+    (head)->sph_root = (elm);						\
+    return (NULL);							\
+}									\
+									\
+struct type *								\
+name##_SPLAY_REMOVE(struct name *head, struct type *elm)		\
+{									\
+	struct type *__tmp;						\
+	if (SPLAY_EMPTY(head))						\
+		return (NULL);						\
+	name##_SPLAY(head, elm);					\
+	if ((cmp)(elm, (head)->sph_root) == 0) {			\
+		if (SPLAY_LEFT((head)->sph_root, field) == NULL) {	\
+			(head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\
+		} else {						\
+			__tmp = SPLAY_RIGHT((head)->sph_root, field);	\
+			(head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\
+			name##_SPLAY(head, elm);			\
+			SPLAY_RIGHT((head)->sph_root, field) = __tmp;	\
+		}							\
+		return (elm);						\
+	}								\
+	return (NULL);							\
+}									\
+									\
+void									\
+name##_SPLAY(struct name *head, struct type *elm)			\
+{									\
+	struct type __node, *__left, *__right, *__tmp;			\
+	int __comp;							\
+\
+	SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\
+	__left = __right = &__node;					\
+\
+	while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) {		\
+		if (__comp < 0) {					\
+			__tmp = SPLAY_LEFT((head)->sph_root, field);	\
+			if (__tmp == NULL)				\
+				break;					\
+			if ((cmp)(elm, __tmp) < 0){			\
+				SPLAY_ROTATE_RIGHT(head, __tmp, field);	\
+				if (SPLAY_LEFT((head)->sph_root, field) == NULL)\
+					break;				\
+			}						\
+			SPLAY_LINKLEFT(head, __right, field);		\
+		} else if (__comp > 0) {				\
+			__tmp = SPLAY_RIGHT((head)->sph_root, field);	\
+			if (__tmp == NULL)				\
+				break;					\
+			if ((cmp)(elm, __tmp) > 0){			\
+				SPLAY_ROTATE_LEFT(head, __tmp, field);	\
+				if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\
+					break;				\
+			}						\
+			SPLAY_LINKRIGHT(head, __left, field);		\
+		}							\
+	}								\
+	SPLAY_ASSEMBLE(head, &__node, __left, __right, field);		\
+}									\
+									\
+/* Splay with either the minimum or the maximum element			\
+ * Used to find minimum or maximum element in tree.			\
+ */									\
+void name##_SPLAY_MINMAX(struct name *head, int __comp) \
+{									\
+	struct type __node, *__left, *__right, *__tmp;			\
+\
+	SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\
+	__left = __right = &__node;					\
+\
+	while (1) {							\
+		if (__comp < 0) {					\
+			__tmp = SPLAY_LEFT((head)->sph_root, field);	\
+			if (__tmp == NULL)				\
+				break;					\
+			if (__comp < 0){				\
+				SPLAY_ROTATE_RIGHT(head, __tmp, field);	\
+				if (SPLAY_LEFT((head)->sph_root, field) == NULL)\
+					break;				\
+			}						\
+			SPLAY_LINKLEFT(head, __right, field);		\
+		} else if (__comp > 0) {				\
+			__tmp = SPLAY_RIGHT((head)->sph_root, field);	\
+			if (__tmp == NULL)				\
+				break;					\
+			if (__comp > 0) {				\
+				SPLAY_ROTATE_LEFT(head, __tmp, field);	\
+				if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\
+					break;				\
+			}						\
+			SPLAY_LINKRIGHT(head, __left, field);		\
+		}							\
+	}								\
+	SPLAY_ASSEMBLE(head, &__node, __left, __right, field);		\
+}
+
+#define SPLAY_NEGINF	-1
+#define SPLAY_INF	1
+
+#define SPLAY_INSERT(name, x, y)	name##_SPLAY_INSERT(x, y)
+#define SPLAY_REMOVE(name, x, y)	name##_SPLAY_REMOVE(x, y)
+#define SPLAY_FIND(name, x, y)		name##_SPLAY_FIND(x, y)
+#define SPLAY_NEXT(name, x, y)		name##_SPLAY_NEXT(x, y)
+#define SPLAY_MIN(name, x)		(SPLAY_EMPTY(x) ? NULL	\
+					: name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF))
+#define SPLAY_MAX(name, x)		(SPLAY_EMPTY(x) ? NULL	\
+					: name##_SPLAY_MIN_MAX(x, SPLAY_INF))
+
+#define SPLAY_FOREACH(x, name, head)					\
+	for ((x) = SPLAY_MIN(name, head);				\
+	     (x) != NULL;						\
+	     (x) = SPLAY_NEXT(name, head, x))
+
+/* Macros that define a red-black tree */
+#define RB_HEAD(name, type)						\
+struct name {								\
+	struct type *rbh_root; /* root of the tree */			\
+}
+
+#define RB_INITIALIZER(root)						\
+	{ NULL }
+
+#define RB_INIT(root) do {						\
+	(root)->rbh_root = NULL;					\
+} while (/*CONSTCOND*/ 0)
+
+#define RB_BLACK	0
+#define RB_RED		1
+#define RB_ENTRY(type)							\
+struct {								\
+	struct type *rbe_left;		/* left element */		\
+	struct type *rbe_right;		/* right element */		\
+	struct type *rbe_parent;	/* parent element */		\
+	int rbe_color;			/* node color */		\
+}
+
+#define RB_LEFT(elm, field)		(elm)->field.rbe_left
+#define RB_RIGHT(elm, field)		(elm)->field.rbe_right
+#define RB_PARENT(elm, field)		(elm)->field.rbe_parent
+#define RB_COLOR(elm, field)		(elm)->field.rbe_color
+#define RB_ROOT(head)			(head)->rbh_root
+#define RB_EMPTY(head)			(RB_ROOT(head) == NULL)
+
+#define RB_SET(elm, parent, field) do {					\
+	RB_PARENT(elm, field) = parent;					\
+	RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL;		\
+	RB_COLOR(elm, field) = RB_RED;					\
+} while (/*CONSTCOND*/ 0)
+
+#define RB_SET_BLACKRED(black, red, field) do {				\
+	RB_COLOR(black, field) = RB_BLACK;				\
+	RB_COLOR(red, field) = RB_RED;					\
+} while (/*CONSTCOND*/ 0)
+
+#ifndef RB_AUGMENT
+#define RB_AUGMENT(x)	do {} while (0)
+#endif
+
+#define RB_ROTATE_LEFT(head, elm, tmp, field) do {			\
+	(tmp) = RB_RIGHT(elm, field);					\
+	if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) {	\
+		RB_PARENT(RB_LEFT(tmp, field), field) = (elm);		\
+	}								\
+	RB_AUGMENT(elm);						\
+	if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) {	\
+		if ((elm) == RB_LEFT(RB_PARENT(elm, field), field))	\
+			RB_LEFT(RB_PARENT(elm, field), field) = (tmp);	\
+		else							\
+			RB_RIGHT(RB_PARENT(elm, field), field) = (tmp);	\
+	} else								\
+		(head)->rbh_root = (tmp);				\
+	RB_LEFT(tmp, field) = (elm);					\
+	RB_PARENT(elm, field) = (tmp);					\
+	RB_AUGMENT(tmp);						\
+	if ((RB_PARENT(tmp, field)))					\
+		RB_AUGMENT(RB_PARENT(tmp, field));			\
+} while (/*CONSTCOND*/ 0)
+
+#define RB_ROTATE_RIGHT(head, elm, tmp, field) do {			\
+	(tmp) = RB_LEFT(elm, field);					\
+	if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) {	\
+		RB_PARENT(RB_RIGHT(tmp, field), field) = (elm);		\
+	}								\
+	RB_AUGMENT(elm);						\
+	if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) {	\
+		if ((elm) == RB_LEFT(RB_PARENT(elm, field), field))	\
+			RB_LEFT(RB_PARENT(elm, field), field) = (tmp);	\
+		else							\
+			RB_RIGHT(RB_PARENT(elm, field), field) = (tmp);	\
+	} else								\
+		(head)->rbh_root = (tmp);				\
+	RB_RIGHT(tmp, field) = (elm);					\
+	RB_PARENT(elm, field) = (tmp);					\
+	RB_AUGMENT(tmp);						\
+	if ((RB_PARENT(tmp, field)))					\
+		RB_AUGMENT(RB_PARENT(tmp, field));			\
+} while (/*CONSTCOND*/ 0)
+
+/* Generates prototypes and inline functions */
+#define	RB_PROTOTYPE(name, type, field, cmp)				\
+	RB_PROTOTYPE_INTERNAL(name, type, field, cmp,)
+#define	RB_PROTOTYPE_STATIC(name, type, field, cmp)			\
+	RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __unused static)
+#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr)		\
+attr void name##_RB_INSERT_COLOR(struct name *, struct type *);		\
+attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\
+attr struct type *name##_RB_REMOVE(struct name *, struct type *);	\
+attr struct type *name##_RB_INSERT(struct name *, struct type *);	\
+attr struct type *name##_RB_FIND(struct name *, struct type *);		\
+attr struct type *name##_RB_NFIND(struct name *, struct type *);	\
+attr struct type *name##_RB_NEXT(struct type *);			\
+attr struct type *name##_RB_PREV(struct type *);			\
+attr struct type *name##_RB_MINMAX(struct name *, int);			\
+									\
+
+/* Main rb operation.
+ * Moves node close to the key of elm to top
+ */
+#define	RB_GENERATE(name, type, field, cmp)				\
+	RB_GENERATE_INTERNAL(name, type, field, cmp,)
+#define	RB_GENERATE_STATIC(name, type, field, cmp)			\
+	RB_GENERATE_INTERNAL(name, type, field, cmp, __unused static)
+#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr)		\
+attr void								\
+name##_RB_INSERT_COLOR(struct name *head, struct type *elm)		\
+{									\
+	struct type *parent, *gparent, *tmp;				\
+	while ((parent = RB_PARENT(elm, field)) != NULL &&		\
+	    RB_COLOR(parent, field) == RB_RED) {			\
+		gparent = RB_PARENT(parent, field);			\
+		if (parent == RB_LEFT(gparent, field)) {		\
+			tmp = RB_RIGHT(gparent, field);			\
+			if (tmp && RB_COLOR(tmp, field) == RB_RED) {	\
+				RB_COLOR(tmp, field) = RB_BLACK;	\
+				RB_SET_BLACKRED(parent, gparent, field);\
+				elm = gparent;				\
+				continue;				\
+			}						\
+			if (RB_RIGHT(parent, field) == elm) {		\
+				RB_ROTATE_LEFT(head, parent, tmp, field);\
+				tmp = parent;				\
+				parent = elm;				\
+				elm = tmp;				\
+			}						\
+			RB_SET_BLACKRED(parent, gparent, field);	\
+			RB_ROTATE_RIGHT(head, gparent, tmp, field);	\
+		} else {						\
+			tmp = RB_LEFT(gparent, field);			\
+			if (tmp && RB_COLOR(tmp, field) == RB_RED) {	\
+				RB_COLOR(tmp, field) = RB_BLACK;	\
+				RB_SET_BLACKRED(parent, gparent, field);\
+				elm = gparent;				\
+				continue;				\
+			}						\
+			if (RB_LEFT(parent, field) == elm) {		\
+				RB_ROTATE_RIGHT(head, parent, tmp, field);\
+				tmp = parent;				\
+				parent = elm;				\
+				elm = tmp;				\
+			}						\
+			RB_SET_BLACKRED(parent, gparent, field);	\
+			RB_ROTATE_LEFT(head, gparent, tmp, field);	\
+		}							\
+	}								\
+	RB_COLOR(head->rbh_root, field) = RB_BLACK;			\
+}									\
+									\
+attr void								\
+name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \
+{									\
+	struct type *tmp;						\
+	while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) &&	\
+	    elm != RB_ROOT(head)) {					\
+		if (RB_LEFT(parent, field) == elm) {			\
+			tmp = RB_RIGHT(parent, field);			\
+			if (RB_COLOR(tmp, field) == RB_RED) {		\
+				RB_SET_BLACKRED(tmp, parent, field);	\
+				RB_ROTATE_LEFT(head, parent, tmp, field);\
+				tmp = RB_RIGHT(parent, field);		\
+			}						\
+			if ((RB_LEFT(tmp, field) == NULL ||		\
+			    RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\
+			    (RB_RIGHT(tmp, field) == NULL ||		\
+			    RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\
+				RB_COLOR(tmp, field) = RB_RED;		\
+				elm = parent;				\
+				parent = RB_PARENT(elm, field);		\
+			} else {					\
+				if (RB_RIGHT(tmp, field) == NULL ||	\
+				    RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\
+					struct type *oleft;		\
+					if ((oleft = RB_LEFT(tmp, field)) \
+					    != NULL)			\
+						RB_COLOR(oleft, field) = RB_BLACK;\
+					RB_COLOR(tmp, field) = RB_RED;	\
+					RB_ROTATE_RIGHT(head, tmp, oleft, field);\
+					tmp = RB_RIGHT(parent, field);	\
+				}					\
+				RB_COLOR(tmp, field) = RB_COLOR(parent, field);\
+				RB_COLOR(parent, field) = RB_BLACK;	\
+				if (RB_RIGHT(tmp, field))		\
+					RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\
+				RB_ROTATE_LEFT(head, parent, tmp, field);\
+				elm = RB_ROOT(head);			\
+				break;					\
+			}						\
+		} else {						\
+			tmp = RB_LEFT(parent, field);			\
+			if (RB_COLOR(tmp, field) == RB_RED) {		\
+				RB_SET_BLACKRED(tmp, parent, field);	\
+				RB_ROTATE_RIGHT(head, parent, tmp, field);\
+				tmp = RB_LEFT(parent, field);		\
+			}						\
+			if ((RB_LEFT(tmp, field) == NULL ||		\
+			    RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\
+			    (RB_RIGHT(tmp, field) == NULL ||		\
+			    RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\
+				RB_COLOR(tmp, field) = RB_RED;		\
+				elm = parent;				\
+				parent = RB_PARENT(elm, field);		\
+			} else {					\
+				if (RB_LEFT(tmp, field) == NULL ||	\
+				    RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\
+					struct type *oright;		\
+					if ((oright = RB_RIGHT(tmp, field)) \
+					    != NULL)			\
+						RB_COLOR(oright, field) = RB_BLACK;\
+					RB_COLOR(tmp, field) = RB_RED;	\
+					RB_ROTATE_LEFT(head, tmp, oright, field);\
+					tmp = RB_LEFT(parent, field);	\
+				}					\
+				RB_COLOR(tmp, field) = RB_COLOR(parent, field);\
+				RB_COLOR(parent, field) = RB_BLACK;	\
+				if (RB_LEFT(tmp, field))		\
+					RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\
+				RB_ROTATE_RIGHT(head, parent, tmp, field);\
+				elm = RB_ROOT(head);			\
+				break;					\
+			}						\
+		}							\
+	}								\
+	if (elm)							\
+		RB_COLOR(elm, field) = RB_BLACK;			\
+}									\
+									\
+attr struct type *							\
+name##_RB_REMOVE(struct name *head, struct type *elm)			\
+{									\
+	struct type *child, *parent, *old = elm;			\
+	int color;							\
+	if (RB_LEFT(elm, field) == NULL)				\
+		child = RB_RIGHT(elm, field);				\
+	else if (RB_RIGHT(elm, field) == NULL)				\
+		child = RB_LEFT(elm, field);				\
+	else {								\
+		struct type *left;					\
+		elm = RB_RIGHT(elm, field);				\
+		while ((left = RB_LEFT(elm, field)) != NULL)		\
+			elm = left;					\
+		child = RB_RIGHT(elm, field);				\
+		parent = RB_PARENT(elm, field);				\
+		color = RB_COLOR(elm, field);				\
+		if (child)						\
+			RB_PARENT(child, field) = parent;		\
+		if (parent) {						\
+			if (RB_LEFT(parent, field) == elm)		\
+				RB_LEFT(parent, field) = child;		\
+			else						\
+				RB_RIGHT(parent, field) = child;	\
+			RB_AUGMENT(parent);				\
+		} else							\
+			RB_ROOT(head) = child;				\
+		if (RB_PARENT(elm, field) == old)			\
+			parent = elm;					\
+		(elm)->field = (old)->field;				\
+		if (RB_PARENT(old, field)) {				\
+			if (RB_LEFT(RB_PARENT(old, field), field) == old)\
+				RB_LEFT(RB_PARENT(old, field), field) = elm;\
+			else						\
+				RB_RIGHT(RB_PARENT(old, field), field) = elm;\
+			RB_AUGMENT(RB_PARENT(old, field));		\
+		} else							\
+			RB_ROOT(head) = elm;				\
+		RB_PARENT(RB_LEFT(old, field), field) = elm;		\
+		if (RB_RIGHT(old, field))				\
+			RB_PARENT(RB_RIGHT(old, field), field) = elm;	\
+		if (parent) {						\
+			left = parent;					\
+			do {						\
+				RB_AUGMENT(left);			\
+			} while ((left = RB_PARENT(left, field)) != NULL); \
+		}							\
+		goto color;						\
+	}								\
+	parent = RB_PARENT(elm, field);					\
+	color = RB_COLOR(elm, field);					\
+	if (child)							\
+		RB_PARENT(child, field) = parent;			\
+	if (parent) {							\
+		if (RB_LEFT(parent, field) == elm)			\
+			RB_LEFT(parent, field) = child;			\
+		else							\
+			RB_RIGHT(parent, field) = child;		\
+		RB_AUGMENT(parent);					\
+	} else								\
+		RB_ROOT(head) = child;					\
+color:									\
+	if (color == RB_BLACK)						\
+		name##_RB_REMOVE_COLOR(head, parent, child);		\
+	return (old);							\
+}									\
+									\
+/* Inserts a node into the RB tree */					\
+attr struct type *							\
+name##_RB_INSERT(struct name *head, struct type *elm)			\
+{									\
+	struct type *tmp;						\
+	struct type *parent = NULL;					\
+	int comp = 0;							\
+	tmp = RB_ROOT(head);						\
+	while (tmp) {							\
+		parent = tmp;						\
+		comp = (cmp)(elm, parent);				\
+		if (comp < 0)						\
+			tmp = RB_LEFT(tmp, field);			\
+		else if (comp > 0)					\
+			tmp = RB_RIGHT(tmp, field);			\
+		else							\
+			return (tmp);					\
+	}								\
+	RB_SET(elm, parent, field);					\
+	if (parent != NULL) {						\
+		if (comp < 0)						\
+			RB_LEFT(parent, field) = elm;			\
+		else							\
+			RB_RIGHT(parent, field) = elm;			\
+		RB_AUGMENT(parent);					\
+	} else								\
+		RB_ROOT(head) = elm;					\
+	name##_RB_INSERT_COLOR(head, elm);				\
+	return (NULL);							\
+}									\
+									\
+/* Finds the node with the same key as elm */				\
+attr struct type *							\
+name##_RB_FIND(struct name *head, struct type *elm)			\
+{									\
+	struct type *tmp = RB_ROOT(head);				\
+	int comp;							\
+	while (tmp) {							\
+		comp = cmp(elm, tmp);					\
+		if (comp < 0)						\
+			tmp = RB_LEFT(tmp, field);			\
+		else if (comp > 0)					\
+			tmp = RB_RIGHT(tmp, field);			\
+		else							\
+			return (tmp);					\
+	}								\
+	return (NULL);							\
+}									\
+									\
+/* Finds the first node greater than or equal to the search key */	\
+attr struct type *							\
+name##_RB_NFIND(struct name *head, struct type *elm)			\
+{									\
+	struct type *tmp = RB_ROOT(head);				\
+	struct type *res = NULL;					\
+	int comp;							\
+	while (tmp) {							\
+		comp = cmp(elm, tmp);					\
+		if (comp < 0) {						\
+			res = tmp;					\
+			tmp = RB_LEFT(tmp, field);			\
+		}							\
+		else if (comp > 0)					\
+			tmp = RB_RIGHT(tmp, field);			\
+		else							\
+			return (tmp);					\
+	}								\
+	return (res);							\
+}									\
+									\
+/* ARGSUSED */								\
+attr struct type *							\
+name##_RB_NEXT(struct type *elm)					\
+{									\
+	if (RB_RIGHT(elm, field)) {					\
+		elm = RB_RIGHT(elm, field);				\
+		while (RB_LEFT(elm, field))				\
+			elm = RB_LEFT(elm, field);			\
+	} else {							\
+		if (RB_PARENT(elm, field) &&				\
+		    (elm == RB_LEFT(RB_PARENT(elm, field), field)))	\
+			elm = RB_PARENT(elm, field);			\
+		else {							\
+			while (RB_PARENT(elm, field) &&			\
+			    (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\
+				elm = RB_PARENT(elm, field);		\
+			elm = RB_PARENT(elm, field);			\
+		}							\
+	}								\
+	return (elm);							\
+}									\
+									\
+/* ARGSUSED */								\
+attr struct type *							\
+name##_RB_PREV(struct type *elm)					\
+{									\
+	if (RB_LEFT(elm, field)) {					\
+		elm = RB_LEFT(elm, field);				\
+		while (RB_RIGHT(elm, field))				\
+			elm = RB_RIGHT(elm, field);			\
+	} else {							\
+		if (RB_PARENT(elm, field) &&				\
+		    (elm == RB_RIGHT(RB_PARENT(elm, field), field)))	\
+			elm = RB_PARENT(elm, field);			\
+		else {							\
+			while (RB_PARENT(elm, field) &&			\
+			    (elm == RB_LEFT(RB_PARENT(elm, field), field)))\
+				elm = RB_PARENT(elm, field);		\
+			elm = RB_PARENT(elm, field);			\
+		}							\
+	}								\
+	return (elm);							\
+}									\
+									\
+attr struct type *							\
+name##_RB_MINMAX(struct name *head, int val)				\
+{									\
+	struct type *tmp = RB_ROOT(head);				\
+	struct type *parent = NULL;					\
+	while (tmp) {							\
+		parent = tmp;						\
+		if (val < 0)						\
+			tmp = RB_LEFT(tmp, field);			\
+		else							\
+			tmp = RB_RIGHT(tmp, field);			\
+	}								\
+	return (parent);						\
+}
+
+#define RB_NEGINF	-1
+#define RB_INF	1
+
+#define RB_INSERT(name, x, y)	name##_RB_INSERT(x, y)
+#define RB_REMOVE(name, x, y)	name##_RB_REMOVE(x, y)
+#define RB_FIND(name, x, y)	name##_RB_FIND(x, y)
+#define RB_NFIND(name, x, y)	name##_RB_NFIND(x, y)
+#define RB_NEXT(name, x, y)	name##_RB_NEXT(y)
+#define RB_PREV(name, x, y)	name##_RB_PREV(y)
+#define RB_MIN(name, x)		name##_RB_MINMAX(x, RB_NEGINF)
+#define RB_MAX(name, x)		name##_RB_MINMAX(x, RB_INF)
+
+#define RB_FOREACH(x, name, head)					\
+	for ((x) = RB_MIN(name, head);					\
+	     (x) != NULL;						\
+	     (x) = name##_RB_NEXT(x))
+
+#define RB_FOREACH_FROM(x, name, y)					\
+	for ((x) = (y);							\
+	    ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL);	\
+	     (x) = (y))
+
+#define RB_FOREACH_SAFE(x, name, head, y)				\
+	for ((x) = RB_MIN(name, head);					\
+	    ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL);	\
+	     (x) = (y))
+
+#define RB_FOREACH_REVERSE(x, name, head)				\
+	for ((x) = RB_MAX(name, head);					\
+	     (x) != NULL;						\
+	     (x) = name##_RB_PREV(x))
+
+#define RB_FOREACH_REVERSE_FROM(x, name, y)				\
+	for ((x) = (y);							\
+	    ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL);	\
+	     (x) = (y))
+
+#define RB_FOREACH_REVERSE_SAFE(x, name, head, y)			\
+	for ((x) = RB_MAX(name, head);					\
+	    ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL);	\
+	     (x) = (y))
+
+#endif	/* _SYS_TREE_H_ */
diff --git a/usr/contrib/freebsd/x86/apicreg.h b/usr/contrib/freebsd/x86/apicreg.h
new file mode 100644
index 0000000000..24006e2733
--- /dev/null
+++ b/usr/contrib/freebsd/x86/apicreg.h
@@ -0,0 +1,455 @@
+/*-
+ * Copyright (c) 1996, by Peter Wemm and Steve Passe
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/x86/include/apicreg.h 259140 2013-12-09 21:08:52Z jhb $
+ */
+
+#ifndef _X86_APICREG_H_
+#define _X86_APICREG_H_
+
+/*
+ * Local && I/O APIC definitions.
+ */
+
+/*
+ * Pentium P54C+ Built-in APIC
+ * (Advanced programmable Interrupt Controller)
+ * 
+ * Base Address of Built-in APIC in memory location
+ * is 0xfee00000.
+ * 
+ * Map of APIC Registers:
+ * 
+ * Offset (hex)    Description                     Read/Write state
+ * 000             Reserved
+ * 010             Reserved
+ * 020 ID          Local APIC ID                   R/W
+ * 030 VER         Local APIC Version              R
+ * 040             Reserved
+ * 050             Reserved
+ * 060             Reserved
+ * 070             Reserved
+ * 080             Task Priority Register          R/W
+ * 090             Arbitration Priority Register   R
+ * 0A0             Processor Priority Register     R
+ * 0B0             EOI Register                    W
+ * 0C0 RRR         Remote read                     R
+ * 0D0             Logical Destination             R/W
+ * 0E0             Destination Format Register     0..27 R;  28..31 R/W
+ * 0F0 SVR         Spurious Interrupt Vector Reg.  0..3  R;  4..9   R/W
+ * 100             ISR  000-031                    R
+ * 110             ISR  032-063                    R
+ * 120             ISR  064-095                    R
+ * 130             ISR  095-128                    R
+ * 140             ISR  128-159                    R
+ * 150             ISR  160-191                    R
+ * 160             ISR  192-223                    R
+ * 170             ISR  224-255                    R
+ * 180             TMR  000-031                    R
+ * 190             TMR  032-063                    R
+ * 1A0             TMR  064-095                    R
+ * 1B0             TMR  095-128                    R
+ * 1C0             TMR  128-159                    R
+ * 1D0             TMR  160-191                    R
+ * 1E0             TMR  192-223                    R
+ * 1F0             TMR  224-255                    R
+ * 200             IRR  000-031                    R
+ * 210             IRR  032-063                    R
+ * 220             IRR  064-095                    R
+ * 230             IRR  095-128                    R
+ * 240             IRR  128-159                    R
+ * 250             IRR  160-191                    R
+ * 260             IRR  192-223                    R
+ * 270             IRR  224-255                    R
+ * 280             Error Status Register           R
+ * 290             Reserved
+ * 2A0             Reserved
+ * 2B0             Reserved
+ * 2C0             Reserved
+ * 2D0             Reserved
+ * 2E0             Reserved
+ * 2F0             Local Vector Table (CMCI)       R/W
+ * 300 ICR_LOW     Interrupt Command Reg. (0-31)   R/W
+ * 310 ICR_HI      Interrupt Command Reg. (32-63)  R/W
+ * 320             Local Vector Table (Timer)      R/W
+ * 330             Local Vector Table (Thermal)    R/W (PIV+)
+ * 340             Local Vector Table (Performance) R/W (P6+)
+ * 350 LVT1        Local Vector Table (LINT0)      R/W
+ * 360 LVT2        Local Vector Table (LINT1)      R/W
+ * 370 LVT3        Local Vector Table (ERROR)      R/W
+ * 380             Initial Count Reg. for Timer    R/W
+ * 390             Current Count of Timer          R
+ * 3A0             Reserved
+ * 3B0             Reserved
+ * 3C0             Reserved
+ * 3D0             Reserved
+ * 3E0             Timer Divide Configuration Reg. R/W
+ * 3F0             Reserved
+ */
+
+
+/******************************************************************************
+ * global defines, etc.
+ */
+
+
+/******************************************************************************
+ * LOCAL APIC structure
+ */
+
+#ifndef LOCORE
+#include <sys/types.h>
+
+#define PAD3	int : 32; int : 32; int : 32
+#define PAD4	int : 32; int : 32; int : 32; int : 32
+
+struct LAPIC {
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	u_int32_t id;		PAD3;
+	u_int32_t version;	PAD3;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	u_int32_t tpr;		PAD3;
+	u_int32_t apr;		PAD3;
+	u_int32_t ppr;		PAD3;
+	u_int32_t eoi;		PAD3;
+	/* reserved */		PAD4;
+	u_int32_t ldr;		PAD3;
+	u_int32_t dfr;		PAD3;
+	u_int32_t svr;		PAD3;
+	u_int32_t isr0;		PAD3;
+	u_int32_t isr1;		PAD3;
+	u_int32_t isr2;		PAD3;
+	u_int32_t isr3;		PAD3;
+	u_int32_t isr4;		PAD3;
+	u_int32_t isr5;		PAD3;
+	u_int32_t isr6;		PAD3;
+	u_int32_t isr7;		PAD3;
+	u_int32_t tmr0;		PAD3;
+	u_int32_t tmr1;		PAD3;
+	u_int32_t tmr2;		PAD3;
+	u_int32_t tmr3;		PAD3;
+	u_int32_t tmr4;		PAD3;
+	u_int32_t tmr5;		PAD3;
+	u_int32_t tmr6;		PAD3;
+	u_int32_t tmr7;		PAD3;
+	u_int32_t irr0;		PAD3;
+	u_int32_t irr1;		PAD3;
+	u_int32_t irr2;		PAD3;
+	u_int32_t irr3;		PAD3;
+	u_int32_t irr4;		PAD3;
+	u_int32_t irr5;		PAD3;
+	u_int32_t irr6;		PAD3;
+	u_int32_t irr7;		PAD3;
+	u_int32_t esr;		PAD3;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	u_int32_t lvt_cmci;	PAD3;
+	u_int32_t icr_lo;	PAD3;
+	u_int32_t icr_hi;	PAD3;
+	u_int32_t lvt_timer;	PAD3;
+	u_int32_t lvt_thermal;	PAD3;
+	u_int32_t lvt_pcint;	PAD3;
+	u_int32_t lvt_lint0;	PAD3;
+	u_int32_t lvt_lint1;	PAD3;
+	u_int32_t lvt_error;	PAD3;
+	u_int32_t icr_timer;	PAD3;
+	u_int32_t ccr_timer;	PAD3;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	/* reserved */		PAD4;
+	u_int32_t dcr_timer;	PAD3;
+	/* reserved */		PAD4;
+};
+
+typedef struct LAPIC lapic_t;
+
+/******************************************************************************
+ * I/O APIC structure
+ */
+
+struct IOAPIC {
+	u_int32_t ioregsel;	PAD3;
+	u_int32_t iowin;	PAD3;
+};
+
+typedef struct IOAPIC ioapic_t;
+
+#undef PAD4
+#undef PAD3
+
+#endif  /* !LOCORE */
+
+
+/******************************************************************************
+ * various code 'logical' values
+ */
+
+/******************************************************************************
+ * LOCAL APIC defines
+ */
+
+/* default physical locations of LOCAL (CPU) APICs */
+#define DEFAULT_APIC_BASE	0xfee00000
+
+/* constants relating to APIC ID registers */
+#define APIC_ID_MASK		0xff000000
+#define	APIC_ID_SHIFT		24
+#define	APIC_ID_CLUSTER		0xf0
+#define	APIC_ID_CLUSTER_ID	0x0f
+#define	APIC_MAX_CLUSTER	0xe
+#define	APIC_MAX_INTRACLUSTER_ID 3
+#define	APIC_ID_CLUSTER_SHIFT	4
+
+/* fields in VER */
+#define APIC_VER_VERSION	0x000000ff
+#define APIC_VER_MAXLVT		0x00ff0000
+#define MAXLVTSHIFT		16
+#define APIC_VER_EOI_SUPPRESSION 0x01000000
+
+/* fields in LDR */
+#define	APIC_LDR_RESERVED	0x00ffffff
+
+/* fields in DFR */
+#define	APIC_DFR_RESERVED	0x0fffffff
+#define	APIC_DFR_MODEL_MASK	0xf0000000
+#define	APIC_DFR_MODEL_FLAT	0xf0000000
+#define	APIC_DFR_MODEL_CLUSTER	0x00000000
+
+/* fields in SVR */
+#define APIC_SVR_VECTOR		0x000000ff
+#define APIC_SVR_VEC_PROG	0x000000f0
+#define APIC_SVR_VEC_FIX	0x0000000f
+#define APIC_SVR_ENABLE		0x00000100
+# define APIC_SVR_SWDIS		0x00000000
+# define APIC_SVR_SWEN		0x00000100
+#define APIC_SVR_FOCUS		0x00000200
+# define APIC_SVR_FEN		0x00000000
+# define APIC_SVR_FDIS		0x00000200
+#define APIC_SVR_EOI_SUPPRESSION 0x00001000
+
+/* fields in TPR */
+#define APIC_TPR_PRIO		0x000000ff
+# define APIC_TPR_INT		0x000000f0
+# define APIC_TPR_SUB		0x0000000f
+
+/* fields in ESR */
+#define	APIC_ESR_SEND_CS_ERROR		0x00000001
+#define	APIC_ESR_RECEIVE_CS_ERROR	0x00000002
+#define	APIC_ESR_SEND_ACCEPT		0x00000004
+#define	APIC_ESR_RECEIVE_ACCEPT		0x00000008
+#define	APIC_ESR_SEND_ILLEGAL_VECTOR	0x00000020
+#define	APIC_ESR_RECEIVE_ILLEGAL_VECTOR	0x00000040
+#define	APIC_ESR_ILLEGAL_REGISTER	0x00000080
+
+/* fields in ICR_LOW */
+#define APIC_VECTOR_MASK	0x000000ff
+
+#define APIC_DELMODE_MASK	0x00000700
+# define APIC_DELMODE_FIXED	0x00000000
+# define APIC_DELMODE_LOWPRIO	0x00000100
+# define APIC_DELMODE_SMI	0x00000200
+# define APIC_DELMODE_RR	0x00000300
+# define APIC_DELMODE_NMI	0x00000400
+# define APIC_DELMODE_INIT	0x00000500
+# define APIC_DELMODE_STARTUP	0x00000600
+# define APIC_DELMODE_RESV	0x00000700
+
+#define APIC_DESTMODE_MASK	0x00000800
+# define APIC_DESTMODE_PHY	0x00000000
+# define APIC_DESTMODE_LOG	0x00000800
+
+#define APIC_DELSTAT_MASK	0x00001000
+# define APIC_DELSTAT_IDLE	0x00000000
+# define APIC_DELSTAT_PEND	0x00001000
+
+#define APIC_RESV1_MASK		0x00002000
+
+#define APIC_LEVEL_MASK		0x00004000
+# define APIC_LEVEL_DEASSERT	0x00000000
+# define APIC_LEVEL_ASSERT	0x00004000
+
+#define APIC_TRIGMOD_MASK	0x00008000
+# define APIC_TRIGMOD_EDGE	0x00000000
+# define APIC_TRIGMOD_LEVEL	0x00008000
+
+#define APIC_RRSTAT_MASK	0x00030000
+# define APIC_RRSTAT_INVALID	0x00000000
+# define APIC_RRSTAT_INPROG	0x00010000
+# define APIC_RRSTAT_VALID	0x00020000
+# define APIC_RRSTAT_RESV	0x00030000
+
+#define APIC_DEST_MASK		0x000c0000
+# define APIC_DEST_DESTFLD	0x00000000
+# define APIC_DEST_SELF		0x00040000
+# define APIC_DEST_ALLISELF	0x00080000
+# define APIC_DEST_ALLESELF	0x000c0000
+
+#define APIC_RESV2_MASK		0xfff00000
+
+#define	APIC_ICRLO_RESV_MASK	(APIC_RESV1_MASK | APIC_RESV2_MASK)
+
+/* fields in LVT1/2 */
+#define APIC_LVT_VECTOR		0x000000ff
+#define APIC_LVT_DM		0x00000700
+# define APIC_LVT_DM_FIXED	0x00000000
+# define APIC_LVT_DM_SMI	0x00000200
+# define APIC_LVT_DM_NMI	0x00000400
+# define APIC_LVT_DM_INIT	0x00000500
+# define APIC_LVT_DM_EXTINT	0x00000700
+#define APIC_LVT_DS		0x00001000
+#define APIC_LVT_IIPP		0x00002000
+#define APIC_LVT_IIPP_INTALO	0x00002000
+#define APIC_LVT_IIPP_INTAHI	0x00000000
+#define APIC_LVT_RIRR		0x00004000
+#define APIC_LVT_TM		0x00008000
+#define APIC_LVT_M		0x00010000
+
+
+/* fields in LVT Timer */
+#define APIC_LVTT_VECTOR	0x000000ff
+#define APIC_LVTT_DS		0x00001000
+#define APIC_LVTT_M		0x00010000
+#define APIC_LVTT_TM		0x00020000
+# define APIC_LVTT_TM_ONE_SHOT	0x00000000
+# define APIC_LVTT_TM_PERIODIC	0x00020000
+
+
+/* APIC timer current count */
+#define	APIC_TIMER_MAX_COUNT	0xffffffff
+
+/* fields in TDCR */
+#define APIC_TDCR_2		0x00
+#define APIC_TDCR_4		0x01
+#define APIC_TDCR_8		0x02
+#define APIC_TDCR_16		0x03
+#define APIC_TDCR_32		0x08
+#define APIC_TDCR_64		0x09
+#define APIC_TDCR_128		0x0a
+#define APIC_TDCR_1		0x0b
+
+/* LVT table indices */
+#define	APIC_LVT_LINT0		0
+#define	APIC_LVT_LINT1		1
+#define	APIC_LVT_TIMER		2
+#define	APIC_LVT_ERROR		3
+#define	APIC_LVT_PMC		4
+#define	APIC_LVT_THERMAL	5
+#define	APIC_LVT_CMCI		6
+#define	APIC_LVT_MAX		APIC_LVT_CMCI
+
+/******************************************************************************
+ * I/O APIC defines
+ */
+
+/* default physical locations of an IO APIC */
+#define DEFAULT_IO_APIC_BASE	0xfec00000
+
+/* window register offset */
+#define IOAPIC_WINDOW		0x10
+#define IOAPIC_EOIR		0x40
+
+/* indexes into IO APIC */
+#define IOAPIC_ID		0x00
+#define IOAPIC_VER		0x01
+#define IOAPIC_ARB		0x02
+#define IOAPIC_REDTBL		0x10
+#define IOAPIC_REDTBL0		IOAPIC_REDTBL
+#define IOAPIC_REDTBL1		(IOAPIC_REDTBL+0x02)
+#define IOAPIC_REDTBL2		(IOAPIC_REDTBL+0x04)
+#define IOAPIC_REDTBL3		(IOAPIC_REDTBL+0x06)
+#define IOAPIC_REDTBL4		(IOAPIC_REDTBL+0x08)
+#define IOAPIC_REDTBL5		(IOAPIC_REDTBL+0x0a)
+#define IOAPIC_REDTBL6		(IOAPIC_REDTBL+0x0c)
+#define IOAPIC_REDTBL7		(IOAPIC_REDTBL+0x0e)
+#define IOAPIC_REDTBL8		(IOAPIC_REDTBL+0x10)
+#define IOAPIC_REDTBL9		(IOAPIC_REDTBL+0x12)
+#define IOAPIC_REDTBL10		(IOAPIC_REDTBL+0x14)
+#define IOAPIC_REDTBL11		(IOAPIC_REDTBL+0x16)
+#define IOAPIC_REDTBL12		(IOAPIC_REDTBL+0x18)
+#define IOAPIC_REDTBL13		(IOAPIC_REDTBL+0x1a)
+#define IOAPIC_REDTBL14		(IOAPIC_REDTBL+0x1c)
+#define IOAPIC_REDTBL15		(IOAPIC_REDTBL+0x1e)
+#define IOAPIC_REDTBL16		(IOAPIC_REDTBL+0x20)
+#define IOAPIC_REDTBL17		(IOAPIC_REDTBL+0x22)
+#define IOAPIC_REDTBL18		(IOAPIC_REDTBL+0x24)
+#define IOAPIC_REDTBL19		(IOAPIC_REDTBL+0x26)
+#define IOAPIC_REDTBL20		(IOAPIC_REDTBL+0x28)
+#define IOAPIC_REDTBL21		(IOAPIC_REDTBL+0x2a)
+#define IOAPIC_REDTBL22		(IOAPIC_REDTBL+0x2c)
+#define IOAPIC_REDTBL23		(IOAPIC_REDTBL+0x2e)
+
+/* fields in VER */
+#define IOART_VER_VERSION	0x000000ff
+#define IOART_VER_MAXREDIR	0x00ff0000
+#define MAXREDIRSHIFT		16
+
+/*
+ * fields in the IO APIC's redirection table entries
+ */
+#define IOART_DEST	APIC_ID_MASK	/* broadcast addr: all APICs */
+
+#define IOART_RESV	0x00fe0000	/* reserved */
+
+#define IOART_INTMASK	0x00010000	/* R/W: INTerrupt mask */
+# define IOART_INTMCLR	0x00000000	/*       clear, allow INTs */
+# define IOART_INTMSET	0x00010000	/*       set, inhibit INTs */
+
+#define IOART_TRGRMOD	0x00008000	/* R/W: trigger mode */
+# define IOART_TRGREDG	0x00000000	/*       edge */
+# define IOART_TRGRLVL	0x00008000	/*       level */
+
+#define IOART_REM_IRR	0x00004000	/* RO: remote IRR */
+
+#define IOART_INTPOL	0x00002000	/* R/W: INT input pin polarity */
+# define IOART_INTAHI	0x00000000	/*      active high */
+# define IOART_INTALO	0x00002000	/*      active low */
+
+#define IOART_DELIVS	0x00001000	/* RO: delivery status */
+
+#define IOART_DESTMOD	0x00000800	/* R/W: destination mode */
+# define IOART_DESTPHY	0x00000000	/*      physical */
+# define IOART_DESTLOG	0x00000800	/*      logical */
+
+#define IOART_DELMOD	0x00000700	/* R/W: delivery mode */
+# define IOART_DELFIXED	0x00000000	/*       fixed */
+# define IOART_DELLOPRI	0x00000100	/*       lowest priority */
+# define IOART_DELSMI	0x00000200	/*       System Management INT */
+# define IOART_DELRSV1	0x00000300	/*       reserved */
+# define IOART_DELNMI	0x00000400	/*       NMI signal */
+# define IOART_DELINIT	0x00000500	/*       INIT signal */
+# define IOART_DELRSV2	0x00000600	/*       reserved */
+# define IOART_DELEXINT	0x00000700	/*       External INTerrupt */
+
+#define IOART_INTVEC	0x000000ff	/* R/W: INTerrupt vector field */
+
+#endif /* _X86_APICREG_H_ */
diff --git a/usr/contrib/freebsd/x86/mptable.h b/usr/contrib/freebsd/x86/mptable.h
new file mode 100644
index 0000000000..8f3c62a295
--- /dev/null
+++ b/usr/contrib/freebsd/x86/mptable.h
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 1996, by Steve Passe
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/x86/include/mptable.h 259228 2013-12-11 21:19:04Z jhb $
+ */
+
+#ifndef __MACHINE_MPTABLE_H__
+#define	__MACHINE_MPTABLE_H__
+
+enum busTypes {
+    NOBUS = 0,
+    CBUS = 1,
+    CBUSII = 2,
+    EISA = 3,
+    ISA = 6,
+    MCA = 9,
+    PCI = 13,
+    XPRESS = 18,
+    MAX_BUSTYPE = 18,
+    UNKNOWN_BUSTYPE = 0xff
+};
+
+/* MP Floating Pointer Structure */
+typedef struct MPFPS {
+	uint8_t	signature[4];
+	uint32_t pap;
+	uint8_t	length;
+	uint8_t	spec_rev;
+	uint8_t	checksum;
+	uint8_t	config_type;
+	uint8_t	mpfb2;
+	uint8_t	mpfb3;
+	uint8_t	mpfb4;
+	uint8_t	mpfb5;
+} __packed *mpfps_t;
+
+#define	MPFB2_IMCR_PRESENT	0x80
+#define	MPFB2_MUL_CLK_SRCS	0x40
+
+/* MP Configuration Table Header */
+typedef struct MPCTH {
+	uint8_t	signature[4];
+	uint16_t base_table_length;
+	uint8_t	spec_rev;
+	uint8_t	checksum;
+	uint8_t	oem_id[8];
+	uint8_t	product_id[12];
+	uint32_t oem_table_pointer;
+	uint16_t oem_table_size;
+	uint16_t entry_count;
+	uint32_t apic_address;
+	uint16_t extended_table_length;
+	uint8_t	extended_table_checksum;
+	uint8_t	reserved;
+} __packed *mpcth_t;
+
+/* Base table entries */
+
+#define	MPCT_ENTRY_PROCESSOR	0
+#define	MPCT_ENTRY_BUS		1
+#define	MPCT_ENTRY_IOAPIC	2
+#define	MPCT_ENTRY_INT		3
+#define	MPCT_ENTRY_LOCAL_INT	4
+
+typedef struct PROCENTRY {
+	uint8_t	type;
+	uint8_t	apic_id;
+	uint8_t	apic_version;
+	uint8_t	cpu_flags;
+	uint32_t cpu_signature;
+	uint32_t feature_flags;
+	uint32_t reserved1;
+	uint32_t reserved2;
+} __packed *proc_entry_ptr;
+
+#define PROCENTRY_FLAG_EN	0x01
+#define PROCENTRY_FLAG_BP	0x02
+
+typedef struct BUSENTRY {
+	uint8_t	type;
+	uint8_t	bus_id;
+	uint8_t	bus_type[6];
+} __packed *bus_entry_ptr;
+
+typedef struct IOAPICENTRY {
+	uint8_t	type;
+	uint8_t	apic_id;
+	uint8_t	apic_version;
+	uint8_t	apic_flags;
+	uint32_t apic_address;
+} __packed *io_apic_entry_ptr;
+
+#define IOAPICENTRY_FLAG_EN	0x01
+
+typedef struct INTENTRY {
+	uint8_t	type;
+	uint8_t	int_type;
+	uint16_t int_flags;
+	uint8_t	src_bus_id;
+	uint8_t	src_bus_irq;
+	uint8_t	dst_apic_id;
+	uint8_t	dst_apic_int;
+} __packed *int_entry_ptr;
+
+#define	INTENTRY_TYPE_INT  	0
+#define	INTENTRY_TYPE_NMI	1
+#define	INTENTRY_TYPE_SMI	2
+#define	INTENTRY_TYPE_EXTINT	3
+
+#define	INTENTRY_FLAGS_POLARITY			0x3
+#define	INTENTRY_FLAGS_POLARITY_CONFORM		0x0
+#define	INTENTRY_FLAGS_POLARITY_ACTIVEHI	0x1
+#define	INTENTRY_FLAGS_POLARITY_ACTIVELO	0x3
+#define	INTENTRY_FLAGS_TRIGGER			0xc
+#define	INTENTRY_FLAGS_TRIGGER_CONFORM		0x0
+#define	INTENTRY_FLAGS_TRIGGER_EDGE		0x4
+#define	INTENTRY_FLAGS_TRIGGER_LEVEL		0xc
+
+/* Extended table entries */
+
+typedef	struct EXTENTRY {
+	uint8_t	type;
+	uint8_t	length;
+} __packed *ext_entry_ptr;
+
+#define	MPCT_EXTENTRY_SAS	0x80
+#define	MPCT_EXTENTRY_BHD	0x81
+#define	MPCT_EXTENTRY_CBASM	0x82
+
+typedef struct SASENTRY {
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	bus_id;
+	uint8_t	address_type;
+	uint64_t address_base;
+	uint64_t address_length;
+} __packed *sas_entry_ptr;
+
+#define	SASENTRY_TYPE_IO	0
+#define	SASENTRY_TYPE_MEMORY	1
+#define	SASENTRY_TYPE_PREFETCH	2
+
+typedef struct BHDENTRY {
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	bus_id;
+	uint8_t	bus_info;
+	uint8_t	parent_bus;
+	uint8_t	reserved[3];
+} __packed *bhd_entry_ptr;
+
+#define	BHDENTRY_INFO_SUBTRACTIVE_DECODE	0x1
+
+typedef struct CBASMENTRY {
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	bus_id;
+	uint8_t	address_mod;
+	uint32_t predefined_range;
+} __packed *cbasm_entry_ptr;
+
+#define	CBASMENTRY_ADDRESS_MOD_ADD		0x0
+#define	CBASMENTRY_ADDRESS_MOD_SUBTRACT		0x1
+
+#define	CBASMENTRY_RANGE_ISA_IO		0
+#define	CBASMENTRY_RANGE_VGA_IO		1
+
+#ifdef _KERNEL
+struct mptable_hostb_softc {
+#ifdef NEW_PCIB
+	struct pcib_host_resources sc_host_res;
+	int		sc_decodes_vga_io;
+	int		sc_decodes_isa_io;
+#endif
+};
+
+#ifdef NEW_PCIB
+void	mptable_pci_host_res_init(device_t pcib);
+#endif
+int	mptable_pci_probe_table(int bus);
+int	mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin);
+#endif
+#endif /* !__MACHINE_MPTABLE_H__ */
diff --git a/usr/contrib/freebsd/x86/psl.h b/usr/contrib/freebsd/x86/psl.h
new file mode 100644
index 0000000000..6934b4feb7
--- /dev/null
+++ b/usr/contrib/freebsd/x86/psl.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)psl.h	5.2 (Berkeley) 1/18/91
+ * $FreeBSD: head/sys/x86/include/psl.h 258135 2013-11-14 15:37:20Z emaste $
+ */
+
+#ifndef _MACHINE_PSL_H_
+#define	_MACHINE_PSL_H_
+
+/*
+ * 386 processor status longword.
+ */
+#define	PSL_C		0x00000001	/* carry bit */
+#define	PSL_PF		0x00000004	/* parity bit */
+#define	PSL_AF		0x00000010	/* bcd carry bit */
+#define	PSL_Z		0x00000040	/* zero bit */
+#define	PSL_N		0x00000080	/* negative bit */
+#define	PSL_T		0x00000100	/* trace enable bit */
+#define	PSL_I		0x00000200	/* interrupt enable bit */
+#define	PSL_D		0x00000400	/* string instruction direction bit */
+#define	PSL_V		0x00000800	/* overflow bit */
+#define	PSL_IOPL	0x00003000	/* i/o privilege level */
+#define	PSL_NT		0x00004000	/* nested task bit */
+#define	PSL_RF		0x00010000	/* resume flag bit */
+#define	PSL_VM		0x00020000	/* virtual 8086 mode bit */
+#define	PSL_AC		0x00040000	/* alignment checking */
+#define	PSL_VIF		0x00080000	/* virtual interrupt enable */
+#define	PSL_VIP		0x00100000	/* virtual interrupt pending */
+#define	PSL_ID		0x00200000	/* identification bit */
+
+/*
+ * The i486 manual says that we are not supposed to change reserved flags,
+ * but this is too much trouble since the reserved flags depend on the cpu
+ * and setting them to their historical values works in practice.
+ */
+#define	PSL_RESERVED_DEFAULT	0x00000002
+
+/*
+ * Initial flags for kernel and user mode.  The kernel later inherits
+ * PSL_I and some other flags from user mode.
+ */
+#define	PSL_KERNEL	PSL_RESERVED_DEFAULT
+#define	PSL_USER	(PSL_RESERVED_DEFAULT | PSL_I)
+
+/*
+ * Bits that can be changed in user mode on 486's.  We allow these bits
+ * to be changed using ptrace(), sigreturn() and procfs.  Setting PS_NT
+ * is undesirable but it may as well be allowed since users can inflict
+ * it on the kernel directly.  Changes to PSL_AC are silently ignored on
+ * 386's.
+ *
+ * Users are allowed to change the privileged flag PSL_RF.  The cpu sets PSL_RF
+ * in tf_eflags for faults.  Debuggers should sometimes set it there too.
+ * tf_eflags is kept in the signal context during signal handling and there is
+ * no other place to remember it, so the PSL_RF bit may be corrupted by the
+ * signal handler without us knowing.  Corruption of the PSL_RF bit at worst
+ * causes one more or one less debugger trap, so allowing it is fairly
+ * harmless.   
+ */
+#define	PSL_USERCHANGE (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_T \
+			| PSL_D | PSL_V | PSL_NT | PSL_RF | PSL_AC | PSL_ID)
+
+#endif /* !_MACHINE_PSL_H_ */
diff --git a/usr/contrib/freebsd/x86/specialreg.h b/usr/contrib/freebsd/x86/specialreg.h
new file mode 100644
index 0000000000..bea3122423
--- /dev/null
+++ b/usr/contrib/freebsd/x86/specialreg.h
@@ -0,0 +1,839 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)specialreg.h	7.1 (Berkeley) 5/9/91
+ * $FreeBSD: head/sys/x86/include/specialreg.h 273338 2014-10-20 18:09:33Z neel $
+ */
+
+#ifndef _MACHINE_SPECIALREG_H_
+#define	_MACHINE_SPECIALREG_H_
+
+/*
+ * Bits in 386 special registers:
+ */
+#define	CR0_PE	0x00000001	/* Protected mode Enable */
+#define	CR0_MP	0x00000002	/* "Math" (fpu) Present */
+#define	CR0_EM	0x00000004	/* EMulate FPU instructions. (trap ESC only) */
+#define	CR0_TS	0x00000008	/* Task Switched (if MP, trap ESC and WAIT) */
+#define	CR0_PG	0x80000000	/* PaGing enable */
+
+/*
+ * Bits in 486 special registers:
+ */
+#define	CR0_NE	0x00000020	/* Numeric Error enable (EX16 vs IRQ13) */
+#define	CR0_WP	0x00010000	/* Write Protect (honor page protect in
+							   all modes) */
+#define	CR0_AM	0x00040000	/* Alignment Mask (set to enable AC flag) */
+#define	CR0_NW  0x20000000	/* Not Write-through */
+#define	CR0_CD  0x40000000	/* Cache Disable */
+
+#define	CR3_PCID_SAVE 0x8000000000000000
+
+/*
+ * Bits in PPro special registers
+ */
+#define	CR4_VME	0x00000001	/* Virtual 8086 mode extensions */
+#define	CR4_PVI	0x00000002	/* Protected-mode virtual interrupts */
+#define	CR4_TSD	0x00000004	/* Time stamp disable */
+#define	CR4_DE	0x00000008	/* Debugging extensions */
+#define	CR4_PSE	0x00000010	/* Page size extensions */
+#define	CR4_PAE	0x00000020	/* Physical address extension */
+#define	CR4_MCE	0x00000040	/* Machine check enable */
+#define	CR4_PGE	0x00000080	/* Page global enable */
+#define	CR4_PCE	0x00000100	/* Performance monitoring counter enable */
+#define	CR4_FXSR 0x00000200	/* Fast FPU save/restore used by OS */
+#define	CR4_XMM	0x00000400	/* enable SIMD/MMX2 to use except 16 */
+#define	CR4_VMXE 0x00002000	/* enable VMX operation (Intel-specific) */
+#define	CR4_FSGSBASE 0x00010000	/* Enable FS/GS BASE accessing instructions */
+#define	CR4_PCIDE 0x00020000	/* Enable Context ID */
+#define	CR4_XSAVE 0x00040000	/* XSETBV/XGETBV */
+#define	CR4_SMEP 0x00100000	/* Supervisor-Mode Execution Prevention */
+
+/*
+ * Bits in AMD64 special registers.  EFER is 64 bits wide.
+ */
+#define	EFER_SCE 0x000000001	/* System Call Extensions (R/W) */
+#define	EFER_LME 0x000000100	/* Long mode enable (R/W) */
+#define	EFER_LMA 0x000000400	/* Long mode active (R) */
+#define	EFER_NXE 0x000000800	/* PTE No-Execute bit enable (R/W) */
+#define	EFER_SVM 0x000001000	/* SVM enable bit for AMD, reserved for Intel */
+
+/*
+ * Intel Extended Features registers
+ */
+#define	XCR0	0		/* XFEATURE_ENABLED_MASK register */
+
+#define	XFEATURE_ENABLED_X87		0x00000001
+#define	XFEATURE_ENABLED_SSE		0x00000002
+#define	XFEATURE_ENABLED_YMM_HI128	0x00000004
+#define	XFEATURE_ENABLED_AVX		XFEATURE_ENABLED_YMM_HI128
+#define	XFEATURE_ENABLED_BNDREGS	0x00000008
+#define	XFEATURE_ENABLED_BNDCSR		0x00000010
+#define	XFEATURE_ENABLED_OPMASK		0x00000020
+#define	XFEATURE_ENABLED_ZMM_HI256	0x00000040
+#define	XFEATURE_ENABLED_HI16_ZMM	0x00000080
+
+#define	XFEATURE_AVX					\
+    (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX)
+#define	XFEATURE_AVX512						\
+    (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 |	\
+    XFEATURE_ENABLED_HI16_ZMM)
+#define	XFEATURE_MPX					\
+    (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR)
+
+/*
+ * CPUID instruction features register
+ */
+#define	CPUID_FPU	0x00000001
+#define	CPUID_VME	0x00000002
+#define	CPUID_DE	0x00000004
+#define	CPUID_PSE	0x00000008
+#define	CPUID_TSC	0x00000010
+#define	CPUID_MSR	0x00000020
+#define	CPUID_PAE	0x00000040
+#define	CPUID_MCE	0x00000080
+#define	CPUID_CX8	0x00000100
+#define	CPUID_APIC	0x00000200
+#define	CPUID_B10	0x00000400
+#define	CPUID_SEP	0x00000800
+#define	CPUID_MTRR	0x00001000
+#define	CPUID_PGE	0x00002000
+#define	CPUID_MCA	0x00004000
+#define	CPUID_CMOV	0x00008000
+#define	CPUID_PAT	0x00010000
+#define	CPUID_PSE36	0x00020000
+#define	CPUID_PSN	0x00040000
+#define	CPUID_CLFSH	0x00080000
+#define	CPUID_B20	0x00100000
+#define	CPUID_DS	0x00200000
+#define	CPUID_ACPI	0x00400000
+#define	CPUID_MMX	0x00800000
+#define	CPUID_FXSR	0x01000000
+#define	CPUID_SSE	0x02000000
+#define	CPUID_XMM	0x02000000
+#define	CPUID_SSE2	0x04000000
+#define	CPUID_SS	0x08000000
+#define	CPUID_HTT	0x10000000
+#define	CPUID_TM	0x20000000
+#define	CPUID_IA64	0x40000000
+#define	CPUID_PBE	0x80000000
+
+#define	CPUID2_SSE3	0x00000001
+#define	CPUID2_PCLMULQDQ 0x00000002
+#define	CPUID2_DTES64	0x00000004
+#define	CPUID2_MON	0x00000008
+#define	CPUID2_DS_CPL	0x00000010
+#define	CPUID2_VMX	0x00000020
+#define	CPUID2_SMX	0x00000040
+#define	CPUID2_EST	0x00000080
+#define	CPUID2_TM2	0x00000100
+#define	CPUID2_SSSE3	0x00000200
+#define	CPUID2_CNXTID	0x00000400
+#define	CPUID2_FMA	0x00001000
+#define	CPUID2_CX16	0x00002000
+#define	CPUID2_XTPR	0x00004000
+#define	CPUID2_PDCM	0x00008000
+#define	CPUID2_PCID	0x00020000
+#define	CPUID2_DCA	0x00040000
+#define	CPUID2_SSE41	0x00080000
+#define	CPUID2_SSE42	0x00100000
+#define	CPUID2_X2APIC	0x00200000
+#define	CPUID2_MOVBE	0x00400000
+#define	CPUID2_POPCNT	0x00800000
+#define	CPUID2_TSCDLT	0x01000000
+#define	CPUID2_AESNI	0x02000000
+#define	CPUID2_XSAVE	0x04000000
+#define	CPUID2_OSXSAVE	0x08000000
+#define	CPUID2_AVX	0x10000000
+#define	CPUID2_F16C	0x20000000
+#define	CPUID2_RDRAND	0x40000000
+#define	CPUID2_HV	0x80000000
+
+/*
+ * Important bits in the Thermal and Power Management flags
+ * CPUID.6 EAX and ECX.
+ */
+#define	CPUTPM1_SENSOR	0x00000001
+#define	CPUTPM1_TURBO	0x00000002
+#define	CPUTPM1_ARAT	0x00000004
+#define	CPUTPM2_EFFREQ	0x00000001
+
+/*
+ * Important bits in the AMD extended cpuid flags
+ */
+#define	AMDID_SYSCALL	0x00000800
+#define	AMDID_MP	0x00080000
+#define	AMDID_NX	0x00100000
+#define	AMDID_EXT_MMX	0x00400000
+#define	AMDID_FFXSR	0x01000000
+#define	AMDID_PAGE1GB	0x04000000
+#define	AMDID_RDTSCP	0x08000000
+#define	AMDID_LM	0x20000000
+#define	AMDID_EXT_3DNOW	0x40000000
+#define	AMDID_3DNOW	0x80000000
+
+#define	AMDID2_LAHF	0x00000001
+#define	AMDID2_CMP	0x00000002
+#define	AMDID2_SVM	0x00000004
+#define	AMDID2_EXT_APIC	0x00000008
+#define	AMDID2_CR8	0x00000010
+#define	AMDID2_ABM	0x00000020
+#define	AMDID2_SSE4A	0x00000040
+#define	AMDID2_MAS	0x00000080
+#define	AMDID2_PREFETCH	0x00000100
+#define	AMDID2_OSVW	0x00000200
+#define	AMDID2_IBS	0x00000400
+#define	AMDID2_XOP	0x00000800
+#define	AMDID2_SKINIT	0x00001000
+#define	AMDID2_WDT	0x00002000
+#define	AMDID2_LWP	0x00008000
+#define	AMDID2_FMA4	0x00010000
+#define	AMDID2_TCE	0x00020000
+#define	AMDID2_NODE_ID	0x00080000
+#define	AMDID2_TBM	0x00200000
+#define	AMDID2_TOPOLOGY	0x00400000
+#define	AMDID2_PCXC	0x00800000
+#define	AMDID2_PNXC	0x01000000
+#define	AMDID2_DBE	0x04000000
+#define	AMDID2_PTSC	0x08000000
+#define	AMDID2_PTSCEL2I	0x10000000
+
+/*
+ * CPUID instruction 1 eax info
+ */
+#define	CPUID_STEPPING		0x0000000f
+#define	CPUID_MODEL		0x000000f0
+#define	CPUID_FAMILY		0x00000f00
+#define	CPUID_EXT_MODEL		0x000f0000
+#define	CPUID_EXT_FAMILY	0x0ff00000
+#ifdef __i386__
+#define	CPUID_TO_MODEL(id) \
+    ((((id) & CPUID_MODEL) >> 4) | \
+    ((((id) & CPUID_FAMILY) >= 0x600) ? \
+    (((id) & CPUID_EXT_MODEL) >> 12) : 0))
+#define	CPUID_TO_FAMILY(id) \
+    ((((id) & CPUID_FAMILY) >> 8) + \
+    ((((id) & CPUID_FAMILY) == 0xf00) ? \
+    (((id) & CPUID_EXT_FAMILY) >> 20) : 0))
+#else
+#define	CPUID_TO_MODEL(id) \
+    ((((id) & CPUID_MODEL) >> 4) | \
+    (((id) & CPUID_EXT_MODEL) >> 12))
+#define	CPUID_TO_FAMILY(id) \
+    ((((id) & CPUID_FAMILY) >> 8) + \
+    (((id) & CPUID_EXT_FAMILY) >> 20))
+#endif
+
+/*
+ * CPUID instruction 1 ebx info
+ */
+#define	CPUID_BRAND_INDEX	0x000000ff
+#define	CPUID_CLFUSH_SIZE	0x0000ff00
+#define	CPUID_HTT_CORES		0x00ff0000
+#define	CPUID_LOCAL_APIC_ID	0xff000000
+
+/*
+ * CPUID instruction 5 info
+ */
+#define	CPUID5_MON_MIN_SIZE	0x0000ffff	/* eax */
+#define	CPUID5_MON_MAX_SIZE	0x0000ffff	/* ebx */
+#define	CPUID5_MON_MWAIT_EXT	0x00000001	/* ecx */
+#define	CPUID5_MWAIT_INTRBREAK	0x00000002	/* ecx */
+
+/*
+ * MWAIT cpu power states.  Lower 4 bits are sub-states.
+ */
+#define	MWAIT_C0	0xf0
+#define	MWAIT_C1	0x00
+#define	MWAIT_C2	0x10
+#define	MWAIT_C3	0x20
+#define	MWAIT_C4	0x30
+
+/*
+ * MWAIT extensions.
+ */
+/* Interrupt breaks MWAIT even when masked. */
+#define	MWAIT_INTRBREAK		0x00000001
+
+/*
+ * CPUID instruction 6 ecx info
+ */
+#define	CPUID_PERF_STAT		0x00000001
+#define	CPUID_PERF_BIAS		0x00000008
+
+/* 
+ * CPUID instruction 0xb ebx info.
+ */
+#define	CPUID_TYPE_INVAL	0
+#define	CPUID_TYPE_SMT		1
+#define	CPUID_TYPE_CORE		2
+
+/*
+ * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1
+ */
+#define	CPUID_EXTSTATE_XSAVEOPT	0x00000001
+#define	CPUID_EXTSTATE_XSAVEC	0x00000002
+#define	CPUID_EXTSTATE_XINUSE	0x00000004
+#define	CPUID_EXTSTATE_XSAVES	0x00000008
+
+/*
+ * AMD extended function 8000_0007h edx info
+ */
+#define	AMDPM_TS		0x00000001
+#define	AMDPM_FID		0x00000002
+#define	AMDPM_VID		0x00000004
+#define	AMDPM_TTP		0x00000008
+#define	AMDPM_TM		0x00000010
+#define	AMDPM_STC		0x00000020
+#define	AMDPM_100MHZ_STEPS	0x00000040
+#define	AMDPM_HW_PSTATE		0x00000080
+#define	AMDPM_TSC_INVARIANT	0x00000100
+#define	AMDPM_CPB		0x00000200
+
+/*
+ * AMD extended function 8000_0008h ecx info
+ */
+#define	AMDID_CMP_CORES		0x000000ff
+#define	AMDID_COREID_SIZE	0x0000f000
+#define	AMDID_COREID_SIZE_SHIFT	12
+
+/*
+ * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info
+ */
+#define	CPUID_STDEXT_FSGSBASE	0x00000001
+#define	CPUID_STDEXT_TSC_ADJUST	0x00000002
+#define	CPUID_STDEXT_BMI1	0x00000008
+#define	CPUID_STDEXT_HLE	0x00000010
+#define	CPUID_STDEXT_AVX2	0x00000020
+#define	CPUID_STDEXT_SMEP	0x00000080
+#define	CPUID_STDEXT_BMI2	0x00000100
+#define	CPUID_STDEXT_ERMS	0x00000200
+#define	CPUID_STDEXT_INVPCID	0x00000400
+#define	CPUID_STDEXT_RTM	0x00000800
+#define	CPUID_STDEXT_MPX	0x00004000
+#define	CPUID_STDEXT_AVX512F	0x00010000
+#define	CPUID_STDEXT_RDSEED	0x00040000
+#define	CPUID_STDEXT_ADX	0x00080000
+#define	CPUID_STDEXT_SMAP	0x00100000
+#define	CPUID_STDEXT_CLFLUSHOPT	0x00800000
+#define	CPUID_STDEXT_PROCTRACE	0x02000000
+#define	CPUID_STDEXT_AVX512PF	0x04000000
+#define	CPUID_STDEXT_AVX512ER	0x08000000
+#define	CPUID_STDEXT_AVX512CD	0x10000000
+#define	CPUID_STDEXT_SHA	0x20000000
+
+/*
+ * CPUID manufacturers identifiers
+ */
+#define	AMD_VENDOR_ID		"AuthenticAMD"
+#define	CENTAUR_VENDOR_ID	"CentaurHauls"
+#define	CYRIX_VENDOR_ID		"CyrixInstead"
+#define	INTEL_VENDOR_ID		"GenuineIntel"
+#define	NEXGEN_VENDOR_ID	"NexGenDriven"
+#define	NSC_VENDOR_ID		"Geode by NSC"
+#define	RISE_VENDOR_ID		"RiseRiseRise"
+#define	SIS_VENDOR_ID		"SiS SiS SiS "
+#define	TRANSMETA_VENDOR_ID	"GenuineTMx86"
+#define	UMC_VENDOR_ID		"UMC UMC UMC "
+
+/*
+ * Model-specific registers for the i386 family
+ */
+#define	MSR_P5_MC_ADDR		0x000
+#define	MSR_P5_MC_TYPE		0x001
+#define	MSR_TSC			0x010
+#define	MSR_P5_CESR		0x011
+#define	MSR_P5_CTR0		0x012
+#define	MSR_P5_CTR1		0x013
+#define	MSR_IA32_PLATFORM_ID	0x017
+#define	MSR_APICBASE		0x01b
+#define	MSR_EBL_CR_POWERON	0x02a
+#define	MSR_TEST_CTL		0x033
+#define	MSR_IA32_FEATURE_CONTROL 0x03a
+#define	MSR_BIOS_UPDT_TRIG	0x079
+#define	MSR_BBL_CR_D0		0x088
+#define	MSR_BBL_CR_D1		0x089
+#define	MSR_BBL_CR_D2		0x08a
+#define	MSR_BIOS_SIGN		0x08b
+#define	MSR_PERFCTR0		0x0c1
+#define	MSR_PERFCTR1		0x0c2
+#define	MSR_PLATFORM_INFO	0x0ce
+#define	MSR_MPERF		0x0e7
+#define	MSR_APERF		0x0e8
+#define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
+#define	MSR_MTRRcap		0x0fe
+#define	MSR_BBL_CR_ADDR		0x116
+#define	MSR_BBL_CR_DECC		0x118
+#define	MSR_BBL_CR_CTL		0x119
+#define	MSR_BBL_CR_TRIG		0x11a
+#define	MSR_BBL_CR_BUSY		0x11b
+#define	MSR_BBL_CR_CTL3		0x11e
+#define	MSR_SYSENTER_CS_MSR	0x174
+#define	MSR_SYSENTER_ESP_MSR	0x175
+#define	MSR_SYSENTER_EIP_MSR	0x176
+#define	MSR_MCG_CAP		0x179
+#define	MSR_MCG_STATUS		0x17a
+#define	MSR_MCG_CTL		0x17b
+#define	MSR_EVNTSEL0		0x186
+#define	MSR_EVNTSEL1		0x187
+#define	MSR_THERM_CONTROL	0x19a
+#define	MSR_THERM_INTERRUPT	0x19b
+#define	MSR_THERM_STATUS	0x19c
+#define	MSR_IA32_MISC_ENABLE	0x1a0
+#define	MSR_IA32_TEMPERATURE_TARGET	0x1a2
+#define	MSR_TURBO_RATIO_LIMIT	0x1ad
+#define	MSR_TURBO_RATIO_LIMIT1	0x1ae
+#define	MSR_DEBUGCTLMSR		0x1d9
+#define	MSR_LASTBRANCHFROMIP	0x1db
+#define	MSR_LASTBRANCHTOIP	0x1dc
+#define	MSR_LASTINTFROMIP	0x1dd
+#define	MSR_LASTINTTOIP		0x1de
+#define	MSR_ROB_CR_BKUPTMPDR6	0x1e0
+#define	MSR_MTRRVarBase		0x200
+#define	MSR_MTRR64kBase		0x250
+#define	MSR_MTRR16kBase		0x258
+#define	MSR_MTRR4kBase		0x268
+#define	MSR_PAT			0x277
+#define	MSR_MC0_CTL2		0x280
+#define	MSR_MTRRdefType		0x2ff
+#define	MSR_MC0_CTL		0x400
+#define	MSR_MC0_STATUS		0x401
+#define	MSR_MC0_ADDR		0x402
+#define	MSR_MC0_MISC		0x403
+#define	MSR_MC1_CTL		0x404
+#define	MSR_MC1_STATUS		0x405
+#define	MSR_MC1_ADDR		0x406
+#define	MSR_MC1_MISC		0x407
+#define	MSR_MC2_CTL		0x408
+#define	MSR_MC2_STATUS		0x409
+#define	MSR_MC2_ADDR		0x40a
+#define	MSR_MC2_MISC		0x40b
+#define	MSR_MC3_CTL		0x40c
+#define	MSR_MC3_STATUS		0x40d
+#define	MSR_MC3_ADDR		0x40e
+#define	MSR_MC3_MISC		0x40f
+#define	MSR_MC4_CTL		0x410
+#define	MSR_MC4_STATUS		0x411
+#define	MSR_MC4_ADDR		0x412
+#define	MSR_MC4_MISC		0x413
+#define	MSR_RAPL_POWER_UNIT	0x606
+#define	MSR_PKG_ENERGY_STATUS	0x611
+#define	MSR_DRAM_ENERGY_STATUS	0x619
+#define	MSR_PP0_ENERGY_STATUS	0x639
+#define	MSR_PP1_ENERGY_STATUS	0x641
+
+/*
+ * VMX MSRs
+ */
+#define	MSR_VMX_BASIC		0x480
+#define	MSR_VMX_PINBASED_CTLS	0x481
+#define	MSR_VMX_PROCBASED_CTLS	0x482
+#define	MSR_VMX_EXIT_CTLS	0x483
+#define	MSR_VMX_ENTRY_CTLS	0x484
+#define	MSR_VMX_CR0_FIXED0	0x486
+#define	MSR_VMX_CR0_FIXED1	0x487
+#define	MSR_VMX_CR4_FIXED0	0x488
+#define	MSR_VMX_CR4_FIXED1	0x489
+#define	MSR_VMX_PROCBASED_CTLS2	0x48b
+#define	MSR_VMX_EPT_VPID_CAP	0x48c
+#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48d
+#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48e
+#define	MSR_VMX_TRUE_EXIT_CTLS	0x48f
+#define	MSR_VMX_TRUE_ENTRY_CTLS	0x490
+
+/*
+ * X2APIC MSRs
+ */
+#define	MSR_APIC_ID		0x802
+#define	MSR_APIC_VERSION	0x803
+#define	MSR_APIC_TPR		0x808
+#define	MSR_APIC_EOI		0x80b
+#define	MSR_APIC_LDR		0x80d
+#define	MSR_APIC_SVR		0x80f
+#define	MSR_APIC_ISR0		0x810
+#define	MSR_APIC_ISR1		0x811
+#define	MSR_APIC_ISR2		0x812
+#define	MSR_APIC_ISR3		0x813
+#define	MSR_APIC_ISR4		0x814
+#define	MSR_APIC_ISR5		0x815
+#define	MSR_APIC_ISR6		0x816
+#define	MSR_APIC_ISR7		0x817
+#define	MSR_APIC_TMR0		0x818
+#define	MSR_APIC_IRR0		0x820
+#define	MSR_APIC_ESR		0x828
+#define	MSR_APIC_LVT_CMCI	0x82F
+#define	MSR_APIC_ICR		0x830
+#define	MSR_APIC_LVT_TIMER	0x832
+#define	MSR_APIC_LVT_THERMAL	0x833
+#define	MSR_APIC_LVT_PCINT	0x834
+#define	MSR_APIC_LVT_LINT0	0x835
+#define	MSR_APIC_LVT_LINT1	0x836
+#define	MSR_APIC_LVT_ERROR	0x837
+#define	MSR_APIC_ICR_TIMER	0x838
+#define	MSR_APIC_CCR_TIMER	0x839
+#define	MSR_APIC_DCR_TIMER	0x83e
+#define	MSR_APIC_SELF_IPI	0x83f
+
+#define	MSR_IA32_XSS		0xda0
+
+/*
+ * Constants related to MSR's.
+ */
+#define	APICBASE_RESERVED	0x000002ff
+#define	APICBASE_BSP		0x00000100
+#define	APICBASE_X2APIC		0x00000400
+#define	APICBASE_ENABLED	0x00000800
+#define	APICBASE_ADDRESS	0xfffff000
+
+/* MSR_IA32_FEATURE_CONTROL related */
+#define	IA32_FEATURE_CONTROL_LOCK	0x01	/* lock bit */
+#define	IA32_FEATURE_CONTROL_SMX_EN	0x02	/* enable VMX inside SMX */
+#define	IA32_FEATURE_CONTROL_VMX_EN	0x04	/* enable VMX outside SMX */
+
+/*
+ * PAT modes.
+ */
+#define	PAT_UNCACHEABLE		0x00
+#define	PAT_WRITE_COMBINING	0x01
+#define	PAT_WRITE_THROUGH	0x04
+#define	PAT_WRITE_PROTECTED	0x05
+#define	PAT_WRITE_BACK		0x06
+#define	PAT_UNCACHED		0x07
+#define	PAT_VALUE(i, m)		((long long)(m) << (8 * (i)))
+#define	PAT_MASK(i)		PAT_VALUE(i, 0xff)
+
+/*
+ * Constants related to MTRRs
+ */
+#define	MTRR_UNCACHEABLE	0x00
+#define	MTRR_WRITE_COMBINING	0x01
+#define	MTRR_WRITE_THROUGH	0x04
+#define	MTRR_WRITE_PROTECTED	0x05
+#define	MTRR_WRITE_BACK		0x06
+#define	MTRR_N64K		8	/* numbers of fixed-size entries */
+#define	MTRR_N16K		16
+#define	MTRR_N4K		64
+#define	MTRR_CAP_WC		0x0000000000000400
+#define	MTRR_CAP_FIXED		0x0000000000000100
+#define	MTRR_CAP_VCNT		0x00000000000000ff
+#define	MTRR_DEF_ENABLE		0x0000000000000800
+#define	MTRR_DEF_FIXED_ENABLE	0x0000000000000400
+#define	MTRR_DEF_TYPE		0x00000000000000ff
+#define	MTRR_PHYSBASE_PHYSBASE	0x000ffffffffff000
+#define	MTRR_PHYSBASE_TYPE	0x00000000000000ff
+#define	MTRR_PHYSMASK_PHYSMASK	0x000ffffffffff000
+#define	MTRR_PHYSMASK_VALID	0x0000000000000800
+
+/*
+ * Cyrix configuration registers, accessible as IO ports.
+ */
+#define	CCR0			0xc0	/* Configuration control register 0 */
+#define	CCR0_NC0		0x01	/* First 64K of each 1M memory region is
+								   non-cacheable */
+#define	CCR0_NC1		0x02	/* 640K-1M region is non-cacheable */
+#define	CCR0_A20M		0x04	/* Enables A20M# input pin */
+#define	CCR0_KEN		0x08	/* Enables KEN# input pin */
+#define	CCR0_FLUSH		0x10	/* Enables FLUSH# input pin */
+#define	CCR0_BARB		0x20	/* Flushes internal cache when entering hold
+								   state */
+#define	CCR0_CO			0x40	/* Cache org: 1=direct mapped, 0=2x set
+								   assoc */
+#define	CCR0_SUSPEND	0x80	/* Enables SUSP# and SUSPA# pins */
+
+#define	CCR1			0xc1	/* Configuration control register 1 */
+#define	CCR1_RPL		0x01	/* Enables RPLSET and RPLVAL# pins */
+#define	CCR1_SMI		0x02	/* Enables SMM pins */
+#define	CCR1_SMAC		0x04	/* System management memory access */
+#define	CCR1_MMAC		0x08	/* Main memory access */
+#define	CCR1_NO_LOCK	0x10	/* Negate LOCK# */
+#define	CCR1_SM3		0x80	/* SMM address space address region 3 */
+
+#define	CCR2			0xc2
+#define	CCR2_WB			0x02	/* Enables WB cache interface pins */
+#define	CCR2_SADS		0x02	/* Slow ADS */
+#define	CCR2_LOCK_NW	0x04	/* LOCK NW Bit */
+#define	CCR2_SUSP_HLT	0x08	/* Suspend on HALT */
+#define	CCR2_WT1		0x10	/* WT region 1 */
+#define	CCR2_WPR1		0x10	/* Write-protect region 1 */
+#define	CCR2_BARB		0x20	/* Flushes write-back cache when entering
+								   hold state. */
+#define	CCR2_BWRT		0x40	/* Enables burst write cycles */
+#define	CCR2_USE_SUSP	0x80	/* Enables suspend pins */
+
+#define	CCR3			0xc3
+#define	CCR3_SMILOCK	0x01	/* SMM register lock */
+#define	CCR3_NMI		0x02	/* Enables NMI during SMM */
+#define	CCR3_LINBRST	0x04	/* Linear address burst cycles */
+#define	CCR3_SMMMODE	0x08	/* SMM Mode */
+#define	CCR3_MAPEN0		0x10	/* Enables Map0 */
+#define	CCR3_MAPEN1		0x20	/* Enables Map1 */
+#define	CCR3_MAPEN2		0x40	/* Enables Map2 */
+#define	CCR3_MAPEN3		0x80	/* Enables Map3 */
+
+#define	CCR4			0xe8
+#define	CCR4_IOMASK		0x07
+#define	CCR4_MEM		0x08	/* Enables momory bypassing */
+#define	CCR4_DTE		0x10	/* Enables directory table entry cache */
+#define	CCR4_FASTFPE	0x20	/* Fast FPU exception */
+#define	CCR4_CPUID		0x80	/* Enables CPUID instruction */
+
+#define	CCR5			0xe9
+#define	CCR5_WT_ALLOC	0x01	/* Write-through allocate */
+#define	CCR5_SLOP		0x02	/* LOOP instruction slowed down */
+#define	CCR5_LBR1		0x10	/* Local bus region 1 */
+#define	CCR5_ARREN		0x20	/* Enables ARR region */
+
+#define	CCR6			0xea
+
+#define	CCR7			0xeb
+
+/* Performance Control Register (5x86 only). */
+#define	PCR0			0x20
+#define	PCR0_RSTK		0x01	/* Enables return stack */
+#define	PCR0_BTB		0x02	/* Enables branch target buffer */
+#define	PCR0_LOOP		0x04	/* Enables loop */
+#define	PCR0_AIS		0x08	/* Enables all instrcutions stalled to
+								   serialize pipe. */
+#define	PCR0_MLR		0x10	/* Enables reordering of misaligned loads */
+#define	PCR0_BTBRT		0x40	/* Enables BTB test register. */
+#define	PCR0_LSSER		0x80	/* Disable reorder */
+
+/* Device Identification Registers */
+#define	DIR0			0xfe
+#define	DIR1			0xff
+
+/*
+ * Machine Check register constants.
+ */
+#define	MCG_CAP_COUNT		0x000000ff
+#define	MCG_CAP_CTL_P		0x00000100
+#define	MCG_CAP_EXT_P		0x00000200
+#define	MCG_CAP_CMCI_P		0x00000400
+#define	MCG_CAP_TES_P		0x00000800
+#define	MCG_CAP_EXT_CNT		0x00ff0000
+#define	MCG_CAP_SER_P		0x01000000
+#define	MCG_STATUS_RIPV		0x00000001
+#define	MCG_STATUS_EIPV		0x00000002
+#define	MCG_STATUS_MCIP		0x00000004
+#define	MCG_CTL_ENABLE		0xffffffffffffffff
+#define	MCG_CTL_DISABLE		0x0000000000000000
+#define	MSR_MC_CTL(x)		(MSR_MC0_CTL + (x) * 4)
+#define	MSR_MC_STATUS(x)	(MSR_MC0_STATUS + (x) * 4)
+#define	MSR_MC_ADDR(x)		(MSR_MC0_ADDR + (x) * 4)
+#define	MSR_MC_MISC(x)		(MSR_MC0_MISC + (x) * 4)
+#define	MSR_MC_CTL2(x)		(MSR_MC0_CTL2 + (x))	/* If MCG_CAP_CMCI_P */
+#define	MC_STATUS_MCA_ERROR	0x000000000000ffff
+#define	MC_STATUS_MODEL_ERROR	0x00000000ffff0000
+#define	MC_STATUS_OTHER_INFO	0x01ffffff00000000
+#define	MC_STATUS_COR_COUNT	0x001fffc000000000	/* If MCG_CAP_CMCI_P */
+#define	MC_STATUS_TES_STATUS	0x0060000000000000	/* If MCG_CAP_TES_P */
+#define	MC_STATUS_AR		0x0080000000000000	/* If MCG_CAP_TES_P */
+#define	MC_STATUS_S		0x0100000000000000	/* If MCG_CAP_TES_P */
+#define	MC_STATUS_PCC		0x0200000000000000
+#define	MC_STATUS_ADDRV		0x0400000000000000
+#define	MC_STATUS_MISCV		0x0800000000000000
+#define	MC_STATUS_EN		0x1000000000000000
+#define	MC_STATUS_UC		0x2000000000000000
+#define	MC_STATUS_OVER		0x4000000000000000
+#define	MC_STATUS_VAL		0x8000000000000000
+#define	MC_MISC_RA_LSB		0x000000000000003f	/* If MCG_CAP_SER_P */
+#define	MC_MISC_ADDRESS_MODE	0x00000000000001c0	/* If MCG_CAP_SER_P */
+#define	MC_CTL2_THRESHOLD	0x0000000000007fff
+#define	MC_CTL2_CMCI_EN		0x0000000040000000
+
+/*
+ * The following four 3-byte registers control the non-cacheable regions.
+ * These registers must be written as three separate bytes.
+ *
+ * NCRx+0: A31-A24 of starting address
+ * NCRx+1: A23-A16 of starting address
+ * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx.
+ *
+ * The non-cacheable region's starting address must be aligned to the
+ * size indicated by the NCR_SIZE_xx field.
+ */
+#define	NCR1	0xc4
+#define	NCR2	0xc7
+#define	NCR3	0xca
+#define	NCR4	0xcd
+
+#define	NCR_SIZE_0K	0
+#define	NCR_SIZE_4K	1
+#define	NCR_SIZE_8K	2
+#define	NCR_SIZE_16K	3
+#define	NCR_SIZE_32K	4
+#define	NCR_SIZE_64K	5
+#define	NCR_SIZE_128K	6
+#define	NCR_SIZE_256K	7
+#define	NCR_SIZE_512K	8
+#define	NCR_SIZE_1M	9
+#define	NCR_SIZE_2M	10
+#define	NCR_SIZE_4M	11
+#define	NCR_SIZE_8M	12
+#define	NCR_SIZE_16M	13
+#define	NCR_SIZE_32M	14
+#define	NCR_SIZE_4G	15
+
+/*
+ * The address region registers are used to specify the location and
+ * size for the eight address regions.
+ *
+ * ARRx + 0: A31-A24 of start address
+ * ARRx + 1: A23-A16 of start address
+ * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx
+ */
+#define	ARR0	0xc4
+#define	ARR1	0xc7
+#define	ARR2	0xca
+#define	ARR3	0xcd
+#define	ARR4	0xd0
+#define	ARR5	0xd3
+#define	ARR6	0xd6
+#define	ARR7	0xd9
+
+#define	ARR_SIZE_0K		0
+#define	ARR_SIZE_4K		1
+#define	ARR_SIZE_8K		2
+#define	ARR_SIZE_16K	3
+#define	ARR_SIZE_32K	4
+#define	ARR_SIZE_64K	5
+#define	ARR_SIZE_128K	6
+#define	ARR_SIZE_256K	7
+#define	ARR_SIZE_512K	8
+#define	ARR_SIZE_1M		9
+#define	ARR_SIZE_2M		10
+#define	ARR_SIZE_4M		11
+#define	ARR_SIZE_8M		12
+#define	ARR_SIZE_16M	13
+#define	ARR_SIZE_32M	14
+#define	ARR_SIZE_4G		15
+
+/*
+ * The region control registers specify the attributes associated with
+ * the ARRx addres regions.
+ */
+#define	RCR0	0xdc
+#define	RCR1	0xdd
+#define	RCR2	0xde
+#define	RCR3	0xdf
+#define	RCR4	0xe0
+#define	RCR5	0xe1
+#define	RCR6	0xe2
+#define	RCR7	0xe3
+
+#define	RCR_RCD	0x01	/* Disables caching for ARRx (x = 0-6). */
+#define	RCR_RCE	0x01	/* Enables caching for ARR7. */
+#define	RCR_WWO	0x02	/* Weak write ordering. */
+#define	RCR_WL	0x04	/* Weak locking. */
+#define	RCR_WG	0x08	/* Write gathering. */
+#define	RCR_WT	0x10	/* Write-through. */
+#define	RCR_NLB	0x20	/* LBA# pin is not asserted. */
+
+/* AMD Write Allocate Top-Of-Memory and Control Register */
+#define	AMD_WT_ALLOC_TME	0x40000	/* top-of-memory enable */
+#define	AMD_WT_ALLOC_PRE	0x20000	/* programmable range enable */
+#define	AMD_WT_ALLOC_FRE	0x10000	/* fixed (A0000-FFFFF) range enable */
+
+/* AMD64 MSR's */
+#define	MSR_EFER	0xc0000080	/* extended features */
+#define	MSR_STAR	0xc0000081	/* legacy mode SYSCALL target/cs/ss */
+#define	MSR_LSTAR	0xc0000082	/* long mode SYSCALL target rip */
+#define	MSR_CSTAR	0xc0000083	/* compat mode SYSCALL target rip */
+#define	MSR_SF_MASK	0xc0000084	/* syscall flags mask */
+#define	MSR_FSBASE	0xc0000100	/* base address of the %fs "segment" */
+#define	MSR_GSBASE	0xc0000101	/* base address of the %gs "segment" */
+#define	MSR_KGSBASE	0xc0000102	/* base address of the kernel %gs */
+#define	MSR_PERFEVSEL0	0xc0010000
+#define	MSR_PERFEVSEL1	0xc0010001
+#define	MSR_PERFEVSEL2	0xc0010002
+#define	MSR_PERFEVSEL3	0xc0010003
+#define	MSR_K7_PERFCTR0	0xc0010004
+#define	MSR_K7_PERFCTR1	0xc0010005
+#define	MSR_K7_PERFCTR2	0xc0010006
+#define	MSR_K7_PERFCTR3	0xc0010007
+#define	MSR_SYSCFG	0xc0010010
+#define	MSR_HWCR	0xc0010015
+#define	MSR_IORRBASE0	0xc0010016
+#define	MSR_IORRMASK0	0xc0010017
+#define	MSR_IORRBASE1	0xc0010018
+#define	MSR_IORRMASK1	0xc0010019
+#define	MSR_TOP_MEM	0xc001001a	/* boundary for ram below 4G */
+#define	MSR_TOP_MEM2	0xc001001d	/* boundary for ram above 4G */
+#define	MSR_NB_CFG1	0xc001001f	/* NB configuration 1 */
+#define	MSR_P_STATE_LIMIT 0xc0010061	/* P-state Current Limit Register */
+#define	MSR_P_STATE_CONTROL 0xc0010062	/* P-state Control Register */
+#define	MSR_P_STATE_STATUS 0xc0010063	/* P-state Status Register */
+#define	MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */
+#define	MSR_SMM_ADDR	0xc0010112	/* SMM TSEG base address */
+#define	MSR_SMM_MASK	0xc0010113	/* SMM TSEG address mask */
+#define	MSR_IC_CFG	0xc0011021	/* Instruction Cache Configuration */
+#define	MSR_K8_UCODE_UPDATE	0xc0010020	/* update microcode */
+#define	MSR_MC0_CTL_MASK	0xc0010044
+#define	MSR_VM_CR		0xc0010114 /* SVM: feature control */
+#define	MSR_VM_HSAVE_PA		0xc0010117 /* SVM: host save area address */
+
+/* MSR_VM_CR related */
+#define	VM_CR_SVMDIS		0x10	/* SVM: disabled by BIOS */
+
+/* VIA ACE crypto featureset: for via_feature_rng */
+#define	VIA_HAS_RNG		1	/* cpu has RNG */
+
+/* VIA ACE crypto featureset: for via_feature_xcrypt */
+#define	VIA_HAS_AES		1	/* cpu has AES */
+#define	VIA_HAS_SHA		2	/* cpu has SHA1 & SHA256 */
+#define	VIA_HAS_MM		4	/* cpu has RSA instructions */
+#define	VIA_HAS_AESCTR		8	/* cpu has AES-CTR instructions */
+
+/* Centaur Extended Feature flags */
+#define	VIA_CPUID_HAS_RNG	0x000004
+#define	VIA_CPUID_DO_RNG	0x000008
+#define	VIA_CPUID_HAS_ACE	0x000040
+#define	VIA_CPUID_DO_ACE	0x000080
+#define	VIA_CPUID_HAS_ACE2	0x000100
+#define	VIA_CPUID_DO_ACE2	0x000200
+#define	VIA_CPUID_HAS_PHE	0x000400
+#define	VIA_CPUID_DO_PHE	0x000800
+#define	VIA_CPUID_HAS_PMM	0x001000
+#define	VIA_CPUID_DO_PMM	0x002000
+
+/* VIA ACE xcrypt-* instruction context control options */
+#define	VIA_CRYPT_CWLO_ROUND_M		0x0000000f
+#define	VIA_CRYPT_CWLO_ALG_M		0x00000070
+#define	VIA_CRYPT_CWLO_ALG_AES		0x00000000
+#define	VIA_CRYPT_CWLO_KEYGEN_M		0x00000080
+#define	VIA_CRYPT_CWLO_KEYGEN_HW	0x00000000
+#define	VIA_CRYPT_CWLO_KEYGEN_SW	0x00000080
+#define	VIA_CRYPT_CWLO_NORMAL		0x00000000
+#define	VIA_CRYPT_CWLO_INTERMEDIATE	0x00000100
+#define	VIA_CRYPT_CWLO_ENCRYPT		0x00000000
+#define	VIA_CRYPT_CWLO_DECRYPT		0x00000200
+#define	VIA_CRYPT_CWLO_KEY128		0x0000000a	/* 128bit, 10 rds */
+#define	VIA_CRYPT_CWLO_KEY192		0x0000040c	/* 192bit, 12 rds */
+#define	VIA_CRYPT_CWLO_KEY256		0x0000080e	/* 256bit, 15 rds */
+
+#endif /* !_MACHINE_SPECIALREG_H_ */
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
new file mode 100644
index 0000000000..f47daead31
--- /dev/null
+++ b/usr/src/cmd/bhyve/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2014 Pluribus Networks Inc.
+#
+
+PROG =		bhyve
+
+include ../Makefile.cmd
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint:	$(SUBDIRS)
+
+install: $(SUBDIRS)
+	-$(RM) $(ROOTUSRSBINPROG)
+	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyve/Makefile.com b/usr/src/cmd/bhyve/Makefile.com
new file mode 100644
index 0000000000..4a92b622ab
--- /dev/null
+++ b/usr/src/cmd/bhyve/Makefile.com
@@ -0,0 +1,94 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Pluribus Networks Inc.
+#
+
+PROG= bhyve
+
+SRCS =	atkbdc.c		\
+	bhyvegc.c		\
+	bhyverun.c		\
+	block_if.c		\
+	console.c		\
+	consport.c		\
+	inout.c			\
+	ioapic.c		\
+	mem.c			\
+	mptbl.c			\
+	pci_ahci.c		\
+	pci_emul.c		\
+	pci_hostbridge.c	\
+	pci_irq.c		\
+	pci_lpc.c		\
+	pci_virtio_block.c	\
+	pci_virtio_net.c	\
+	pci_virtio_viona.c	\
+	pm.c			\
+	pmtmr.c			\
+	post.c			\
+	ps2kbd.c		\
+	ps2mouse.c		\
+	rfb.c			\
+	rtc.c			\
+	smbiostbl.c		\
+	uart_emul.c		\
+	vga.c			\
+	virtio.c		\
+	vmm_instruction_emul.c	\
+	xmsr.c			\
+	spinup_ap.c		\
+	bhyve_sol_glue.c
+
+OBJS = $(SRCS:.c=.o)
+
+include ../../Makefile.cmd
+
+.KEEP_STATE:
+
+CFLAGS +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration
+CFLAGS64 +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration
+CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
+		-I$(ROOT)/usr/platform/i86pc/include \
+		-I$(SRC)/uts/i86pc/io/vmm \
+		-I$(SRC)/uts/common \
+		-I$(SRC)/uts/i86pc \
+		-I$(SRC)/lib/libdladm/common
+LDLIBS +=	-lsocket -lnsl -ldlpi -ldladm -lkstat -lmd -luuid -lvmmapi
+
+POST_PROCESS += ; $(GENSETDEFS) $@
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+	$(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+	$(RM) $(OBJS)
+
+lint:	lint_SRCS
+
+include ../../Makefile.targ
+
+%.o: ../%.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
+
+%.o: $(SRC)/uts/i86pc/io/vmm/%.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
+
+%.o: ../%.s
+	$(COMPILE.s) $<
diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h
new file mode 100644
index 0000000000..477f827286
--- /dev/null
+++ b/usr/src/cmd/bhyve/acpi.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/acpi.h 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#ifndef _ACPI_H_
+#define _ACPI_H_
+
+#define	SCI_INT			9
+
+#define	SMI_CMD			0xb2
+#define	BHYVE_ACPI_ENABLE	0xa0
+#define	BHYVE_ACPI_DISABLE	0xa1
+
+#define	PM1A_EVT_ADDR		0x400
+#define	PM1A_CNT_ADDR		0x404
+
+#define	IO_PMTMR		0x408	/* 4-byte i/o port for the timer */
+
+struct vmctx;
+
+int	acpi_build(struct vmctx *ctx, int ncpu);
+void	dsdt_line(const char *fmt, ...);
+void	dsdt_fixed_ioport(uint16_t iobase, uint16_t length);
+void	dsdt_fixed_irq(uint8_t irq);
+void	dsdt_fixed_mem32(uint32_t base, uint32_t length);
+void	dsdt_indent(int levels);
+void	dsdt_unindent(int levels);
+void	sci_init(struct vmctx *ctx);
+
+#endif /* _ACPI_H_ */
diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h
new file mode 100644
index 0000000000..1cf09adcbf
--- /dev/null
+++ b/usr/src/cmd/bhyve/ahci.h
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
+ * Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/ahci.h 256056 2013-10-04 18:31:38Z grehan $
+ */
+
+#ifndef _AHCI_H_
+#define	_AHCI_H_
+
+/* ATA register defines */
+#define ATA_DATA                        0       /* (RW) data */
+
+#define ATA_FEATURE                     1       /* (W) feature */
+#define         ATA_F_DMA               0x01    /* enable DMA */
+#define         ATA_F_OVL               0x02    /* enable overlap */
+
+#define ATA_COUNT                       2       /* (W) sector count */
+
+#define ATA_SECTOR                      3       /* (RW) sector # */
+#define ATA_CYL_LSB                     4       /* (RW) cylinder# LSB */
+#define ATA_CYL_MSB                     5       /* (RW) cylinder# MSB */
+#define ATA_DRIVE                       6       /* (W) Sector/Drive/Head */
+#define         ATA_D_LBA               0x40    /* use LBA addressing */
+#define         ATA_D_IBM               0xa0    /* 512 byte sectors, ECC */
+
+#define ATA_COMMAND                     7       /* (W) command */
+
+#define ATA_ERROR                       8       /* (R) error */
+#define         ATA_E_ILI               0x01    /* illegal length */
+#define         ATA_E_NM                0x02    /* no media */
+#define         ATA_E_ABORT             0x04    /* command aborted */
+#define         ATA_E_MCR               0x08    /* media change request */
+#define         ATA_E_IDNF              0x10    /* ID not found */
+#define         ATA_E_MC                0x20    /* media changed */
+#define         ATA_E_UNC               0x40    /* uncorrectable data */
+#define         ATA_E_ICRC              0x80    /* UDMA crc error */
+#define		ATA_E_ATAPI_SENSE_MASK	0xf0	/* ATAPI sense key mask */
+
+#define ATA_IREASON                     9       /* (R) interrupt reason */
+#define         ATA_I_CMD               0x01    /* cmd (1) | data (0) */
+#define         ATA_I_IN                0x02    /* read (1) | write (0) */
+#define         ATA_I_RELEASE           0x04    /* released bus (1) */
+#define         ATA_I_TAGMASK           0xf8    /* tag mask */
+
+#define ATA_STATUS                      10      /* (R) status */
+#define ATA_ALTSTAT                     11      /* (R) alternate status */
+#define         ATA_S_ERROR             0x01    /* error */
+#define         ATA_S_INDEX             0x02    /* index */
+#define         ATA_S_CORR              0x04    /* data corrected */
+#define         ATA_S_DRQ               0x08    /* data request */
+#define         ATA_S_DSC               0x10    /* drive seek completed */
+#define         ATA_S_SERVICE           0x10    /* drive needs service */
+#define         ATA_S_DWF               0x20    /* drive write fault */
+#define         ATA_S_DMA               0x20    /* DMA ready */
+#define         ATA_S_READY             0x40    /* drive ready */
+#define         ATA_S_BUSY              0x80    /* busy */
+
+#define ATA_CONTROL                     12      /* (W) control */
+#define         ATA_A_IDS               0x02    /* disable interrupts */
+#define         ATA_A_RESET             0x04    /* RESET controller */
+#define         ATA_A_4BIT              0x08    /* 4 head bits */
+#define         ATA_A_HOB               0x80    /* High Order Byte enable */
+
+/* SATA register defines */
+#define ATA_SSTATUS                     13
+#define         ATA_SS_DET_MASK         0x0000000f
+#define         ATA_SS_DET_NO_DEVICE    0x00000000
+#define         ATA_SS_DET_DEV_PRESENT  0x00000001
+#define         ATA_SS_DET_PHY_ONLINE   0x00000003
+#define         ATA_SS_DET_PHY_OFFLINE  0x00000004
+
+#define         ATA_SS_SPD_MASK         0x000000f0
+#define         ATA_SS_SPD_NO_SPEED     0x00000000
+#define         ATA_SS_SPD_GEN1         0x00000010
+#define         ATA_SS_SPD_GEN2         0x00000020
+#define         ATA_SS_SPD_GEN3         0x00000040
+
+#define         ATA_SS_IPM_MASK         0x00000f00
+#define         ATA_SS_IPM_NO_DEVICE    0x00000000
+#define         ATA_SS_IPM_ACTIVE       0x00000100
+#define         ATA_SS_IPM_PARTIAL      0x00000200
+#define         ATA_SS_IPM_SLUMBER      0x00000600
+
+#define ATA_SERROR                      14
+#define         ATA_SE_DATA_CORRECTED   0x00000001
+#define         ATA_SE_COMM_CORRECTED   0x00000002
+#define         ATA_SE_DATA_ERR         0x00000100
+#define         ATA_SE_COMM_ERR         0x00000200
+#define         ATA_SE_PROT_ERR         0x00000400
+#define         ATA_SE_HOST_ERR         0x00000800
+#define         ATA_SE_PHY_CHANGED      0x00010000
+#define         ATA_SE_PHY_IERROR       0x00020000
+#define         ATA_SE_COMM_WAKE        0x00040000
+#define         ATA_SE_DECODE_ERR       0x00080000
+#define         ATA_SE_PARITY_ERR       0x00100000
+#define         ATA_SE_CRC_ERR          0x00200000
+#define         ATA_SE_HANDSHAKE_ERR    0x00400000
+#define         ATA_SE_LINKSEQ_ERR      0x00800000
+#define         ATA_SE_TRANSPORT_ERR    0x01000000
+#define         ATA_SE_UNKNOWN_FIS      0x02000000
+#define         ATA_SE_EXCHANGED        0x04000000
+
+#define ATA_SCONTROL                    15
+#define         ATA_SC_DET_MASK         0x0000000f
+#define         ATA_SC_DET_IDLE         0x00000000
+#define         ATA_SC_DET_RESET        0x00000001
+#define         ATA_SC_DET_DISABLE      0x00000004
+
+#define         ATA_SC_SPD_MASK         0x000000f0
+#define         ATA_SC_SPD_NO_SPEED     0x00000000
+#define         ATA_SC_SPD_SPEED_GEN1   0x00000010
+#define         ATA_SC_SPD_SPEED_GEN2   0x00000020
+#define         ATA_SC_SPD_SPEED_GEN3   0x00000040
+
+#define         ATA_SC_IPM_MASK         0x00000f00
+#define         ATA_SC_IPM_NONE         0x00000000
+#define         ATA_SC_IPM_DIS_PARTIAL  0x00000100
+#define         ATA_SC_IPM_DIS_SLUMBER  0x00000200
+
+#define ATA_SACTIVE                     16
+
+#define AHCI_MAX_PORTS			32
+#define AHCI_MAX_SLOTS			32
+
+/* SATA AHCI v1.0 register defines */
+#define AHCI_CAP                    0x00
+#define		AHCI_CAP_NPMASK	0x0000001f
+#define		AHCI_CAP_SXS	0x00000020
+#define		AHCI_CAP_EMS	0x00000040
+#define		AHCI_CAP_CCCS	0x00000080
+#define		AHCI_CAP_NCS	0x00001F00
+#define		AHCI_CAP_NCS_SHIFT	8
+#define		AHCI_CAP_PSC	0x00002000
+#define		AHCI_CAP_SSC	0x00004000
+#define		AHCI_CAP_PMD	0x00008000
+#define		AHCI_CAP_FBSS	0x00010000
+#define		AHCI_CAP_SPM	0x00020000
+#define		AHCI_CAP_SAM	0x00080000
+#define		AHCI_CAP_ISS	0x00F00000
+#define		AHCI_CAP_ISS_SHIFT	20
+#define		AHCI_CAP_SCLO	0x01000000
+#define		AHCI_CAP_SAL	0x02000000
+#define		AHCI_CAP_SALP	0x04000000
+#define		AHCI_CAP_SSS	0x08000000
+#define		AHCI_CAP_SMPS	0x10000000
+#define		AHCI_CAP_SSNTF	0x20000000
+#define		AHCI_CAP_SNCQ	0x40000000
+#define		AHCI_CAP_64BIT	0x80000000
+
+#define AHCI_GHC                    0x04
+#define         AHCI_GHC_AE         0x80000000
+#define         AHCI_GHC_MRSM       0x00000004
+#define         AHCI_GHC_IE         0x00000002
+#define         AHCI_GHC_HR         0x00000001
+
+#define AHCI_IS                     0x08
+#define AHCI_PI                     0x0c
+#define AHCI_VS                     0x10
+
+#define AHCI_CCCC                   0x14
+#define		AHCI_CCCC_TV_MASK	0xffff0000
+#define		AHCI_CCCC_TV_SHIFT	16
+#define		AHCI_CCCC_CC_MASK	0x0000ff00
+#define		AHCI_CCCC_CC_SHIFT	8
+#define		AHCI_CCCC_INT_MASK	0x000000f8
+#define		AHCI_CCCC_INT_SHIFT	3
+#define		AHCI_CCCC_EN		0x00000001
+#define AHCI_CCCP                   0x18
+
+#define AHCI_EM_LOC                 0x1C
+#define AHCI_EM_CTL                 0x20
+#define 	AHCI_EM_MR              0x00000001
+#define 	AHCI_EM_TM              0x00000100
+#define 	AHCI_EM_RST             0x00000200
+#define 	AHCI_EM_LED             0x00010000
+#define 	AHCI_EM_SAFTE           0x00020000
+#define 	AHCI_EM_SES2            0x00040000
+#define 	AHCI_EM_SGPIO           0x00080000
+#define 	AHCI_EM_SMB             0x01000000
+#define 	AHCI_EM_XMT             0x02000000
+#define 	AHCI_EM_ALHD            0x04000000
+#define 	AHCI_EM_PM              0x08000000
+
+#define AHCI_CAP2                   0x24
+#define		AHCI_CAP2_BOH	0x00000001
+#define		AHCI_CAP2_NVMP	0x00000002
+#define		AHCI_CAP2_APST	0x00000004
+
+#define AHCI_OFFSET                 0x100
+#define AHCI_STEP                   0x80
+
+#define AHCI_P_CLB                  0x00
+#define AHCI_P_CLBU                 0x04
+#define AHCI_P_FB                   0x08
+#define AHCI_P_FBU                  0x0c
+#define AHCI_P_IS                   0x10
+#define AHCI_P_IE                   0x14
+#define         AHCI_P_IX_DHR       0x00000001
+#define         AHCI_P_IX_PS        0x00000002
+#define         AHCI_P_IX_DS        0x00000004
+#define         AHCI_P_IX_SDB       0x00000008
+#define         AHCI_P_IX_UF        0x00000010
+#define         AHCI_P_IX_DP        0x00000020
+#define         AHCI_P_IX_PC        0x00000040
+#define         AHCI_P_IX_MP        0x00000080
+
+#define         AHCI_P_IX_PRC       0x00400000
+#define         AHCI_P_IX_IPM       0x00800000
+#define         AHCI_P_IX_OF        0x01000000
+#define         AHCI_P_IX_INF       0x04000000
+#define         AHCI_P_IX_IF        0x08000000
+#define         AHCI_P_IX_HBD       0x10000000
+#define         AHCI_P_IX_HBF       0x20000000
+#define         AHCI_P_IX_TFE       0x40000000
+#define         AHCI_P_IX_CPD       0x80000000
+
+#define AHCI_P_CMD                  0x18
+#define         AHCI_P_CMD_ST       0x00000001
+#define         AHCI_P_CMD_SUD      0x00000002
+#define         AHCI_P_CMD_POD      0x00000004
+#define         AHCI_P_CMD_CLO      0x00000008
+#define         AHCI_P_CMD_FRE      0x00000010
+#define         AHCI_P_CMD_CCS_MASK 0x00001f00
+#define         AHCI_P_CMD_CCS_SHIFT 8
+#define         AHCI_P_CMD_ISS      0x00002000
+#define         AHCI_P_CMD_FR       0x00004000
+#define         AHCI_P_CMD_CR       0x00008000
+#define         AHCI_P_CMD_CPS      0x00010000
+#define         AHCI_P_CMD_PMA      0x00020000
+#define         AHCI_P_CMD_HPCP     0x00040000
+#define         AHCI_P_CMD_MPSP     0x00080000
+#define         AHCI_P_CMD_CPD      0x00100000
+#define         AHCI_P_CMD_ESP      0x00200000
+#define         AHCI_P_CMD_FBSCP    0x00400000
+#define         AHCI_P_CMD_APSTE    0x00800000
+#define         AHCI_P_CMD_ATAPI    0x01000000
+#define         AHCI_P_CMD_DLAE     0x02000000
+#define         AHCI_P_CMD_ALPE     0x04000000
+#define         AHCI_P_CMD_ASP      0x08000000
+#define         AHCI_P_CMD_ICC_MASK 0xf0000000
+#define         AHCI_P_CMD_NOOP     0x00000000
+#define         AHCI_P_CMD_ACTIVE   0x10000000
+#define         AHCI_P_CMD_PARTIAL  0x20000000
+#define         AHCI_P_CMD_SLUMBER  0x60000000
+
+#define AHCI_P_TFD                  0x20
+#define AHCI_P_SIG                  0x24
+#define AHCI_P_SSTS                 0x28
+#define AHCI_P_SCTL                 0x2c
+#define AHCI_P_SERR                 0x30
+#define AHCI_P_SACT                 0x34
+#define AHCI_P_CI                   0x38
+#define AHCI_P_SNTF                 0x3C
+#define AHCI_P_FBS                  0x40
+#define 	AHCI_P_FBS_EN       0x00000001
+#define 	AHCI_P_FBS_DEC      0x00000002
+#define 	AHCI_P_FBS_SDE      0x00000004
+#define 	AHCI_P_FBS_DEV      0x00000f00
+#define 	AHCI_P_FBS_DEV_SHIFT 8
+#define 	AHCI_P_FBS_ADO      0x0000f000
+#define 	AHCI_P_FBS_ADO_SHIFT 12
+#define 	AHCI_P_FBS_DWE      0x000f0000
+#define 	AHCI_P_FBS_DWE_SHIFT 16
+
+/* Just to be sure, if building as module. */
+#if MAXPHYS < 512 * 1024
+#undef MAXPHYS
+#define MAXPHYS				512 * 1024
+#endif
+/* Pessimistic prognosis on number of required S/G entries */
+#define AHCI_SG_ENTRIES	(roundup(btoc(MAXPHYS) + 1, 8))
+/* Command list. 32 commands. First, 1Kbyte aligned. */
+#define AHCI_CL_OFFSET              0
+#define AHCI_CL_SIZE                32
+/* Command tables. Up to 32 commands, Each, 128byte aligned. */
+#define AHCI_CT_OFFSET              (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
+#define AHCI_CT_SIZE                (128 + AHCI_SG_ENTRIES * 16)
+/* Total main work area. */
+#define AHCI_WORK_SIZE              (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)
+
+#endif /* _AHCI_H_ */
diff --git a/usr/src/cmd/bhyve/amd64/Makefile b/usr/src/cmd/bhyve/amd64/Makefile
new file mode 100644
index 0000000000..13cdae6663
--- /dev/null
+++ b/usr/src/cmd/bhyve/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c
new file mode 100644
index 0000000000..4d09d88266
--- /dev/null
+++ b/usr/src/cmd/bhyve/atkbdc.c
@@ -0,0 +1,576 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z neel $");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "ps2kbd.h"
+#include "ps2mouse.h"
+
+#define	KBD_DATA_PORT		0x60
+
+#define	KBD_STS_CTL_PORT	0x64
+
+#define	KBDC_RESET		0xfe
+
+#define	KBD_DEV_IRQ		1
+#define	AUX_DEV_IRQ		12
+
+/* controller commands */
+#define	KBDC_SET_COMMAND_BYTE	0x60
+#define	KBDC_GET_COMMAND_BYTE	0x20
+#define	KBDC_DISABLE_AUX_PORT	0xa7
+#define	KBDC_ENABLE_AUX_PORT	0xa8
+#define	KBDC_TEST_AUX_PORT	0xa9
+#define	KBDC_TEST_CTRL		0xaa
+#define	KBDC_TEST_KBD_PORT	0xab
+#define	KBDC_DISABLE_KBD_PORT	0xad
+#define	KBDC_ENABLE_KBD_PORT	0xae
+#define	KBDC_READ_INPORT	0xc0
+#define	KBDC_READ_OUTPORT	0xd0
+#define	KBDC_WRITE_OUTPORT	0xd1
+#define	KBDC_WRITE_KBD_OUTBUF	0xd2
+#define	KBDC_WRITE_AUX_OUTBUF	0xd3
+#define	KBDC_WRITE_TO_AUX	0xd4
+
+/* controller command byte (set by KBDC_SET_COMMAND_BYTE) */
+#define	KBD_TRANSLATION		0x40
+#define	KBD_SYS_FLAG_BIT	0x04
+#define	KBD_DISABLE_KBD_PORT	0x10
+#define	KBD_DISABLE_AUX_PORT	0x20
+#define	KBD_ENABLE_AUX_INT	0x02
+#define	KBD_ENABLE_KBD_INT	0x01
+#define	KBD_KBD_CONTROL_BITS	(KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT)
+#define	KBD_AUX_CONTROL_BITS	(KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT)
+
+/* controller status bits */
+#define	KBDS_KBD_BUFFER_FULL	0x01
+#define KBDS_SYS_FLAG		0x04
+#define KBDS_CTRL_FLAG		0x08
+#define	KBDS_AUX_BUFFER_FULL	0x20
+
+/* controller output port */
+#define	KBDO_KBD_OUTFULL	0x10
+#define	KBDO_AUX_OUTFULL	0x20
+
+#define	RAMSZ			32
+
+struct kbd_dev {
+	bool	irq_active;
+	int	irq;
+
+	uint8_t	buffer;
+};
+
+struct aux_dev {
+	bool	irq_active;
+	int	irq;
+
+	uint8_t	buffer;
+};
+
+struct atkbdc_softc {
+	struct vmctx *ctx;
+	pthread_mutex_t mtx;
+
+	struct ps2kbd_softc	*ps2kbd_sc;
+	struct ps2mouse_softc	*ps2mouse_sc;
+
+	uint8_t	status;		/* status register */
+	uint8_t	outport;	/* controller output port */
+	uint8_t	ram[RAMSZ];	/* byte0 = controller config */
+
+	uint32_t curcmd;	/* current command for next byte */
+
+	struct kbd_dev kbd;
+	struct aux_dev aux;
+};
+
+static void
+atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
+{
+	if (!sc->kbd.irq_active &&
+	    (sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) {
+		sc->kbd.irq_active = true;
+		vm_isa_assert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
+	}
+}
+
+static void
+atkbdc_deassert_kbd_intr(struct atkbdc_softc *sc)
+{
+	if (sc->kbd.irq_active) {
+		vm_isa_deassert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
+		sc->kbd.irq_active = false;
+	}
+}
+
+static void
+atkbdc_assert_aux_intr(struct atkbdc_softc *sc)
+{
+	if (!sc->aux.irq_active &&
+	    (sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) {
+		sc->aux.irq_active = true;
+		vm_isa_assert_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
+	}
+}
+
+static void
+atkbdc_deassert_aux_intr(struct atkbdc_softc *sc)
+{
+	if (sc->aux.irq_active) {
+		vm_isa_deassert_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
+		sc->aux.irq_active = false;
+	}
+}
+
+static void
+atkbdc_aux_queue_data(struct atkbdc_softc *sc, uint8_t val)
+{
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	sc->aux.buffer = val;
+	sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+	sc->outport |= KBDO_AUX_OUTFULL;
+	atkbdc_assert_aux_intr(sc);
+}
+
+static void
+atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val)
+{
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	sc->kbd.buffer = val;
+	sc->status |= KBDS_KBD_BUFFER_FULL;
+	sc->outport |= KBDO_KBD_OUTFULL;
+	atkbdc_assert_kbd_intr(sc);
+}
+
+static void
+atkbdc_aux_read(struct atkbdc_softc *sc)
+{
+	uint8_t val;
+
+        assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	if (ps2mouse_read(sc->ps2mouse_sc, &val) != -1)
+		atkbdc_aux_queue_data(sc, val);
+}
+
+static void
+atkbdc_kbd_read(struct atkbdc_softc *sc)
+{
+	const uint8_t translation[256] = {
+		0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58,
+		0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59,
+		0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a,
+		0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b,
+		0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c,
+		0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d,
+		0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e,
+		0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f,
+		0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60,
+		0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61,
+		0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e,
+		0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76,
+		0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b,
+		0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f,
+		0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45,
+		0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54,
+		0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87,
+		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+		0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+		0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+		0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+		0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+		0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+		0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+		0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+		0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+		0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+		0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+		0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+		0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+	};
+	uint8_t val;
+	uint8_t release = 0;
+
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	if (sc->ram[0] & KBD_TRANSLATION) {
+		while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) {
+			if (val == 0xf0) {
+				release = 0x80;
+				continue;
+			} else {
+				val = translation[val] | release;
+			}
+
+			atkbdc_kbd_queue_data(sc, val);
+			break;
+		}
+	} else {
+		if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1)
+			atkbdc_kbd_queue_data(sc, val);
+	}
+}
+
+static void
+atkbdc_aux_poll(struct atkbdc_softc *sc)
+{
+	if ((sc->outport & KBDO_AUX_OUTFULL) == 0)
+		atkbdc_aux_read(sc);
+}
+
+static void
+atkbdc_kbd_poll(struct atkbdc_softc *sc)
+{
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	if ((sc->outport & KBDO_KBD_OUTFULL) == 0)
+		atkbdc_kbd_read(sc);
+}
+
+static void
+atkbdc_poll(struct atkbdc_softc *sc)
+{
+	atkbdc_aux_poll(sc);
+	atkbdc_kbd_poll(sc);
+}
+
+static void
+atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf)
+{
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	if (sc->outport & KBDO_AUX_OUTFULL) {
+		*buf = sc->aux.buffer;
+		sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+		sc->outport &= ~KBDO_AUX_OUTFULL;
+		atkbdc_deassert_aux_intr(sc);
+
+		atkbdc_poll(sc);
+		return;
+	}
+
+	*buf = sc->kbd.buffer;
+	sc->status &= ~KBDS_KBD_BUFFER_FULL;
+	sc->outport &= ~KBDO_KBD_OUTFULL;
+	atkbdc_deassert_kbd_intr(sc);
+
+	atkbdc_poll(sc);
+}
+
+static int
+atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+	struct atkbdc_softc *sc;
+	uint8_t buf;
+	int retval;
+
+	if (bytes != 1)
+		return (-1);
+
+	sc = arg;
+	retval = 0;
+
+	pthread_mutex_lock(&sc->mtx);
+	if (in) {
+		sc->curcmd = 0;
+		sc->status &= ~KBDS_CTRL_FLAG;
+
+		/* read device buffer; includes kbd cmd responses */
+		atkbdc_dequeue_data(sc, &buf);
+		*eax = buf;
+
+		pthread_mutex_unlock(&sc->mtx);
+		return (retval);
+	}
+
+	if (sc->status & KBDS_CTRL_FLAG) {
+		/*
+		 * Command byte for the controller.
+		 */
+		switch (sc->curcmd) {
+		case KBDC_SET_COMMAND_BYTE:
+			sc->ram[0] = *eax;
+			if (sc->ram[0] & KBD_SYS_FLAG_BIT)
+				sc->status |= KBDS_SYS_FLAG;
+			else
+				sc->status &= KBDS_SYS_FLAG;
+			if (sc->outport & KBDO_AUX_OUTFULL)
+				atkbdc_assert_aux_intr(sc);
+			else if (sc->outport & KBDO_KBD_OUTFULL)
+				atkbdc_assert_kbd_intr(sc);
+			break;
+		case KBDC_WRITE_OUTPORT:
+			sc->outport = *eax;
+			if (sc->outport & KBDO_AUX_OUTFULL)
+				sc->status |= (KBDS_AUX_BUFFER_FULL |
+					       KBDS_KBD_BUFFER_FULL);
+			if (sc->outport & KBDO_KBD_OUTFULL)
+				sc->status |= KBDS_KBD_BUFFER_FULL;
+			break;
+		case KBDC_WRITE_TO_AUX:
+			ps2mouse_write(sc->ps2mouse_sc, *eax);
+			atkbdc_poll(sc);
+			break;
+		case KBDC_WRITE_KBD_OUTBUF:
+			atkbdc_kbd_queue_data(sc, *eax);
+			break;
+		case KBDC_WRITE_AUX_OUTBUF:
+			atkbdc_aux_queue_data(sc, *eax);
+			break;
+		default:
+			/* write to particular RAM byte */
+			if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) {
+				int byten;
+
+				byten = (sc->curcmd - 0x60) & 0x1f;
+				sc->ram[byten] = *eax & 0xff;
+			}
+			break;
+		}
+
+		sc->curcmd = 0;
+		sc->status &= ~KBDS_CTRL_FLAG;
+
+		pthread_mutex_unlock(&sc->mtx);
+		return (retval);
+	}
+
+	/*
+	 * Data byte for the device.
+	 */
+	ps2kbd_write(sc->ps2kbd_sc, *eax);
+	atkbdc_poll(sc);
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (retval);
+}
+
+
+static int
+atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
+    int bytes, uint32_t *eax, void *arg)
+{
+	struct atkbdc_softc *sc;
+	int	error, retval;
+
+	if (bytes != 1)
+		return (-1);
+
+	sc = arg;
+	retval = 0;
+
+	pthread_mutex_lock(&sc->mtx);
+
+	if (in) {
+		/* read status register */
+		*eax = sc->status;
+		pthread_mutex_unlock(&sc->mtx);
+		return (retval);
+	}
+
+	sc->curcmd = 0;
+	sc->status |= KBDS_CTRL_FLAG;
+
+	switch (*eax) {
+	case KBDC_GET_COMMAND_BYTE:
+		atkbdc_kbd_queue_data(sc, sc->ram[0]);
+		break;
+	case KBDC_TEST_CTRL:
+		atkbdc_kbd_queue_data(sc, 0x55);
+		break;
+	case KBDC_TEST_AUX_PORT:
+	case KBDC_TEST_KBD_PORT:
+		atkbdc_kbd_queue_data(sc, 0);
+		break;
+	case KBDC_READ_INPORT:
+		atkbdc_kbd_queue_data(sc, 0);
+		break;
+	case KBDC_READ_OUTPORT:
+		atkbdc_kbd_queue_data(sc, sc->outport);
+		break;
+	case KBDC_SET_COMMAND_BYTE:
+	case KBDC_WRITE_OUTPORT:
+	case KBDC_WRITE_KBD_OUTBUF:
+	case KBDC_WRITE_AUX_OUTBUF:
+		sc->curcmd = *eax;
+		break;
+	case KBDC_DISABLE_KBD_PORT:
+		sc->ram[0] |= KBD_DISABLE_KBD_PORT;
+		break;
+	case KBDC_ENABLE_KBD_PORT:
+		sc->ram[0] &= ~KBD_DISABLE_KBD_PORT;
+		atkbdc_poll(sc);
+		break;
+	case KBDC_WRITE_TO_AUX:
+		sc->curcmd = *eax;
+		break;
+	case KBDC_DISABLE_AUX_PORT:
+		sc->ram[0] |= KBD_DISABLE_AUX_PORT;
+		break;
+	case KBDC_ENABLE_AUX_PORT:
+		sc->ram[0] &= ~KBD_DISABLE_AUX_PORT;
+		break;
+	case KBDC_RESET:		/* Pulse "reset" line */
+#ifdef	__FreeBSD__
+		error = vm_suspend(ctx, VM_SUSPEND_RESET);
+		assert(error == 0 || errno == EALREADY);
+#else
+		exit(0);
+#endif
+		break;
+	default:
+		if (*eax >= 0x21 && *eax <= 0x3f) {
+			/* read "byte N" from RAM */
+			int	byten;
+
+			byten = (*eax - 0x20) & 0x1f;
+			atkbdc_kbd_queue_data(sc, sc->ram[byten]);
+		}
+		break;
+	}
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (retval);
+}
+
+void
+atkbdc_event(struct atkbdc_softc *sc)
+{
+	pthread_mutex_lock(&sc->mtx);
+	atkbdc_poll(sc);
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+void
+atkbdc_init(struct vmctx *ctx)
+{
+	struct inout_port iop;
+	struct atkbdc_softc *sc;
+	int error;
+
+	sc = calloc(1, sizeof(struct atkbdc_softc));
+	sc->ctx = ctx;
+
+	pthread_mutex_init(&sc->mtx, NULL);
+
+	bzero(&iop, sizeof(struct inout_port));
+	iop.name = "atkdbc";
+	iop.port = KBD_STS_CTL_PORT;
+	iop.size = 1;
+	iop.flags = IOPORT_F_INOUT;
+	iop.handler = atkbdc_sts_ctl_handler;
+	iop.arg = sc;
+
+	error = register_inout(&iop);
+	assert(error == 0);
+
+	bzero(&iop, sizeof(struct inout_port));
+	iop.name = "atkdbc";
+	iop.port = KBD_DATA_PORT;
+	iop.size = 1;
+	iop.flags = IOPORT_F_INOUT;
+	iop.handler = atkbdc_data_handler;
+	iop.arg = sc;
+
+	error = register_inout(&iop);
+	assert(error == 0);
+
+	pci_irq_reserve(KBD_DEV_IRQ);
+	sc->kbd.irq = KBD_DEV_IRQ;
+
+	pci_irq_reserve(AUX_DEV_IRQ);
+	sc->aux.irq = AUX_DEV_IRQ;
+
+	sc->ps2kbd_sc = ps2kbd_init(sc);
+	sc->ps2mouse_sc = ps2mouse_init(sc);
+}
+
+#ifdef	__FreeBSD__
+static void
+atkbdc_dsdt(void)
+{
+
+	dsdt_line("");
+	dsdt_line("Device (KBD)");
+	dsdt_line("{");
+	dsdt_line("  Name (_HID, EisaId (\"PNP0303\"))");
+	dsdt_line("  Name (_CRS, ResourceTemplate ()");
+	dsdt_line("  {");
+	dsdt_indent(2);
+	dsdt_fixed_ioport(KBD_DATA_PORT, 1);
+	dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
+	dsdt_fixed_irq(1);
+	dsdt_unindent(2);
+	dsdt_line("  })");
+	dsdt_line("}");
+
+	dsdt_line("");
+	dsdt_line("Device (MOU)");
+	dsdt_line("{");
+	dsdt_line("  Name (_HID, EisaId (\"PNP0F13\"))");
+	dsdt_line("  Name (_CRS, ResourceTemplate ()");
+	dsdt_line("  {");
+	dsdt_indent(2);
+	dsdt_fixed_ioport(KBD_DATA_PORT, 1);
+	dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
+	dsdt_fixed_irq(12);
+	dsdt_unindent(2);
+	dsdt_line("  })");
+	dsdt_line("}");
+}
+LPC_DSDT(atkbdc_dsdt);
+#endif
diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h
new file mode 100644
index 0000000000..48b3a8b00c
--- /dev/null
+++ b/usr/src/cmd/bhyve/atkbdc.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ATKBDC_H_
+#define _ATKBDC_H_
+
+struct atkbdc_softc;
+struct vmctx;
+
+void atkbdc_init(struct vmctx *ctx);
+void atkbdc_event(struct atkbdc_softc *sc);
+
+#endif /* _ATKBDC_H_ */
diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c
new file mode 100644
index 0000000000..633faacc5f
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c
@@ -0,0 +1,86 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/uio.h>
+
+#include <termios.h>
+#include <unistd.h>
+
+/*
+ * Make a pre-existing termios structure into "raw" mode: character-at-a-time
+ * mode with no characters interpreted, 8-bit data path.
+ */
+void
+cfmakeraw(struct termios *t)
+{
+	t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON|IGNPAR);
+	t->c_iflag |= IGNBRK;
+	t->c_oflag &= ~OPOST;
+	t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|TOSTOP |PENDIN);
+	t->c_cflag &= ~(CSIZE|PARENB);
+	t->c_cflag |= CS8|CREAD;
+	t->c_cc[VMIN] = 1;
+	t->c_cc[VTIME] = 0;
+}
+
+ssize_t
+preadv(int d, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	off_t		old_offset;
+	ssize_t		n;
+
+	old_offset = lseek(d, (off_t)0, SEEK_CUR);
+	if (old_offset == -1)
+		return (-1);
+
+	offset = lseek(d, offset, SEEK_SET);
+	if (offset == -1)
+		return (-1);
+
+	n = readv(d, iov, iovcnt);
+	if (n == -1)
+		return (-1);
+
+	offset = lseek(d, old_offset, SEEK_SET);
+	if (offset == -1)
+		return (-1);
+
+	return (n);
+}
+
+ssize_t
+pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	off_t		old_offset;
+	ssize_t		n;
+
+	old_offset = lseek(d, (off_t)0, SEEK_CUR);
+	if (old_offset == -1)
+		return (-1);
+
+	offset = lseek(d, offset, SEEK_SET);
+	if (offset == -1)
+		return (-1);
+
+	n = writev(d, iov, iovcnt);
+	if (n == -1)
+		return (-1);
+
+	offset = lseek(d, old_offset, SEEK_SET);
+	if (offset == -1)
+		return (-1);
+
+	return (n);
+}
diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c
new file mode 100644
index 0000000000..7a13c4c83f
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyvegc.c
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "bhyvegc.h"
+
+struct bhyvegc {
+	struct bhyvegc_image	*gc_image;
+};
+
+struct bhyvegc *
+bhyvegc_init(int width, int height)
+{
+	struct bhyvegc *gc;
+	struct bhyvegc_image *gc_image;
+
+	gc = calloc(1, sizeof (struct bhyvegc));
+
+	gc_image = calloc(1, sizeof(struct bhyvegc_image));
+	gc_image->width = width;
+	gc_image->height = height;
+	gc_image->data = calloc(width * height, sizeof (uint32_t));
+
+	gc->gc_image = gc_image;
+
+	return (gc);
+}
+
+void
+bhyvegc_resize(struct bhyvegc *gc, int width, int height)
+{
+	struct bhyvegc_image *gc_image;
+
+	gc_image = gc->gc_image;
+
+	gc_image->width = width;
+	gc_image->height = height;
+	gc_image->data = realloc(gc_image->data,
+	    sizeof (uint32_t) * width * height);
+	memset(gc_image->data, 0, width * height * sizeof (uint32_t));
+}
+
+struct bhyvegc_image *
+bhyvegc_get_image(struct bhyvegc *gc)
+{
+	return (gc->gc_image);
+}
diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h
new file mode 100644
index 0000000000..19648f98af
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyvegc.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVEGC_H_
+#define	_BHYVEGC_H_
+
+struct bhyvegc;
+
+struct bhyvegc_image {
+	int		width;
+	int		height;
+	uint32_t	*data;
+};
+
+struct bhyvegc *bhyvegc_init(int width, int height);
+void bhyvegc_resize(struct bhyvegc *gc, int width, int height);
+struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc);
+
+#endif /* _BHYVEGC_H_ */
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c
new file mode 100644
index 0000000000..b985a2286e
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyverun.c
@@ -0,0 +1,820 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <machine/segments.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <sysexits.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "atkbdc.h"
+#include "console.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "ioapic.h"
+#include "mem.h"
+#ifdef	__FreeBSD__
+#include "mevent.h"
+#endif
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "smbiostbl.h"
+#include "xmsr.h"
+#include "spinup_ap.h"
+#include "rfb.h"
+#include "rtc.h"
+#include "vga.h"
+
+#define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
+
+#define MB		(1024UL * 1024)
+#define GB		(1024UL * MB)
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+
+char *vmname;
+
+int guest_ncpus;
+char *guest_uuid_str;
+
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
+static int virtio_msix = 1;
+static int x2apic_mode = 0;	/* default is xAPIC */
+
+static int strictio;
+static int strictmsr = 1;
+
+#ifdef	__FreeBSD__
+static int acpi;
+#endif
+
+static char *progname;
+static const int BSP = 0;
+
+#ifndef	__FreeBSD__
+int bcons_wait = 0;
+int bcons_connected = 0;
+pthread_mutex_t bcons_wait_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t bcons_wait_done = PTHREAD_COND_INITIALIZER;
+#endif
+
+static cpuset_t cpumask;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+static struct vm_exit vmexit[VM_MAXCPU];
+
+struct bhyvestats {
+        uint64_t        vmexit_bogus;
+        uint64_t        vmexit_bogus_switch;
+        uint64_t        vmexit_hlt;
+        uint64_t        vmexit_pause;
+        uint64_t        vmexit_mtrap;
+        uint64_t        vmexit_inst_emul;
+        uint64_t        cpu_switch_rotate;
+        uint64_t        cpu_switch_direct;
+        int             io_reset;
+} stats;
+
+struct mt_vmm_info {
+	pthread_t	mt_thr;
+	struct vmctx	*mt_ctx;
+	int		mt_vcpu;	
+} mt_vmm_info[VM_MAXCPU];
+
+static void
+usage(int code)
+{
+
+#ifdef	__FreeBSD__
+        fprintf(stderr,
+                "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n"
+		"       %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+		"       -a: local apic is in xAPIC mode (deprecated)\n"
+		"       -A: create an ACPI table\n"
+		"       -g: gdb port\n"
+		"       -c: # cpus (default 1)\n"
+		"       -C: include guest memory in core file\n"
+		"       -p: pin 'vcpu' to 'hostcpu'\n"
+		"       -H: vmexit from the guest on hlt\n"
+		"       -P: vmexit from the guest on pause\n"
+		"       -W: force virtio to use single-vector MSI\n"
+		"       -e: exit on unhandled I/O access\n"
+		"       -h: help\n"
+		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -l: LPC device configuration\n"
+		"       -m: memory size in MB\n"
+		"       -w: ignore unimplemented MSRs\n"
+		"       -x: local apic is in x2APIC mode\n"
+		"       -Y: disable MPtable generation\n"
+		"       -U: uuid\n",
+		progname, (int)strlen(progname), "");
+#else
+        fprintf(stderr,
+                "Usage: %s [-ehwHPW] [-s <pci>] [-c vcpus]\n"
+		"       %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+		"       -c: # cpus (default 1)\n"
+		"       -H: vmexit from the guest on hlt\n"
+		"       -P: vmexit from the guest on pause\n"
+		"       -W: force virtio to use single-vector MSI\n"
+		"       -e: exit on unhandled I/O access\n"
+		"       -h: help\n"
+		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -l: LPC device configuration\n"
+		"       -m: memory size in MB\n"
+		"       -w: ignore unimplemented MSRs\n"
+		"       -Y: disable MPtable generation\n"
+		"       -U: uuid\n",
+		progname, (int)strlen(progname), "");
+#endif
+
+	exit(code);
+}
+
+void
+vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
+    int errcode)
+{
+	struct vmctx *ctx;
+	int error, restart_instruction;
+
+	ctx = arg;
+	restart_instruction = 1;
+
+	error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
+	    restart_instruction);
+	assert(error == 0);
+}
+
+void *
+paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
+{
+
+	return (vm_map_gpa(ctx, gaddr, len));
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+	return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+	return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_virtio_msix(void)
+{
+
+	return (virtio_msix);
+}
+
+static void *
+fbsdrun_start_thread(void *param)
+{
+	char tname[MAXCOMLEN + 1];
+	struct mt_vmm_info *mtp;
+	int vcpu;
+
+	mtp = param;
+	vcpu = mtp->mt_vcpu;
+
+	snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
+	pthread_set_name_np(mtp->mt_thr, tname);
+
+	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+	/* not reached */
+	exit(1);
+	return (NULL);
+}
+
+void
+fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
+{
+	int error;
+
+	assert(fromcpu == BSP);
+
+	/*
+	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
+	 * vm_activate_cpu() is delayed until newcpu's pthread starts running
+	 * then vmm.ko is out-of-sync with bhyve and this can create a race
+	 * with vm_suspend().
+	 */
+	error = vm_activate_cpu(ctx, newcpu);
+	assert(error == 0);
+
+	CPU_SET_ATOMIC(newcpu, &cpumask);
+
+	/*
+	 * Set up the vmexit struct to allow execution to start
+	 * at the given RIP
+	 */
+	vmexit[newcpu].rip = rip;
+	vmexit[newcpu].inst_length = 0;
+
+	mt_vmm_info[newcpu].mt_ctx = ctx;
+	mt_vmm_info[newcpu].mt_vcpu = newcpu;
+
+	error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
+	    fbsdrun_start_thread, &mt_vmm_info[newcpu]);
+	assert(error == 0);
+}
+
+static int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+		     uint32_t eax)
+{
+#if BHYVE_DEBUG
+	/*
+	 * put guest-driven debug here
+	 */
+#endif
+        return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	int error;
+	int bytes, port, in, out, string;
+	int vcpu;
+
+	vcpu = *pvcpu;
+
+	port = vme->u.inout.port;
+	bytes = vme->u.inout.bytes;
+	string = vme->u.inout.string;
+	in = vme->u.inout.in;
+	out = !in;
+
+        /* Extra-special case of host notifications */
+        if (out && port == GUEST_NIO_PORT) {
+                error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
+		return (error);
+	}
+
+	error = emulate_inout(ctx, vcpu, vme, strictio);
+	if (error) {
+		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
+		    in ? "in" : "out",
+		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
+		    port, vmexit->rip);
+		return (VMEXIT_ABORT);
+	} else {
+		return (VMEXIT_CONTINUE);
+	}
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	uint64_t val;
+	uint32_t eax, edx;
+	int error;
+
+	val = 0;
+	error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
+	if (error != 0) {
+		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
+		    vme->u.msr.code, *pvcpu);
+		if (strictmsr) {
+			vm_inject_gp(ctx, *pvcpu);
+			return (VMEXIT_CONTINUE);
+		}
+	}
+
+	eax = val;
+	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
+	assert(error == 0);
+
+	edx = val >> 32;
+	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
+	assert(error == 0);
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	int error;
+
+	error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
+	if (error != 0) {
+		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
+		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
+		if (strictmsr) {
+			vm_inject_gp(ctx, *pvcpu);
+			return (VMEXIT_CONTINUE);
+		}
+	}
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	int newcpu;
+	int retval = VMEXIT_CONTINUE;
+
+	newcpu = spinup_ap(ctx, *pvcpu,
+			   vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+
+	return (retval);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+	fprintf(stderr, "\treason\t\tVMX\n");
+	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+	fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
+	fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+	fprintf(stderr, "\tqualification\t0x%016lx\n",
+	    vmexit->u.vmx.exit_qualification);
+	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
+	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
+
+	return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	stats.vmexit_bogus++;
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	stats.vmexit_hlt++;
+
+	/*
+	 * Just continue execution with the next instruction. We use
+	 * the HLT VM exit as a way to be friendly with the host
+	 * scheduler.
+	 */
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	stats.vmexit_pause++;
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	stats.vmexit_mtrap++;
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	int err, i;
+	struct vie *vie;
+
+	stats.vmexit_inst_emul++;
+
+	vie = &vmexit->u.inst_emul.vie;
+	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
+	    vie, &vmexit->u.inst_emul.paging);
+
+	if (err) {
+		if (err == ESRCH) {
+			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
+			    vmexit->u.inst_emul.gpa);
+		}
+
+		fprintf(stderr, "Failed to emulate instruction [");
+		for (i = 0; i < vie->num_valid; i++) {
+			fprintf(stderr, "0x%02x%s", vie->inst[i],
+			    i != (vie->num_valid - 1) ? " " : "");
+		}
+		fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
+		return (VMEXIT_ABORT);
+	}
+
+	return (VMEXIT_CONTINUE);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+	[VM_EXITCODE_INOUT]  = vmexit_inout,
+	[VM_EXITCODE_VMX]    = vmexit_vmx,
+	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
+	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
+	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
+	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
+	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
+	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
+{
+#ifdef	__FreeBSD__
+	cpuset_t mask;
+#endif
+	int error, rc, prevcpu;
+	enum vm_exitcode exitcode;
+
+#ifdef	__FreeBSD__
+	if (pincpu >= 0) {
+		CPU_ZERO(&mask);
+		CPU_SET(pincpu + vcpu, &mask);
+		error = pthread_setaffinity_np(pthread_self(),
+					       sizeof(mask), &mask);
+		assert(error == 0);
+	}
+#endif
+
+	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
+	assert(error == 0);
+
+	while (1) {
+		error = vm_run(ctx, vcpu, &vmexit[vcpu]);
+		if (error != 0)
+			break;
+
+		prevcpu = vcpu;
+
+		exitcode = vmexit[vcpu].exitcode;
+		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
+			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
+			    exitcode);
+			exit(1);
+		}
+
+                rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
+
+		switch (rc) {
+		case VMEXIT_CONTINUE:
+			break;
+		case VMEXIT_ABORT:
+			abort();
+		default:
+			exit(1);
+		}
+	}
+	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+static int
+num_vcpus_allowed(struct vmctx *ctx)
+{
+	int tmp, error;
+
+	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
+
+	/*
+	 * The guest is allowed to spinup more than one processor only if the
+	 * UNRESTRICTED_GUEST capability is available.
+	 */
+	if (error == 0)
+		return (VM_MAXCPU);
+	else
+		return (1);
+}
+
+void
+fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
+{
+	int err, tmp;
+
+	if (fbsdrun_vmexit_on_hlt()) {
+		err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
+		if (err < 0) {
+			fprintf(stderr, "VM exit on HLT not supported\n");
+			exit(1);
+		}
+		vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
+		if (cpu == BSP)
+			handler[VM_EXITCODE_HLT] = vmexit_hlt;
+	}
+
+        if (fbsdrun_vmexit_on_pause()) {
+		/*
+		 * pause exit support required for this mode
+		 */
+		err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
+		if (err < 0) {
+			fprintf(stderr,
+			    "SMP mux requested, no pause support\n");
+			exit(1);
+		}
+		vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
+		if (cpu == BSP)
+			handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+        }
+
+	if (x2apic_mode)
+		err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
+	else
+		err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
+
+	if (err) {
+		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
+		exit(1);
+	}
+
+#ifdef	__FreeBSD__
+	vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
+#endif
+}
+
+int
+main(int argc, char *argv[])
+{
+	int c, error, gdb_port, rfb_port, err, bvmcons;
+	int max_vcpus;
+	struct vmctx *ctx;
+	uint64_t rip;
+	size_t memsize;
+
+	bvmcons = 0;
+	progname = basename(argv[0]);
+	gdb_port = 0;
+	rfb_port = -1;
+	guest_ncpus = 1;
+	memsize = 256 * MB;
+
+
+#ifdef	__FreeBSD__
+	while ((c = getopt(argc, argv, "abehwxACHIPWYp:r:g:c:s:m:l:U:")) != -1) {
+#else
+	while ((c = getopt(argc, argv, "abehwxHIPWYr:c:s:m:l:U:")) != -1) {
+#endif
+		switch (c) {
+		case 'a':
+			x2apic_mode = 0;
+			break;
+#ifdef	__FreeBSD__
+		case 'A':
+			acpi = 1;
+			break;
+#endif
+		case 'b':
+			bvmcons = 1;
+			break;
+#ifdef	__FreeBSD__
+		case 'p':
+			pincpu = atoi(optarg);
+			break;
+#endif
+		case 'r':
+			if (optarg[0] == ':')
+				rfb_port = atoi(optarg + 1) + RFB_PORT;
+			else
+				rfb_port = atoi(optarg);
+			break;
+                case 'c':
+			guest_ncpus = atoi(optarg);
+			break;
+#ifdef	__FreeBSD__
+		case 'g':
+			gdb_port = atoi(optarg);
+			break;
+#endif
+		case 'l':
+			if (lpc_device_parse(optarg) != 0) {
+				errx(EX_USAGE, "invalid lpc device "
+				    "configuration '%s'", optarg);
+			}
+			break;
+		case 's':
+			if (pci_parse_slot(optarg) != 0)
+				exit(1);
+			else
+				break;
+                case 'm':
+			error = vm_parse_memsize(optarg, &memsize);
+			if (error)
+				errx(EX_USAGE, "invalid memsize '%s'", optarg);
+			break;
+		case 'H':
+			guest_vmexit_on_hlt = 1;
+			break;
+		case 'I':
+			/*
+			 * The "-I" option was used to add an ioapic to the
+			 * virtual machine.
+			 *
+			 * An ioapic is now provided unconditionally for each
+			 * virtual machine and this option is now deprecated.
+			 */
+			break;
+		case 'P':
+			guest_vmexit_on_pause = 1;
+			break;
+		case 'e':
+			strictio = 1;
+			break;
+		case 'U':
+			guest_uuid_str = optarg;
+			break;
+		case 'W':
+			virtio_msix = 0;
+			break;
+		case 'x':
+			x2apic_mode = 1;
+			break;
+		case 'h':
+			usage(0);			
+		default:
+			usage(1);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 1)
+		usage(1);
+
+	vmname = argv[0];
+
+	ctx = vm_open(vmname);
+	if (ctx == NULL) {
+		perror("vm_open");
+		exit(1);
+	}
+
+	max_vcpus = num_vcpus_allowed(ctx);
+	if (guest_ncpus > max_vcpus) {
+		fprintf(stderr, "%d vCPUs requested but only %d available\n",
+			guest_ncpus, max_vcpus);
+		exit(1);
+	}
+
+	fbsdrun_set_capabilities(ctx, BSP);
+
+	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+	if (err) {
+		fprintf(stderr, "Unable to setup memory (%d)\n", err);
+		exit(1);
+	}
+
+	error = init_msr();
+	if (error) {
+		fprintf(stderr, "init_msr error %d", error);
+		exit(1);
+	}
+
+	init_mem();
+	init_inout();
+	atkbdc_init(ctx);
+	pci_irq_init(ctx);
+	ioapic_init(ctx);
+
+	rtc_init(ctx);
+
+	/*
+	 * Exit if a device emulation finds an error in it's initilization
+	 */
+	if (init_pci(ctx) != 0)
+		exit(1);
+
+#ifdef	__FreeBSD__
+	if (gdb_port != 0)
+		init_dbgport(gdb_port);
+#endif
+
+	if (bvmcons)
+		init_bvmcons();
+
+	console_init();
+	vga_init();
+	if (rfb_port != -1)
+		rfb_init(rfb_port);
+
+	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+	assert(error == 0);
+
+	/*
+ 	 * build the guest tables, MP etc.
+	 */
+	mptable_build(ctx, guest_ncpus);
+
+	error = smbios_build(ctx);
+	assert(error == 0);
+
+#ifdef	__FreeBSD__
+	if (acpi) {
+		error = acpi_build(ctx, guest_ncpus);
+		assert(error == 0);
+	}
+
+	/*
+	 * Change the proc title to include the VM name.
+	 */
+	setproctitle("%s", vmname); 
+#else
+	/*
+	 * If applicable, wait for bhyveconsole
+	 */
+	if (bcons_wait) {
+		printf("Waiting for bhyveconsole connection...\n");
+		(void) pthread_mutex_lock(&bcons_wait_lock);
+		while (!bcons_connected) {
+			(void) pthread_cond_wait(&bcons_wait_done,
+			    &bcons_wait_lock);
+		}
+		(void) pthread_mutex_unlock(&bcons_wait_lock);
+	}
+#endif
+
+	/*
+	 * Add CPU 0
+	 */
+	fbsdrun_addcpu(ctx, BSP, BSP, rip);
+
+	/*
+	 * Head off to the main event dispatch loop
+	 */
+#ifdef	__FreeBSD__
+	mevent_dispatch();
+#else
+	pthread_exit(NULL);
+#endif
+
+	exit(1);
+}
diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h
new file mode 100644
index 0000000000..be89314c09
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyverun.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/bhyverun.h 277310 2015-01-18 03:08:30Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef	_FBSDRUN_H_
+#define	_FBSDRUN_H_
+
+#ifndef CTASSERT		/* Allow lint to override */
+#define	CTASSERT(x)		_CTASSERT(x, __LINE__)
+#define	_CTASSERT(x, y)		__CTASSERT(x, y)
+#define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]
+#endif
+
+#define	VMEXIT_CONTINUE		(0)
+#define	VMEXIT_ABORT		(-1)
+
+struct vmctx;
+extern int guest_ncpus;
+extern char *guest_uuid_str;
+extern char *vmname;
+#ifndef	__FreeBSD__
+extern int bcons_wait;
+extern int bcons_connected;
+extern pthread_mutex_t bcons_wait_lock;
+extern pthread_cond_t bcons_wait_done;
+#endif
+
+void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
+
+void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
+void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
+int  fbsdrun_muxed(void);
+int  fbsdrun_vmexit_on_hlt(void);
+int  fbsdrun_vmexit_on_pause(void);
+int  fbsdrun_disable_x2apic(void);
+int  fbsdrun_virtio_msix(void);
+#endif
diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c
new file mode 100644
index 0000000000..2da946d420
--- /dev/null
+++ b/usr/src/cmd/bhyve/block_if.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $");
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <machine/atomic.h>
+
+#include "bhyverun.h"
+#ifdef	__FreeBSD__
+#include "mevent.h"
+#endif
+#include "block_if.h"
+
+#define BLOCKIF_SIG	0xb109b109
+
+#define BLOCKIF_MAXREQ	33
+
+enum blockop {
+	BOP_READ,
+	BOP_WRITE,
+	BOP_FLUSH
+};
+
+enum blockstat {
+	BST_FREE,
+	BST_PEND,
+	BST_BUSY,
+	BST_DONE
+};
+
+struct blockif_elem {
+	TAILQ_ENTRY(blockif_elem) be_link;
+	struct blockif_req  *be_req;
+	enum blockop	     be_op;
+	enum blockstat	     be_status;
+	pthread_t            be_tid;
+};
+
+struct blockif_ctxt {
+	int			bc_magic;
+	int			bc_fd;
+	int			bc_rdonly;
+	off_t			bc_size;
+	int			bc_sectsz;
+	pthread_t		bc_btid;
+        pthread_mutex_t		bc_mtx;
+        pthread_cond_t		bc_cond;
+	int			bc_closing;
+
+	/* Request elements and free/pending/busy queues */
+	TAILQ_HEAD(, blockif_elem) bc_freeq;       
+	TAILQ_HEAD(, blockif_elem) bc_pendq;
+	TAILQ_HEAD(, blockif_elem) bc_busyq;
+	u_int			bc_req_count;
+	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
+};
+
+static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
+
+struct blockif_sig_elem {
+	pthread_mutex_t			bse_mtx;
+	pthread_cond_t			bse_cond;
+	int				bse_pending;
+	struct blockif_sig_elem		*bse_next;
+};
+
+static struct blockif_sig_elem *blockif_bse_head;
+
+static int
+blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
+		enum blockop op)
+{
+	struct blockif_elem *be;
+
+	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
+
+	be = TAILQ_FIRST(&bc->bc_freeq);
+	assert(be != NULL);
+	assert(be->be_status == BST_FREE);
+
+	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
+	be->be_status = BST_PEND;
+	be->be_req = breq;
+	be->be_op = op;
+	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
+
+	bc->bc_req_count++;
+
+	return (0);
+}
+
+static int
+blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
+{
+	struct blockif_elem *be;
+
+	if (bc->bc_req_count == 0)
+		return (ENOENT);
+
+	be = TAILQ_FIRST(&bc->bc_pendq);
+	assert(be != NULL);
+	assert(be->be_status == BST_PEND);
+	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+	be->be_status = BST_BUSY;
+	be->be_tid = bc->bc_btid;
+	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
+
+	*bep = be;
+
+	return (0);
+}
+
+static void
+blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
+{
+	assert(be->be_status == BST_DONE);
+
+	TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
+	be->be_tid = 0;
+	be->be_status = BST_FREE;
+	be->be_req = NULL;
+	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+
+	bc->bc_req_count--;
+}
+
+static void
+blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
+{
+	struct blockif_req *br;
+	int err;
+
+	br = be->be_req;
+	err = 0;
+
+	switch (be->be_op) {
+	case BOP_READ:
+		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
+			   br->br_offset) < 0)
+			err = errno;
+		break;
+	case BOP_WRITE:
+		if (bc->bc_rdonly)
+			err = EROFS;
+		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
+			     br->br_offset) < 0)
+			err = errno;
+		break;
+	case BOP_FLUSH:
+		break;
+	default:
+		err = EINVAL;
+		break;
+	}
+
+	be->be_status = BST_DONE;
+
+	(*br->br_callback)(br, err);
+}
+
+static void *
+blockif_thr(void *arg)
+{
+	struct blockif_ctxt *bc;
+	struct blockif_elem *be;
+
+	bc = arg;
+
+	for (;;) {
+		pthread_mutex_lock(&bc->bc_mtx);
+		while (!blockif_dequeue(bc, &be)) {
+			pthread_mutex_unlock(&bc->bc_mtx);
+			blockif_proc(bc, be);
+			pthread_mutex_lock(&bc->bc_mtx);
+			blockif_complete(bc, be);
+		}
+		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
+		pthread_mutex_unlock(&bc->bc_mtx);
+
+		/*
+		 * Check ctxt status here to see if exit requested
+		 */
+		if (bc->bc_closing)
+			pthread_exit(NULL);
+	}
+
+	/* Not reached */
+	return (NULL);
+}
+
+#ifdef	__FreeBSD__
+static void
+blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
+#else
+static void
+blockif_sigcont_handler(int signal)
+#endif
+{
+	struct blockif_sig_elem *bse;
+
+	for (;;) {
+		/*
+		 * Process the entire list even if not intended for
+		 * this thread.
+		 */
+		do {
+			bse = blockif_bse_head;
+			if (bse == NULL)
+				return;
+		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+					    (uintptr_t)bse,
+					    (uintptr_t)bse->bse_next));
+
+		pthread_mutex_lock(&bse->bse_mtx);
+		bse->bse_pending = 0;
+		pthread_cond_signal(&bse->bse_cond);
+		pthread_mutex_unlock(&bse->bse_mtx);
+	}
+}
+
+static void
+blockif_init(void)
+{
+#ifdef	__FreeBSD__
+	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
+	(void) signal(SIGCONT, SIG_IGN);
+#else
+	(void) sigset(SIGCONT, blockif_sigcont_handler);
+#endif
+}
+
+struct blockif_ctxt *
+blockif_open(const char *optstr, const char *ident)
+{
+	char tname[MAXCOMLEN + 1];
+	char *nopt, *xopts;
+	struct blockif_ctxt *bc;
+	struct stat sbuf;
+	off_t size;
+	int extra, fd, i, sectsz;
+	int nocache, sync, ro;
+
+	pthread_once(&blockif_once, blockif_init);
+
+	nocache = 0;
+	sync = 0;
+	ro = 0;
+
+	/*
+	 * The first element in the optstring is always a pathname.
+	 * Optional elements follow
+	 */
+	nopt = strdup(optstr);
+	for (xopts = strtok(nopt, ",");
+	     xopts != NULL;
+	     xopts = strtok(NULL, ",")) {
+		if (!strcmp(xopts, "nocache"))
+			nocache = 1;
+		else if (!strcmp(xopts, "sync"))
+			sync = 1;
+		else if (!strcmp(xopts, "ro"))
+			ro = 1;
+	}
+
+	extra = 0;
+	if (nocache)
+		extra |= O_DIRECT;
+	if (sync)
+		extra |= O_SYNC;
+
+	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
+	if (fd < 0 && !ro) {
+		/* Attempt a r/w fail with a r/o open */
+		fd = open(nopt, O_RDONLY | extra);
+		ro = 1;
+	}
+
+	if (fd < 0) {
+		perror("Could not open backing file");
+		return (NULL);
+	}
+
+        if (fstat(fd, &sbuf) < 0) {
+                perror("Could not stat backing file");
+                close(fd);
+                return (NULL);
+        }
+
+        /*
+	 * Deal with raw devices
+	 */
+        size = sbuf.st_size;
+	sectsz = DEV_BSIZE;
+#ifdef	__FreeBSD__
+	if (S_ISCHR(sbuf.st_mode)) {
+		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+			perror("Could not fetch dev blk/sector size");
+			close(fd);
+			return (NULL);
+		}
+		assert(size != 0);
+		assert(sectsz != 0);
+	}
+#endif
+
+	bc = calloc(1, sizeof(struct blockif_ctxt));
+	if (bc == NULL) {
+		close(fd);
+		return (NULL);
+	}
+
+	bc->bc_magic = BLOCKIF_SIG;
+	bc->bc_fd = fd;
+	bc->bc_rdonly = ro;
+	bc->bc_size = size;
+	bc->bc_sectsz = sectsz;
+	pthread_mutex_init(&bc->bc_mtx, NULL);
+	pthread_cond_init(&bc->bc_cond, NULL);
+	TAILQ_INIT(&bc->bc_freeq);
+	TAILQ_INIT(&bc->bc_pendq);
+	TAILQ_INIT(&bc->bc_busyq);
+	bc->bc_req_count = 0;
+	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
+		bc->bc_reqs[i].be_status = BST_FREE;
+		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
+	}
+
+	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
+
+	snprintf(tname, sizeof(tname), "blk-%s", ident);
+	pthread_set_name_np(bc->bc_btid, tname);
+
+	return (bc);
+}
+
+static int
+blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
+		enum blockop op)
+{
+	int err;
+
+	err = 0;
+
+	pthread_mutex_lock(&bc->bc_mtx);
+	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
+		/*
+		 * Enqueue and inform the block i/o thread
+		 * that there is work available
+		 */
+		blockif_enqueue(bc, breq, op);
+		pthread_cond_signal(&bc->bc_cond);
+	} else {
+		/*
+		 * Callers are not allowed to enqueue more than
+		 * the specified blockif queue limit. Return an
+		 * error to indicate that the queue length has been
+		 * exceeded.
+		 */
+		err = E2BIG;
+	}
+	pthread_mutex_unlock(&bc->bc_mtx);
+
+	return (err);
+}
+
+int
+blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (blockif_request(bc, breq, BOP_READ));
+}
+
+int
+blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (blockif_request(bc, breq, BOP_WRITE));
+}
+
+int
+blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (blockif_request(bc, breq, BOP_FLUSH));
+}
+
+int
+blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+	struct blockif_elem *be;
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+
+	pthread_mutex_lock(&bc->bc_mtx);
+	/*
+	 * Check pending requests.
+	 */
+	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+		if (be->be_req == breq)
+			break;
+	}
+	if (be != NULL) {
+		/*
+		 * Found it.
+		 */
+		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+		be->be_status = BST_FREE;
+		be->be_req = NULL;
+		TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+		bc->bc_req_count--;
+		pthread_mutex_unlock(&bc->bc_mtx);
+
+		return (0);
+	}
+
+	/*
+	 * Check in-flight requests.
+	 */
+	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
+		if (be->be_req == breq)
+			break;
+	}
+	if (be == NULL) {
+		/*
+		 * Didn't find it.
+		 */
+		pthread_mutex_unlock(&bc->bc_mtx);
+		return (EINVAL);
+	}
+
+	/*
+	 * Interrupt the processing thread to force it return
+	 * prematurely via it's normal callback path.
+	 */
+	while (be->be_status == BST_BUSY) {
+		struct blockif_sig_elem bse, *old_head;
+
+		pthread_mutex_init(&bse.bse_mtx, NULL);
+		pthread_cond_init(&bse.bse_cond, NULL);
+
+		bse.bse_pending = 1;
+
+		do {
+			old_head = blockif_bse_head;
+			bse.bse_next = old_head;
+		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+					    (uintptr_t)old_head,
+					    (uintptr_t)&bse));
+
+		pthread_kill(be->be_tid, SIGCONT);
+
+		pthread_mutex_lock(&bse.bse_mtx);
+		while (bse.bse_pending)
+			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
+		pthread_mutex_unlock(&bse.bse_mtx);
+	}
+
+	pthread_mutex_unlock(&bc->bc_mtx);
+
+	/*
+	 * The processing thread has been interrupted.  Since it's not
+	 * clear if the callback has been invoked yet, return EBUSY.
+	 */
+	return (EBUSY);
+}
+
+int
+blockif_close(struct blockif_ctxt *bc)
+{
+	void *jval;
+	int err;
+
+	err = 0;
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+
+	/*
+	 * Stop the block i/o thread
+	 */
+	bc->bc_closing = 1;
+	pthread_cond_signal(&bc->bc_cond);
+	pthread_join(bc->bc_btid, &jval);
+
+	/* XXX Cancel queued i/o's ??? */
+
+	/*
+	 * Release resources
+	 */
+	bc->bc_magic = 0;
+	close(bc->bc_fd);
+	free(bc);
+
+	return (0);
+}
+
+/*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+	off_t sectors;		/* total sectors of the block dev */
+	off_t hcyl;		/* cylinders times heads */
+	uint16_t secpt;		/* sectors per track */
+	uint8_t heads;
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+
+	sectors = bc->bc_size / bc->bc_sectsz;
+
+	/* Clamp the size to the largest possible with CHS */
+	if (sectors > 65535UL*16*255)
+		sectors = 65535UL*16*255;
+
+	if (sectors >= 65536UL*16*63) {
+		secpt = 255;
+		heads = 16;
+		hcyl = sectors / secpt;
+	} else {
+		secpt = 17;
+		hcyl = sectors / secpt;
+		heads = (hcyl + 1023) / 1024;
+
+		if (heads < 4)
+			heads = 4;
+
+		if (hcyl >= (heads * 1024) || heads > 16) {
+			secpt = 31;
+			heads = 16;
+			hcyl = sectors / secpt;
+		}
+		if (hcyl >= (heads * 1024)) {
+			secpt = 63;
+			heads = 16;
+			hcyl = sectors / secpt;
+		}
+	}
+
+	*c = hcyl / heads;
+	*h = heads;
+	*s = secpt;
+}
+
+/*
+ * Accessors
+ */
+off_t
+blockif_size(struct blockif_ctxt *bc)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (bc->bc_size);
+}
+
+int
+blockif_sectsz(struct blockif_ctxt *bc)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (bc->bc_sectsz);
+}
+
+int
+blockif_queuesz(struct blockif_ctxt *bc)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (BLOCKIF_MAXREQ - 1);
+}
+
+int
+blockif_is_ro(struct blockif_ctxt *bc)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (bc->bc_rdonly);
+}
diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h
new file mode 100644
index 0000000000..5ef120933c
--- /dev/null
+++ b/usr/src/cmd/bhyve/block_if.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/block_if.h 268638 2014-07-15 00:25:54Z grehan $
+ */
+
+/*
+ * The block API to be used by bhyve block-device emulations. The routines
+ * are thread safe, with no assumptions about the context of the completion
+ * callback - it may occur in the caller's context, or asynchronously in
+ * another thread.
+ */
+
+#ifndef _BLOCK_IF_H_
+#define _BLOCK_IF_H_
+
+#include <sys/uio.h>
+#include <sys/unistd.h>
+
+#ifdef	__FreeBSD__
+#define BLOCKIF_IOV_MAX		32	/* not practical to be IOV_MAX */
+#else
+#define BLOCKIF_IOV_MAX		16	/* not practical to be IOV_MAX */
+#endif
+
+struct blockif_req {
+	struct iovec	br_iov[BLOCKIF_IOV_MAX];
+	int		br_iovcnt;
+	off_t		br_offset;
+	void		(*br_callback)(struct blockif_req *req, int err);
+	void		*br_param;
+};
+
+struct blockif_ctxt;
+struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
+off_t	blockif_size(struct blockif_ctxt *bc);
+void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+    uint8_t *s);
+int	blockif_sectsz(struct blockif_ctxt *bc);
+int	blockif_queuesz(struct blockif_ctxt *bc);
+int	blockif_is_ro(struct blockif_ctxt *bc);
+int	blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
+int	blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
+int	blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
+int	blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
+int	blockif_close(struct blockif_ctxt *bc);
+
+#endif /* _BLOCK_IF_H_ */
diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c
new file mode 100644
index 0000000000..a8d07709be
--- /dev/null
+++ b/usr/src/cmd/bhyve/console.c
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+
+static struct {
+	struct bhyvegc		*gc;
+
+	fb_render_func_t	fb_render_cb;
+	void			*fb_arg;
+
+	kbd_event_func_t	kbd_event_cb;
+	void			*kbd_arg;
+
+	ptr_event_func_t	ptr_event_cb;
+	void			*ptr_arg;
+} console;
+
+void
+console_init(void)
+{
+	console.gc = bhyvegc_init(640, 400);
+}
+
+struct bhyvegc_image *
+console_get_image(void)
+{
+	struct bhyvegc_image *bhyvegc_image;
+
+	bhyvegc_image = bhyvegc_get_image(console.gc);
+
+	return (bhyvegc_image);
+}
+
+void
+console_fb_register(fb_render_func_t render_cb, void *arg)
+{
+	console.fb_render_cb = render_cb;
+	console.fb_arg = arg;
+}
+
+void
+console_refresh(void)
+{
+	(*console.fb_render_cb)(console.gc, console.fb_arg);
+}
+
+void
+console_kbd_register(kbd_event_func_t event_cb, void *arg)
+{
+	console.kbd_event_cb = event_cb;
+	console.kbd_arg = arg;
+}
+
+void
+console_ptr_register(ptr_event_func_t event_cb, void *arg)
+{
+	console.ptr_event_cb = event_cb;
+	console.ptr_arg = arg;
+}
+
+void
+console_key_event(int down, uint32_t keysym)
+{
+	(*console.kbd_event_cb)(down, keysym, console.kbd_arg);
+}
+
+void
+console_ptr_event(uint8_t button, int x, int y)
+{
+	(*console.ptr_event_cb)(button, x, y, console.ptr_arg);
+}
diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h
new file mode 100644
index 0000000000..bffb7c2456
--- /dev/null
+++ b/usr/src/cmd/bhyve/console.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CONSOLE_H_
+#define	_CONSOLE_H_
+
+struct bhyvegc;
+
+typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg);
+typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg);
+typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg);
+
+void	console_init(void);
+struct bhyvegc_image *console_get_image(void);
+
+void	console_fb_register(fb_render_func_t render_cb, void *arg);
+void	console_refresh(void);
+
+void	console_kbd_register(kbd_event_func_t event_cb, void *arg);
+void	console_key_event(int down, uint32_t keysym);
+
+void	console_ptr_register(ptr_event_func_t event_cb, void *arg);
+void	console_ptr_event(uint8_t button, int x, int y);
+
+#endif /* _CONSOLE_H_ */
diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c
new file mode 100644
index 0000000000..69b6dfddf1
--- /dev/null
+++ b/usr/src/cmd/bhyve/consport.c
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $");
+
+#include <sys/types.h>
+#include <sys/select.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include "inout.h"
+#include "pci_lpc.h"
+
+#define	BVM_CONSOLE_PORT	0x220
+#define	BVM_CONS_SIG		('b' << 8 | 'v')
+
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+#ifdef	__FreeBSD__
+	tcgetattr(STDIN_FILENO, &tio_orig);
+
+	cfmakeraw(&tio_new);
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);	
+
+	atexit(ttyclose);
+#endif
+}
+
+static bool
+tty_char_available(void)
+{
+        fd_set rfds;
+        struct timeval tv;
+
+        FD_ZERO(&rfds);
+        FD_SET(STDIN_FILENO, &rfds);
+        tv.tv_sec = 0;
+        tv.tv_usec = 0;
+        if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+		return (true);
+	} else {
+		return (false);
+	}
+}
+
+static int
+ttyread(void)
+{
+	char rb;
+
+	if (tty_char_available()) {
+		read(STDIN_FILENO, &rb, 1);
+		return (rb & 0xff);
+	} else {
+		return (-1);
+	}
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+	(void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		uint32_t *eax, void *arg)
+{
+	static int opened;
+
+	if (bytes == 2 && in) {
+		*eax = BVM_CONS_SIG;
+		return (0);
+	}
+
+	/*
+	 * Guests might probe this port to look for old ISA devices
+	 * using single-byte reads.  Return 0xff for those.
+	 */
+	if (bytes == 1 && in) {
+		*eax = 0xff;
+		return (0);
+	}
+
+	if (bytes != 4)
+		return (-1);
+
+	if (!opened) {
+		ttyopen();
+		opened = 1;
+	}
+	
+	if (in)
+		*eax = ttyread();
+	else
+		ttywrite(*eax);
+
+	return (0);
+}
+
+SYSRES_IO(BVM_CONSOLE_PORT, 4);
+
+static struct inout_port consport = {
+	"bvmcons",
+	BVM_CONSOLE_PORT,
+	1,
+	IOPORT_F_INOUT,
+	console_handler
+};
+
+void
+init_bvmcons(void)
+{
+
+	register_inout(&consport);
+}
diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h
new file mode 100644
index 0000000000..b95df0bd31
--- /dev/null
+++ b/usr/src/cmd/bhyve/dbgport.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/dbgport.h 256156 2013-10-08 16:36:17Z neel $
+ */
+
+#ifndef _DBGPORT_H_
+#define	_DBGPORT_H_
+
+void	init_dbgport(int port);
+
+#endif
diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c
new file mode 100644
index 0000000000..510649893a
--- /dev/null
+++ b/usr/src/cmd/bhyve/inout.c
@@ -0,0 +1,297 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define	MAX_IOPORTS	(1 << 16)
+
+#define	VERIFY_IOPORT(port, size) \
+	assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS)
+
+static struct {
+	const char	*name;
+	int		flags;
+	inout_func_t	handler;
+	void		*arg;
+} inout_handlers[MAX_IOPORTS];
+
+static int
+default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+              uint32_t *eax, void *arg)
+{
+        if (in) {
+                switch (bytes) {
+                case 4:
+                        *eax = 0xffffffff;
+                        break;
+                case 2:
+                        *eax = 0xffff;
+                        break;
+                case 1:
+                        *eax = 0xff;
+                        break;
+                }
+        }
+        
+        return (0);
+}
+
+static void 
+register_default_iohandler(int start, int size)
+{
+	struct inout_port iop;
+	
+	VERIFY_IOPORT(start, size);
+
+	bzero(&iop, sizeof(iop));
+	iop.name = "default";
+	iop.port = start;
+	iop.size = size;
+	iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT;
+	iop.handler = default_inout;
+
+	register_inout(&iop);
+}
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
+{
+	int addrsize, bytes, flags, in, port, prot, rep;
+	uint32_t eax, val;
+	inout_func_t handler;
+	void *arg;
+	int error, retval;
+	enum vm_reg_name idxreg;
+	uint64_t gla, index, iterations, count;
+	struct vm_inout_str *vis;
+	struct iovec iov[2];
+
+	bytes = vmexit->u.inout.bytes;
+	in = vmexit->u.inout.in;
+	port = vmexit->u.inout.port;
+
+	assert(port < MAX_IOPORTS);
+	assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+	handler = inout_handlers[port].handler;
+
+	if (strict && handler == default_inout)
+		return (-1);
+
+	flags = inout_handlers[port].flags;
+	arg = inout_handlers[port].arg;
+
+	if (in) {
+		if (!(flags & IOPORT_F_IN))
+			return (-1);
+	} else {
+		if (!(flags & IOPORT_F_OUT))
+			return (-1);
+	}
+
+	retval = 0;
+	if (vmexit->u.inout.string) {
+		vis = &vmexit->u.inout_str;
+		rep = vis->inout.rep;
+		addrsize = vis->addrsize;
+		prot = in ? PROT_WRITE : PROT_READ;
+		assert(addrsize == 2 || addrsize == 4 || addrsize == 8);
+
+		/* Index register */
+		idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
+		index = vis->index & vie_size2mask(addrsize);
+
+		/* Count register */
+		count = vis->count & vie_size2mask(addrsize);
+
+		/* Limit number of back-to-back in/out emulations to 16 */
+		iterations = MIN(count, 16);
+		while (iterations > 0) {
+			assert(retval == 0);
+			if (vie_calculate_gla(vis->paging.cpu_mode,
+			    vis->seg_name, &vis->seg_desc, index, bytes,
+			    addrsize, prot, &gla)) {
+				vm_inject_gp(ctx, vcpu);
+				break;
+			}
+
+			error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
+			    bytes, prot, iov, nitems(iov));
+			if (error == -1) {
+				retval = -1;  /* Unrecoverable error */
+				break;
+			} else if (error == 1) {
+				retval = 0;  /* Resume guest to handle fault */
+				break;
+			}
+
+			if (vie_alignment_check(vis->paging.cpl, bytes,
+			    vis->cr0, vis->rflags, gla)) {
+				vm_inject_ac(ctx, vcpu, 0);
+				break;
+			}
+
+			val = 0;
+			if (!in)
+				vm_copyin(ctx, vcpu, iov, &val, bytes);
+
+			retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+			if (retval != 0)
+				break;
+
+			if (in)
+				vm_copyout(ctx, vcpu, &val, iov, bytes);
+
+			/* Update index */
+			if (vis->rflags & PSL_D)
+				index -= bytes;
+			else
+				index += bytes;
+
+			count--;
+			iterations--;
+		}
+
+		/* Update index register */
+		error = vie_update_register(ctx, vcpu, idxreg, index, addrsize);
+		assert(error == 0);
+
+		/*
+		 * Update count register only if the instruction had a repeat
+		 * prefix.
+		 */
+		if (rep) {
+			error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX,
+			    count, addrsize);
+			assert(error == 0);
+		}
+
+		/* Restart the instruction if more iterations remain */
+		if (retval == 0 && count != 0) {
+			error = vm_restart_instruction(ctx, vcpu);
+			assert(error == 0);
+		}
+	} else {
+		eax = vmexit->u.inout.eax;
+		val = eax & vie_size2mask(bytes);
+		retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+		if (retval == 0 && in) {
+			eax &= ~vie_size2mask(bytes);
+			eax |= val & vie_size2mask(bytes);
+			error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
+			    eax);
+			assert(error == 0);
+		}
+	}
+	return (retval);
+}
+
+void
+init_inout(void)
+{
+	struct inout_port **iopp, *iop;
+
+	/*
+	 * Set up the default handler for all ports
+	 */
+	register_default_iohandler(0, MAX_IOPORTS);
+
+	/*
+	 * Overwrite with specified handlers
+	 */
+	SET_FOREACH(iopp, inout_port_set) {
+		iop = *iopp;
+		assert(iop->port < MAX_IOPORTS);
+		inout_handlers[iop->port].name = iop->name;
+		inout_handlers[iop->port].flags = iop->flags;
+		inout_handlers[iop->port].handler = iop->handler;
+		inout_handlers[iop->port].arg = NULL;
+	}
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+	int i;
+
+	VERIFY_IOPORT(iop->port, iop->size);
+
+	/*
+	 * Verify that the new registration is not overwriting an already
+	 * allocated i/o range.
+	 */
+	if ((iop->flags & IOPORT_F_DEFAULT) == 0) {
+		for (i = iop->port; i < iop->port + iop->size; i++) {
+			if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0)
+				return (-1);
+		}
+	}
+
+	for (i = iop->port; i < iop->port + iop->size; i++) {
+		inout_handlers[i].name = iop->name;
+		inout_handlers[i].flags = iop->flags;
+		inout_handlers[i].handler = iop->handler;
+		inout_handlers[i].arg = iop->arg;
+	}
+
+	return (0);
+}
+
+int
+unregister_inout(struct inout_port *iop)
+{
+
+	VERIFY_IOPORT(iop->port, iop->size);
+	assert(inout_handlers[iop->port].name == iop->name);
+
+	register_default_iohandler(iop->port, iop->size);
+
+	return (0);
+}
diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h
new file mode 100644
index 0000000000..0d4046bd61
--- /dev/null
+++ b/usr/src/cmd/bhyve/inout.h
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/inout.h 269094 2014-07-25 20:18:35Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _INOUT_H_
+#define	_INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+struct vm_exit;
+
+/*
+ * inout emulation handlers return 0 on success and -1 on failure.
+ */
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+			    int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+	const char 	*name;
+	int		port;
+	int		size;
+	int		flags;
+	inout_func_t	handler;
+	void		*arg;
+};
+#define	IOPORT_F_IN		0x1
+#define	IOPORT_F_OUT		0x2
+#define	IOPORT_F_INOUT		(IOPORT_F_IN | IOPORT_F_OUT)
+
+/*
+ * The following flags are used internally and must not be used by
+ * device models.
+ */
+#define	IOPORT_F_DEFAULT	0x80000000	/* claimed by default handler */
+
+#define	INOUT_PORT(name, port, flags, handler)				\
+	static struct inout_port __CONCAT(__inout_port, __LINE__) = {	\
+		#name,							\
+		(port),							\
+		1,							\
+		(flags),						\
+		(handler),						\
+		0							\
+	};								\
+	DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+	
+void	init_inout(void);
+int	emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit,
+		      int strict);
+int	register_inout(struct inout_port *iop);
+int	unregister_inout(struct inout_port *iop);
+void	init_bvmcons(void);
+
+#endif	/* _INOUT_H_ */
diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c
new file mode 100644
index 0000000000..86ff5c6580
--- /dev/null
+++ b/usr/src/cmd/bhyve/ioapic.c
@@ -0,0 +1,74 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/ioapic.c 261268 2014-01-29 14:56:48Z jhb $");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "ioapic.h"
+
+/*
+ * Assign PCI INTx interrupts to I/O APIC pins in a round-robin
+ * fashion.  Note that we have no idea what the HPET is using, but the
+ * HPET is also programmable whereas this is intended for hardwired
+ * PCI interrupts.
+ *
+ * This assumes a single I/O APIC where pins >= 16 are permitted for
+ * PCI devices.
+ */
+static int pci_pins;
+
+void
+ioapic_init(struct vmctx *ctx)
+{
+
+	if (vm_ioapic_pincount(ctx, &pci_pins) < 0) {
+		pci_pins = 0;
+		return;
+	}
+
+	/* Ignore the first 16 pins. */
+	if (pci_pins <= 16) {
+		pci_pins = 0;
+		return;
+	}
+	pci_pins -= 16;
+}
+
+int
+ioapic_pci_alloc_irq(void)
+{
+	static int last_pin;
+
+	if (pci_pins == 0)
+		return (-1);
+	return (16 + (last_pin++ % pci_pins));
+}
diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h
new file mode 100644
index 0000000000..789f90fea9
--- /dev/null
+++ b/usr/src/cmd/bhyve/ioapic.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/ioapic.h 261268 2014-01-29 14:56:48Z jhb $
+ */
+
+#ifndef _IOAPIC_H_
+#define	_IOAPIC_H_
+
+/*
+ * Allocate a PCI IRQ from the I/O APIC.
+ */
+void	ioapic_init(struct vmctx *ctx);
+int	ioapic_pci_alloc_irq(void);
+
+#endif
diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c
new file mode 100644
index 0000000000..a153a8e960
--- /dev/null
+++ b/usr/src/cmd/bhyve/mem.c
@@ -0,0 +1,291 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $
+ */
+
+/*
+ * Memory ranges are represented with an RB tree. On insertion, the range
+ * is checked for overlaps. On lookup, the key has the same base and limit
+ * so it can be searched within the range.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $");
+
+#include <sys/types.h>
+#include <sys/tree.h>
+#include <sys/errno.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "mem.h"
+
+struct mmio_rb_range {
+	RB_ENTRY(mmio_rb_range)	mr_link;	/* RB tree links */
+	struct mem_range	mr_param;
+	uint64_t                mr_base;
+	uint64_t                mr_end;
+};
+
+struct mmio_rb_tree;
+RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback;
+
+/*
+ * Per-vCPU cache. Since most accesses from a vCPU will be to
+ * consecutive addresses in a range, it makes sense to cache the
+ * result of a lookup.
+ */
+static struct mmio_rb_range	*mmio_hint[VM_MAXCPU];
+
+static pthread_rwlock_t mmio_rwlock;
+
+static int
+mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
+{
+	if (a->mr_end < b->mr_base)
+		return (-1);
+	else if (a->mr_base > b->mr_end)
+		return (1);
+	return (0);
+}
+
+static int
+mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr,
+    struct mmio_rb_range **entry)
+{
+	struct mmio_rb_range find, *res;
+
+	find.mr_base = find.mr_end = addr;
+
+	res = RB_FIND(mmio_rb_tree, rbt, &find);
+
+	if (res != NULL) {
+		*entry = res;
+		return (0);
+	}
+	
+	return (ENOENT);
+}
+
+static int
+mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
+{
+	struct mmio_rb_range *overlap;
+
+	overlap = RB_INSERT(mmio_rb_tree, rbt, new);
+
+	if (overlap != NULL) {
+#ifdef RB_DEBUG
+		printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
+		       new->mr_base, new->mr_end,
+		       overlap->mr_base, overlap->mr_end);
+#endif
+
+		return (EEXIST);
+	}
+
+	return (0);
+}
+
+#if 0
+static void
+mmio_rb_dump(struct mmio_rb_tree *rbt)
+{
+	struct mmio_rb_range *np;
+
+	pthread_rwlock_rdlock(&mmio_rwlock);
+	RB_FOREACH(np, mmio_rb_tree, rbt) {
+		printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
+		       np->mr_param.name);
+	}
+	pthread_rwlock_unlock(&mmio_rwlock);
+}
+#endif
+
+RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+static int
+mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
+{
+	int error;
+	struct mem_range *mr = arg;
+
+	error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
+			       rval, mr->arg1, mr->arg2);
+	return (error);
+}
+
+static int
+mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
+{
+	int error;
+	struct mem_range *mr = arg;
+
+	error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
+			       &wval, mr->arg1, mr->arg2);
+	return (error);
+}
+
+int
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+    struct vm_guest_paging *paging)
+
+{
+	struct mmio_rb_range *entry;
+	int err, immutable;
+	
+	pthread_rwlock_rdlock(&mmio_rwlock);
+	/*
+	 * First check the per-vCPU cache
+	 */
+	if (mmio_hint[vcpu] &&
+	    paddr >= mmio_hint[vcpu]->mr_base &&
+	    paddr <= mmio_hint[vcpu]->mr_end) {
+		entry = mmio_hint[vcpu];
+	} else
+		entry = NULL;
+
+	if (entry == NULL) {
+		if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) {
+			/* Update the per-vCPU cache */
+			mmio_hint[vcpu] = entry;			
+		} else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
+			pthread_rwlock_unlock(&mmio_rwlock);
+			return (ESRCH);
+		}
+	}
+
+	assert(entry != NULL);
+
+	/*
+	 * An 'immutable' memory range is guaranteed to be never removed
+	 * so there is no need to hold 'mmio_rwlock' while calling the
+	 * handler.
+	 *
+	 * XXX writes to the PCIR_COMMAND register can cause register_mem()
+	 * to be called. If the guest is using PCI extended config space
+	 * to modify the PCIR_COMMAND register then register_mem() can
+	 * deadlock on 'mmio_rwlock'. However by registering the extended
+	 * config space window as 'immutable' the deadlock can be avoided.
+	 */
+	immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
+	if (immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
+
+	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
+				      mem_read, mem_write, &entry->mr_param);
+
+	if (!immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
+
+	return (err);
+}
+
+static int
+register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
+{
+	struct mmio_rb_range *entry, *mrp;
+	int		err;
+
+	err = 0;
+
+	mrp = malloc(sizeof(struct mmio_rb_range));
+	
+	if (mrp != NULL) {
+		mrp->mr_param = *memp;
+		mrp->mr_base = memp->base;
+		mrp->mr_end = memp->base + memp->size - 1;
+		pthread_rwlock_wrlock(&mmio_rwlock);
+		if (mmio_rb_lookup(rbt, memp->base, &entry) != 0)
+			err = mmio_rb_add(rbt, mrp);
+		pthread_rwlock_unlock(&mmio_rwlock);
+		if (err)
+			free(mrp);
+	} else
+		err = ENOMEM;
+
+	return (err);
+}
+
+int
+register_mem(struct mem_range *memp)
+{
+
+	return (register_mem_int(&mmio_rb_root, memp));
+}
+
+int
+register_mem_fallback(struct mem_range *memp)
+{
+
+	return (register_mem_int(&mmio_rb_fallback, memp));
+}
+
+int 
+unregister_mem(struct mem_range *memp)
+{
+	struct mem_range *mr;
+	struct mmio_rb_range *entry = NULL;
+	int err, i;
+	
+	pthread_rwlock_wrlock(&mmio_rwlock);
+	err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry);
+	if (err == 0) {
+		mr = &entry->mr_param;
+		assert(mr->name == memp->name);
+		assert(mr->base == memp->base && mr->size == memp->size); 
+		assert((mr->flags & MEM_F_IMMUTABLE) == 0);
+		RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
+
+		/* flush Per-vCPU cache */	
+		for (i=0; i < VM_MAXCPU; i++) {
+			if (mmio_hint[i] == entry)
+				mmio_hint[i] = NULL;
+		}
+	}
+	pthread_rwlock_unlock(&mmio_rwlock);
+
+	if (entry)
+		free(entry);
+	
+	return (err);
+}
+
+void
+init_mem(void)
+{
+
+	RB_INIT(&mmio_rb_root);
+	RB_INIT(&mmio_rb_fallback);
+	pthread_rwlock_init(&mmio_rwlock, NULL);
+}
diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h
new file mode 100644
index 0000000000..09cf56b72e
--- /dev/null
+++ b/usr/src/cmd/bhyve/mem.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mem.h 269700 2014-08-08 03:49:01Z neel $
+ */
+
+#ifndef _MEM_H_
+#define	_MEM_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+			  int size, uint64_t *val, void *arg1, long arg2);
+
+struct mem_range {
+	const char 	*name;
+	int		flags;
+	mem_func_t	handler;
+	void		*arg1;
+	long		arg2;
+	uint64_t  	base;
+	uint64_t  	size;
+};
+#define	MEM_F_READ		0x1
+#define	MEM_F_WRITE		0x2
+#define	MEM_F_RW		0x3
+#define	MEM_F_IMMUTABLE		0x4	/* mem_range cannot be unregistered */
+
+void	init_mem(void);
+int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
+		    struct vm_guest_paging *paging);
+		    
+int	register_mem(struct mem_range *memp);
+int	register_mem_fallback(struct mem_range *memp);
+int	unregister_mem(struct mem_range *memp);
+
+#endif	/* _MEM_H_ */
diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c
new file mode 100644
index 0000000000..9d03765c7a
--- /dev/null
+++ b/usr/src/cmd/bhyve/mptbl.c
@@ -0,0 +1,377 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <x86/mptable.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "acpi.h"
+#include "bhyverun.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+
+#define MPTABLE_BASE		0xE0000
+
+/* floating pointer length + maximum length of configuration table */
+#define	MPTABLE_MAX_LENGTH	(65536 + 16)
+
+#define LAPIC_PADDR		0xFEE00000
+#define LAPIC_VERSION 		16
+
+#define IOAPIC_PADDR		0xFEC00000
+#define IOAPIC_VERSION		0x11
+
+#define MP_SPECREV		4
+#define MPFP_SIG		"_MP_"
+
+/* Configuration header defines */
+#define MPCH_SIG		"PCMP"
+#define MPCH_OEMID		"BHyVe   "
+#define MPCH_OEMID_LEN          8
+#define MPCH_PRODID             "Hypervisor  "
+#define MPCH_PRODID_LEN         12
+
+/* Processor entry defines */
+#define MPEP_SIG_FAMILY		6	/* XXX bhyve should supply this */
+#define MPEP_SIG_MODEL		26
+#define MPEP_SIG_STEPPING	5
+#define MPEP_SIG		\
+	((MPEP_SIG_FAMILY << 8) | \
+	 (MPEP_SIG_MODEL << 4)	| \
+	 (MPEP_SIG_STEPPING))
+
+#define MPEP_FEATURES           (0xBFEBFBFF) /* XXX Intel i7 */
+
+/* Number of local intr entries */
+#define	MPEII_NUM_LOCAL_IRQ	2
+
+/* Bus entry defines */
+#define MPE_NUM_BUSES		2
+#define MPE_BUSNAME_LEN		6
+#define MPE_BUSNAME_ISA		"ISA   "
+#define MPE_BUSNAME_PCI		"PCI   "
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static uint8_t
+mpt_compute_checksum(void *base, size_t len)
+{
+	uint8_t	*bytes;
+	uint8_t	sum;
+
+	for(bytes = base, sum = 0; len > 0; len--) {
+		sum += *bytes++;
+	}
+
+	return (256 - sum);
+}
+
+static void
+mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
+{
+
+	memset(mpfp, 0, sizeof(*mpfp));
+	memcpy(mpfp->signature, MPFP_SIG, 4);
+	mpfp->pap = gpa + sizeof(*mpfp);
+	mpfp->length = 1;
+	mpfp->spec_rev = MP_SPECREV;
+	mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
+}
+
+static void
+mpt_build_mpch(mpcth_t mpch)
+{
+
+	memset(mpch, 0, sizeof(*mpch));
+	memcpy(mpch->signature, MPCH_SIG, 4);
+	mpch->spec_rev = MP_SPECREV;
+	memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
+	memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
+	mpch->apic_address = LAPIC_PADDR;
+}
+
+static void
+mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu)
+{
+	int i;
+
+	for (i = 0; i < ncpu; i++) {
+		memset(mpep, 0, sizeof(*mpep));
+		mpep->type = MPCT_ENTRY_PROCESSOR;
+		mpep->apic_id = i; // XXX
+		mpep->apic_version = LAPIC_VERSION;
+		mpep->cpu_flags = PROCENTRY_FLAG_EN;
+		if (i == 0)
+			mpep->cpu_flags |= PROCENTRY_FLAG_BP;
+		mpep->cpu_signature = MPEP_SIG;
+		mpep->feature_flags = MPEP_FEATURES;
+		mpep++;
+	}
+}
+
+static void
+mpt_build_localint_entries(int_entry_ptr mpie)
+{
+
+	/* Hardcode LINT0 as ExtINT on all CPUs. */
+	memset(mpie, 0, sizeof(*mpie));
+	mpie->type = MPCT_ENTRY_LOCAL_INT;
+	mpie->int_type = INTENTRY_TYPE_EXTINT;
+	mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
+	    INTENTRY_FLAGS_TRIGGER_CONFORM;
+	mpie->dst_apic_id = 0xff;
+	mpie->dst_apic_int = 0;
+	mpie++;
+
+	/* Hardcode LINT1 as NMI on all CPUs. */
+	memset(mpie, 0, sizeof(*mpie));
+	mpie->type = MPCT_ENTRY_LOCAL_INT;
+	mpie->int_type = INTENTRY_TYPE_NMI;
+	mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
+	    INTENTRY_FLAGS_TRIGGER_CONFORM;
+	mpie->dst_apic_id = 0xff;
+	mpie->dst_apic_int = 1;
+}
+
+static void
+mpt_build_bus_entries(bus_entry_ptr mpeb)
+{
+
+	memset(mpeb, 0, sizeof(*mpeb));
+	mpeb->type = MPCT_ENTRY_BUS;
+	mpeb->bus_id = 0;
+	memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
+	mpeb++;
+
+	memset(mpeb, 0, sizeof(*mpeb));
+	mpeb->type = MPCT_ENTRY_BUS;
+	mpeb->bus_id = 1;	
+	memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
+}
+
+static void
+mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
+{
+
+	memset(mpei, 0, sizeof(*mpei));
+	mpei->type = MPCT_ENTRY_IOAPIC;
+	mpei->apic_id = id;
+	mpei->apic_version = IOAPIC_VERSION;
+	mpei->apic_flags = IOAPICENTRY_FLAG_EN;
+	mpei->apic_address = IOAPIC_PADDR;
+}
+
+static int
+mpt_count_ioint_entries(void)
+{
+	int bus, count;
+
+	count = 0;
+	for (bus = 0; bus <= PCI_BUSMAX; bus++)
+		count += pci_count_lintr(bus);
+
+	/*
+	 * Always include entries for the first 16 pins along with a entry
+	 * for each active PCI INTx pin.
+	 */
+	return (16 + count);
+}
+
+static void
+mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+    void *arg)
+{
+	int_entry_ptr *mpiep, mpie;
+
+	mpiep = arg;
+	mpie = *mpiep;
+	memset(mpie, 0, sizeof(*mpie));
+
+	/*
+	 * This is always after another I/O interrupt entry, so cheat
+	 * and fetch the I/O APIC ID from the prior entry.
+	 */
+	mpie->type = MPCT_ENTRY_INT;
+	mpie->int_type = INTENTRY_TYPE_INT;
+	mpie->src_bus_id = bus;
+	mpie->src_bus_irq = slot << 2 | (pin - 1);
+	mpie->dst_apic_id = mpie[-1].dst_apic_id;
+	mpie->dst_apic_int = ioapic_irq;
+
+	*mpiep = mpie + 1;
+}
+
+static void
+mpt_build_ioint_entries(int_entry_ptr mpie, int id)
+{
+	int pin, bus;
+
+	/*
+	 * The following config is taken from kernel mptable.c
+	 * mptable_parse_default_config_ints(...), for now 
+	 * just use the default config, tweek later if needed.
+	 */
+
+	/* First, generate the first 16 pins. */
+	for (pin = 0; pin < 16; pin++) {
+		memset(mpie, 0, sizeof(*mpie));
+		mpie->type = MPCT_ENTRY_INT;
+		mpie->src_bus_id = 1;
+		mpie->dst_apic_id = id;
+
+		/*
+		 * All default configs route IRQs from bus 0 to the first 16
+		 * pins of the first I/O APIC with an APIC ID of 2.
+		 */
+		mpie->dst_apic_int = pin;
+		switch (pin) {
+		case 0:
+			/* Pin 0 is an ExtINT pin. */
+			mpie->int_type = INTENTRY_TYPE_EXTINT;
+			break;
+		case 2:
+			/* IRQ 0 is routed to pin 2. */
+			mpie->int_type = INTENTRY_TYPE_INT;
+			mpie->src_bus_irq = 0;
+			break;
+		case SCI_INT:
+			/* ACPI SCI is level triggered and active-lo. */
+			mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO |
+			    INTENTRY_FLAGS_TRIGGER_LEVEL;
+			mpie->int_type = INTENTRY_TYPE_INT;
+			mpie->src_bus_irq = SCI_INT;
+			break;
+		default:
+			/* All other pins are identity mapped. */
+			mpie->int_type = INTENTRY_TYPE_INT;
+			mpie->src_bus_irq = pin;
+			break;
+		}
+		mpie++;
+	}
+
+	/* Next, generate entries for any PCI INTx interrupts. */
+	for (bus = 0; bus <= PCI_BUSMAX; bus++)
+		pci_walk_lintr(bus, mpt_generate_pci_int, &mpie); 
+}
+
+void
+mptable_add_oemtbl(void *tbl, int tblsz)
+{
+
+	oem_tbl_start = tbl;
+	oem_tbl_size = tblsz;
+}
+
+int
+mptable_build(struct vmctx *ctx, int ncpu)
+{
+	mpcth_t			mpch;
+	bus_entry_ptr		mpeb;
+	io_apic_entry_ptr	mpei;
+	proc_entry_ptr		mpep;
+	mpfps_t			mpfp;
+	int_entry_ptr		mpie;
+	int			ioints, bus;
+	char 			*curraddr;
+	char 			*startaddr;
+
+	startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH);
+	if (startaddr == NULL) {
+		fprintf(stderr, "mptable requires mapped mem\n");
+		return (ENOMEM);
+	}
+
+	/*
+	 * There is no way to advertise multiple PCI hierarchies via MPtable
+	 * so require that there is no PCI hierarchy with a non-zero bus
+	 * number.
+	 */
+	for (bus = 1; bus <= PCI_BUSMAX; bus++) {
+		if (pci_bus_configured(bus)) {
+			fprintf(stderr, "MPtable is incompatible with "
+			    "multiple PCI hierarchies.\r\n");
+			fprintf(stderr, "MPtable generation can be disabled "
+			    "by passing the -Y option to bhyve(8).\r\n");
+			return (EINVAL);
+		}
+	}
+
+	curraddr = startaddr;
+	mpfp = (mpfps_t)curraddr;
+	mpt_build_mpfp(mpfp, MPTABLE_BASE);
+	curraddr += sizeof(*mpfp);
+
+	mpch = (mpcth_t)curraddr;
+	mpt_build_mpch(mpch);
+	curraddr += sizeof(*mpch);
+
+	mpep = (proc_entry_ptr)curraddr;
+	mpt_build_proc_entries(mpep, ncpu);
+	curraddr += sizeof(*mpep) * ncpu;
+	mpch->entry_count += ncpu;
+
+	mpeb = (bus_entry_ptr) curraddr;
+	mpt_build_bus_entries(mpeb);
+	curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
+	mpch->entry_count += MPE_NUM_BUSES;
+
+	mpei = (io_apic_entry_ptr)curraddr;
+	mpt_build_ioapic_entries(mpei, 0);
+	curraddr += sizeof(*mpei);
+	mpch->entry_count++;
+
+	mpie = (int_entry_ptr) curraddr;
+	ioints = mpt_count_ioint_entries();
+	mpt_build_ioint_entries(mpie, 0);
+	curraddr += sizeof(*mpie) * ioints;
+	mpch->entry_count += ioints;
+
+	mpie = (int_entry_ptr)curraddr;
+	mpt_build_localint_entries(mpie);
+	curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ;
+	mpch->entry_count += MPEII_NUM_LOCAL_IRQ;
+
+	if (oem_tbl_start) {
+		mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
+		mpch->oem_table_size = oem_tbl_size;
+		memcpy(curraddr, oem_tbl_start, oem_tbl_size);
+	}
+
+	mpch->base_table_length = curraddr - (char *)mpch;
+	mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length);
+
+	return (0);
+}
diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h
new file mode 100644
index 0000000000..d78ea6da09
--- /dev/null
+++ b/usr/src/cmd/bhyve/mptbl.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mptbl.h 257423 2013-10-31 05:44:45Z neel $
+ */
+
+#ifndef _MPTBL_H_
+#define _MPTBL_H_
+
+int	mptable_build(struct vmctx *ctx, int ncpu);
+void	mptable_add_oemtbl(void *tbl, int tblsz);
+
+#endif /* _MPTBL_H_ */
diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c
new file mode 100644
index 0000000000..b68c977c1f
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_ahci.c
@@ -0,0 +1,2009 @@
+/*-
+ * Copyright (c) 2013  Zhixiang Yu <zcore@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+#include <sys/ata.h>
+#include <sys/endian.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <inttypes.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "ahci.h"
+#include "block_if.h"
+
+#define	MAX_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
+
+#define	PxSIG_ATA	0x00000101 /* ATA drive */
+#define	PxSIG_ATAPI	0xeb140101 /* ATAPI drive */
+
+enum sata_fis_type {
+	FIS_TYPE_REGH2D		= 0x27,	/* Register FIS - host to device */
+	FIS_TYPE_REGD2H		= 0x34,	/* Register FIS - device to host */
+	FIS_TYPE_DMAACT		= 0x39,	/* DMA activate FIS - device to host */
+	FIS_TYPE_DMASETUP	= 0x41,	/* DMA setup FIS - bidirectional */
+	FIS_TYPE_DATA		= 0x46,	/* Data FIS - bidirectional */
+	FIS_TYPE_BIST		= 0x58,	/* BIST activate FIS - bidirectional */
+	FIS_TYPE_PIOSETUP	= 0x5F,	/* PIO setup FIS - device to host */
+	FIS_TYPE_SETDEVBITS	= 0xA1,	/* Set dev bits FIS - device to host */
+};
+
+/*
+ * SCSI opcodes
+ */
+#define	TEST_UNIT_READY		0x00
+#define	REQUEST_SENSE		0x03
+#define	INQUIRY			0x12
+#define	START_STOP_UNIT		0x1B
+#define	PREVENT_ALLOW		0x1E
+#define	READ_CAPACITY		0x25
+#define	READ_10			0x28
+#define	POSITION_TO_ELEMENT	0x2B
+#define	READ_TOC		0x43
+#define	GET_EVENT_STATUS_NOTIFICATION 0x4A
+#define	MODE_SENSE_10		0x5A
+#define	READ_12			0xA8
+#define	READ_CD			0xBE
+
+/*
+ * SCSI mode page codes
+ */
+#define	MODEPAGE_RW_ERROR_RECOVERY	0x01
+#define	MODEPAGE_CD_CAPABILITIES	0x2A
+
+/*
+ * ATA commands
+ */
+#define	ATA_SF_ENAB_SATA_SF		0x10
+#define		ATA_SATA_SF_AN		0x05
+#define	ATA_SF_DIS_SATA_SF		0x90
+
+/*
+ * Debug printf
+ */
+#ifdef AHCI_DEBUG
+static FILE *dbg;
+#define DPRINTF(format, arg...)	do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
+#else
+#define DPRINTF(format, arg...)
+#endif
+#define WPRINTF(format, arg...) printf(format, ##arg)
+
+struct ahci_ioreq {
+	struct blockif_req io_req;
+	struct ahci_port *io_pr;
+	STAILQ_ENTRY(ahci_ioreq) io_flist;
+	TAILQ_ENTRY(ahci_ioreq) io_blist;
+	uint8_t *cfis;
+	uint32_t len;
+	uint32_t done;
+	int slot;
+	int prdtl;
+};
+
+struct ahci_port {
+	struct blockif_ctxt *bctx;
+	struct pci_ahci_softc *pr_sc;
+	uint8_t *cmd_lst;
+	uint8_t *rfis;
+	int atapi;
+	int reset;
+	int mult_sectors;
+	uint8_t xfermode;
+	uint8_t sense_key;
+	uint8_t asc;
+	uint32_t pending;
+
+	uint32_t clb;
+	uint32_t clbu;
+	uint32_t fb;
+	uint32_t fbu;
+	uint32_t is;
+	uint32_t ie;
+	uint32_t cmd;
+	uint32_t unused0;
+	uint32_t tfd;
+	uint32_t sig;
+	uint32_t ssts;
+	uint32_t sctl;
+	uint32_t serr;
+	uint32_t sact;
+	uint32_t ci;
+	uint32_t sntf;
+	uint32_t fbs;
+
+	/*
+	 * i/o request info
+	 */
+	struct ahci_ioreq *ioreq;
+	int ioqsz;
+	STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
+	TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
+};
+
+struct ahci_cmd_hdr {
+	uint16_t flags;
+	uint16_t prdtl;
+	uint32_t prdbc;
+	uint64_t ctba;
+	uint32_t reserved[4];
+};
+
+struct ahci_prdt_entry {
+	uint64_t dba;
+	uint32_t reserved;
+#define	DBCMASK		0x3fffff
+	uint32_t dbc;
+};
+
+struct pci_ahci_softc {
+	struct pci_devinst *asc_pi;
+	pthread_mutex_t	mtx;
+	int ports;
+	uint32_t cap;
+	uint32_t ghc;
+	uint32_t is;
+	uint32_t pi;
+	uint32_t vs;
+	uint32_t ccc_ctl;
+	uint32_t ccc_pts;
+	uint32_t em_loc;
+	uint32_t em_ctl;
+	uint32_t cap2;
+	uint32_t bohc;
+	uint32_t lintr;
+	struct ahci_port port[MAX_PORTS];
+};
+#define	ahci_ctx(sc)	((sc)->asc_pi->pi_vmctx)
+
+static inline void lba_to_msf(uint8_t *buf, int lba)
+{
+	lba += 150;
+	buf[0] = (lba / 75) / 60;
+	buf[1] = (lba / 75) % 60;
+	buf[2] = lba % 75;
+}
+
+/*
+ * generate HBA intr depending on whether or not ports within
+ * the controller have an interrupt pending.
+ */
+static void
+ahci_generate_intr(struct pci_ahci_softc *sc)
+{
+	struct pci_devinst *pi;
+	int i;
+
+	pi = sc->asc_pi;
+
+	for (i = 0; i < sc->ports; i++) {
+		struct ahci_port *pr;
+		pr = &sc->port[i];
+		if (pr->is & pr->ie)
+			sc->is |= (1 << i);
+	}
+
+	DPRINTF("%s %x\n", __func__, sc->is);
+
+	if (sc->is && (sc->ghc & AHCI_GHC_IE)) {		
+		if (pci_msi_enabled(pi)) {
+			/*
+			 * Generate an MSI interrupt on every edge
+			 */
+			pci_generate_msi(pi, 0);
+		} else if (!sc->lintr) {
+			/*
+			 * Only generate a pin-based interrupt if one wasn't
+			 * in progress
+			 */
+			sc->lintr = 1;
+			pci_lintr_assert(pi);
+		}
+	} else if (sc->lintr) {
+		/*
+		 * No interrupts: deassert pin-based signal if it had
+		 * been asserted
+		 */
+		pci_lintr_deassert(pi);
+		sc->lintr = 0;
+	}
+}
+
+static void
+ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
+{
+	int offset, len, irq;
+
+	if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
+		return;
+
+	switch (ft) {
+	case FIS_TYPE_REGD2H:
+		offset = 0x40;
+		len = 20;
+		irq = AHCI_P_IX_DHR;
+		break;
+	case FIS_TYPE_SETDEVBITS:
+		offset = 0x58;
+		len = 8;
+		irq = AHCI_P_IX_SDB;
+		break;
+	case FIS_TYPE_PIOSETUP:
+		offset = 0x20;
+		len = 20;
+		irq = 0;
+		break;
+	default:
+		WPRINTF("unsupported fis type %d\n", ft);
+		return;
+	}
+	memcpy(p->rfis + offset, fis, len);
+	if (irq) {
+		p->is |= irq;
+		ahci_generate_intr(p->pr_sc);
+	}
+}
+
+static void
+ahci_write_fis_piosetup(struct ahci_port *p)
+{
+	uint8_t fis[20];
+
+	memset(fis, 0, sizeof(fis));
+	fis[0] = FIS_TYPE_PIOSETUP;
+	ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
+}
+
+static void
+ahci_write_fis_sdb(struct ahci_port *p, int slot, uint32_t tfd)
+{
+	uint8_t fis[8];
+	uint8_t error;
+
+	error = (tfd >> 8) & 0xff;
+	memset(fis, 0, sizeof(fis));
+	fis[0] = error;
+	fis[2] = tfd & 0x77;
+	*(uint32_t *)(fis + 4) = (1 << slot);
+	if (fis[2] & ATA_S_ERROR)
+		p->is |= AHCI_P_IX_TFE;
+	p->tfd = tfd;
+	ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
+}
+
+static void
+ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
+{
+	uint8_t fis[20];
+	uint8_t error;
+
+	error = (tfd >> 8) & 0xff;
+	memset(fis, 0, sizeof(fis));
+	fis[0] = FIS_TYPE_REGD2H;
+	fis[1] = (1 << 6);
+	fis[2] = tfd & 0xff;
+	fis[3] = error;
+	fis[4] = cfis[4];
+	fis[5] = cfis[5];
+	fis[6] = cfis[6];
+	fis[7] = cfis[7];
+	fis[8] = cfis[8];
+	fis[9] = cfis[9];
+	fis[10] = cfis[10];
+	fis[11] = cfis[11];
+	fis[12] = cfis[12];
+	fis[13] = cfis[13];
+	if (fis[2] & ATA_S_ERROR)
+		p->is |= AHCI_P_IX_TFE;
+	else
+		p->ci &= ~(1 << slot);
+	p->tfd = tfd;
+	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
+static void
+ahci_write_reset_fis_d2h(struct ahci_port *p)
+{
+	uint8_t fis[20];
+
+	memset(fis, 0, sizeof(fis));
+	fis[0] = FIS_TYPE_REGD2H;
+	fis[3] = 1;
+	fis[4] = 1;
+	if (p->atapi) {
+		fis[5] = 0x14;
+		fis[6] = 0xeb;
+	}
+	fis[12] = 1;
+	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
+static void
+ahci_check_stopped(struct ahci_port *p)
+{
+	/*
+	 * If we are no longer processing the command list and nothing
+	 * is in-flight, clear the running bit, the current command
+	 * slot, the command issue and active bits.
+	 */
+	if (!(p->cmd & AHCI_P_CMD_ST)) {
+		if (p->pending == 0) {
+			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
+			p->ci = 0;
+			p->sact = 0;
+		}
+	}
+}
+
+static void
+ahci_port_stop(struct ahci_port *p)
+{
+	struct ahci_ioreq *aior;
+	uint8_t *cfis;
+	int slot;
+	int ncq;
+	int error;
+
+	assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
+
+	TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
+		/*
+		 * Try to cancel the outstanding blockif request.
+		 */
+		error = blockif_cancel(p->bctx, &aior->io_req);
+		if (error != 0)
+			continue;
+
+		slot = aior->slot;
+		cfis = aior->cfis;
+		if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+		    cfis[2] == ATA_READ_FPDMA_QUEUED)
+			ncq = 1;
+
+		if (ncq)
+			p->sact &= ~(1 << slot);
+		else
+			p->ci &= ~(1 << slot);
+
+		/*
+		 * This command is now done.
+		 */
+		p->pending &= ~(1 << slot);
+
+		/*
+		 * Delete the blockif request from the busy list
+		 */
+		TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+		/*
+		 * Move the blockif request back to the free list
+		 */
+		STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+	}
+
+	ahci_check_stopped(p);
+}
+
+static void
+ahci_port_reset(struct ahci_port *pr)
+{
+	pr->sctl = 0;
+	pr->serr = 0;
+	pr->sact = 0;
+	pr->xfermode = ATA_UDMA6;
+	pr->mult_sectors = 128;
+
+	if (!pr->bctx) {
+		pr->ssts = ATA_SS_DET_NO_DEVICE;
+		pr->sig = 0xFFFFFFFF;
+		pr->tfd = 0x7F;
+		return;
+	}
+	pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_SPD_GEN2 |
+		ATA_SS_IPM_ACTIVE;
+	pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
+	if (!pr->atapi) {
+		pr->sig = PxSIG_ATA;
+		pr->tfd |= ATA_S_READY;
+	} else
+		pr->sig = PxSIG_ATAPI;
+	ahci_write_reset_fis_d2h(pr);
+}
+
+static void
+ahci_reset(struct pci_ahci_softc *sc)
+{
+	int i;
+
+	sc->ghc = AHCI_GHC_AE;
+	sc->is = 0;
+
+	if (sc->lintr) {
+		pci_lintr_deassert(sc->asc_pi);
+		sc->lintr = 0;
+	}
+
+	for (i = 0; i < sc->ports; i++) {
+		sc->port[i].ie = 0;
+		sc->port[i].is = 0;
+		ahci_port_reset(&sc->port[i]);
+	}
+}
+
+static void
+ata_string(uint8_t *dest, const char *src, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (*src)
+			dest[i ^ 1] = *src++;
+		else
+			dest[i ^ 1] = ' ';
+	}
+}
+
+static void
+atapi_string(uint8_t *dest, const char *src, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (*src)
+			dest[i] = *src++;
+		else
+			dest[i] = ' ';
+	}
+}
+
+static void
+ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
+    int seek)
+{
+	struct ahci_ioreq *aior;
+	struct blockif_req *breq;
+	struct pci_ahci_softc *sc;
+	struct ahci_prdt_entry *prdt;
+	struct ahci_cmd_hdr *hdr;
+	uint64_t lba;
+	uint32_t len;
+	int i, err, iovcnt, ncq, readop;
+
+	sc = p->pr_sc;
+	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+	ncq = 0;
+	readop = 1;
+
+	prdt += seek;
+	if (cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
+			cfis[2] == ATA_WRITE_FPDMA_QUEUED)
+		readop = 0;
+
+	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+			cfis[2] == ATA_READ_FPDMA_QUEUED) {
+		lba = ((uint64_t)cfis[10] << 40) |
+			((uint64_t)cfis[9] << 32) |
+			((uint64_t)cfis[8] << 24) |
+			((uint64_t)cfis[6] << 16) |
+			((uint64_t)cfis[5] << 8) |
+			cfis[4];
+		len = cfis[11] << 8 | cfis[3];
+		if (!len)
+			len = 65536;
+		ncq = 1;
+	} else if (cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
+		lba = ((uint64_t)cfis[10] << 40) |
+			((uint64_t)cfis[9] << 32) |
+			((uint64_t)cfis[8] << 24) |
+			((uint64_t)cfis[6] << 16) |
+			((uint64_t)cfis[5] << 8) |
+			cfis[4];
+		len = cfis[13] << 8 | cfis[12];
+		if (!len)
+			len = 65536;
+	} else {
+		lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
+			(cfis[5] << 8) | cfis[4];
+		len = cfis[12];
+		if (!len)
+			len = 256;
+	}
+	lba *= blockif_sectsz(p->bctx);
+	len *= blockif_sectsz(p->bctx);
+
+	/*
+	 * Pull request off free list
+	 */
+	aior = STAILQ_FIRST(&p->iofhd);
+	assert(aior != NULL);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+	aior->cfis = cfis;
+	aior->slot = slot;
+	aior->len = len;
+	aior->done = done;
+	breq = &aior->io_req;
+	breq->br_offset = lba + done;
+	iovcnt = hdr->prdtl - seek;
+	if (iovcnt > BLOCKIF_IOV_MAX) {
+		aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
+		iovcnt = BLOCKIF_IOV_MAX;
+	} else
+		aior->prdtl = 0;
+	breq->br_iovcnt = iovcnt;
+
+	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	/*
+	 * Build up the iovec based on the prdt
+	 */
+	for (i = 0; i < iovcnt; i++) {
+		uint32_t dbcsz;
+
+		dbcsz = (prdt->dbc & DBCMASK) + 1;
+		breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
+		    prdt->dba, dbcsz);
+		breq->br_iov[i].iov_len = dbcsz;
+		aior->done += dbcsz;
+		prdt++;
+	}
+	if (readop)
+		err = blockif_read(p->bctx, breq);
+	else
+		err = blockif_write(p->bctx, breq);
+	assert(err == 0);
+
+	if (ncq)
+		p->ci &= ~(1 << slot);
+}
+
+static void
+ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	struct ahci_ioreq *aior;
+	struct blockif_req *breq;
+	int err;
+
+	/*
+	 * Pull request off free list
+	 */
+	aior = STAILQ_FIRST(&p->iofhd);
+	assert(aior != NULL);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+	aior->cfis = cfis;
+	aior->slot = slot;
+	aior->len = 0;
+	aior->done = 0;
+	aior->prdtl = 0;
+	breq = &aior->io_req;
+
+	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	err = blockif_flush(p->bctx, breq);
+	assert(err == 0);
+}
+
+static inline void
+write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
+		void *buf, int size)
+{
+	struct ahci_cmd_hdr *hdr;
+	struct ahci_prdt_entry *prdt;
+	void *from;
+	int i, len;
+
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+	len = size;
+	from = buf;
+	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+	for (i = 0; i < hdr->prdtl && len; i++) {
+		uint8_t *ptr;
+		uint32_t dbcsz;
+		int sublen;
+
+		dbcsz = (prdt->dbc & DBCMASK) + 1;
+		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
+		sublen = len < dbcsz ? len : dbcsz;
+		memcpy(ptr, from, sublen);
+		len -= sublen;
+		from += sublen;
+		prdt++;
+	}
+	hdr->prdbc = size - len;
+}
+
+static void
+handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	struct ahci_cmd_hdr *hdr;
+
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+	if (p->atapi || hdr->prdtl == 0) {
+		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+		p->is |= AHCI_P_IX_TFE;
+	} else {
+		uint16_t buf[256];
+		uint64_t sectors;
+		uint16_t cyl;
+		uint8_t sech, heads;
+
+		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+		blockif_chs(p->bctx, &cyl, &heads, &sech);
+		memset(buf, 0, sizeof(buf));
+		buf[0] = 0x0040;
+		buf[1] = cyl;
+		buf[3] = heads;
+		buf[6] = sech;
+		/* TODO emulate different serial? */
+		ata_string((uint8_t *)(buf+10), "123456", 20);
+		ata_string((uint8_t *)(buf+23), "001", 8);
+		ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
+		buf[47] = (0x8000 | 128);
+		buf[48] = 0x1;
+		buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
+		buf[50] = (1 << 14);
+		buf[53] = (1 << 1 | 1 << 2);
+		if (p->mult_sectors)
+			buf[59] = (0x100 | p->mult_sectors);
+		buf[60] = sectors;
+		buf[61] = (sectors >> 16);
+		buf[63] = 0x7;
+		if (p->xfermode & ATA_WDMA0)
+			buf[63] |= (1 << ((p->xfermode & 7) + 8));
+		buf[64] = 0x3;
+		buf[65] = 100;
+		buf[66] = 100;
+		buf[67] = 100;
+		buf[68] = 100;
+		buf[75] = 31;
+		buf[76] = (1 << 8 | 1 << 2);
+		buf[80] = 0x1f0;
+		buf[81] = 0x28;
+		buf[82] = (1 << 5 | 1 << 14);
+		buf[83] = (1 << 10 | 1 << 12 | 1 << 13 | 1 << 14);
+		buf[84] = (1 << 14);
+		buf[85] = (1 << 5 | 1 << 14);
+		buf[86] = (1 << 10 | 1 << 12 | 1 << 13);
+		buf[87] = (1 << 14);
+		buf[88] = 0x7f;
+		if (p->xfermode & ATA_UDMA0)
+			buf[88] |= (1 << ((p->xfermode & 7) + 8));
+		buf[93] = (1 | 1 <<14);
+		buf[100] = sectors;
+		buf[101] = (sectors >> 16);
+		buf[102] = (sectors >> 32);
+		buf[103] = (sectors >> 48);
+		ahci_write_fis_piosetup(p);
+		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+		p->tfd = ATA_S_DSC | ATA_S_READY;
+		p->is |= AHCI_P_IX_DP;
+		p->ci &= ~(1 << slot);
+	}
+	ahci_generate_intr(p->pr_sc);
+}
+
+static void
+handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	if (!p->atapi) {
+		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+		p->is |= AHCI_P_IX_TFE;
+	} else {
+		uint16_t buf[256];
+
+		memset(buf, 0, sizeof(buf));
+		buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
+		/* TODO emulate different serial? */
+		ata_string((uint8_t *)(buf+10), "123456", 20);
+		ata_string((uint8_t *)(buf+23), "001", 8);
+		ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
+		buf[49] = (1 << 9 | 1 << 8);
+		buf[50] = (1 << 14 | 1);
+		buf[53] = (1 << 2 | 1 << 1);
+		buf[62] = 0x3f;
+		buf[63] = 7;
+		buf[64] = 3;
+		buf[65] = 100;
+		buf[66] = 100;
+		buf[67] = 100;
+		buf[68] = 100;
+		buf[76] = (1 << 2 | 1 << 1);
+		buf[78] = (1 << 5);
+		buf[80] = (0x1f << 4);
+		buf[82] = (1 << 4);
+		buf[83] = (1 << 14);
+		buf[84] = (1 << 14);
+		buf[85] = (1 << 4);
+		buf[87] = (1 << 14);
+		buf[88] = (1 << 14 | 0x7f);
+		ahci_write_fis_piosetup(p);
+		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+		p->tfd = ATA_S_DSC | ATA_S_READY;
+		p->is |= AHCI_P_IX_DHR;
+		p->ci &= ~(1 << slot);
+	}
+	ahci_generate_intr(p->pr_sc);
+}
+
+static void
+atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t buf[36];
+	uint8_t *acmd;
+	int len;
+
+	acmd = cfis + 0x40;
+
+	buf[0] = 0x05;
+	buf[1] = 0x80;
+	buf[2] = 0x00;
+	buf[3] = 0x21;
+	buf[4] = 31;
+	buf[5] = 0;
+	buf[6] = 0;
+	buf[7] = 0;
+	atapi_string(buf + 8, "BHYVE", 8);
+	atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
+	atapi_string(buf + 32, "001", 4);
+
+	len = sizeof(buf);
+	if (len > acmd[4])
+		len = acmd[4];
+	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+	write_prdt(p, slot, cfis, buf, len);
+	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t buf[8];
+	uint64_t sectors;
+
+	sectors = blockif_size(p->bctx) / 2048;
+	be32enc(buf, sectors - 1);
+	be32enc(buf + 4, 2048);
+	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+	write_prdt(p, slot, cfis, buf, sizeof(buf));
+	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t *acmd;
+	uint8_t format;
+	int len;
+
+	acmd = cfis + 0x40;
+
+	len = be16dec(acmd + 7);
+	format = acmd[9] >> 6;
+	switch (format) {
+	case 0:
+	{
+		int msf, size;
+		uint64_t sectors;
+		uint8_t start_track, buf[20], *bp;
+
+		msf = (acmd[1] >> 1) & 1;
+		start_track = acmd[6];
+		if (start_track > 1 && start_track != 0xaa) {
+			uint32_t tfd;
+			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+			p->asc = 0x24;
+			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+			ahci_write_fis_d2h(p, slot, cfis, tfd);
+			return;
+		}
+		bp = buf + 2;
+		*bp++ = 1;
+		*bp++ = 1;
+		if (start_track <= 1) {
+			*bp++ = 0;
+			*bp++ = 0x14;
+			*bp++ = 1;
+			*bp++ = 0;
+			if (msf) {
+				*bp++ = 0;
+				lba_to_msf(bp, 0);
+				bp += 3;
+			} else {
+				*bp++ = 0;
+				*bp++ = 0;
+				*bp++ = 0;
+				*bp++ = 0;
+			}
+		}
+		*bp++ = 0;
+		*bp++ = 0x14;
+		*bp++ = 0xaa;
+		*bp++ = 0;
+		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+		sectors >>= 2;
+		if (msf) {
+			*bp++ = 0;
+			lba_to_msf(bp, sectors);
+			bp += 3;
+		} else {
+			be32enc(bp, sectors);
+			bp += 4;
+		}
+		size = bp - buf;
+		be16enc(buf, size - 2);
+		if (len > size)
+			len = size;
+		write_prdt(p, slot, cfis, buf, len);
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+		break;
+	}
+	case 1:
+	{
+		uint8_t buf[12];
+
+		memset(buf, 0, sizeof(buf));
+		buf[1] = 0xa;
+		buf[2] = 0x1;
+		buf[3] = 0x1;
+		if (len > sizeof(buf))
+			len = sizeof(buf);
+		write_prdt(p, slot, cfis, buf, len);
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+		break;
+	}
+	case 2:
+	{
+		int msf, size;
+		uint64_t sectors;
+		uint8_t start_track, *bp, buf[50];
+
+		msf = (acmd[1] >> 1) & 1;
+		start_track = acmd[6];
+		bp = buf + 2;
+		*bp++ = 1;
+		*bp++ = 1;
+
+		*bp++ = 1;
+		*bp++ = 0x14;
+		*bp++ = 0;
+		*bp++ = 0xa0;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 1;
+		*bp++ = 0;
+		*bp++ = 0;
+
+		*bp++ = 1;
+		*bp++ = 0x14;
+		*bp++ = 0;
+		*bp++ = 0xa1;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 1;
+		*bp++ = 0;
+		*bp++ = 0;
+
+		*bp++ = 1;
+		*bp++ = 0x14;
+		*bp++ = 0;
+		*bp++ = 0xa2;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 0;
+		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+		sectors >>= 2;
+		if (msf) {
+			*bp++ = 0;
+			lba_to_msf(bp, sectors);
+			bp += 3;
+		} else {
+			be32enc(bp, sectors);
+			bp += 4;
+		}
+
+		*bp++ = 1;
+		*bp++ = 0x14;
+		*bp++ = 0;
+		*bp++ = 1;
+		*bp++ = 0;
+		*bp++ = 0;
+		*bp++ = 0;
+		if (msf) {
+			*bp++ = 0;
+			lba_to_msf(bp, 0);
+			bp += 3;
+		} else {
+			*bp++ = 0;
+			*bp++ = 0;
+			*bp++ = 0;
+			*bp++ = 0;
+		}
+
+		size = bp - buf;
+		be16enc(buf, size - 2);
+		if (len > size)
+			len = size;
+		write_prdt(p, slot, cfis, buf, len);
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+		break;
+	}
+	default:
+	{
+		uint32_t tfd;
+
+		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+		p->asc = 0x24;
+		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		ahci_write_fis_d2h(p, slot, cfis, tfd);
+		break;
+	}
+	}
+}
+
+static void
+atapi_read(struct ahci_port *p, int slot, uint8_t *cfis,
+		uint32_t done, int seek)
+{
+	struct ahci_ioreq *aior;
+	struct ahci_cmd_hdr *hdr;
+	struct ahci_prdt_entry *prdt;
+	struct blockif_req *breq;
+	struct pci_ahci_softc *sc;
+	uint8_t *acmd;
+	uint64_t lba;
+	uint32_t len;
+	int i, err, iovcnt;
+
+	sc = p->pr_sc;
+	acmd = cfis + 0x40;
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+
+	prdt += seek;
+	lba = be32dec(acmd + 2);
+	if (acmd[0] == READ_10)
+		len = be16dec(acmd + 7);
+	else
+		len = be32dec(acmd + 6);
+	if (len == 0) {
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+	}
+	lba *= 2048;
+	len *= 2048;
+
+	/*
+	 * Pull request off free list
+	 */
+	aior = STAILQ_FIRST(&p->iofhd);
+	assert(aior != NULL);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+	aior->cfis = cfis;
+	aior->slot = slot;
+	aior->len = len;
+	aior->done = done;
+	breq = &aior->io_req;
+	breq->br_offset = lba + done;
+	iovcnt = hdr->prdtl - seek;
+	if (iovcnt > BLOCKIF_IOV_MAX) {
+		aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
+		iovcnt = BLOCKIF_IOV_MAX;
+	} else
+		aior->prdtl = 0;
+	breq->br_iovcnt = iovcnt;
+
+	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	/*
+	 * Build up the iovec based on the prdt
+	 */
+	for (i = 0; i < iovcnt; i++) {
+		uint32_t dbcsz;
+
+		dbcsz = (prdt->dbc & DBCMASK) + 1;
+		breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
+		    prdt->dba, dbcsz);
+		breq->br_iov[i].iov_len = dbcsz;
+		aior->done += dbcsz;
+		prdt++;
+	}
+	err = blockif_read(p->bctx, breq);
+	assert(err == 0);
+}
+
+static void
+atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t buf[64];
+	uint8_t *acmd;
+	int len;
+
+	acmd = cfis + 0x40;
+	len = acmd[4];
+	if (len > sizeof(buf))
+		len = sizeof(buf);
+	memset(buf, 0, len);
+	buf[0] = 0x70 | (1 << 7);
+	buf[2] = p->sense_key;
+	buf[7] = 10;
+	buf[12] = p->asc;
+	write_prdt(p, slot, cfis, buf, len);
+	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t *acmd = cfis + 0x40;
+	uint32_t tfd;
+
+	switch (acmd[4] & 3) {
+	case 0:
+	case 1:
+	case 3:
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		tfd = ATA_S_READY | ATA_S_DSC;
+		break;
+	case 2:
+		/* TODO eject media */
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+		p->asc = 0x53;
+		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+		break;
+	}
+	ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t *acmd;
+	uint32_t tfd;
+	uint8_t pc, code;
+	int len;
+
+	acmd = cfis + 0x40;
+	len = be16dec(acmd + 7);
+	pc = acmd[2] >> 6;
+	code = acmd[2] & 0x3f;
+
+	switch (pc) {
+	case 0:
+		switch (code) {
+		case MODEPAGE_RW_ERROR_RECOVERY:
+		{
+			uint8_t buf[16];
+
+			if (len > sizeof(buf))
+				len = sizeof(buf);
+
+			memset(buf, 0, sizeof(buf));
+			be16enc(buf, 16 - 2);
+			buf[2] = 0x70;
+			buf[8] = 0x01;
+			buf[9] = 16 - 10;
+			buf[11] = 0x05;
+			write_prdt(p, slot, cfis, buf, len);
+			tfd = ATA_S_READY | ATA_S_DSC;
+			break;
+		}
+		case MODEPAGE_CD_CAPABILITIES:
+		{
+			uint8_t buf[30];
+
+			if (len > sizeof(buf))
+				len = sizeof(buf);
+
+			memset(buf, 0, sizeof(buf));
+			be16enc(buf, 30 - 2);
+			buf[2] = 0x70;
+			buf[8] = 0x2A;
+			buf[9] = 30 - 10;
+			buf[10] = 0x08;
+			buf[12] = 0x71;
+			be16enc(&buf[18], 2);
+			be16enc(&buf[20], 512);
+			write_prdt(p, slot, cfis, buf, len);
+			tfd = ATA_S_READY | ATA_S_DSC;
+			break;
+		}
+		default:
+			goto error;
+			break;
+		}
+		break;
+	case 3:
+		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+		p->asc = 0x39;
+		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+		break;
+error:
+	case 1:
+	case 2:
+		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+		p->asc = 0x24;
+		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+		break;
+	}
+	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+	ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+atapi_get_event_status_notification(struct ahci_port *p, int slot,
+    uint8_t *cfis)
+{
+	uint8_t *acmd;
+	uint32_t tfd;
+
+	acmd = cfis + 0x40;
+
+	/* we don't support asynchronous operation */
+	if (!(acmd[1] & 1)) {
+		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+		p->asc = 0x24;
+		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+	} else {
+		uint8_t buf[8];
+		int len;
+
+		len = be16dec(acmd + 7);
+		if (len > sizeof(buf))
+			len = sizeof(buf);
+
+		memset(buf, 0, sizeof(buf));
+		be16enc(buf, 8 - 2);
+		buf[2] = 0x04;
+		buf[3] = 0x10;
+		buf[5] = 0x02;
+		write_prdt(p, slot, cfis, buf, len);
+		tfd = ATA_S_READY | ATA_S_DSC;
+	}
+	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+	ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t *acmd;
+
+	acmd = cfis + 0x40;
+
+#ifdef AHCI_DEBUG
+	{
+		int i;
+		DPRINTF("ACMD:");
+		for (i = 0; i < 16; i++)
+			DPRINTF("%02x ", acmd[i]);
+		DPRINTF("\n");
+	}
+#endif
+
+	switch (acmd[0]) {
+	case TEST_UNIT_READY:
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+		break;
+	case INQUIRY:
+		atapi_inquiry(p, slot, cfis);
+		break;
+	case READ_CAPACITY:
+		atapi_read_capacity(p, slot, cfis);
+		break;
+	case PREVENT_ALLOW:
+		/* TODO */
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+		break;
+	case READ_TOC:
+		atapi_read_toc(p, slot, cfis);
+		break;
+	case READ_10:
+	case READ_12:
+		atapi_read(p, slot, cfis, 0, 0);
+		break;
+	case REQUEST_SENSE:
+		atapi_request_sense(p, slot, cfis);
+		break;
+	case START_STOP_UNIT:
+		atapi_start_stop_unit(p, slot, cfis);
+		break;
+	case MODE_SENSE_10:
+		atapi_mode_sense(p, slot, cfis);
+		break;
+	case GET_EVENT_STATUS_NOTIFICATION:
+		atapi_get_event_status_notification(p, slot, cfis);
+		break;
+	default:
+		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+		p->asc = 0x20;
+		ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) |
+				ATA_S_READY | ATA_S_ERROR);
+		break;
+	}
+}
+
+static void
+ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+
+	switch (cfis[2]) {
+	case ATA_ATA_IDENTIFY:
+		handle_identify(p, slot, cfis);
+		break;
+	case ATA_SETFEATURES:
+	{
+		switch (cfis[3]) {
+		case ATA_SF_ENAB_SATA_SF:
+			switch (cfis[12]) {
+			case ATA_SATA_SF_AN:
+				p->tfd = ATA_S_DSC | ATA_S_READY;
+				break;
+			default:
+				p->tfd = ATA_S_ERROR | ATA_S_READY;
+				p->tfd |= (ATA_ERROR_ABORT << 8);
+				break;
+			}
+			break;
+		case ATA_SF_ENAB_WCACHE:
+		case ATA_SF_DIS_WCACHE:
+		case ATA_SF_ENAB_RCACHE:
+		case ATA_SF_DIS_RCACHE:
+			p->tfd = ATA_S_DSC | ATA_S_READY;
+			break;
+		case ATA_SF_SETXFER:
+		{
+			switch (cfis[12] & 0xf8) {
+			case ATA_PIO:
+			case ATA_PIO0:
+				break;
+			case ATA_WDMA0:
+			case ATA_UDMA0:
+				p->xfermode = (cfis[12] & 0x7);
+				break;
+			}
+			p->tfd = ATA_S_DSC | ATA_S_READY;
+			break;
+		}
+		default:
+			p->tfd = ATA_S_ERROR | ATA_S_READY;
+			p->tfd |= (ATA_ERROR_ABORT << 8);
+			break;
+		}
+		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
+		break;
+	}
+	case ATA_SET_MULTI:
+		if (cfis[12] != 0 &&
+			(cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) {
+			p->tfd = ATA_S_ERROR | ATA_S_READY;
+			p->tfd |= (ATA_ERROR_ABORT << 8);
+		} else {
+			p->mult_sectors = cfis[12];
+			p->tfd = ATA_S_DSC | ATA_S_READY;
+		}
+		p->is |= AHCI_P_IX_DP;
+		p->ci &= ~(1 << slot);
+		ahci_generate_intr(p->pr_sc);
+		break;
+	case ATA_READ_DMA:
+	case ATA_WRITE_DMA:
+	case ATA_READ_DMA48:
+	case ATA_WRITE_DMA48:
+	case ATA_READ_FPDMA_QUEUED:
+	case ATA_WRITE_FPDMA_QUEUED:
+		ahci_handle_dma(p, slot, cfis, 0, 0);
+		break;
+	case ATA_FLUSHCACHE:
+	case ATA_FLUSHCACHE48:
+		ahci_handle_flush(p, slot, cfis);
+		break;
+	case ATA_STANDBY_CMD:
+		break;
+	case ATA_NOP:
+	case ATA_STANDBY_IMMEDIATE:
+	case ATA_IDLE_IMMEDIATE:
+	case ATA_SLEEP:
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+		break;
+	case ATA_ATAPI_IDENTIFY:
+		handle_atapi_identify(p, slot, cfis);
+		break;
+	case ATA_PACKET_CMD:
+		if (!p->atapi) {
+			p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+			p->is |= AHCI_P_IX_TFE;
+			ahci_generate_intr(p->pr_sc);
+		} else
+			handle_packet_cmd(p, slot, cfis);
+		break;
+	default:
+		WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
+		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+		p->is |= AHCI_P_IX_TFE;
+		ahci_generate_intr(p->pr_sc);
+		break;
+	}
+}
+
+static void
+ahci_handle_slot(struct ahci_port *p, int slot)
+{
+	struct ahci_cmd_hdr *hdr;
+	struct ahci_prdt_entry *prdt;
+	struct pci_ahci_softc *sc;
+	uint8_t *cfis;
+	int cfl;
+
+	sc = p->pr_sc;
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+	cfl = (hdr->flags & 0x1f) * 4;
+	cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
+			0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
+	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+
+#ifdef AHCI_DEBUG
+	DPRINTF("\ncfis:");
+	for (i = 0; i < cfl; i++) {
+		if (i % 10 == 0)
+			DPRINTF("\n");
+		DPRINTF("%02x ", cfis[i]);
+	}
+	DPRINTF("\n");
+
+	for (i = 0; i < hdr->prdtl; i++) {
+		DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba);
+		prdt++;
+	}
+#endif
+
+	if (cfis[0] != FIS_TYPE_REGH2D) {
+		WPRINTF("Not a H2D FIS:%02x\n", cfis[0]);
+		return;
+	}
+
+	if (cfis[1] & 0x80) {
+		ahci_handle_cmd(p, slot, cfis);
+	} else {
+		if (cfis[15] & (1 << 2))
+			p->reset = 1;
+		else if (p->reset) {
+			p->reset = 0;
+			ahci_port_reset(p);
+		}
+		p->ci &= ~(1 << slot);
+	}
+}
+
+static void
+ahci_handle_port(struct ahci_port *p)
+{
+	int i;
+
+	if (!(p->cmd & AHCI_P_CMD_ST))
+		return;
+
+	/*
+	 * Search for any new commands to issue ignoring those that
+	 * are already in-flight.
+	 */
+	for (i = 0; (i < 32) && p->ci; i++) {
+		if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) {
+			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
+			p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;
+			ahci_handle_slot(p, i);
+		}
+	}
+}
+
+/*
+ * blockif callback routine - this runs in the context of the blockif
+ * i/o thread, so the mutex needs to be acquired.
+ */
+static void
+ata_ioreq_cb(struct blockif_req *br, int err)
+{
+	struct ahci_cmd_hdr *hdr;
+	struct ahci_ioreq *aior;
+	struct ahci_port *p;
+	struct pci_ahci_softc *sc;
+	uint32_t tfd;
+	uint8_t *cfis;
+	int pending, slot, ncq;
+
+	DPRINTF("%s %d\n", __func__, err);
+
+	ncq = 0;
+	aior = br->br_param;
+	p = aior->io_pr;
+	cfis = aior->cfis;
+	slot = aior->slot;
+	pending = aior->prdtl;
+	sc = p->pr_sc;
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+
+	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+			cfis[2] == ATA_READ_FPDMA_QUEUED)
+		ncq = 1;
+
+	pthread_mutex_lock(&sc->mtx);
+
+	/*
+	 * Delete the blockif request from the busy list
+	 */
+	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+	/*
+	 * Move the blockif request back to the free list
+	 */
+	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+
+	if (pending && !err) {
+		ahci_handle_dma(p, slot, cfis, aior->done,
+		    hdr->prdtl - pending);
+		goto out;
+	}
+
+	if (!err && aior->done == aior->len) {
+		tfd = ATA_S_READY | ATA_S_DSC;
+		if (ncq)
+			hdr->prdbc = 0;
+		else
+			hdr->prdbc = aior->len;
+	} else {
+		tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+		hdr->prdbc = 0;
+		if (ncq)
+			p->serr |= (1 << slot);
+	}
+
+	if (ncq) {
+		p->sact &= ~(1 << slot);
+		ahci_write_fis_sdb(p, slot, tfd);
+	} else
+		ahci_write_fis_d2h(p, slot, cfis, tfd);
+
+	/*
+	 * This command is now complete.
+	 */
+	p->pending &= ~(1 << slot);
+
+	ahci_check_stopped(p);
+out:
+	pthread_mutex_unlock(&sc->mtx);
+	DPRINTF("%s exit\n", __func__);
+}
+
+static void
+atapi_ioreq_cb(struct blockif_req *br, int err)
+{
+	struct ahci_cmd_hdr *hdr;
+	struct ahci_ioreq *aior;
+	struct ahci_port *p;
+	struct pci_ahci_softc *sc;
+	uint8_t *cfis;
+	uint32_t tfd;
+	int pending, slot;
+
+	DPRINTF("%s %d\n", __func__, err);
+
+	aior = br->br_param;
+	p = aior->io_pr;
+	cfis = aior->cfis;
+	slot = aior->slot;
+	pending = aior->prdtl;
+	sc = p->pr_sc;
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
+
+	pthread_mutex_lock(&sc->mtx);
+
+	/*
+	 * Delete the blockif request from the busy list
+	 */
+	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+	/*
+	 * Move the blockif request back to the free list
+	 */
+	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+
+	if (pending && !err) {
+		atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending);
+		goto out;
+	}
+
+	if (!err && aior->done == aior->len) {
+		tfd = ATA_S_READY | ATA_S_DSC;
+		hdr->prdbc = aior->len;
+	} else {
+		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+		p->asc = 0x21;
+		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+		hdr->prdbc = 0;
+	}
+
+	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+	ahci_write_fis_d2h(p, slot, cfis, tfd);
+
+	/*
+	 * This command is now complete.
+	 */
+	p->pending &= ~(1 << slot);
+
+	ahci_check_stopped(p);
+out:
+	pthread_mutex_unlock(&sc->mtx);
+	DPRINTF("%s exit\n", __func__);
+}
+
+static void
+pci_ahci_ioreq_init(struct ahci_port *pr)
+{
+	struct ahci_ioreq *vr;
+	int i;
+
+	pr->ioqsz = blockif_queuesz(pr->bctx);
+	pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq));
+	STAILQ_INIT(&pr->iofhd);
+
+	/*
+	 * Add all i/o request entries to the free queue
+	 */
+	for (i = 0; i < pr->ioqsz; i++) {
+		vr = &pr->ioreq[i];
+		vr->io_pr = pr;
+		if (!pr->atapi)
+			vr->io_req.br_callback = ata_ioreq_cb;
+		else
+			vr->io_req.br_callback = atapi_ioreq_cb;
+		vr->io_req.br_param = vr;
+		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
+	}
+
+	TAILQ_INIT(&pr->iobhd);
+}
+
+static void
+pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
+{
+	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
+	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
+	struct ahci_port *p = &sc->port[port];
+
+	DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
+		port, offset, value);
+
+	switch (offset) {
+	case AHCI_P_CLB:
+		p->clb = value;
+		break;
+	case AHCI_P_CLBU:
+		p->clbu = value;
+		break;
+	case AHCI_P_FB:
+		p->fb = value;
+		break;
+	case AHCI_P_FBU:
+		p->fbu = value;
+		break;
+	case AHCI_P_IS:
+		p->is &= ~value;
+		break;
+	case AHCI_P_IE:
+		p->ie = value & 0xFDC000FF;
+		ahci_generate_intr(sc);
+		break;
+	case AHCI_P_CMD:
+	{
+		p->cmd = value;
+		
+		if (!(value & AHCI_P_CMD_ST)) {
+			ahci_port_stop(p);
+		} else {
+			uint64_t clb;
+
+			p->cmd |= AHCI_P_CMD_CR;
+			clb = (uint64_t)p->clbu << 32 | p->clb;
+			p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb,
+					AHCI_CL_SIZE * AHCI_MAX_SLOTS);
+		}
+
+		if (value & AHCI_P_CMD_FRE) {
+			uint64_t fb;
+
+			p->cmd |= AHCI_P_CMD_FR;
+			fb = (uint64_t)p->fbu << 32 | p->fb;
+			/* we don't support FBSCP, so rfis size is 256Bytes */
+			p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256);
+		} else {
+			p->cmd &= ~AHCI_P_CMD_FR;
+		}
+
+		if (value & AHCI_P_CMD_CLO) {
+			p->tfd = 0;
+			p->cmd &= ~AHCI_P_CMD_CLO;
+		}
+
+		ahci_handle_port(p);
+		break;
+	}
+	case AHCI_P_TFD:
+	case AHCI_P_SIG:
+	case AHCI_P_SSTS:
+		WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset);
+		break;
+	case AHCI_P_SCTL:
+		if (!(p->cmd & AHCI_P_CMD_ST)) {
+			if (value & ATA_SC_DET_RESET)
+				ahci_port_reset(p);
+			p->sctl = value;
+		}
+		break;
+	case AHCI_P_SERR:
+		p->serr &= ~value;
+		break;
+	case AHCI_P_SACT:
+		p->sact |= value;
+		break;
+	case AHCI_P_CI:
+		p->ci |= value;
+		ahci_handle_port(p);
+		break;
+	case AHCI_P_SNTF:
+	case AHCI_P_FBS:
+	default:
+		break;
+	}
+}
+
+static void
+pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
+{
+	DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
+		offset, value);
+
+	switch (offset) {
+	case AHCI_CAP:
+	case AHCI_PI:
+	case AHCI_VS:
+	case AHCI_CAP2:
+		DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset);
+		break;
+	case AHCI_GHC:
+		if (value & AHCI_GHC_HR)
+			ahci_reset(sc);
+		else if (value & AHCI_GHC_IE) {
+			sc->ghc |= AHCI_GHC_IE;
+			ahci_generate_intr(sc);
+		}
+		break;
+	case AHCI_IS:
+		sc->is &= ~value;
+		ahci_generate_intr(sc);
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_ahci_softc *sc = pi->pi_arg;
+
+	assert(baridx == 5);
+	assert(size == 4);
+
+	pthread_mutex_lock(&sc->mtx);
+
+	if (offset < AHCI_OFFSET)
+		pci_ahci_host_write(sc, offset, value);
+	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
+		pci_ahci_port_write(sc, offset, value);
+	else
+		WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset);
+
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static uint64_t
+pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset)
+{
+	uint32_t value;
+
+	switch (offset) {
+	case AHCI_CAP:
+	case AHCI_GHC:
+	case AHCI_IS:
+	case AHCI_PI:
+	case AHCI_VS:
+	case AHCI_CCCC:
+	case AHCI_CCCP:
+	case AHCI_EM_LOC:
+	case AHCI_EM_CTL:
+	case AHCI_CAP2:
+	{
+		uint32_t *p = &sc->cap;
+		p += (offset - AHCI_CAP) / sizeof(uint32_t);
+		value = *p;
+		break;
+	}
+	default:
+		value = 0;
+		break;
+	}
+	DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n",
+		offset, value);
+
+	return (value);
+}
+
+static uint64_t
+pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
+{
+	uint32_t value;
+	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
+	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
+
+	switch (offset) {
+	case AHCI_P_CLB:
+	case AHCI_P_CLBU:
+	case AHCI_P_FB:
+	case AHCI_P_FBU:
+	case AHCI_P_IS:
+	case AHCI_P_IE:
+	case AHCI_P_CMD:
+	case AHCI_P_TFD:
+	case AHCI_P_SIG:
+	case AHCI_P_SSTS:
+	case AHCI_P_SCTL:
+	case AHCI_P_SERR:
+	case AHCI_P_SACT:
+	case AHCI_P_CI:
+	case AHCI_P_SNTF:
+	case AHCI_P_FBS:
+	{
+		uint32_t *p= &sc->port[port].clb;
+		p += (offset - AHCI_P_CLB) / sizeof(uint32_t);
+		value = *p;
+		break;
+	}
+	default:
+		value = 0;
+		break;
+	}
+
+	DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n",
+		port, offset, value);
+
+	return value;
+}
+
+static uint64_t
+pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+    uint64_t offset, int size)
+{
+	struct pci_ahci_softc *sc = pi->pi_arg;
+	uint32_t value;
+
+	assert(baridx == 5);
+	assert(size == 4);
+
+	pthread_mutex_lock(&sc->mtx);
+
+	if (offset < AHCI_OFFSET)
+		value = pci_ahci_host_read(sc, offset);
+	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
+		value = pci_ahci_port_read(sc, offset);
+	else {
+		value = 0;
+		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset);
+	}
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (value);
+}
+
+static int
+pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
+{
+	char bident[sizeof("XX:X:X")];
+	struct blockif_ctxt *bctxt;
+	struct pci_ahci_softc *sc;
+	int ret, slots;
+
+	ret = 0;
+
+	if (opts == NULL) {
+		fprintf(stderr, "pci_ahci: backing device required\n");
+		return (1);
+	}
+
+#ifdef AHCI_DEBUG
+	dbg = fopen("/tmp/log", "w+");
+#endif
+
+	sc = calloc(1, sizeof(struct pci_ahci_softc));
+	pi->pi_arg = sc;
+	sc->asc_pi = pi;
+	sc->ports = MAX_PORTS;
+
+	/*
+	 * Only use port 0 for a backing device. All other ports will be
+	 * marked as unused
+	 */
+	sc->port[0].atapi = atapi;
+
+	/*
+	 * Attempt to open the backing image. Use the PCI
+	 * slot/func for the identifier string.
+	 */
+	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
+	bctxt = blockif_open(opts, bident);
+	if (bctxt == NULL) {       	
+		ret = 1;
+		goto open_fail;
+	}	
+	sc->port[0].bctx = bctxt;
+	sc->port[0].pr_sc = sc;
+
+	/*
+	 * Allocate blockif request structures and add them
+	 * to the free list
+	 */
+	pci_ahci_ioreq_init(&sc->port[0]);
+
+	pthread_mutex_init(&sc->mtx, NULL);
+
+	/* Intel ICH8 AHCI */
+	slots = sc->port[0].ioqsz;
+	if (slots > 32)
+		slots = 32;
+	--slots;
+	sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
+	    AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
+	    AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
+	    AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
+	    (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
+
+	/* Only port 0 implemented */
+	sc->pi = 1;
+	sc->vs = 0x10300;
+	sc->cap2 = AHCI_CAP2_APST;
+	ahci_reset(sc);
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
+	pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
+	pci_emul_add_msicap(pi, 1);
+	pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
+	    AHCI_OFFSET + sc->ports * AHCI_STEP);
+
+	pci_lintr_request(pi);
+
+open_fail:
+	if (ret) {
+		blockif_close(sc->port[0].bctx);
+		free(sc);
+	}
+
+	return (ret);
+}
+
+static int
+pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+	return (pci_ahci_init(ctx, pi, opts, 0));
+}
+
+static int
+pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+	return (pci_ahci_init(ctx, pi, opts, 1));
+}
+
+/*
+ * Use separate emulation names to distinguish drive and atapi devices
+ */
+struct pci_devemu pci_de_ahci_hd = {
+	.pe_emu =	"ahci-hd",
+	.pe_init =	pci_ahci_hd_init,
+	.pe_barwrite =	pci_ahci_write,
+	.pe_barread =	pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci_hd);
+
+struct pci_devemu pci_de_ahci_cd = {
+	.pe_emu =	"ahci-cd",
+	.pe_init =	pci_ahci_atapi_init,
+	.pe_barwrite =	pci_ahci_write,
+	.pe_barread =	pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci_cd);
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
new file mode 100644
index 0000000000..3b4ca805cc
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -0,0 +1,2103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/errno.h>
+
+#include <ctype.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "bhyverun.h"
+#include "inout.h"
+#include "ioapic.h"
+#include "mem.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+#define CONF1_ADDR_PORT    0x0cf8
+#define CONF1_DATA_PORT    0x0cfc
+
+#define CONF1_ENABLE	   0x80000000ul
+
+#define	CFGWRITE(pi,off,val,b)						\
+do {									\
+	if ((b) == 1) {							\
+		pci_set_cfgdata8((pi),(off),(val));			\
+	} else if ((b) == 2) {						\
+		pci_set_cfgdata16((pi),(off),(val));			\
+	} else {							\
+		pci_set_cfgdata32((pi),(off),(val));			\
+	}								\
+} while (0)
+
+#define	MAXBUSES	(PCI_BUSMAX + 1)
+#define MAXSLOTS	(PCI_SLOTMAX + 1)
+#define	MAXFUNCS	(PCI_FUNCMAX + 1)
+
+struct funcinfo {
+	char	*fi_name;
+	char	*fi_param;
+	struct pci_devinst *fi_devi;
+};
+
+struct intxinfo {
+	int	ii_count;
+	int	ii_pirq_pin;
+	int	ii_ioapic_irq;
+};
+
+struct slotinfo {
+	struct intxinfo si_intpins[4];
+	struct funcinfo si_funcs[MAXFUNCS];
+};
+
+struct businfo {
+	uint16_t iobase, iolimit;		/* I/O window */
+	uint32_t membase32, memlimit32;		/* mmio window below 4GB */
+	uint64_t membase64, memlimit64;		/* mmio window above 4GB */
+	struct slotinfo slotinfo[MAXSLOTS];
+};
+
+static struct businfo *pci_businfo[MAXBUSES];
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define	PCI_EMUL_IOBASE		0x2000
+#define	PCI_EMUL_IOLIMIT	0x10000
+
+#define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
+#define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+
+#define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE
+
+#define	PCI_EMUL_MEMBASE64	0xD000000000UL
+#define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
+
+static struct pci_devemu *pci_emul_finddev(char *name);
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
+    int func, int coff, int bytes, uint32_t *val);
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ *  <bus>:<slot>:<func>,<emul>[,<config>]
+ *  <slot>[:<func>],<emul>[,<config>]
+ *
+ *  slot is 0..31
+ *  func is 0..7
+ *  emul is a string describing the type of PCI device e.g. virtio-net
+ *  config is an optional string, depending on the device, that can be
+ *  used for configuration.
+ *   Examples are:
+ *     1,virtio-net,tap0
+ *     3:0,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+
+	fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt);
+}
+
+int
+pci_parse_slot(char *opt)
+{
+	struct businfo *bi;
+	struct slotinfo *si;
+	char *emul, *config, *str, *cp;
+	int error, bnum, snum, fnum;
+
+	error = -1;
+	str = strdup(opt);
+
+	emul = config = NULL;
+	if ((cp = strchr(str, ',')) != NULL) {
+		*cp = '\0';
+		emul = cp + 1;
+		if ((cp = strchr(emul, ',')) != NULL) {
+			*cp = '\0';
+			config = cp + 1;
+		}
+	} else {
+		pci_parse_slot_usage(opt);
+		goto done;
+	}
+
+	/* <bus>:<slot>:<func> */
+	if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
+		bnum = 0;
+		/* <slot>:<func> */
+		if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
+			fnum = 0;
+			/* <slot> */
+			if (sscanf(str, "%d", &snum) != 1) {
+				snum = -1;
+			}
+		}
+	}
+
+	if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
+	    fnum < 0 || fnum >= MAXFUNCS) {
+		pci_parse_slot_usage(opt);
+		goto done;
+	}
+
+	if (pci_businfo[bnum] == NULL)
+		pci_businfo[bnum] = calloc(1, sizeof(struct businfo));
+
+	bi = pci_businfo[bnum];
+	si = &bi->slotinfo[snum];
+
+	if (si->si_funcs[fnum].fi_name != NULL) {
+		fprintf(stderr, "pci slot %d:%d already occupied!\n",
+			snum, fnum);
+		goto done;
+	}
+
+	if (pci_emul_finddev(emul) == NULL) {
+		fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n",
+			snum, fnum, emul);
+		goto done;
+	}
+
+	error = 0;
+	si->si_funcs[fnum].fi_name = emul;
+	si->si_funcs[fnum].fi_param = config;
+
+done:
+	if (error)
+		free(str);
+
+	return (error);
+}
+
+static int
+pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
+{
+
+	if (offset < pi->pi_msix.pba_offset)
+		return (0);
+
+	if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+		return (0);
+	}
+
+	return (1);
+}
+
+int
+pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+		     uint64_t value)
+{
+	int msix_entry_offset;
+	int tab_index;
+	char *dest;
+
+	/* support only 4 or 8 byte writes */
+	if (size != 4 && size != 8)
+		return (-1);
+
+	/*
+	 * Return if table index is beyond what device supports
+	 */
+	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+	if (tab_index >= pi->pi_msix.table_count)
+		return (-1);
+
+	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+	/* support only aligned writes */
+	if ((msix_entry_offset % size) != 0)
+		return (-1);
+
+	dest = (char *)(pi->pi_msix.table + tab_index);
+	dest += msix_entry_offset;
+
+	if (size == 4)
+		*((uint32_t *)dest) = value;
+	else
+		*((uint64_t *)dest) = value;
+
+	return (0);
+}
+
+uint64_t
+pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
+{
+	char *dest;
+	int msix_entry_offset;
+	int tab_index;
+	uint64_t retval = ~0;
+
+	/*
+	 * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
+	 * table but we also allow 1 byte access to accomodate reads from
+	 * ddb.
+	 */
+	if (size != 1 && size != 4 && size != 8)
+		return (retval);
+
+	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+	/* support only aligned reads */
+	if ((msix_entry_offset % size) != 0) {
+		return (retval);
+	}
+
+	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+	if (tab_index < pi->pi_msix.table_count) {
+		/* valid MSI-X Table access */
+		dest = (char *)(pi->pi_msix.table + tab_index);
+		dest += msix_entry_offset;
+
+		if (size == 1)
+			retval = *((uint8_t *)dest);
+		else if (size == 4)
+			retval = *((uint32_t *)dest);
+		else
+			retval = *((uint64_t *)dest);
+	} else if (pci_valid_pba_offset(pi, offset)) {
+		/* return 0 for PBA access */
+		retval = 0;
+	}
+
+	return (retval);
+}
+
+int
+pci_msix_table_bar(struct pci_devinst *pi)
+{
+
+	if (pi->pi_msix.table != NULL)
+		return (pi->pi_msix.table_bar);
+	else
+		return (-1);
+}
+
+int
+pci_msix_pba_bar(struct pci_devinst *pi)
+{
+
+	if (pi->pi_msix.table != NULL)
+		return (pi->pi_msix.pba_bar);
+	else
+		return (-1);
+}
+
+static int
+pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		    uint32_t *eax, void *arg)
+{
+	struct pci_devinst *pdi = arg;
+	struct pci_devemu *pe = pdi->pi_d;
+	uint64_t offset;
+	int i;
+
+	for (i = 0; i <= PCI_BARMAX; i++) {
+		if (pdi->pi_bar[i].type == PCIBAR_IO &&
+		    port >= pdi->pi_bar[i].addr &&
+		    port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+			offset = port - pdi->pi_bar[i].addr;
+			if (in)
+				*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
+							 offset, bytes);
+			else
+				(*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
+						   bytes, *eax);
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+static int
+pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+		     int size, uint64_t *val, void *arg1, long arg2)
+{
+	struct pci_devinst *pdi = arg1;
+	struct pci_devemu *pe = pdi->pi_d;
+	uint64_t offset;
+	int bidx = (int) arg2;
+
+	assert(bidx <= PCI_BARMAX);
+	assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
+	       pdi->pi_bar[bidx].type == PCIBAR_MEM64);
+	assert(addr >= pdi->pi_bar[bidx].addr &&
+	       addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+
+	offset = addr - pdi->pi_bar[bidx].addr;
+
+	if (dir == MEM_F_WRITE) {
+		if (size == 8) {
+			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
+					   4, *val & 0xffffffff);
+			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4,
+					   4, *val >> 32);
+		} else {
+			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
+					   size, *val);
+		}
+	} else {
+		if (size == 8) {
+			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+						 offset, 4);
+			*val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+						  offset + 4, 4) << 32;
+		} else {
+			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+						 offset, size);
+		}
+	}
+
+	return (0);
+}
+
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+			uint64_t *addr)
+{
+	uint64_t base;
+
+	assert((size & (size - 1)) == 0);	/* must be a power of 2 */
+
+	base = roundup2(*baseptr, size);
+
+	if (base + size <= limit) {
+		*addr = base;
+		*baseptr = base + size;
+		return (0);
+	} else
+		return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+		   uint64_t size)
+{
+
+	return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
+}
+
+/*
+ * Register (or unregister) the MMIO or I/O region associated with the BAR
+ * register 'idx' of an emulated pci device.
+ */
+static void
+modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
+{
+	int error;
+	struct inout_port iop;
+	struct mem_range mr;
+
+	switch (pi->pi_bar[idx].type) {
+	case PCIBAR_IO:
+		bzero(&iop, sizeof(struct inout_port));
+		iop.name = pi->pi_name;
+		iop.port = pi->pi_bar[idx].addr;
+		iop.size = pi->pi_bar[idx].size;
+		if (registration) {
+			iop.flags = IOPORT_F_INOUT;
+			iop.handler = pci_emul_io_handler;
+			iop.arg = pi;
+			error = register_inout(&iop);
+		} else 
+			error = unregister_inout(&iop);
+		break;
+	case PCIBAR_MEM32:
+	case PCIBAR_MEM64:
+		bzero(&mr, sizeof(struct mem_range));
+		mr.name = pi->pi_name;
+		mr.base = pi->pi_bar[idx].addr;
+		mr.size = pi->pi_bar[idx].size;
+		if (registration) {
+			mr.flags = MEM_F_RW;
+			mr.handler = pci_emul_mem_handler;
+			mr.arg1 = pi;
+			mr.arg2 = idx;
+			error = register_mem(&mr);
+		} else
+			error = unregister_mem(&mr);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	assert(error == 0);
+}
+
+static void
+unregister_bar(struct pci_devinst *pi, int idx)
+{
+
+	modify_bar_registration(pi, idx, 0);
+}
+
+static void
+register_bar(struct pci_devinst *pi, int idx)
+{
+
+	modify_bar_registration(pi, idx, 1);
+}
+
+/* Are we decoding i/o port accesses for the emulated pci device? */
+static int
+porten(struct pci_devinst *pi)
+{
+	uint16_t cmd;
+
+	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+	return (cmd & PCIM_CMD_PORTEN);
+}
+
+/* Are we decoding memory accesses for the emulated pci device? */
+static int
+memen(struct pci_devinst *pi)
+{
+	uint16_t cmd;
+
+	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+	return (cmd & PCIM_CMD_MEMEN);
+}
+
+/*
+ * Update the MMIO or I/O address that is decoded by the BAR register.
+ *
+ * If the pci device has enabled the address space decoding then intercept
+ * the address range decoded by the BAR register.
+ */
+static void
+update_bar_address(struct  pci_devinst *pi, uint64_t addr, int idx, int type)
+{
+	int decode;
+
+	if (pi->pi_bar[idx].type == PCIBAR_IO)
+		decode = porten(pi);
+	else
+		decode = memen(pi);
+
+	if (decode)
+		unregister_bar(pi, idx);
+
+	switch (type) {
+	case PCIBAR_IO:
+	case PCIBAR_MEM32:
+		pi->pi_bar[idx].addr = addr;
+		break;
+	case PCIBAR_MEM64:
+		pi->pi_bar[idx].addr &= ~0xffffffffUL;
+		pi->pi_bar[idx].addr |= addr;
+		break;
+	case PCIBAR_MEMHI64:
+		pi->pi_bar[idx].addr &= 0xffffffff;
+		pi->pi_bar[idx].addr |= addr;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (decode)
+		register_bar(pi, idx);
+}
+
+int
+pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+		    enum pcibar_type type, uint64_t size)
+{
+	int error;
+	uint64_t *baseptr, limit, addr, mask, lobits, bar;
+
+	assert(idx >= 0 && idx <= PCI_BARMAX);
+
+	if ((size & (size - 1)) != 0)
+		size = 1UL << flsl(size);	/* round up to a power of 2 */
+
+	/* Enforce minimum BAR sizes required by the PCI standard */
+	if (type == PCIBAR_IO) {
+		if (size < 4)
+			size = 4;
+	} else {
+		if (size < 16)
+			size = 16;
+	}
+
+	switch (type) {
+	case PCIBAR_NONE:
+		baseptr = NULL;
+		addr = mask = lobits = 0;
+		break;
+	case PCIBAR_IO:
+		baseptr = &pci_emul_iobase;
+		limit = PCI_EMUL_IOLIMIT;
+		mask = PCIM_BAR_IO_BASE;
+		lobits = PCIM_BAR_IO_SPACE;
+		break;
+	case PCIBAR_MEM64:
+		/*
+		 * XXX
+		 * Some drivers do not work well if the 64-bit BAR is allocated
+		 * above 4GB. Allow for this by allocating small requests under
+		 * 4GB unless then allocation size is larger than some arbitrary
+		 * number (32MB currently).
+		 */
+		if (size > 32 * 1024 * 1024) {
+			/*
+			 * XXX special case for device requiring peer-peer DMA
+			 */
+			if (size == 0x100000000UL)
+				baseptr = &hostbase;
+			else
+				baseptr = &pci_emul_membase64;
+			limit = PCI_EMUL_MEMLIMIT64;
+			mask = PCIM_BAR_MEM_BASE;
+			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+				 PCIM_BAR_MEM_PREFETCH;
+			break;
+		} else {
+			baseptr = &pci_emul_membase32;
+			limit = PCI_EMUL_MEMLIMIT32;
+			mask = PCIM_BAR_MEM_BASE;
+			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
+		}
+		break;
+	case PCIBAR_MEM32:
+		baseptr = &pci_emul_membase32;
+		limit = PCI_EMUL_MEMLIMIT32;
+		mask = PCIM_BAR_MEM_BASE;
+		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+		break;
+	default:
+		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+		assert(0);
+	}
+
+	if (baseptr != NULL) {
+		error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+		if (error != 0)
+			return (error);
+	}
+
+	pdi->pi_bar[idx].type = type;
+	pdi->pi_bar[idx].addr = addr;
+	pdi->pi_bar[idx].size = size;
+
+	/* Initialize the BAR register in config space */
+	bar = (addr & mask) | lobits;
+	pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+	if (type == PCIBAR_MEM64) {
+		assert(idx + 1 <= PCI_BARMAX);
+		pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+	}
+	
+	register_bar(pdi, idx);
+
+	return (0);
+}
+
+#define	CAP_START_OFFSET	0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+	int i, capoff, reallen;
+	uint16_t sts;
+
+	assert(caplen > 0);
+
+	reallen = roundup2(caplen, 4);		/* dword aligned */
+
+	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+	if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
+		capoff = CAP_START_OFFSET;
+	else
+		capoff = pi->pi_capend + 1;
+
+	/* Check if we have enough space */
+	if (capoff + reallen > PCI_REGMAX + 1)
+		return (-1);
+
+	/* Set the previous capability pointer */
+	if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+		pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+		pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+	} else
+		pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
+
+	/* Copy the capability */
+	for (i = 0; i < caplen; i++)
+		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+	/* Set the next capability pointer */
+	pci_set_cfgdata8(pi, capoff + 1, 0);
+
+	pi->pi_prevcap = capoff;
+	pi->pi_capend = capoff + reallen - 1;
+	return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+	struct pci_devemu **pdpp, *pdp;
+
+	SET_FOREACH(pdpp, pci_devemu_set) {
+		pdp = *pdpp;
+		if (!strcmp(pdp->pe_emu, name)) {
+			return (pdp);
+		}
+	}
+
+	return (NULL);
+}
+
+static int
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
+    int func, struct funcinfo *fi)
+{
+	struct pci_devinst *pdi;
+	int err;
+
+	pdi = calloc(1, sizeof(struct pci_devinst));
+
+	pdi->pi_vmctx = ctx;
+	pdi->pi_bus = bus;
+	pdi->pi_slot = slot;
+	pdi->pi_func = func;
+	pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
+	pdi->pi_lintr.pin = 0;
+	pdi->pi_lintr.state = IDLE;
+	pdi->pi_lintr.pirq_pin = 0;
+	pdi->pi_lintr.ioapic_irq = 0;
+	pdi->pi_d = pde;
+	snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+	/* Disable legacy interrupts */
+	pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+	pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+	pci_set_cfgdata8(pdi, PCIR_COMMAND,
+		    PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+	err = (*pde->pe_init)(ctx, pdi, fi->fi_param);
+	if (err == 0)
+		fi->fi_devi = pdi;
+	else
+		free(pdi);
+
+	return (err);
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+	int mmc;
+
+	CTASSERT(sizeof(struct msicap) == 14);
+
+	/* Number of msi messages must be a power of 2 between 1 and 32 */
+	assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+	mmc = ffs(msgnum) - 1;
+
+	bzero(msicap, sizeof(struct msicap));
+	msicap->capid = PCIY_MSI;
+	msicap->nextptr = nextptr;
+	msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+	struct msicap msicap;
+
+	pci_populate_msicap(&msicap, msgnum, 0);
+
+	return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+static void
+pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
+		     uint32_t msix_tab_size)
+{
+	CTASSERT(sizeof(struct msixcap) == 12);
+
+	assert(msix_tab_size % 4096 == 0);
+
+	bzero(msixcap, sizeof(struct msixcap));
+	msixcap->capid = PCIY_MSIX;
+
+	/*
+	 * Message Control Register, all fields set to
+	 * zero except for the Table Size.
+	 * Note: Table size N is encoded as N-1
+	 */
+	msixcap->msgctrl = msgnum - 1;
+
+	/*
+	 * MSI-X BAR setup:
+	 * - MSI-X table start at offset 0
+	 * - PBA table starts at a 4K aligned offset after the MSI-X table
+	 */
+	msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
+	msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
+}
+
+static void
+pci_msix_table_init(struct pci_devinst *pi, int table_entries)
+{
+	int i, table_size;
+
+	assert(table_entries > 0);
+	assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
+
+	table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
+	pi->pi_msix.table = calloc(1, table_size);
+
+	/* set mask bit of vector control register */
+	for (i = 0; i < table_entries; i++)
+		pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
+}
+
+int
+pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
+{
+	uint32_t tab_size;
+	struct msixcap msixcap;
+
+	assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
+	assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
+	
+	tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
+
+	/* Align table size to nearest 4K */
+	tab_size = roundup2(tab_size, 4096);
+
+	pi->pi_msix.table_bar = barnum;
+	pi->pi_msix.pba_bar   = barnum;
+	pi->pi_msix.table_offset = 0;
+	pi->pi_msix.table_count = msgnum;
+	pi->pi_msix.pba_offset = tab_size;
+	pi->pi_msix.pba_size = PBA_SIZE(msgnum);
+
+	pci_msix_table_init(pi, msgnum);
+
+	pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
+
+	/* allocate memory for MSI-X Table and PBA */
+	pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
+				tab_size + pi->pi_msix.pba_size);
+
+	return (pci_emul_add_capability(pi, (u_char *)&msixcap,
+					sizeof(msixcap)));
+}
+
+void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+		 int bytes, uint32_t val)
+{
+	uint16_t msgctrl, rwmask;
+	int off, table_bar;
+	
+	off = offset - capoff;
+	table_bar = pi->pi_msix.table_bar;
+	/* Message Control Register */
+	if (off == 2 && bytes == 2) {
+		rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+		msgctrl = pci_get_cfgdata16(pi, offset);
+		msgctrl &= ~rwmask;
+		msgctrl |= val & rwmask;
+		val = msgctrl;
+
+		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+		pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
+		pci_lintr_update(pi);
+	} 
+	
+	CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+		int bytes, uint32_t val)
+{
+	uint16_t msgctrl, rwmask, msgdata, mme;
+	uint32_t addrlo;
+
+	/*
+	 * If guest is writing to the message control register make sure
+	 * we do not overwrite read-only fields.
+	 */
+	if ((offset - capoff) == 2 && bytes == 2) {
+		rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+		msgctrl = pci_get_cfgdata16(pi, offset);
+		msgctrl &= ~rwmask;
+		msgctrl |= val & rwmask;
+		val = msgctrl;
+
+		addrlo = pci_get_cfgdata32(pi, capoff + 4);
+		if (msgctrl & PCIM_MSICTRL_64BIT)
+			msgdata = pci_get_cfgdata16(pi, capoff + 12);
+		else
+			msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+		mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+		pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+		if (pi->pi_msi.enabled) {
+			pi->pi_msi.addr = addrlo;
+			pi->pi_msi.msg_data = msgdata;
+			pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
+		} else {
+			pi->pi_msi.maxmsgnum = 0;
+		}
+		pci_lintr_update(pi);
+	}
+
+	CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+		 int bytes, uint32_t val)
+{
+
+	/* XXX don't write to the readonly parts */
+	CFGWRITE(pi, offset, val, bytes);
+}
+
+#define	PCIECAP_VERSION	0x2
+int
+pci_emul_add_pciecap(struct pci_devinst *pi, int type)
+{
+	int err;
+	struct pciecap pciecap;
+
+	CTASSERT(sizeof(struct pciecap) == 60);
+
+	if (type != PCIEM_TYPE_ROOT_PORT)
+		return (-1);
+
+	bzero(&pciecap, sizeof(pciecap));
+
+	pciecap.capid = PCIY_EXPRESS;
+	pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT;
+	pciecap.link_capabilities = 0x411;	/* gen1, x1 */
+	pciecap.link_status = 0x11;		/* gen1, x1 */
+
+	err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
+	return (err);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+	int capid;
+	uint8_t capoff, nextoff;
+
+	/* Do not allow un-aligned writes */
+	if ((offset & (bytes - 1)) != 0)
+		return;
+
+	/* Find the capability that we want to update */
+	capoff = CAP_START_OFFSET;
+	while (1) {
+		nextoff = pci_get_cfgdata8(pi, capoff + 1);
+		if (nextoff == 0)
+			break;
+		if (offset >= capoff && offset < nextoff)
+			break;
+
+		capoff = nextoff;
+	}
+	assert(offset >= capoff);
+
+	/*
+	 * Capability ID and Next Capability Pointer are readonly.
+	 * However, some o/s's do 4-byte writes that include these.
+	 * For this case, trim the write back to 2 bytes and adjust
+	 * the data.
+	 */
+	if (offset == capoff || offset == capoff + 1) {
+		if (offset == capoff && bytes == 4) {
+			bytes = 2;
+			offset += 2;
+			val >>= 16;
+		} else
+			return;
+	}
+
+	capid = pci_get_cfgdata8(pi, capoff);
+	switch (capid) {
+	case PCIY_MSI:
+		msicap_cfgwrite(pi, capoff, offset, bytes, val);
+		break;
+	case PCIY_MSIX:
+		msixcap_cfgwrite(pi, capoff, offset, bytes, val);
+		break;
+	case PCIY_EXPRESS:
+		pciecap_cfgwrite(pi, capoff, offset, bytes, val);
+		break;
+	default:
+		break;
+	}
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+	uint16_t sts;
+
+	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+	if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+		if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
+			return (1);
+	}
+	return (0);
+}
+
+static int
+pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+			  int size, uint64_t *val, void *arg1, long arg2)
+{
+	/*
+	 * Ignore writes; return 0xff's for reads. The mem read code
+	 * will take care of truncating to the correct size.
+	 */
+	if (dir == MEM_F_READ) {
+		*val = 0xffffffffffffffff;
+	}
+
+	return (0);
+}
+
+static int
+pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+    int bytes, uint64_t *val, void *arg1, long arg2)
+{
+	int bus, slot, func, coff, in;
+
+	coff = addr & 0xfff;
+	func = (addr >> 12) & 0x7;
+	slot = (addr >> 15) & 0x1f;
+	bus = (addr >> 20) & 0xff;
+	in = (dir == MEM_F_READ);
+	if (in)
+		*val = ~0UL;
+	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
+	return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+	return (PCI_EMUL_ECFG_BASE);
+}
+
+#define	BUSIO_ROUNDUP		32
+#define	BUSMEM_ROUNDUP		(1024 * 1024)
+
+int
+init_pci(struct vmctx *ctx)
+{
+	struct mem_range mr;
+	struct pci_devemu *pde;
+	struct businfo *bi;
+	struct slotinfo *si;
+	struct funcinfo *fi;
+	size_t lowmem;
+	int bus, slot, func;
+	int error;
+
+	pci_emul_iobase = PCI_EMUL_IOBASE;
+	pci_emul_membase32 = vm_get_lowmem_limit(ctx);
+	pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+	for (bus = 0; bus < MAXBUSES; bus++) {
+		if ((bi = pci_businfo[bus]) == NULL)
+			continue;
+		/* 
+		 * Keep track of the i/o and memory resources allocated to
+		 * this bus.
+		 */
+		bi->iobase = pci_emul_iobase;
+		bi->membase32 = pci_emul_membase32;
+		bi->membase64 = pci_emul_membase64;
+
+		for (slot = 0; slot < MAXSLOTS; slot++) {
+			si = &bi->slotinfo[slot];
+			for (func = 0; func < MAXFUNCS; func++) {
+				fi = &si->si_funcs[func];
+				if (fi->fi_name == NULL)
+					continue;
+				pde = pci_emul_finddev(fi->fi_name);
+				assert(pde != NULL);
+				error = pci_emul_init(ctx, pde, bus, slot,
+				    func, fi);
+				if (error)
+					return (error);
+			}
+		}
+
+		/*
+		 * Add some slop to the I/O and memory resources decoded by
+		 * this bus to give a guest some flexibility if it wants to
+		 * reprogram the BARs.
+		 */
+		pci_emul_iobase += BUSIO_ROUNDUP;
+		pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
+		bi->iolimit = pci_emul_iobase;
+
+		pci_emul_membase32 += BUSMEM_ROUNDUP;
+		pci_emul_membase32 = roundup2(pci_emul_membase32,
+		    BUSMEM_ROUNDUP);
+		bi->memlimit32 = pci_emul_membase32;
+
+		pci_emul_membase64 += BUSMEM_ROUNDUP;
+		pci_emul_membase64 = roundup2(pci_emul_membase64,
+		    BUSMEM_ROUNDUP);
+		bi->memlimit64 = pci_emul_membase64;
+	}
+
+	/*
+	 * PCI backends are initialized before routing INTx interrupts
+	 * so that LPC devices are able to reserve ISA IRQs before
+	 * routing PIRQ pins.
+	 */
+	for (bus = 0; bus < MAXBUSES; bus++) {
+		if ((bi = pci_businfo[bus]) == NULL)
+			continue;
+
+		for (slot = 0; slot < MAXSLOTS; slot++) {
+			si = &bi->slotinfo[slot];
+			for (func = 0; func < MAXFUNCS; func++) {
+				fi = &si->si_funcs[func];
+				if (fi->fi_devi == NULL)
+					continue;
+				pci_lintr_route(fi->fi_devi);
+			}
+		}
+	}
+	lpc_pirq_routed();
+
+	/*
+	 * The guest physical memory map looks like the following:
+	 * [0,		    lowmem)		guest system memory
+	 * [lowmem,	    lowmem_limit)	memory hole (may be absent)
+	 * [lowmem_limit,   0xE0000000)		PCI hole (32-bit BAR allocation)
+	 * [0xE0000000,	    0xF0000000)		PCI extended config window
+	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware
+	 * [4GB,	    4GB + highmem)
+	 */
+
+	/*
+	 * Accesses to memory addresses that are not allocated to system
+	 * memory or PCI devices return 0xff's.
+	 */
+	lowmem = vm_get_lowmem_size(ctx);
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI hole";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = lowmem;
+	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+	mr.handler = pci_emul_fallback_handler;
+	error = register_mem_fallback(&mr);
+	assert(error == 0);
+
+	/* PCI extended config space */
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI ECFG";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = PCI_EMUL_ECFG_BASE;
+	mr.size = PCI_EMUL_ECFG_SIZE;
+	mr.handler = pci_emul_ecfg_handler;
+	error = register_mem(&mr);
+	assert(error == 0);
+
+	return (0);
+}
+
+#ifdef	__FreeBSD__
+static void
+pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+    void *arg)
+{
+
+	dsdt_line("  Package ()");
+	dsdt_line("  {");
+	dsdt_line("    0x%X,", slot << 16 | 0xffff);
+	dsdt_line("    0x%02X,", pin - 1);
+	dsdt_line("    Zero,");
+	dsdt_line("    0x%X", ioapic_irq);
+	dsdt_line("  },");
+}
+
+static void
+pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+    void *arg)
+{
+	char *name;
+
+	name = lpc_pirq_name(pirq_pin);
+	if (name == NULL)
+		return;
+	dsdt_line("  Package ()");
+	dsdt_line("  {");
+	dsdt_line("    0x%X,", slot << 16 | 0xffff);
+	dsdt_line("    0x%02X,", pin - 1);
+	dsdt_line("    %s,", name);
+	dsdt_line("    0x00");
+	dsdt_line("  },");
+	free(name);
+}
+
+/*
+ * A bhyve virtual machine has a flat PCI hierarchy with a root port
+ * corresponding to each PCI bus.
+ */
+static void
+pci_bus_write_dsdt(int bus)
+{
+	struct businfo *bi;
+	struct slotinfo *si;
+	struct pci_devinst *pi;
+	int count, func, slot;
+
+	/*
+	 * If there are no devices on this 'bus' then just return.
+	 */
+	if ((bi = pci_businfo[bus]) == NULL) {
+		/*
+		 * Bus 0 is special because it decodes the I/O ports used
+		 * for PCI config space access even if there are no devices
+		 * on it.
+		 */
+		if (bus != 0)
+			return;
+	}
+
+	dsdt_line("  Device (PC%02X)", bus);
+	dsdt_line("  {");
+	dsdt_line("    Name (_HID, EisaId (\"PNP0A03\"))");
+	dsdt_line("    Name (_ADR, Zero)");
+
+	dsdt_line("    Method (_BBN, 0, NotSerialized)");
+	dsdt_line("    {");
+	dsdt_line("        Return (0x%08X)", bus);
+	dsdt_line("    }");
+	dsdt_line("    Name (_CRS, ResourceTemplate ()");
+	dsdt_line("    {");
+	dsdt_line("      WordBusNumber (ResourceProducer, MinFixed, "
+	    "MaxFixed, PosDecode,");
+	dsdt_line("        0x0000,             // Granularity");
+	dsdt_line("        0x%04X,             // Range Minimum", bus);
+	dsdt_line("        0x%04X,             // Range Maximum", bus);
+	dsdt_line("        0x0000,             // Translation Offset");
+	dsdt_line("        0x0001,             // Length");
+	dsdt_line("        ,, )");
+
+	if (bus == 0) {
+		dsdt_indent(3);
+		dsdt_fixed_ioport(0xCF8, 8);
+		dsdt_unindent(3);
+
+		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
+		    "PosDecode, EntireRange,");
+		dsdt_line("        0x0000,             // Granularity");
+		dsdt_line("        0x0000,             // Range Minimum");
+		dsdt_line("        0x0CF7,             // Range Maximum");
+		dsdt_line("        0x0000,             // Translation Offset");
+		dsdt_line("        0x0CF8,             // Length");
+		dsdt_line("        ,, , TypeStatic)");
+
+		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
+		    "PosDecode, EntireRange,");
+		dsdt_line("        0x0000,             // Granularity");
+		dsdt_line("        0x0D00,             // Range Minimum");
+		dsdt_line("        0x%04X,             // Range Maximum",
+		    PCI_EMUL_IOBASE - 1);
+		dsdt_line("        0x0000,             // Translation Offset");
+		dsdt_line("        0x%04X,             // Length",
+		    PCI_EMUL_IOBASE - 0x0D00);
+		dsdt_line("        ,, , TypeStatic)");
+
+		if (bi == NULL) {
+			dsdt_line("    })");
+			goto done;
+		}
+	}
+	assert(bi != NULL);
+
+	/* i/o window */
+	dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
+	    "PosDecode, EntireRange,");
+	dsdt_line("        0x0000,             // Granularity");
+	dsdt_line("        0x%04X,             // Range Minimum", bi->iobase);
+	dsdt_line("        0x%04X,             // Range Maximum",
+	    bi->iolimit - 1);
+	dsdt_line("        0x0000,             // Translation Offset");
+	dsdt_line("        0x%04X,             // Length",
+	    bi->iolimit - bi->iobase);
+	dsdt_line("        ,, , TypeStatic)");
+
+	/* mmio window (32-bit) */
+	dsdt_line("      DWordMemory (ResourceProducer, PosDecode, "
+	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+	dsdt_line("        0x00000000,         // Granularity");
+	dsdt_line("        0x%08X,         // Range Minimum\n", bi->membase32);
+	dsdt_line("        0x%08X,         // Range Maximum\n",
+	    bi->memlimit32 - 1);
+	dsdt_line("        0x00000000,         // Translation Offset");
+	dsdt_line("        0x%08X,         // Length\n",
+	    bi->memlimit32 - bi->membase32);
+	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
+
+	/* mmio window (64-bit) */
+	dsdt_line("      QWordMemory (ResourceProducer, PosDecode, "
+	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+	dsdt_line("        0x0000000000000000, // Granularity");
+	dsdt_line("        0x%016lX, // Range Minimum\n", bi->membase64);
+	dsdt_line("        0x%016lX, // Range Maximum\n",
+	    bi->memlimit64 - 1);
+	dsdt_line("        0x0000000000000000, // Translation Offset");
+	dsdt_line("        0x%016lX, // Length\n",
+	    bi->memlimit64 - bi->membase64);
+	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
+	dsdt_line("    })");
+
+	count = pci_count_lintr(bus);
+	if (count != 0) {
+		dsdt_indent(2);
+		dsdt_line("Name (PPRT, Package ()");
+		dsdt_line("{");
+		pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
+ 		dsdt_line("})");
+		dsdt_line("Name (APRT, Package ()");
+		dsdt_line("{");
+		pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
+ 		dsdt_line("})");
+		dsdt_line("Method (_PRT, 0, NotSerialized)");
+		dsdt_line("{");
+		dsdt_line("  If (PICM)");
+		dsdt_line("  {");
+		dsdt_line("    Return (APRT)");
+		dsdt_line("  }");
+		dsdt_line("  Else");
+		dsdt_line("  {");
+		dsdt_line("    Return (PPRT)");
+		dsdt_line("  }");
+		dsdt_line("}");
+		dsdt_unindent(2);
+	}
+
+	dsdt_indent(2);
+	for (slot = 0; slot < MAXSLOTS; slot++) {
+		si = &bi->slotinfo[slot];
+		for (func = 0; func < MAXFUNCS; func++) {
+			pi = si->si_funcs[func].fi_devi;
+			if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
+				pi->pi_d->pe_write_dsdt(pi);
+		}
+	}
+	dsdt_unindent(2);
+done:
+	dsdt_line("  }");
+}
+
+void
+pci_write_dsdt(void)
+{
+	int bus;
+
+	dsdt_indent(1);
+	dsdt_line("Name (PICM, 0x00)");
+	dsdt_line("Method (_PIC, 1, NotSerialized)");
+	dsdt_line("{");
+	dsdt_line("  Store (Arg0, PICM)");
+	dsdt_line("}");
+	dsdt_line("");
+	dsdt_line("Scope (_SB)");
+	dsdt_line("{");
+	for (bus = 0; bus < MAXBUSES; bus++)
+		pci_bus_write_dsdt(bus);
+	dsdt_line("}");
+	dsdt_unindent(1);
+}
+#endif
+
+int
+pci_bus_configured(int bus)
+{
+	assert(bus >= 0 && bus < MAXBUSES);
+	return (pci_businfo[bus] != NULL);
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+	return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_maxmsgnum(struct pci_devinst *pi)
+{
+	if (pi->pi_msi.enabled)
+		return (pi->pi_msi.maxmsgnum);
+	else
+		return (0);
+}
+
+int
+pci_msix_enabled(struct pci_devinst *pi)
+{
+
+	return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
+}
+
+void
+pci_generate_msix(struct pci_devinst *pi, int index)
+{
+	struct msix_table_entry *mte;
+
+	if (!pci_msix_enabled(pi))
+		return;
+
+	if (pi->pi_msix.function_mask)
+		return;
+
+	if (index >= pi->pi_msix.table_count)
+		return;
+
+	mte = &pi->pi_msix.table[index];
+	if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+		/* XXX Set PBA bit if interrupt is disabled */
+		vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data);
+	}
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int index)
+{
+
+	if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
+		vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr,
+			     pi->pi_msi.msg_data + index);
+	}
+}
+
+static bool
+pci_lintr_permitted(struct pci_devinst *pi)
+{
+	uint16_t cmd;
+
+	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+	return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
+		(cmd & PCIM_CMD_INTxDIS)));
+}
+
+void
+pci_lintr_request(struct pci_devinst *pi)
+{
+	struct businfo *bi;
+	struct slotinfo *si;
+	int bestpin, bestcount, pin;
+
+	bi = pci_businfo[pi->pi_bus];
+	assert(bi != NULL);
+
+	/*
+	 * Just allocate a pin from our slot.  The pin will be
+	 * assigned IRQs later when interrupts are routed.
+	 */
+	si = &bi->slotinfo[pi->pi_slot];
+	bestpin = 0;
+	bestcount = si->si_intpins[0].ii_count;
+	for (pin = 1; pin < 4; pin++) {
+		if (si->si_intpins[pin].ii_count < bestcount) {
+			bestpin = pin;
+			bestcount = si->si_intpins[pin].ii_count;
+		}
+	}
+
+	si->si_intpins[bestpin].ii_count++;
+	pi->pi_lintr.pin = bestpin + 1;
+	pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
+}
+
+static void
+pci_lintr_route(struct pci_devinst *pi)
+{
+	struct businfo *bi;
+	struct intxinfo *ii;
+
+	if (pi->pi_lintr.pin == 0)
+		return;
+
+	bi = pci_businfo[pi->pi_bus];
+	assert(bi != NULL);
+	ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
+
+	/*
+	 * Attempt to allocate an I/O APIC pin for this intpin if one
+	 * is not yet assigned.
+	 */
+	if (ii->ii_ioapic_irq == 0)
+		ii->ii_ioapic_irq = ioapic_pci_alloc_irq();
+	assert(ii->ii_ioapic_irq > 0);
+
+	/*
+	 * Attempt to allocate a PIRQ pin for this intpin if one is
+	 * not yet assigned.
+	 */
+	if (ii->ii_pirq_pin == 0)
+		ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx);
+	assert(ii->ii_pirq_pin > 0);
+
+	pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
+	pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
+	pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
+}
+
+void
+pci_lintr_assert(struct pci_devinst *pi)
+{
+
+	assert(pi->pi_lintr.pin > 0);
+
+	pthread_mutex_lock(&pi->pi_lintr.lock);
+	if (pi->pi_lintr.state == IDLE) {
+		if (pci_lintr_permitted(pi)) {
+			pi->pi_lintr.state = ASSERTED;
+			pci_irq_assert(pi);
+		} else
+			pi->pi_lintr.state = PENDING;
+	}
+	pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+void
+pci_lintr_deassert(struct pci_devinst *pi)
+{
+
+	assert(pi->pi_lintr.pin > 0);
+
+	pthread_mutex_lock(&pi->pi_lintr.lock);
+	if (pi->pi_lintr.state == ASSERTED) {
+		pi->pi_lintr.state = IDLE;
+		pci_irq_deassert(pi);
+	} else if (pi->pi_lintr.state == PENDING)
+		pi->pi_lintr.state = IDLE;
+	pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+static void
+pci_lintr_update(struct pci_devinst *pi)
+{
+
+	pthread_mutex_lock(&pi->pi_lintr.lock);
+	if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
+		pci_irq_deassert(pi);
+		pi->pi_lintr.state = PENDING;
+	} else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
+		pi->pi_lintr.state = ASSERTED;
+		pci_irq_assert(pi);
+	}
+	pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+int
+pci_count_lintr(int bus)
+{
+	int count, slot, pin;
+	struct slotinfo *slotinfo;
+
+	count = 0;
+	if (pci_businfo[bus] != NULL) {
+		for (slot = 0; slot < MAXSLOTS; slot++) {
+			slotinfo = &pci_businfo[bus]->slotinfo[slot];
+			for (pin = 0; pin < 4; pin++) {
+				if (slotinfo->si_intpins[pin].ii_count != 0)
+					count++;
+			}
+		}
+	}
+	return (count);
+}
+
+void
+pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
+{
+	struct businfo *bi;
+	struct slotinfo *si;
+	struct intxinfo *ii;
+	int slot, pin;
+
+	if ((bi = pci_businfo[bus]) == NULL)
+		return;
+
+	for (slot = 0; slot < MAXSLOTS; slot++) {
+		si = &bi->slotinfo[slot];
+		for (pin = 0; pin < 4; pin++) {
+			ii = &si->si_intpins[pin];
+			if (ii->ii_count != 0)
+				cb(bus, slot, pin + 1, ii->ii_pirq_pin,
+				    ii->ii_ioapic_irq, arg);
+		}
+	}
+}
+
+/*
+ * Return 1 if the emulated device in 'slot' is a multi-function device.
+ * Return 0 otherwise.
+ */
+static int
+pci_emul_is_mfdev(int bus, int slot)
+{
+	struct businfo *bi;
+	struct slotinfo *si;
+	int f, numfuncs;
+
+	numfuncs = 0;
+	if ((bi = pci_businfo[bus]) != NULL) {
+		si = &bi->slotinfo[slot];
+		for (f = 0; f < MAXFUNCS; f++) {
+			if (si->si_funcs[f].fi_devi != NULL) {
+				numfuncs++;
+			}
+		}
+	}
+	return (numfuncs > 1);
+}
+
+/*
+ * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
+ * whether or not is a multi-function being emulated in the pci 'slot'.
+ */
+static void
+pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
+{
+	int mfdev;
+
+	if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
+		mfdev = pci_emul_is_mfdev(bus, slot);
+		switch (bytes) {
+		case 1:
+		case 2:
+			*rv &= ~PCIM_MFDEV;
+			if (mfdev) {
+				*rv |= PCIM_MFDEV;
+			}
+			break;
+		case 4:
+			*rv &= ~(PCIM_MFDEV << 16);
+			if (mfdev) {
+				*rv |= (PCIM_MFDEV << 16);
+			}
+			break;
+		}
+	}
+}
+
+static uint32_t
+bits_changed(uint32_t old, uint32_t new, uint32_t mask)
+{
+
+	return ((old ^ new) & mask);
+}
+
+static void
+pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
+{
+	int i;
+	uint16_t old;
+
+	/*
+	 * The command register is at an offset of 4 bytes and thus the
+	 * guest could write 1, 2 or 4 bytes starting at this offset.
+	 */
+
+	old = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
+	CFGWRITE(pi, PCIR_COMMAND, new, bytes);		/* update config */
+	new = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* get updated value */
+
+	/*
+	 * If the MMIO or I/O address space decoding has changed then
+	 * register/unregister all BARs that decode that address space.
+	 */
+	for (i = 0; i <= PCI_BARMAX; i++) {
+		switch (pi->pi_bar[i].type) {
+			case PCIBAR_NONE:
+			case PCIBAR_MEMHI64:
+				break;
+			case PCIBAR_IO:
+				/* I/O address space decoding changed? */
+				if (bits_changed(old, new, PCIM_CMD_PORTEN)) {
+					if (porten(pi))
+						register_bar(pi, i);
+					else
+						unregister_bar(pi, i);
+				}
+				break;
+			case PCIBAR_MEM32:
+			case PCIBAR_MEM64:
+				/* MMIO address space decoding changed? */
+				if (bits_changed(old, new, PCIM_CMD_MEMEN)) {
+					if (memen(pi))
+						register_bar(pi, i);
+					else
+						unregister_bar(pi, i);
+				}
+				break; 
+			default:
+				assert(0); 
+		}
+	}
+
+	/*
+	 * If INTx has been unmasked and is pending, assert the
+	 * interrupt.
+	 */
+	pci_lintr_update(pi);
+}	
+
+static void
+pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
+    int coff, int bytes, uint32_t *eax)
+{
+	struct businfo *bi;
+	struct slotinfo *si;
+	struct pci_devinst *pi;
+	struct pci_devemu *pe;
+	int idx, needcfg;
+	uint64_t addr, bar, mask;
+
+	if ((bi = pci_businfo[bus]) != NULL) {
+		si = &bi->slotinfo[slot];
+		pi = si->si_funcs[func].fi_devi;
+	} else
+		pi = NULL;
+
+	/*
+	 * Just return if there is no device at this slot:func or if the
+	 * the guest is doing an un-aligned access.
+	 */
+	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+	    (coff & (bytes - 1)) != 0) {
+		if (in)
+			*eax = 0xffffffff;
+		return;
+	}
+
+	/*
+	 * Ignore all writes beyond the standard config space and return all
+	 * ones on reads.
+	 */
+	if (coff >= PCI_REGMAX + 1) {
+		if (in) {
+			*eax = 0xffffffff;
+			/*
+			 * Extended capabilities begin at offset 256 in config
+			 * space. Absence of extended capabilities is signaled
+			 * with all 0s in the extended capability header at
+			 * offset 256.
+			 */
+			if (coff <= PCI_REGMAX + 4)
+				*eax = 0x00000000;
+		}
+		return;
+	}
+
+	pe = pi->pi_d;
+
+	/*
+	 * Config read
+	 */
+	if (in) {
+		/* Let the device emulation override the default handler */
+		if (pe->pe_cfgread != NULL) {
+			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
+			    eax);
+		} else {
+			needcfg = 1;
+		}
+
+		if (needcfg) {
+			if (bytes == 1)
+				*eax = pci_get_cfgdata8(pi, coff);
+			else if (bytes == 2)
+				*eax = pci_get_cfgdata16(pi, coff);
+			else
+				*eax = pci_get_cfgdata32(pi, coff);
+		}
+
+		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
+	} else {
+		/* Let the device emulation override the default handler */
+		if (pe->pe_cfgwrite != NULL &&
+		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+			return;
+
+		/*
+		 * Special handling for write to BAR registers
+		 */
+		if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+			/*
+			 * Ignore writes to BAR registers that are not
+			 * 4-byte aligned.
+			 */
+			if (bytes != 4 || (coff & 0x3) != 0)
+				return;
+			idx = (coff - PCIR_BAR(0)) / 4;
+			mask = ~(pi->pi_bar[idx].size - 1);
+			switch (pi->pi_bar[idx].type) {
+			case PCIBAR_NONE:
+				pi->pi_bar[idx].addr = bar = 0;
+				break;
+			case PCIBAR_IO:
+				addr = *eax & mask;
+				addr &= 0xffff;
+				bar = addr | PCIM_BAR_IO_SPACE;
+				/*
+				 * Register the new BAR value for interception
+				 */
+				if (addr != pi->pi_bar[idx].addr) {
+					update_bar_address(pi, addr, idx,
+							   PCIBAR_IO);
+				}
+				break;
+			case PCIBAR_MEM32:
+				addr = bar = *eax & mask;
+				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+				if (addr != pi->pi_bar[idx].addr) {
+					update_bar_address(pi, addr, idx,
+							   PCIBAR_MEM32);
+				}
+				break;
+			case PCIBAR_MEM64:
+				addr = bar = *eax & mask;
+				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+				       PCIM_BAR_MEM_PREFETCH;
+				if (addr != (uint32_t)pi->pi_bar[idx].addr) {
+					update_bar_address(pi, addr, idx,
+							   PCIBAR_MEM64);
+				}
+				break;
+			case PCIBAR_MEMHI64:
+				mask = ~(pi->pi_bar[idx - 1].size - 1);
+				addr = ((uint64_t)*eax << 32) & mask;
+				bar = addr >> 32;
+				if (bar != pi->pi_bar[idx - 1].addr >> 32) {
+					update_bar_address(pi, addr, idx - 1,
+							   PCIBAR_MEMHI64);
+				}
+				break;
+			default:
+				assert(0);
+			}
+			pci_set_cfgdata32(pi, coff, bar);
+
+		} else if (pci_emul_iscap(pi, coff)) {
+			pci_emul_capwrite(pi, coff, bytes, *eax);
+		} else if (coff == PCIR_COMMAND) {
+			pci_emul_cmdwrite(pi, *eax, bytes);
+		} else {
+			CFGWRITE(pi, coff, *eax, bytes);
+		}
+	}
+}
+
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	uint32_t x;
+
+	if (bytes != 4) {
+		if (in)
+			*eax = (bytes == 2) ? 0xffff : 0xff;
+		return (0);
+	}
+
+	if (in) {
+		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+		if (cfgenable)
+			x |= CONF1_ENABLE;
+		*eax = x;
+	} else {
+		x = *eax;
+		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+		cfgoff = x & PCI_REGMAX;
+		cfgfunc = (x >> 8) & PCI_FUNCMAX;
+		cfgslot = (x >> 11) & PCI_SLOTMAX;
+		cfgbus = (x >> 16) & PCI_BUSMAX;
+	}
+
+	return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	int coff;
+
+	assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+	coff = cfgoff + (port - CONF1_DATA_PORT);
+	if (cfgenable) {
+		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
+		    eax);
+	} else {
+		/* Ignore accesses to cfgdata if not enabled by cfgaddr */
+		if (in)
+			*eax = 0xffffffff;
+	}
+	return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DIOSZ	8
+#define DMEMSZ	4096
+struct pci_emul_dsoftc {
+	uint8_t   ioregs[DIOSZ];
+	uint8_t	  memregs[DMEMSZ];
+};
+
+#define	PCI_EMUL_MSI_MSGS	 4
+#define	PCI_EMUL_MSIX_MSGS	16
+
+static int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	int error;
+	struct pci_emul_dsoftc *sc;
+
+	sc = calloc(1, sizeof(struct pci_emul_dsoftc));
+
+	pi->pi_arg = sc;
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+	pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+	error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
+	assert(error == 0);
+
+	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
+	assert(error == 0);
+
+	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
+	assert(error == 0);
+
+	return (0);
+}
+
+static void
+pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	      uint64_t offset, int size, uint64_t value)
+{
+	int i;
+	struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+	if (baridx == 0) {
+		if (offset + size > DIOSZ) {
+			printf("diow: iow too large, offset %ld size %d\n",
+			       offset, size);
+			return;
+		}
+
+		if (size == 1) {
+			sc->ioregs[offset] = value & 0xff;
+		} else if (size == 2) {
+			*(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
+		} else if (size == 4) {
+			*(uint32_t *)&sc->ioregs[offset] = value;
+		} else {
+			printf("diow: iow unknown size %d\n", size);
+		}
+
+		/*
+		 * Special magic value to generate an interrupt
+		 */
+		if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+			pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
+
+		if (value == 0xabcdef) {
+			for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
+				pci_generate_msi(pi, i);
+		}
+	}
+
+	if (baridx == 1) {
+		if (offset + size > DMEMSZ) {
+			printf("diow: memw too large, offset %ld size %d\n",
+			       offset, size);
+			return;
+		}
+
+		if (size == 1) {
+			sc->memregs[offset] = value;
+		} else if (size == 2) {
+			*(uint16_t *)&sc->memregs[offset] = value;
+		} else if (size == 4) {
+			*(uint32_t *)&sc->memregs[offset] = value;
+		} else if (size == 8) {
+			*(uint64_t *)&sc->memregs[offset] = value;
+		} else {
+			printf("diow: memw unknown size %d\n", size);
+		}
+		
+		/*
+		 * magic interrupt ??
+		 */
+	}
+
+	if (baridx > 1) {
+		printf("diow: unknown bar idx %d\n", baridx);
+	}
+}
+
+static uint64_t
+pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	      uint64_t offset, int size)
+{
+	struct pci_emul_dsoftc *sc = pi->pi_arg;
+	uint32_t value;
+
+	if (baridx == 0) {
+		if (offset + size > DIOSZ) {
+			printf("dior: ior too large, offset %ld size %d\n",
+			       offset, size);
+			return (0);
+		}
+	
+		if (size == 1) {
+			value = sc->ioregs[offset];
+		} else if (size == 2) {
+			value = *(uint16_t *) &sc->ioregs[offset];
+		} else if (size == 4) {
+			value = *(uint32_t *) &sc->ioregs[offset];
+		} else {
+			printf("dior: ior unknown size %d\n", size);
+		}
+	}
+	
+	if (baridx == 1) {
+		if (offset + size > DMEMSZ) {
+			printf("dior: memr too large, offset %ld size %d\n",
+			       offset, size);
+			return (0);
+		}
+	
+		if (size == 1) {
+			value = sc->memregs[offset];
+		} else if (size == 2) {
+			value = *(uint16_t *) &sc->memregs[offset];
+		} else if (size == 4) {
+			value = *(uint32_t *) &sc->memregs[offset];
+		} else if (size == 8) {
+			value = *(uint64_t *) &sc->memregs[offset];
+		} else {
+			printf("dior: ior unknown size %d\n", size);
+		}
+	}
+
+
+	if (baridx > 1) {
+		printf("dior: unknown bar idx %d\n", baridx);
+		return (0);
+	}
+
+	return (value);
+}
+
+struct pci_devemu pci_dummy = {
+	.pe_emu = "dummy",
+	.pe_init = pci_emul_dinit,
+	.pe_barwrite = pci_emul_diow,
+	.pe_barread = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
new file mode 100644
index 0000000000..6af01c4c3c
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -0,0 +1,283 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_emul.h 269700 2014-08-08 03:49:01Z neel $
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+#include <sys/_pthreadtypes.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define	PCI_BARMAX	PCIR_MAX_BAR_0	/* BAR registers in a Type 0 header */
+
+struct vmctx;
+struct pci_devinst;
+struct memory_region;
+
+struct pci_devemu {
+	char      *pe_emu;		/* Name of device emulation */
+
+	/* instance creation */
+	int       (*pe_init)(struct vmctx *, struct pci_devinst *,
+			     char *opts);
+
+	/* ACPI DSDT enumeration */
+	void	(*pe_write_dsdt)(struct pci_devinst *);
+
+	/* config space read/write callbacks */
+	int	(*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+			       struct pci_devinst *pi, int offset,
+			       int bytes, uint32_t val);
+	int	(*pe_cfgread)(struct vmctx *ctx, int vcpu,
+			      struct pci_devinst *pi, int offset,
+			      int bytes, uint32_t *retval);
+
+	/* BAR read/write callbacks */
+	void      (*pe_barwrite)(struct vmctx *ctx, int vcpu,
+				 struct pci_devinst *pi, int baridx,
+				 uint64_t offset, int size, uint64_t value);
+	uint64_t  (*pe_barread)(struct vmctx *ctx, int vcpu,
+				struct pci_devinst *pi, int baridx,
+				uint64_t offset, int size);
+};
+#define PCI_EMUL_SET(x)   DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+	PCIBAR_NONE,
+	PCIBAR_IO,
+	PCIBAR_MEM32,
+	PCIBAR_MEM64,
+	PCIBAR_MEMHI64
+};
+
+struct pcibar {
+	enum pcibar_type	type;		/* io or memory */
+	uint64_t		size;
+	uint64_t		addr;
+};
+
+#define PI_NAMESZ	40
+
+struct msix_table_entry {
+	uint64_t	addr;
+	uint32_t	msg_data;
+	uint32_t	vector_control;
+} __packed;
+
+/* 
+ * In case the structure is modified to hold extra information, use a define
+ * for the size that should be emulated.
+ */
+#define	MSIX_TABLE_ENTRY_SIZE	16
+#define MAX_MSIX_TABLE_ENTRIES	2048
+#define	PBA_SIZE(msgnum)	(roundup2((msgnum), 64) / 8)
+
+enum lintr_stat {
+	IDLE,
+	ASSERTED,
+	PENDING
+};
+
+struct pci_devinst {
+	struct pci_devemu *pi_d;
+	struct vmctx *pi_vmctx;
+	uint8_t	  pi_bus, pi_slot, pi_func;
+	char	  pi_name[PI_NAMESZ];
+	int	  pi_bar_getsize;
+	int	  pi_prevcap;
+	int	  pi_capend;
+
+	struct {
+		int8_t    	pin;
+		enum lintr_stat	state;
+		int		pirq_pin;
+		int	  	ioapic_irq;
+		pthread_mutex_t	lock;
+	} pi_lintr;
+
+	struct {
+		int		enabled;
+		uint64_t	addr;
+		uint64_t	msg_data;
+		int		maxmsgnum;
+	} pi_msi;
+
+	struct {
+		int	enabled;
+		int	table_bar;
+		int	pba_bar;
+		uint32_t table_offset;
+		int	table_count;
+		uint32_t pba_offset;
+		int	pba_size;
+		int	function_mask; 	
+		struct msix_table_entry *table;	/* allocated at runtime */
+	} pi_msix;
+
+	void      *pi_arg;		/* devemu-private data */
+
+	u_char	  pi_cfgdata[PCI_REGMAX + 1];
+	struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+	uint8_t		capid;
+	uint8_t		nextptr;
+	uint16_t	msgctrl;
+	uint32_t	addrlo;
+	uint32_t	addrhi;
+	uint16_t	msgdata;
+} __packed;
+
+struct msixcap {
+	uint8_t		capid;
+	uint8_t		nextptr;
+	uint16_t	msgctrl;
+	uint32_t	table_info;	/* bar index and offset within it */
+	uint32_t	pba_info;	/* bar index and offset within it */
+} __packed;
+
+struct pciecap {
+	uint8_t		capid;
+	uint8_t		nextptr;
+	uint16_t	pcie_capabilities;
+
+	uint32_t	dev_capabilities;	/* all devices */
+	uint16_t	dev_control;
+	uint16_t	dev_status;
+
+	uint32_t	link_capabilities;	/* devices with links */
+	uint16_t	link_control;
+	uint16_t	link_status;
+
+	uint32_t	slot_capabilities;	/* ports with slots */
+	uint16_t	slot_control;
+	uint16_t	slot_status;
+
+	uint16_t	root_control;		/* root ports */
+	uint16_t	root_capabilities;
+	uint32_t	root_status;
+
+	uint32_t	dev_capabilities2;	/* all devices */
+	uint16_t	dev_control2;
+	uint16_t	dev_status2;
+
+	uint32_t	link_capabilities2;	/* devices with links */
+	uint16_t	link_control2;
+	uint16_t	link_status2;
+
+	uint32_t	slot_capabilities2;	/* ports with slots */
+	uint16_t	slot_control2;
+	uint16_t	slot_status2;
+} __packed;
+
+typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
+    int ioapic_irq, void *arg);
+
+int	init_pci(struct vmctx *ctx);
+void	msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+	    int bytes, uint32_t val);
+void	msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+	    int bytes, uint32_t val);
+void	pci_callback(void);
+int	pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
+	    enum pcibar_type type, uint64_t size);
+int	pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
+	    uint64_t hostbase, enum pcibar_type type, uint64_t size);
+int	pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+int	pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
+void	pci_generate_msi(struct pci_devinst *pi, int msgnum);
+void	pci_generate_msix(struct pci_devinst *pi, int msgnum);
+void	pci_lintr_assert(struct pci_devinst *pi);
+void	pci_lintr_deassert(struct pci_devinst *pi);
+void	pci_lintr_request(struct pci_devinst *pi);
+int	pci_msi_enabled(struct pci_devinst *pi);
+int	pci_msix_enabled(struct pci_devinst *pi);
+int	pci_msix_table_bar(struct pci_devinst *pi);
+int	pci_msix_pba_bar(struct pci_devinst *pi);
+int	pci_msi_msgnum(struct pci_devinst *pi);
+int	pci_parse_slot(char *opt);
+void	pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+int	pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
+int	pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+			     uint64_t value);
+uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
+int	pci_count_lintr(int bus);
+void	pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
+void	pci_write_dsdt(void);
+uint64_t pci_ecfg_base(void);
+int	pci_bus_configured(int bus);
+
+static __inline void 
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+	assert(offset <= PCI_REGMAX);
+	*(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void 
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+	*(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void 
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+	*(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+	assert(offset <= PCI_REGMAX);
+	return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+	return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+	return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c
new file mode 100644
index 0000000000..08956d082e
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_hostbridge.c
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $");
+
+#include "pci_emul.h"
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+	/* config space */
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275);	/* NetApp */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275);	/* NetApp */
+	pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+	pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
+
+	return (0);
+}
+
+static int
+pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	(void) pci_hostbridge_init(ctx, pi, opts);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022);	/* AMD */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432);	/* made up */
+
+	return (0);
+}
+
+struct pci_devemu pci_de_amd_hostbridge = {
+	.pe_emu = "amd_hostbridge",
+	.pe_init = pci_amd_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_amd_hostbridge);
+
+struct pci_devemu pci_de_hostbridge = {
+	.pe_emu = "hostbridge",
+	.pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c
new file mode 100644
index 0000000000..97ee330c65
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_irq.c
@@ -0,0 +1,351 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_irq.c 266125 2014-05-15 14:16:55Z jhb $");
+
+#include <sys/param.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+/*
+ * Implement an 8 pin PCI interrupt router compatible with the router
+ * present on Intel's ICH10 chip.
+ */
+
+/* Fields in each PIRQ register. */
+#define	PIRQ_DIS	0x80
+#define	PIRQ_IRQ	0x0f
+
+/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */
+#define	PERMITTED_IRQS	0xdef8
+#define	IRQ_PERMITTED(irq)	(((1U << (irq)) & PERMITTED_IRQS) != 0)
+
+/* IRQ count to disable an IRQ. */
+#define	IRQ_DISABLED	0xff
+
+static struct pirq {
+	uint8_t	reg;
+	int	use_count;
+	int	active_count;
+	pthread_mutex_t lock;
+} pirqs[8];
+
+static u_char irq_counts[16];
+static int pirq_cold = 1;
+
+/*
+ * Returns true if this pin is enabled with a valid IRQ.  Setting the
+ * register to a reserved IRQ causes interrupts to not be asserted as
+ * if the pin was disabled.
+ */
+static bool
+pirq_valid_irq(int reg)
+{
+
+	if (reg & PIRQ_DIS)
+		return (false);
+	return (IRQ_PERMITTED(reg & PIRQ_IRQ));
+}
+
+uint8_t
+pirq_read(int pin)
+{
+
+	assert(pin > 0 && pin <= nitems(pirqs));
+	return (pirqs[pin - 1].reg);
+}
+
+void
+pirq_write(struct vmctx *ctx, int pin, uint8_t val)
+{
+	struct pirq *pirq;
+
+	assert(pin > 0 && pin <= nitems(pirqs));
+	pirq = &pirqs[pin - 1];
+	pthread_mutex_lock(&pirq->lock);
+	if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) {
+		if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+			vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+		pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ);
+		if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+			vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+	}
+	pthread_mutex_unlock(&pirq->lock);
+}
+
+void
+pci_irq_reserve(int irq)
+{
+
+	assert(irq < nitems(irq_counts));
+	assert(pirq_cold);
+	assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
+	irq_counts[irq] = IRQ_DISABLED;
+}
+
+void
+pci_irq_use(int irq)
+{
+
+	assert(irq < nitems(irq_counts));
+	assert(pirq_cold);
+	if (irq_counts[irq] != IRQ_DISABLED)
+		irq_counts[irq]++;
+}
+
+void
+pci_irq_init(struct vmctx *ctx)
+{
+	int i;
+
+	for (i = 0; i < nitems(pirqs); i++) {
+		pirqs[i].reg = PIRQ_DIS;
+		pirqs[i].use_count = 0;
+		pirqs[i].active_count = 0;
+		pthread_mutex_init(&pirqs[i].lock, NULL);
+	}
+	for (i = 0; i < nitems(irq_counts); i++) {
+		if (IRQ_PERMITTED(i))
+			irq_counts[i] = 0;
+		else
+			irq_counts[i] = IRQ_DISABLED;
+	}
+}
+
+void
+pci_irq_assert(struct pci_devinst *pi)
+{
+	struct pirq *pirq;
+
+	if (pi->pi_lintr.pirq_pin > 0) {
+		assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+		pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+		pthread_mutex_lock(&pirq->lock);
+		pirq->active_count++;
+		if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) {
+			vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+			    pi->pi_lintr.ioapic_irq);
+			pthread_mutex_unlock(&pirq->lock);
+			return;
+		}
+		pthread_mutex_unlock(&pirq->lock);
+	}
+	vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+void
+pci_irq_deassert(struct pci_devinst *pi)
+{
+	struct pirq *pirq;
+
+	if (pi->pi_lintr.pirq_pin > 0) {
+		assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+		pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+		pthread_mutex_lock(&pirq->lock);
+		pirq->active_count--;
+		if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) {
+			vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+			    pi->pi_lintr.ioapic_irq);
+			pthread_mutex_unlock(&pirq->lock);
+			return;
+		}
+		pthread_mutex_unlock(&pirq->lock);
+	}
+	vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+int
+pirq_alloc_pin(struct vmctx *ctx)
+{
+	int best_count, best_irq, best_pin, irq, pin;
+
+	pirq_cold = 1;
+
+	/* First, find the least-used PIRQ pin. */
+	best_pin = 0;
+	best_count = pirqs[0].use_count;
+	for (pin = 1; pin < nitems(pirqs); pin++) {
+		if (pirqs[pin].use_count < best_count) {
+			best_pin = pin;
+			best_count = pirqs[pin].use_count;
+		}
+	}
+	pirqs[best_pin].use_count++;
+
+	/* Second, route this pin to an IRQ. */
+	if (pirqs[best_pin].reg == PIRQ_DIS) {
+		best_irq = -1;
+		best_count = 0;
+		for (irq = 0; irq < nitems(irq_counts); irq++) {
+			if (irq_counts[irq] == IRQ_DISABLED)
+				continue;
+			if (best_irq == -1 || irq_counts[irq] < best_count) {
+				best_irq = irq;
+				best_count = irq_counts[irq];
+			}
+		}
+		assert(best_irq != 0);
+		irq_counts[best_irq]++;
+		pirqs[best_pin].reg = best_irq;
+		vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
+	}
+
+	return (best_pin + 1);
+}
+
+int
+pirq_irq(int pin)
+{
+
+	if (pin == -1)
+		return (255);
+	assert(pin > 0 && pin <= nitems(pirqs));
+	return (pirqs[pin - 1].reg & PIRQ_IRQ);
+}
+
+/* XXX: Generate $PIR table. */
+
+#ifdef	__FreeBSD__
+static void
+pirq_dsdt(void)
+{
+	char *irq_prs, *old;
+	int irq, pin;
+
+	irq_prs = NULL;
+	for (irq = 0; irq < nitems(irq_counts); irq++) {
+		if (!IRQ_PERMITTED(irq))
+			continue;
+		if (irq_prs == NULL)
+			asprintf(&irq_prs, "%d", irq);
+		else {
+			old = irq_prs;
+			asprintf(&irq_prs, "%s,%d", old, irq);
+			free(old);
+		}
+	}
+
+	/*
+	 * A helper method to validate a link register's value.  This
+	 * duplicates pirq_valid_irq().
+	 */
+	dsdt_line("");
+	dsdt_line("Method (PIRV, 1, NotSerialized)");
+	dsdt_line("{");
+	dsdt_line("  If (And (Arg0, 0x%02X))", PIRQ_DIS);
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  And (Arg0, 0x%02X, Local0)", PIRQ_IRQ);
+	dsdt_line("  If (LLess (Local0, 0x03))");
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  If (LEqual (Local0, 0x08))");
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  If (LEqual (Local0, 0x0D))");
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  Return (0x01)");
+	dsdt_line("}");
+
+	for (pin = 0; pin < nitems(pirqs); pin++) {
+		dsdt_line("");
+		dsdt_line("Device (LNK%c)", 'A' + pin);
+		dsdt_line("{");
+		dsdt_line("  Name (_HID, EisaId (\"PNP0C0F\"))");
+		dsdt_line("  Name (_UID, 0x%02X)", pin + 1);
+		dsdt_line("  Method (_STA, 0, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    If (PIRV (PIR%c))", 'A' + pin);
+		dsdt_line("    {");
+		dsdt_line("       Return (0x0B)");
+		dsdt_line("    }");
+		dsdt_line("    Else");
+		dsdt_line("    {");
+		dsdt_line("       Return (0x09)");
+		dsdt_line("    }");
+		dsdt_line("  }");
+		dsdt_line("  Name (_PRS, ResourceTemplate ()");
+		dsdt_line("  {");
+		dsdt_line("    IRQ (Level, ActiveLow, Shared, )");
+		dsdt_line("      {%s}", irq_prs);
+		dsdt_line("  })");
+		dsdt_line("  Name (CB%02X, ResourceTemplate ()", pin + 1);
+		dsdt_line("  {");
+		dsdt_line("    IRQ (Level, ActiveLow, Shared, )");
+		dsdt_line("      {}");
+		dsdt_line("  })");
+		dsdt_line("  CreateWordField (CB%02X, 0x01, CIR%c)",
+		    pin + 1, 'A' + pin);
+		dsdt_line("  Method (_CRS, 0, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    And (PIR%c, 0x%02X, Local0)", 'A' + pin,
+		    PIRQ_DIS | PIRQ_IRQ);
+		dsdt_line("    If (PIRV (Local0))");
+		dsdt_line("    {");
+		dsdt_line("      ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin);
+		dsdt_line("    }");
+		dsdt_line("    Else");
+		dsdt_line("    {");
+		dsdt_line("      Store (0x00, CIR%c)", 'A' + pin);
+		dsdt_line("    }");
+		dsdt_line("    Return (CB%02X)", pin + 1);
+		dsdt_line("  }");
+		dsdt_line("  Method (_DIS, 0, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    Store (0x80, PIR%c)", 'A' + pin);
+		dsdt_line("  }");
+		dsdt_line("  Method (_SRS, 1, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin);
+		dsdt_line("    FindSetRightBit (SIR%c, Local0)", 'A' + pin);
+		dsdt_line("    Store (Decrement (Local0), PIR%c)", 'A' + pin);
+		dsdt_line("  }");
+		dsdt_line("}");
+	}
+	free(irq_prs);
+}
+LPC_DSDT(pirq_dsdt);
+#endif
diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h
new file mode 100644
index 0000000000..483f12b61e
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_irq.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_irq.h 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#ifndef __PCI_IRQ_H__
+#define	__PCI_IRQ_H__
+
+struct pci_devinst;
+
+void	pci_irq_assert(struct pci_devinst *pi);
+void	pci_irq_deassert(struct pci_devinst *pi);
+void	pci_irq_init(struct vmctx *ctx);
+void	pci_irq_reserve(int irq);
+void	pci_irq_use(int irq);
+int	pirq_alloc_pin(struct vmctx *ctx);
+int	pirq_irq(int pin);
+uint8_t	pirq_read(int pin);
+void	pirq_write(struct vmctx *ctx, int pin, uint8_t val);
+
+#endif
diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c
new file mode 100644
index 0000000000..8c060150dc
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_lpc.c
@@ -0,0 +1,433 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $");
+
+#include <sys/types.h>
+#include <machine/vmm.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "uart_emul.h"
+
+#define	IO_ICU1		0x20
+#define	IO_ICU2		0xA0
+
+SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt);
+SET_DECLARE(lpc_sysres_set, struct lpc_sysres);
+
+#define	ELCR_PORT	0x4d0
+SYSRES_IO(ELCR_PORT, 2);
+
+#define	IO_TIMER1_PORT	0x40
+
+#define	NMISC_PORT	0x61
+SYSRES_IO(NMISC_PORT, 1);
+
+static struct pci_devinst *lpc_bridge;
+
+#define	LPC_UART_NUM	2
+static struct lpc_uart_softc {
+	struct uart_softc *uart_softc;
+	const char *opts;
+	int	iobase;
+	int	irq;
+	int	enabled;
+} lpc_uart_softc[LPC_UART_NUM];
+
+static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
+
+/*
+ * LPC device configuration is in the following form:
+ * <lpc_device_name>[,<options>]
+ * For e.g. "com1,stdio"
+ */
+int
+lpc_device_parse(const char *opts)
+{
+	int unit, error;
+	char *str, *cpy, *lpcdev;
+
+	error = -1;
+	str = cpy = strdup(opts);
+	lpcdev = strsep(&str, ",");
+	if (lpcdev != NULL) {
+		for (unit = 0; unit < LPC_UART_NUM; unit++) {
+			if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) {
+				lpc_uart_softc[unit].opts = str;
+				error = 0;
+				goto done;
+			}
+		}
+	}
+
+done:
+	if (error)
+		free(cpy);
+
+	return (error);
+}
+
+static void
+lpc_uart_intr_assert(void *arg)
+{
+	struct lpc_uart_softc *sc = arg;
+
+	assert(sc->irq >= 0);
+
+	vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq);
+}
+
+static void
+lpc_uart_intr_deassert(void *arg)
+{
+	/* 
+	 * The COM devices on the LPC bus generate edge triggered interrupts,
+	 * so nothing more to do here.
+	 */
+}
+
+static int
+lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		    uint32_t *eax, void *arg)
+{
+	int offset;
+	struct lpc_uart_softc *sc = arg;
+
+	offset = port - sc->iobase;
+
+	switch (bytes) {
+	case 1:
+		if (in)
+			*eax = uart_read(sc->uart_softc, offset);
+		else
+			uart_write(sc->uart_softc, offset, *eax);
+		break;
+	case 2:
+		if (in) {
+			*eax = uart_read(sc->uart_softc, offset);
+			*eax |= uart_read(sc->uart_softc, offset + 1) << 8;
+		} else {
+			uart_write(sc->uart_softc, offset, *eax);
+			uart_write(sc->uart_softc, offset + 1, *eax >> 8);
+		}
+		break;
+	default:
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+lpc_init(void)
+{
+	struct lpc_uart_softc *sc;
+	struct inout_port iop;
+	const char *name;
+	int unit, error;
+
+	/* COM1 and COM2 */
+	for (unit = 0; unit < LPC_UART_NUM; unit++) {
+		sc = &lpc_uart_softc[unit];
+		name = lpc_uart_names[unit];
+
+		if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) {
+			fprintf(stderr, "Unable to allocate resources for "
+			    "LPC device %s\n", name);
+			return (-1);
+		}
+		pci_irq_reserve(sc->irq);
+
+		sc->uart_softc = uart_init(lpc_uart_intr_assert,
+				    lpc_uart_intr_deassert, sc);
+
+		if (uart_set_backend(sc->uart_softc, sc->opts) != 0) {
+			fprintf(stderr, "Unable to initialize backend '%s' "
+			    "for LPC device %s\n", sc->opts, name);
+			return (-1);
+		}
+
+		bzero(&iop, sizeof(struct inout_port));
+		iop.name = name;
+		iop.port = sc->iobase;
+		iop.size = UART_IO_BAR_SIZE;
+		iop.flags = IOPORT_F_INOUT;
+		iop.handler = lpc_uart_io_handler;
+		iop.arg = sc;
+
+		error = register_inout(&iop);
+		assert(error == 0);
+		sc->enabled = 1;
+	}
+
+	return (0);
+}
+
+#ifdef	__FreeBSD__
+static void
+pci_lpc_write_dsdt(struct pci_devinst *pi)
+{
+	struct lpc_dsdt **ldpp, *ldp;
+
+	dsdt_line("");
+	dsdt_line("Device (ISA)");
+	dsdt_line("{");
+	dsdt_line("  Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func);
+	dsdt_line("  OperationRegion (LPCR, PCI_Config, 0x00, 0x100)");
+	dsdt_line("  Field (LPCR, AnyAcc, NoLock, Preserve)");
+	dsdt_line("  {");
+	dsdt_line("    Offset (0x60),");
+	dsdt_line("    PIRA,   8,");
+	dsdt_line("    PIRB,   8,");
+	dsdt_line("    PIRC,   8,");
+	dsdt_line("    PIRD,   8,");
+	dsdt_line("    Offset (0x68),");
+	dsdt_line("    PIRE,   8,");
+	dsdt_line("    PIRF,   8,");
+	dsdt_line("    PIRG,   8,");
+	dsdt_line("    PIRH,   8");
+	dsdt_line("  }");
+	dsdt_line("");
+
+	dsdt_indent(1);
+	SET_FOREACH(ldpp, lpc_dsdt_set) {
+		ldp = *ldpp;
+		ldp->handler();
+	}
+
+	dsdt_line("");
+	dsdt_line("Device (PIC)");
+	dsdt_line("{");
+	dsdt_line("  Name (_HID, EisaId (\"PNP0000\"))");
+	dsdt_line("  Name (_CRS, ResourceTemplate ()");
+	dsdt_line("  {");
+	dsdt_indent(2);
+	dsdt_fixed_ioport(IO_ICU1, 2);
+	dsdt_fixed_ioport(IO_ICU2, 2);
+	dsdt_fixed_irq(2);
+	dsdt_unindent(2);
+	dsdt_line("  })");
+	dsdt_line("}");
+
+	dsdt_line("");
+	dsdt_line("Device (TIMR)");
+	dsdt_line("{");
+	dsdt_line("  Name (_HID, EisaId (\"PNP0100\"))");
+	dsdt_line("  Name (_CRS, ResourceTemplate ()");
+	dsdt_line("  {");
+	dsdt_indent(2);
+	dsdt_fixed_ioport(IO_TIMER1_PORT, 4);
+	dsdt_fixed_irq(0);
+	dsdt_unindent(2);
+	dsdt_line("  })");
+	dsdt_line("}");
+	dsdt_unindent(1);
+
+	dsdt_line("}");
+}
+
+static void
+pci_lpc_sysres_dsdt(void)
+{
+	struct lpc_sysres **lspp, *lsp;
+
+	dsdt_line("");
+	dsdt_line("Device (SIO)");
+	dsdt_line("{");
+	dsdt_line("  Name (_HID, EisaId (\"PNP0C02\"))");
+	dsdt_line("  Name (_CRS, ResourceTemplate ()");
+	dsdt_line("  {");
+
+	dsdt_indent(2);
+	SET_FOREACH(lspp, lpc_sysres_set) {
+		lsp = *lspp;
+		switch (lsp->type) {
+		case LPC_SYSRES_IO:
+			dsdt_fixed_ioport(lsp->base, lsp->length);
+			break;
+		case LPC_SYSRES_MEM:
+			dsdt_fixed_mem32(lsp->base, lsp->length);
+			break;
+		}
+	}
+	dsdt_unindent(2);
+
+	dsdt_line("  })");
+	dsdt_line("}");
+}
+LPC_DSDT(pci_lpc_sysres_dsdt);
+
+static void
+pci_lpc_uart_dsdt(void)
+{
+	struct lpc_uart_softc *sc;
+	int unit;
+
+	for (unit = 0; unit < LPC_UART_NUM; unit++) {
+		sc = &lpc_uart_softc[unit];
+		if (!sc->enabled)
+			continue;
+		dsdt_line("");
+		dsdt_line("Device (%s)", lpc_uart_names[unit]);
+		dsdt_line("{");
+		dsdt_line("  Name (_HID, EisaId (\"PNP0501\"))");
+		dsdt_line("  Name (_UID, %d)", unit + 1);
+		dsdt_line("  Name (_CRS, ResourceTemplate ()");
+		dsdt_line("  {");
+		dsdt_indent(2);
+		dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE);
+		dsdt_fixed_irq(sc->irq);
+		dsdt_unindent(2);
+		dsdt_line("  })");
+		dsdt_line("}");
+	}
+}
+LPC_DSDT(pci_lpc_uart_dsdt);
+#endif
+
+static int
+pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		  int coff, int bytes, uint32_t val)
+{
+	int pirq_pin;
+
+	if (bytes == 1) {
+		pirq_pin = 0;
+		if (coff >= 0x60 && coff <= 0x63)
+			pirq_pin = coff - 0x60 + 1;
+		if (coff >= 0x68 && coff <= 0x6b)
+			pirq_pin = coff - 0x68 + 5;
+		if (pirq_pin != 0) {
+			pirq_write(ctx, pirq_pin, val);
+			pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin));
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+static void
+pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	       int baridx, uint64_t offset, int size, uint64_t value)
+{
+}
+
+static uint64_t
+pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	      int baridx, uint64_t offset, int size)
+{
+	return (0);
+}
+
+#define	LPC_DEV		0x7000
+#define	LPC_VENDOR	0x8086
+
+static int
+pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+	/*
+	 * Do not allow more than one LPC bridge to be configured.
+	 */
+	if (lpc_bridge != NULL) {
+		fprintf(stderr, "Only one LPC bridge is allowed.\n");
+		return (-1);
+	}
+
+	/*
+	 * Enforce that the LPC can only be configured on bus 0. This
+	 * simplifies the ACPI DSDT because it can provide a decode for
+	 * all legacy i/o ports behind bus 0.
+	 */
+	if (pi->pi_bus != 0) {
+		fprintf(stderr, "LPC bridge can be present only on bus 0.\n");
+		return (-1);
+	}
+
+	if (lpc_init() != 0)
+		return (-1);
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA);
+
+	lpc_bridge = pi;
+
+	return (0);
+}
+
+char *
+lpc_pirq_name(int pin)
+{
+	char *name;
+
+	if (lpc_bridge == NULL)
+		return (NULL);
+	asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1);
+	return (name);
+}
+
+void
+lpc_pirq_routed(void)
+{
+	int pin;
+
+	if (lpc_bridge == NULL)
+		return;
+
+ 	for (pin = 0; pin < 4; pin++)
+		pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1));
+	for (pin = 0; pin < 4; pin++)
+		pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
+}
+
+struct pci_devemu pci_de_lpc = {
+	.pe_emu =	"lpc",
+	.pe_init =	pci_lpc_init,
+#ifdef	__FreeBSD__
+	.pe_write_dsdt = pci_lpc_write_dsdt,
+#endif
+	.pe_cfgwrite =	pci_lpc_cfgwrite,
+	.pe_barwrite =	pci_lpc_write,
+	.pe_barread =	pci_lpc_read
+};
+PCI_EMUL_SET(pci_de_lpc);
diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h
new file mode 100644
index 0000000000..4f725b1dd3
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_lpc.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.h 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#ifndef _LPC_H_
+#define	_LPC_H_
+
+#include <sys/linker_set.h>
+
+typedef void (*lpc_write_dsdt_t)(void);
+
+struct lpc_dsdt {
+	lpc_write_dsdt_t handler;
+};
+
+#define	LPC_DSDT(handler)						\
+	static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = {	\
+		(handler),						\
+	};								\
+	DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__))
+
+enum lpc_sysres_type {
+	LPC_SYSRES_IO,
+	LPC_SYSRES_MEM
+};
+
+struct lpc_sysres {
+	enum lpc_sysres_type type;
+	uint32_t base;
+	uint32_t length;
+};
+
+#define	LPC_SYSRES(type, base, length)					\
+	static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = {	\
+		(type),							\
+		(base),							\
+		(length)						\
+	};								\
+	DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__))
+
+#define	SYSRES_IO(base, length)		LPC_SYSRES(LPC_SYSRES_IO, base, length)
+#define	SYSRES_MEM(base, length)	LPC_SYSRES(LPC_SYSRES_MEM, base, length)
+
+int	lpc_device_parse(const char *opt);
+char	*lpc_pirq_name(int pin);
+void	lpc_pirq_routed(void);
+
+#endif
diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c
new file mode 100644
index 0000000000..65e2d9c57d
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_block.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <md5.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTBLK_RINGSZ	64
+
+#ifdef	__FreeBSD__
+#define VTBLK_MAXSEGS	32
+#else
+#define	VTBLK_MAXSEGS	16
+#endif
+
+#define VTBLK_S_OK	0
+#define VTBLK_S_IOERR	1
+#define	VTBLK_S_UNSUPP	2
+
+#define	VTBLK_BLK_ID_BYTES	20
+
+/* Capability bits */
+#define	VTBLK_F_SEG_MAX		(1 << 2)	/* Maximum request segments */
+#define	VTBLK_F_BLK_SIZE       	(1 << 6)	/* cfg block size valid */
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS      \
+  ( VTBLK_F_SEG_MAX  |						    \
+    VTBLK_F_BLK_SIZE |						    \
+    VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
+
+/*
+ * Config space "registers"
+ */
+struct vtblk_config {
+	uint64_t	vbc_capacity;
+	uint32_t	vbc_size_max;
+	uint32_t	vbc_seg_max;
+	uint16_t	vbc_geom_c;
+	uint8_t		vbc_geom_h;
+	uint8_t		vbc_geom_s;
+	uint32_t	vbc_blk_size;
+	uint32_t	vbc_sectors_max;
+} __packed;
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define	VBH_OP_READ		0
+#define	VBH_OP_WRITE		1
+#define	VBH_OP_IDENT		8		
+#define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
+	uint32_t       	vbh_type;
+	uint32_t	vbh_ioprio;
+	uint64_t	vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+	struct virtio_softc vbsc_vs;
+	pthread_mutex_t vsc_mtx;
+	struct vqueue_info vbsc_vq;
+	int		vbsc_fd;
+	struct vtblk_config vbsc_cfg;	
+	char vbsc_ident[VTBLK_BLK_ID_BYTES];
+};
+
+static void pci_vtblk_reset(void *);
+static void pci_vtblk_notify(void *, struct vqueue_info *);
+static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
+static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtblk_vi_consts = {
+	"vtblk",		/* our name */
+	1,			/* we support 1 virtqueue */
+	sizeof(struct vtblk_config), /* config reg size */
+	pci_vtblk_reset,	/* reset */
+	pci_vtblk_notify,	/* device-wide qnotify */
+	pci_vtblk_cfgread,	/* read PCI config */
+	pci_vtblk_cfgwrite,	/* write PCI config */
+	VTBLK_S_HOSTCAPS,	/* our capabilities */
+};
+
+static void
+pci_vtblk_reset(void *vsc)
+{
+	struct pci_vtblk_softc *sc = vsc;
+
+	DPRINTF(("vtblk: device reset requested !\n"));
+	vi_reset_dev(&sc->vbsc_vs);
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
+{
+	struct virtio_blk_hdr *vbh;
+	uint8_t *status;
+	int i, n;
+	int err;
+	int iolen;
+	int writeop, type;
+	off_t offset;
+	struct iovec iov[VTBLK_MAXSEGS + 2];
+	uint16_t flags[VTBLK_MAXSEGS + 2];
+
+	n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags);
+
+	/*
+	 * The first descriptor will be the read-only fixed header,
+	 * and the last is for status (hence +2 above and below).
+	 * The remaining iov's are the actual data I/O vectors.
+	 *
+	 * XXX - note - this fails on crash dump, which does a
+	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
+	 */
+	assert(n >= 2 && n <= VTBLK_MAXSEGS + 2);
+
+	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
+	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
+	vbh = (struct virtio_block_hdr *)iov[0].iov_base;
+
+	status = iov[--n].iov_base;
+	assert(iov[n].iov_len == 1);
+	assert(flags[n] & VRING_DESC_F_WRITE);
+
+	/*
+	 * XXX
+	 * The guest should not be setting the BARRIER flag because
+	 * we don't advertise the capability.
+	 */
+	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
+	writeop = (type == VBH_OP_WRITE);
+
+	offset = vbh->vbh_sector * DEV_BSIZE;
+
+	iolen = 0;
+	for (i = 1; i < n; i++) {
+		/*
+		 * - write op implies read-only descriptor,
+		 * - read/ident op implies write-only descriptor,
+		 * therefore test the inverse of the descriptor bit
+		 * to the op.
+		 */
+		assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
+		iolen += iov[i].iov_len;
+	}
+
+	DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", 
+		 writeop ? "write" : "read/ident", iolen, i - 1, offset));
+
+	switch (type) {
+	case VBH_OP_WRITE:
+		err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset);
+		break;
+	case VBH_OP_READ:
+		err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset);
+		break;
+	case VBH_OP_IDENT:
+		/* Assume a single buffer */
+		strlcpy(iov[1].iov_base, sc->vbsc_ident,
+		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
+		err = 0;
+		break;
+	default:
+		err = -ENOSYS;
+		break;
+	}
+
+	/* convert errno into a virtio block error return */
+	if (err < 0) {
+		if (err == -ENOSYS)
+			*status = VTBLK_S_UNSUPP;
+		else
+			*status = VTBLK_S_IOERR;
+	} else
+		*status = VTBLK_S_OK;
+
+	/*
+	 * Return the descriptor back to the host.
+	 * We wrote 1 byte (our status) to host.
+	 */
+	vq_relchain(vq, 1);
+}
+
+static void
+pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
+{
+	struct pci_vtblk_softc *sc = vsc;
+
+	vq_startchains(vq);
+	while (vq_has_descs(vq))
+		pci_vtblk_proc(sc, vq);
+	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct stat sbuf;
+	MD5_CTX mdctx;
+	u_char digest[16];
+	struct pci_vtblk_softc *sc;
+	off_t size;	
+	int fd;
+	int sectsz;
+
+	if (opts == NULL) {
+		printf("virtio-block: backing device required\n");
+		return (1);
+	}
+
+	/*
+	 * The supplied backing file has to exist
+	 */
+	fd = open(opts, O_RDWR);
+	if (fd < 0) {
+		perror("Could not open backing file");
+		return (1);
+	}
+
+	if (fstat(fd, &sbuf) < 0) {
+		perror("Could not stat backing file");
+		close(fd);
+		return (1);
+	}
+
+	/*
+	 * Deal with raw devices
+	 */
+	size = sbuf.st_size;
+	sectsz = DEV_BSIZE;
+#ifdef	__FreeBSD__
+	if (S_ISCHR(sbuf.st_mode)) {
+		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+			perror("Could not fetch dev blk/sector size");
+			close(fd);
+			return (1);
+		}
+		assert(size != 0);
+		assert(sectsz != 0);
+	}
+#endif
+
+	sc = calloc(1, sizeof(struct pci_vtblk_softc));
+
+	/* record fd of storage device/file */
+	sc->vbsc_fd = fd;
+
+	pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+	/* init virtio softc and virtqueues */
+	vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
+	sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
+
+	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
+	/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
+
+	/*
+	 * Create an identifier for the backing file. Use parts of the
+	 * md5 sum of the filename
+	 */
+	MD5Init(&mdctx);
+	MD5Update(&mdctx, opts, strlen(opts));
+	MD5Final(digest, &mdctx);	
+	sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
+
+	/* setup virtio block config space */
+	sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
+	sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
+	sc->vbsc_cfg.vbc_blk_size = sectsz;
+	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
+	sc->vbsc_cfg.vbc_geom_c = 0;	/* no geometry */
+	sc->vbsc_cfg.vbc_geom_h = 0;
+	sc->vbsc_cfg.vbc_geom_s = 0;
+	sc->vbsc_cfg.vbc_sectors_max = 0;
+
+	/*
+	 * Should we move some of this into virtio.c?  Could
+	 * have the device, class, and subdev_0 as fields in
+	 * the virtio constants structure.
+	 */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+
+	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix()))
+		return (1);
+	vi_set_io_bar(&sc->vbsc_vs, 0);
+	return (0);
+}
+
+static int
+pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
+{
+
+	DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
+	return (1);
+}
+
+static int
+pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+	struct pci_vtblk_softc *sc = vsc;
+	void *ptr;
+
+	/* our caller has already verified offset and size */
+	ptr = (uint8_t *)&sc->vbsc_cfg + offset;
+	memcpy(retval, ptr, size);
+	return (0);
+}
+
+struct pci_devemu pci_de_vblk = {
+	.pe_emu =	"virtio-blk",
+	.pe_init =	pci_vtblk_init,
+	.pe_barwrite =	vi_pci_write,
+	.pe_barread =	vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c
new file mode 100644
index 0000000000..e58bdd0115
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_net.c
@@ -0,0 +1,870 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <net/ethernet.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#ifndef	__FreeBSD__
+#include <poll.h>
+#include <libdlpi.h>
+#endif
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#ifdef	__FreeBSD__
+#include "mevent.h"
+#endif
+#include "virtio.h"
+
+#define VTNET_RINGSZ	1024
+
+#define VTNET_MAXSEGS	32
+
+/*
+ * Host capabilities.  Note that we only offer a few of these.
+ */
+#define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
+#define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
+#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
+#define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
+#define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
+#define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
+#define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
+#define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
+#define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
+#define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
+#define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
+#define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
+#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
+#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
+#define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
+#define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
+#define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
+#define	VIRTIO_NET_F_GUEST_ANNOUNCE \
+				(1 << 21) /* guest can send gratuitous pkts */
+
+#define VTNET_S_HOSTCAPS      \
+  ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
+    VIRTIO_F_NOTIFY_ON_EMPTY)
+
+/*
+ * PCI config-space "registers"
+ */
+struct virtio_net_config {
+	uint8_t  mac[6];
+	uint16_t status;
+} __packed;
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ	0
+#define VTNET_TXQ	1
+#define VTNET_CTLQ	2	/* NB: not yet supported */
+
+#define VTNET_MAXQ	3
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+	struct virtio_softc vsc_vs;
+	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
+	pthread_mutex_t vsc_mtx;
+	struct mevent	*vsc_mevp;
+
+#ifdef	__FreeBSD
+	int		vsc_tapfd;
+#else
+	dlpi_handle_t	vsc_dhp;
+	int		vsc_dlpifd;
+#endif
+	int		vsc_rx_ready;
+	volatile int	resetting;	/* set and checked outside lock */
+
+	uint32_t	vsc_features;
+	struct virtio_net_config vsc_config;
+
+	pthread_mutex_t	rx_mtx;
+	int		rx_in_progress;
+
+	pthread_t 	tx_tid;
+	pthread_mutex_t	tx_mtx;
+	pthread_cond_t	tx_cond;
+	int		tx_in_progress;
+};
+
+static void pci_vtnet_reset(void *);
+/* static void pci_vtnet_notify(void *, struct vqueue_info *); */
+static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
+static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtnet_vi_consts = {
+	"vtnet",		/* our name */
+	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
+	sizeof(struct virtio_net_config), /* config reg size */
+	pci_vtnet_reset,	/* reset */
+	NULL,			/* device-wide qnotify -- not used */
+	pci_vtnet_cfgread,	/* read PCI config */
+	pci_vtnet_cfgwrite,	/* write PCI config */
+	VTNET_S_HOSTCAPS,	/* our capabilities */
+};
+
+/*
+ * If the transmit thread is active then stall until it is done.
+ */
+static void
+pci_vtnet_txwait(struct pci_vtnet_softc *sc)
+{
+
+	pthread_mutex_lock(&sc->tx_mtx);
+	while (sc->tx_in_progress) {
+		pthread_mutex_unlock(&sc->tx_mtx);
+		usleep(10000);
+		pthread_mutex_lock(&sc->tx_mtx);
+	}
+	pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+/*
+ * If the receive thread is active then stall until it is done.
+ */
+static void
+pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
+{
+
+	pthread_mutex_lock(&sc->rx_mtx);
+	while (sc->rx_in_progress) {
+		pthread_mutex_unlock(&sc->rx_mtx);
+		usleep(10000);
+		pthread_mutex_lock(&sc->rx_mtx);
+	}
+	pthread_mutex_unlock(&sc->rx_mtx);
+}
+
+static void
+pci_vtnet_reset(void *vsc)
+{
+	struct pci_vtnet_softc *sc = vsc;
+
+	DPRINTF(("vtnet: device reset requested !\n"));
+
+	sc->resetting = 1;
+
+	/*
+	 * Wait for the transmit and receive threads to finish their
+	 * processing.
+	 */
+	pci_vtnet_txwait(sc);
+	pci_vtnet_rxwait(sc);
+
+	sc->vsc_rx_ready = 0;
+
+	/* now reset rings, MSI-X vectors, and negotiated capabilities */
+	vi_reset_dev(&sc->vsc_vs);
+
+	sc->resetting = 0;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+#ifdef	__FreeBSD__
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+		 int len)
+{
+	static char pad[60]; /* all zero bytes */
+
+	if (sc->vsc_tapfd == -1)
+		return;
+
+	/*
+	 * If the length is < 60, pad out to that and add the
+	 * extra zero'd segment to the iov. It is guaranteed that
+	 * there is always an extra iov available by the caller.
+	 */
+	if (len < 60) {
+		iov[iovcnt].iov_base = pad;
+		iov[iovcnt].iov_len = 60 - len;
+		iovcnt++;
+	}
+	(void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+#else
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+		 int len)
+{
+	int i;
+
+	for (i = 0; i < iovcnt; i++) {
+		(void) dlpi_send(sc->vsc_dhp, NULL, NULL,
+				 iov[i].iov_base, iov[i].iov_len, NULL);
+	}
+}
+#endif
+
+#ifdef	__FreeBSD__
+/*
+ *  Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ *  MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+#endif
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+	struct vqueue_info *vq;
+	struct virtio_net_rxhdr *vrx;
+	uint8_t *buf;
+#ifdef	__FreeBSD__
+	int len;
+#endif
+	struct iovec iov[VTNET_MAXSEGS];
+#ifndef	__FreeBSD__
+	size_t len;
+	int ret;
+#endif
+	int total_len = 0;
+
+	/*
+	 * Should never be called without a valid tap fd
+	 */
+#ifdef	__FreeBSD__
+	assert(sc->vsc_tapfd != -1);
+#else
+	assert(sc->vsc_dlpifd != -1);
+#endif
+
+	/*
+	 * But, will be called when the rx ring hasn't yet
+	 * been set up or the guest is resetting the device.
+	 */
+	if (!sc->vsc_rx_ready || sc->resetting) {
+#ifdef	__FreeBSD__
+		/*
+		 * Drop the packet and try later.
+		 */
+		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+#endif
+		return;
+	}
+
+	/*
+	 * Check for available rx buffers
+	 */
+	vq = &sc->vsc_queues[VTNET_RXQ];
+	vq_startchains(vq);
+	if (!vq_has_descs(vq)) {
+		/*
+		 * Drop the packet and try later.  Interrupt on
+		 * empty, if that's negotiated.
+		 */
+#ifdef	__FreeBSD__
+		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+#endif
+		vq_endchains(vq, 1);
+		return;
+	}
+
+	do {
+		/*
+		 * Get descriptor chain
+		 */
+		if (sc->vsc_vs.vs_negotiated_caps & VIRTIO_NET_F_MRG_RXBUF) { 
+			assert(vq_getchain(vq, iov, 1, NULL) == 1);
+
+			/*
+			 * Get a pointer to the rx header, and use the
+			 * data immediately following it for the packet buffer.
+			 */
+			vrx = (struct virtio_net_rxhdr *)iov[0].iov_base;
+			buf = (uint8_t *)(vrx + 1);
+			total_len = iov[0].iov_len;
+#ifdef	__FreeBSD__
+			len = read(sc->vsc_tapfd, buf,
+			   iov[0].iov_len - sizeof(struct virtio_net_rxhdr));
+
+			if (len < 0 && errno == EWOULDBLOCK) {
+				/*
+				 * No more packets, but still some avail ring
+				 * entries.  Interrupt if needed/appropriate.
+				 */
+				vq_endchains(vq, 0);
+				return;
+			}
+#else
+			len = iov[0].iov_len - sizeof(struct virtio_net_rxhdr);
+			ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf,
+			    &len, 0, NULL);
+			if (ret != DLPI_SUCCESS) {
+				/*
+				 * No more packets, but still some avail ring
+				 * entries.  Interrupt if needed/appropriate.
+				 */
+				vq_endchains(vq, 0);
+				return;
+			}
+#endif
+		} else {
+			int i;
+			int num_segs;
+			num_segs = vq_getchain(vq, iov,
+			    VTNET_MAXSEGS, NULL);
+			vrx = (struct virtio_net_rxhrd *)iov[0].iov_base;
+			total_len = iov[0].iov_len;
+			for (i = 1; i < num_segs; i++) {
+				buf = (uint8_t *)iov[i].iov_base;
+				total_len += iov[i].iov_len;
+#ifdef __FreeBSD__
+				len = read(sc->vsc_tapfd, buf, iov[i].iov_len);
+				if (len < 0 && errno == EWOULDBLOCK) {
+					/*
+					 * No more packets,
+					 * but still some avail ring entries.
+					 * Interrupt if needed/appropriate.
+					 */
+					break;
+				}
+#else
+				len = iov[i].iov_len;
+				ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf,
+				    &len, 0, NULL);
+				if (ret != DLPI_SUCCESS) {
+					/*
+					 * No more packets,
+					 * but still some avail ring entries.
+					 * Interrupt if needed/appropriate.
+					 */
+					 total_len = 0;
+					 break;
+				}
+#endif
+			}
+			if (total_len == 0) {
+				vq_endchains(vq, 0);
+				return;
+			}
+		}
+
+		/*
+		 * The only valid field in the rx packet header is the
+		 * number of buffers, which is always 1 without TSO
+		 * support.
+		 */
+		memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
+		vrx->vrh_bufs = 1;
+
+		/*
+		 * Release this chain and handle more chains.
+		 */
+		vq_relchain(vq, total_len);
+	} while (vq_has_descs(vq));
+
+	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+	vq_endchains(vq, 1);
+}
+
+#ifdef	__FreeBSD__
+static void
+pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+{
+	struct pci_vtnet_softc *sc = param;
+
+	pthread_mutex_lock(&sc->rx_mtx);
+	sc->rx_in_progress = 1;
+	pci_vtnet_tap_rx(sc);
+	sc->rx_in_progress = 0;
+	pthread_mutex_unlock(&sc->rx_mtx);
+
+}
+#else
+static void *
+pci_vtnet_poll_thread(void *param)
+{
+	struct pci_vtnet_softc *sc = param;
+	pollfd_t pollset;
+
+	pollset.fd = sc->vsc_dlpifd;
+	pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
+
+	for (;;) {
+		if (poll(&pollset, 1, -1) < 0) {
+			if (errno == EINTR)
+				continue;
+			fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno);
+			continue;
+		}
+		pthread_mutex_lock(&sc->vsc_mtx);
+		pci_vtnet_tap_rx(sc);
+		pthread_mutex_unlock(&sc->vsc_mtx);
+	}
+}
+#endif
+
+static void
+pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
+{
+	struct pci_vtnet_softc *sc = vsc;
+
+	/*
+	 * A qnotify means that the rx process can now begin
+	 */
+	if (sc->vsc_rx_ready == 0) {
+		sc->vsc_rx_ready = 1;
+	}
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
+{
+	struct iovec iov[VTNET_MAXSEGS + 1];
+	int i, n;
+	int plen, tlen;
+
+	/*
+	 * Obtain chain of descriptors.  The first one is
+	 * really the header descriptor, so we need to sum
+	 * up two lengths: packet length and transfer length.
+	 */
+	n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL);
+	assert(n >= 1 && n <= VTNET_MAXSEGS);
+	plen = 0;
+	tlen = iov[0].iov_len;
+	for (i = 1; i < n; i++) {
+		plen += iov[i].iov_len;
+		tlen += iov[i].iov_len;
+	}
+
+	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
+	pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen);
+
+	/* chain is processed, release it and set tlen */
+	vq_relchain(vq, tlen);
+}
+
+static void
+pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
+{
+	struct pci_vtnet_softc *sc = vsc;
+
+	/*
+	 * Any ring entries to process?
+	 */
+	if (!vq_has_descs(vq))
+		return;
+
+	/* Signal the tx thread for processing */
+	pthread_mutex_lock(&sc->tx_mtx);
+	if (sc->tx_in_progress == 0)
+		pthread_cond_signal(&sc->tx_cond);
+	pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+/*
+ * Thread which will handle processing of TX desc
+ */
+static void *
+pci_vtnet_tx_thread(void *param)
+{
+	struct pci_vtnet_softc *sc = param;
+	struct vqueue_info *vq;
+	int have_work, error;
+
+	vq = &sc->vsc_queues[VTNET_TXQ];
+
+	/*
+	 * Let us wait till the tx queue pointers get initialised &
+	 * first tx signaled
+	 */
+	pthread_mutex_lock(&sc->tx_mtx);
+	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
+	assert(error == 0);
+
+	for (;;) {
+		/* note - tx mutex is locked here */
+		do {
+			if (sc->resetting)
+				have_work = 0;
+			else
+				have_work = vq_has_descs(vq);
+
+			if (!have_work) {
+				sc->tx_in_progress = 0;
+				error = pthread_cond_wait(&sc->tx_cond,
+							  &sc->tx_mtx);
+				assert(error == 0);
+			}
+		} while (!have_work);
+		sc->tx_in_progress = 1;
+		pthread_mutex_unlock(&sc->tx_mtx);
+
+		vq_startchains(vq);
+		do {
+			/*
+			 * Run through entries, placing them into
+			 * iovecs and sending when an end-of-packet
+			 * is found
+			 */
+			pci_vtnet_proctx(sc, vq);
+		} while (vq_has_descs(vq));
+
+		/*
+		 * Generate an interrupt if needed.
+		 */
+		vq_endchains(vq, 1);
+
+		pthread_mutex_lock(&sc->tx_mtx);
+	}
+}
+
+#ifdef notyet
+static void
+pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
+{
+
+	DPRINTF(("vtnet: control qnotify!\n\r"));
+}
+#endif
+
+#ifdef	__FreeBSD__
+static int
+pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
+{
+        struct ether_addr *ea;
+        char *tmpstr;
+        char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+        tmpstr = strsep(&mac_str,"=");
+       
+        if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+                ea = ether_aton(mac_str);
+
+                if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+                    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+			fprintf(stderr, "Invalid MAC %s\n", mac_str);
+                        return (EINVAL);
+                } else
+                        memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+        }
+
+        return (0);
+}
+#endif
+
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+#ifdef	__FreeBSD__
+	MD5_CTX mdctx;
+	unsigned char digest[16];
+#else
+	uchar_t physaddr[DLPI_PHYSADDR_MAX];
+	size_t physaddrlen = DLPI_PHYSADDR_MAX;
+	int error;
+#endif
+	char nstr[80];
+	char tname[MAXCOMLEN + 1];
+	struct pci_vtnet_softc *sc;
+	const char *env_msi;
+	char *devname;
+	char *vtopts;
+	int mac_provided;
+	int use_msix;
+
+	sc = malloc(sizeof(struct pci_vtnet_softc));
+	memset(sc, 0, sizeof(struct pci_vtnet_softc));
+
+	pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+	vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
+	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
+	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
+	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
+	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
+#ifdef notyet
+	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
+        sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
+#endif
+ 
+	/*
+	 * Use MSI if set by user
+	 */
+	use_msix = 1;
+	if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
+		if (strcasecmp(env_msi, "yes") == 0)
+			use_msix = 0;
+	}
+
+	/*
+	 * Attempt to open the tap device and read the MAC address
+	 * if specified
+	 */
+	mac_provided = 0;
+#ifdef	__FreeBSD__
+	sc->vsc_tapfd = -1;
+#endif
+	if (opts != NULL) {
+		char tbuf[80];
+		int err;
+
+		devname = vtopts = strdup(opts);
+		(void) strsep(&vtopts, ",");
+
+#ifdef	__FreBSD__
+		if (vtopts != NULL) {
+			err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
+			if (err != 0) {
+				free(devname);
+				return (err);
+			}
+			mac_provided = 1;
+		}
+#endif
+
+		strcpy(tbuf, "/dev/");
+		strlcat(tbuf, devname, sizeof(tbuf));
+
+		free(devname);
+
+#ifdef	__FreeBSD__
+		sc->vsc_tapfd = open(tbuf, O_RDWR);
+		if (sc->vsc_tapfd == -1) {
+			WPRINTF(("open of tap device %s failed\n", tbuf));
+		} else {
+			/*
+			 * Set non-blocking and register for read
+			 * notifications with the event loop
+			 */
+			int opt = 1;
+			if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+				WPRINTF(("tap device O_NONBLOCK failed\n"));
+				close(sc->vsc_tapfd);
+				sc->vsc_tapfd = -1;
+			}
+
+			sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+						  EVF_READ,
+						  pci_vtnet_tap_callback,
+						  sc);
+			if (sc->vsc_mevp == NULL) {
+				WPRINTF(("Could not register event\n"));
+				close(sc->vsc_tapfd);
+				sc->vsc_tapfd = -1;
+			}
+		}		
+#else
+		if (dlpi_open(opts, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
+			 WPRINTF(("open of vnic device %s failed\n", opts));
+		}
+
+		if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, &physaddrlen) != DLPI_SUCCESS) {
+			 WPRINTF(("read MAC address of vnic device %s failed\n", opts));
+		}
+		if (physaddrlen != ETHERADDRL) {
+			WPRINTF(("bad MAC address len %d on vnic device %s\n", physaddrlen, opts));
+		}
+		memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
+
+		if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
+			 WPRINTF(("bind of vnic device %s failed\n", opts));
+		}
+
+		if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
+			 WPRINTF(("enable promiscous mode(physical) of vnic device %s failed\n", opts));
+		}
+		if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
+			 WPRINTF(("enable promiscous mode(SAP) of vnic device %s failed\n", opts));
+		}
+
+		sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
+
+		if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
+			 WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", opts));
+			 dlpi_close(sc->vsc_dhp);
+			 sc->vsc_dlpifd = -1;
+		}
+
+		error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
+		assert(error == 0);
+#endif
+	}
+
+#ifdef	__FreeBSD__
+	/*
+	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
+	 * followed by an MD5 of the PCI slot/func number and dev name
+	 */
+	if (!mac_provided) {
+		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+	            pi->pi_func, vmname);
+
+		MD5Init(&mdctx);
+		MD5Update(&mdctx, nstr, strlen(nstr));
+		MD5Final(digest, &mdctx);
+
+		sc->vsc_config.mac[0] = 0x00;
+		sc->vsc_config.mac[1] = 0xa0;
+		sc->vsc_config.mac[2] = 0x98;
+		sc->vsc_config.mac[3] = digest[0];
+		sc->vsc_config.mac[4] = digest[1];
+		sc->vsc_config.mac[5] = digest[2];
+	}
+#endif
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+
+	/* link always up */
+	sc->vsc_config.status = 1;
+	
+	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
+	if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
+		return (1);
+
+	/* use BAR 0 to map config regs in IO space */
+	vi_set_io_bar(&sc->vsc_vs, 0);
+
+	sc->resetting = 0;
+
+	sc->rx_in_progress = 0;
+	pthread_mutex_init(&sc->rx_mtx, NULL); 
+
+	/* 
+	 * Initialize tx semaphore & spawn TX processing thread.
+	 * As of now, only one thread for TX desc processing is
+	 * spawned. 
+	 */
+	sc->tx_in_progress = 0;
+	pthread_mutex_init(&sc->tx_mtx, NULL);
+	pthread_cond_init(&sc->tx_cond, NULL);
+	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
+        snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot);
+        pthread_set_name_np(sc->tx_tid, tname);
+
+	return (0);
+}
+
+static int
+pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
+{
+	struct pci_vtnet_softc *sc = vsc;
+	void *ptr;
+
+	if (offset < 6) {
+		assert(offset + size <= 6);
+		/*
+		 * The driver is allowed to change the MAC address
+		 */
+		ptr = &sc->vsc_config.mac[offset];
+		memcpy(ptr, &value, size);
+	} else {
+		DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
+		return (1);
+	}
+	return (0);
+}
+
+static int
+pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+	struct pci_vtnet_softc *sc = vsc;
+	void *ptr;
+
+	ptr = (uint8_t *)&sc->vsc_config + offset;
+	memcpy(retval, ptr, size);
+	return (0);
+}
+
+struct pci_devemu pci_de_vnet = {
+	.pe_emu = 	"virtio-net",
+	.pe_init =	pci_vtnet_init,
+	.pe_barwrite =	vi_pci_write,
+	.pe_barread =	vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c
new file mode 100644
index 0000000000..f4d5d528be
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_viona.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/ioctl.h>
+#include <sys/viona_io.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <signal.h>
+#include <poll.h>
+#include <libdladm.h>
+#include <libdllink.h>
+#include <libdlvnic.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define	VIONA_RINGSZ	1024
+
+/*
+ * PCI config-space register offsets
+ */
+#define	VIONA_R_CFG0	24
+#define	VIONA_R_CFG1	25
+#define	VIONA_R_CFG2	26
+#define	VIONA_R_CFG3	27
+#define	VIONA_R_CFG4	28
+#define	VIONA_R_CFG5	29
+#define	VIONA_R_CFG6	30
+#define	VIONA_R_CFG7	31
+#define	VIONA_R_MAX	31
+
+#define	VIONA_REGSZ	VIONA_R_MAX+1
+
+/*
+ * Host capabilities
+ */
+#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
+#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
+#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
+
+#define	VIONA_S_HOSTCAPS		\
+	(VIRTIO_NET_F_MAC |		\
+	VIRTIO_NET_F_MRG_RXBUF |	\
+	VIRTIO_NET_F_STATUS)
+
+/*
+ * Queue definitions.
+ */
+#define	VIONA_RXQ	0
+#define	VIONA_TXQ	1
+#define	VIONA_CTLQ	2
+
+#define	VIONA_MAXQ	3
+
+/*
+ * Debug printf
+ */
+static int pci_viona_debug;
+#define	DPRINTF(params) if (pci_viona_debug) printf params
+#define	WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_viona_softc {
+	struct pci_devinst *vsc_pi;
+	pthread_mutex_t vsc_mtx;
+
+	int		vsc_curq;
+	int		vsc_status;
+	int		vsc_isr;
+
+	datalink_id_t	vsc_linkid;
+	char		vsc_linkname[MAXLINKNAMELEN];
+	int		vsc_vnafd;
+
+	uint32_t	vsc_features;
+	uint8_t		vsc_macaddr[6];
+
+	uint64_t	vsc_pfn[VIONA_MAXQ];
+	uint16_t	vsc_msix_table_idx[VIONA_MAXQ];
+	/*
+	 * Flag to see if host is already sending data out.
+	 * If it is, no need to wait for lock and send interrupt to host
+	 * for new data.
+	 */
+	boolean_t	vsc_tx_kick_lock_held;
+
+	pthread_t	tx_tid;
+	pthread_mutex_t	tx_mtx;
+	pthread_cond_t	tx_cond;
+};
+#define	viona_ctx(sc)	((sc)->vsc_pi->pi_vmctx)
+
+/*
+ * Return the size of IO BAR that maps virtio header and device specific
+ * region. The size would vary depending on whether MSI-X is enabled or
+ * not.
+ */
+static uint64_t
+pci_viona_iosize(struct pci_devinst *pi)
+{
+	if (pci_msix_enabled(pi))
+		return (VIONA_REGSZ);
+	else
+		return (VIONA_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+}
+
+static uint16_t
+pci_viona_qsize(int qnum)
+{
+	/* XXX no ctl queue currently */
+	if (qnum == VIONA_CTLQ) {
+		return (0);
+	}
+
+	/* XXX fixed currently. Maybe different for tx/rx/ctl */
+	return (VIONA_RINGSZ);
+}
+
+static void
+pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
+{
+	int	error;
+
+	assert(ring < VIONA_MAXQ);
+
+	switch (ring) {
+	case VIONA_RXQ:
+		error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_RESET);
+		if (error != 0) {
+			WPRINTF(("ioctl viona rx ring reset failed %d\n",
+			    error));
+		} else {
+			sc->vsc_pfn[VIONA_RXQ] = 0;
+		}
+		break;
+	case VIONA_TXQ:
+		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_RESET);
+		if (error != 0) {
+			WPRINTF(("ioctl viona tx ring reset failed %d\n",
+			    error));
+		} else {
+			sc->vsc_pfn[VIONA_TXQ] = 0;
+		}
+		break;
+	case VIONA_CTLQ:
+	default:
+		break;
+	}
+}
+
+static void
+pci_viona_update_status(struct pci_viona_softc *sc, uint32_t value)
+{
+
+	if (value == 0) {
+		DPRINTF(("viona: device reset requested !\n"));
+		pci_viona_ring_reset(sc, VIONA_RXQ);
+		pci_viona_ring_reset(sc, VIONA_TXQ);
+	}
+
+	sc->vsc_status = value;
+}
+
+static void *
+pci_viona_poll_thread(void *param)
+{
+	struct pci_viona_softc *sc = param;
+	pollfd_t	pollset;
+	int			error;
+
+	pollset.fd = sc->vsc_vnafd;
+	pollset.events = POLLIN | POLLOUT;
+
+	for (;;) {
+		if (poll(&pollset, 1, -1) < 0) {
+			if (errno == EINTR || errno == EAGAIN) {
+				continue;
+			} else {
+				WPRINTF(("pci_viona_poll_thread poll()"
+				    "error %d\n", errno));
+				break;
+			}
+		}
+		if (pollset.revents & POLLIN) {
+			pci_generate_msix(sc->vsc_pi,
+			    sc->vsc_msix_table_idx[VIONA_RXQ]);
+			error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_INTR_CLR);
+			if (error != 0) {
+				WPRINTF(("ioctl viona rx intr clear failed"
+				    " %d\n", error));
+			}
+		}
+
+		if (pollset.revents & POLLOUT) {
+			pci_generate_msix(sc->vsc_pi,
+			    sc->vsc_msix_table_idx[VIONA_TXQ]);
+			error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_INTR_CLR);
+			if (error != 0) {
+				WPRINTF(("ioctl viona tx intr clear failed"
+				    " %d\n", error));
+			}
+		}
+	}
+
+	pthread_exit(NULL);
+}
+
+static void
+pci_viona_ping_rxq(struct pci_viona_softc *sc)
+{
+	int error;
+
+	error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_KICK);
+	if (error != 0) {
+		WPRINTF(("ioctl viona rx ring kick failed %d\n", error));
+	}
+}
+
+static void *
+pci_viona_tx_thread(void *param)
+{
+	struct pci_viona_softc *sc = (struct pci_viona_softc *)param;
+	int error;
+
+	pthread_mutex_lock(&sc->tx_mtx);
+	for (;;) {
+		error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
+		assert(error == 0);
+		sc->vsc_tx_kick_lock_held = B_TRUE;
+		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_KICK);
+		if (error != 0) {
+			WPRINTF(("ioctl viona tx ring kick failed %d\n",
+			    error));
+		}
+		sc->vsc_tx_kick_lock_held = B_FALSE;
+	}
+	pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+static void
+pci_viona_ping_txq(struct pci_viona_softc *sc)
+{
+	/* Signal the tx thread for processing */
+	if (sc->vsc_tx_kick_lock_held)
+		return;
+	pthread_mutex_lock(&sc->tx_mtx);
+	pthread_cond_signal(&sc->tx_cond);
+	pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+static void
+pci_viona_ping_ctlq(struct pci_viona_softc *sc)
+{
+	DPRINTF(("viona: control qnotify!\n\r"));
+}
+
+static void
+pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
+{
+	int			qnum = sc->vsc_curq;
+	vioc_ring_init_t	vna_ri;
+	int			error;
+
+	assert(qnum < VIONA_MAXQ);
+
+	sc->vsc_pfn[qnum] = (pfn << VRING_PFN);
+
+	vna_ri.ri_qsize = pci_viona_qsize(qnum);
+	vna_ri.ri_qaddr = (pfn << VRING_PFN);
+
+	switch (qnum) {
+	case VIONA_RXQ:
+		error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_INIT, &vna_ri);
+		if (error != 0) {
+			WPRINTF(("ioctl viona rx ring init failed %d\n",
+			    error));
+		}
+		break;
+	case VIONA_TXQ:
+		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_INIT, &vna_ri);
+		if (error != 0) {
+			WPRINTF(("ioctl viona tx ring init failed %d\n",
+			    error));
+		}
+		break;
+	case VIONA_CTLQ:
+	default:
+		break;
+	}
+}
+
+static int
+pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
+{
+	vioc_create_t		vna_create;
+	char			devname[MAXNAMELEN];
+	int			ctlfd;
+	int			error;
+
+	sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL);
+	if (sc->vsc_vnafd == -1) {
+		WPRINTF(("open viona ctl failed\n"));
+		return (-1);
+	}
+
+	vna_create.c_linkid = sc->vsc_linkid;
+	strlcpy(vna_create.c_vmname, vmname,
+	    sizeof (vna_create.c_vmname));
+	vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size,
+	    NULL);
+	vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL),
+	    &vna_create.c_himem_size, NULL);
+	error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
+	if (error != 0) {
+		WPRINTF(("ioctl viona create failed %d\n", error));
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	dladm_handle_t		handle;
+	dladm_status_t		status;
+	dladm_vnic_attr_t	attr;
+	char			errmsg[DLADM_STRSIZE];
+	int error;
+	struct pci_viona_softc *sc;
+	int i;
+
+	if (opts == NULL) {
+		printf("virtio-viona: vnic required\n");
+		return (1);
+	}
+
+	sc = malloc(sizeof (struct pci_viona_softc));
+	memset(sc, 0, sizeof (struct pci_viona_softc));
+
+	pi->pi_arg = sc;
+	sc->vsc_pi = pi;
+
+	pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+	strlcpy(sc->vsc_linkname, opts, MAXLINKNAMELEN);
+
+	if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
+		WPRINTF(("could not open /dev/dld"));
+		free(sc);
+		return (1);
+	}
+
+	if (dladm_name2info(handle, sc->vsc_linkname, &sc->vsc_linkid,
+	    NULL, NULL, NULL) != DLADM_STATUS_OK) {
+		WPRINTF(("dladm_name2info() for %s failed: %s\n", opts,
+		    dladm_status2str(status, errmsg)));
+		dladm_close(handle);
+		free(sc);
+		return (1);
+	}
+
+	if (dladm_vnic_info(handle, sc->vsc_linkid, &attr,
+	    DLADM_OPT_ACTIVE) != DLADM_STATUS_OK) {
+		WPRINTF(("dladm_vnic_info() for %s failed: %s\n", opts,
+		    dladm_status2str(status, errmsg)));
+		dladm_close(handle);
+		free(sc);
+		return (1);
+	}
+
+	sc->vsc_tx_kick_lock_held = B_FALSE;
+	memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
+
+	dladm_close(handle);
+
+	error = pci_viona_viona_init(ctx, sc);
+	if (error != 0) {
+		free(sc);
+		return (1);
+	}
+
+	error = pthread_create(NULL, NULL, pci_viona_poll_thread, sc);
+	assert(error == 0);
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+
+	/* MSI-X support */
+	for (i = 0; i < VIONA_MAXQ; i++)
+		sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
+
+	/*
+	 * BAR 1 used to map MSI-X table and PBA
+	 */
+	if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
+		free(sc);
+		return (1);
+	}
+
+	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+
+	/*
+	 * Initialize tx semaphore & spawn TX processing thread
+	 * As of now, only one thread for TX desc processing is
+	 * spawned.
+	 */
+	pthread_mutex_init(&sc->tx_mtx, NULL);
+	pthread_cond_init(&sc->tx_cond, NULL);
+	pthread_create(&sc->tx_tid, NULL, pci_viona_tx_thread, (void *)sc);
+
+	return (0);
+}
+
+/*
+ * Function pointer array to handle queue notifications
+ */
+static void (*pci_viona_qnotify[VIONA_MAXQ])(struct pci_viona_softc *) = {
+	pci_viona_ping_rxq,
+	pci_viona_ping_txq,
+	pci_viona_ping_ctlq
+};
+
+static uint64_t
+viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
+{
+	/*
+	 * Device specific offsets used by guest would change based on
+	 * whether MSI-X capability is enabled or not
+	 */
+	if (!pci_msix_enabled(pi)) {
+		if (offset >= VTCFG_R_MSIX)
+			return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+	}
+
+	return (offset);
+}
+
+static void
+pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	void *ptr;
+	int err = 0;
+
+	if (baridx == pci_msix_table_bar(pi) ||
+	    baridx == pci_msix_pba_bar(pi)) {
+		pci_emul_msix_twrite(pi, offset, size, value);
+		return;
+	}
+
+	assert(baridx == 0);
+
+	if (offset + size > pci_viona_iosize(pi)) {
+		DPRINTF(("viona_write: 2big, offset %ld size %d\n",
+		    offset, size));
+		return;
+	}
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+
+	offset = viona_adjust_offset(pi, offset);
+
+	switch (offset) {
+	case VTCFG_R_GUESTCAP:
+		assert(size == 4);
+		err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
+		if (err != 0)
+			WPRINTF(("ioctl feature negotiation returned"
+			    " err = %d\n", err));
+		break;
+	case VTCFG_R_PFN:
+		assert(size == 4);
+		pci_viona_ring_init(sc, value);
+		break;
+	case VTCFG_R_QSEL:
+		assert(size == 2);
+		assert(value < VIONA_MAXQ);
+		sc->vsc_curq = value;
+		break;
+	case VTCFG_R_QNOTIFY:
+		assert(size == 2);
+		assert(value < VIONA_MAXQ);
+		(*pci_viona_qnotify[value])(sc);
+		break;
+	case VTCFG_R_STATUS:
+		assert(size == 1);
+		pci_viona_update_status(sc, value);
+		break;
+	case VTCFG_R_CFGVEC:
+		assert(size == 2);
+		sc->vsc_msix_table_idx[VIONA_CTLQ] = value;
+		break;
+	case VTCFG_R_QVEC:
+		assert(size == 2);
+		assert(sc->vsc_curq != VIONA_CTLQ);
+		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+		break;
+	case VIONA_R_CFG0:
+	case VIONA_R_CFG1:
+	case VIONA_R_CFG2:
+	case VIONA_R_CFG3:
+	case VIONA_R_CFG4:
+	case VIONA_R_CFG5:
+		assert((size + offset) <= (VIONA_R_CFG5 + 1));
+		ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
+		/*
+		 * The driver is allowed to change the MAC address
+		 */
+		sc->vsc_macaddr[offset - VIONA_R_CFG0] = value;
+		if (size == 1) {
+			*(uint8_t *)ptr = value;
+		} else if (size == 2) {
+			*(uint16_t *)ptr = value;
+		} else {
+			*(uint32_t *)ptr = value;
+		}
+		break;
+	case VTCFG_R_HOSTCAP:
+	case VTCFG_R_QNUM:
+	case VTCFG_R_ISR:
+	case VIONA_R_CFG6:
+	case VIONA_R_CFG7:
+		DPRINTF(("viona: write to readonly reg %ld\n\r", offset));
+		break;
+	default:
+		DPRINTF(("viona: unknown i/o write offset %ld\n\r", offset));
+		value = 0;
+		break;
+	}
+
+	pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+uint64_t
+pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+    int baridx, uint64_t offset, int size)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	void *ptr;
+	uint64_t value;
+	int err = 0;
+
+	if (baridx == pci_msix_table_bar(pi) ||
+	    baridx == pci_msix_pba_bar(pi)) {
+		return (pci_emul_msix_tread(pi, offset, size));
+	}
+
+	assert(baridx == 0);
+
+	if (offset + size > pci_viona_iosize(pi)) {
+		DPRINTF(("viona_read: 2big, offset %ld size %d\n",
+		    offset, size));
+		return (0);
+	}
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+
+	offset = viona_adjust_offset(pi, offset);
+
+	switch (offset) {
+	case VTCFG_R_HOSTCAP:
+		assert(size == 4);
+		err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
+		if (err != 0)
+			WPRINTF(("ioctl get host features returned"
+			    " err = %d\n", err));
+		break;
+	case VTCFG_R_GUESTCAP:
+		assert(size == 4);
+		value = sc->vsc_features; /* XXX never read ? */
+		break;
+	case VTCFG_R_PFN:
+		assert(size == 4);
+		value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+		break;
+	case VTCFG_R_QNUM:
+		assert(size == 2);
+		value = pci_viona_qsize(sc->vsc_curq);
+		break;
+	case VTCFG_R_QSEL:
+		assert(size == 2);
+		value = sc->vsc_curq;  /* XXX never read ? */
+		break;
+	case VTCFG_R_QNOTIFY:
+		assert(size == 2);
+		value = sc->vsc_curq;  /* XXX never read ? */
+		break;
+	case VTCFG_R_STATUS:
+		assert(size == 1);
+		value = sc->vsc_status;
+		break;
+	case VTCFG_R_ISR:
+		assert(size == 1);
+		value = sc->vsc_isr;
+		sc->vsc_isr = 0;	/* a read clears this flag */
+		break;
+	case VTCFG_R_CFGVEC:
+		assert(size == 2);
+		value = sc->vsc_msix_table_idx[VIONA_CTLQ];
+		break;
+	case VTCFG_R_QVEC:
+		assert(size == 2);
+		assert(sc->vsc_curq != VIONA_CTLQ);
+		value = sc->vsc_msix_table_idx[sc->vsc_curq];
+		break;
+	case VIONA_R_CFG0:
+	case VIONA_R_CFG1:
+	case VIONA_R_CFG2:
+	case VIONA_R_CFG3:
+	case VIONA_R_CFG4:
+	case VIONA_R_CFG5:
+		assert((size + offset) <= (VIONA_R_CFG5 + 1));
+		ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
+		if (size == 1) {
+			value = *(uint8_t *)ptr;
+		} else if (size == 2) {
+			value = *(uint16_t *)ptr;
+		} else {
+			value = *(uint32_t *)ptr;
+		}
+		break;
+	case VIONA_R_CFG6:
+		assert(size != 4);
+		value = 0x01;	/* XXX link always up */
+		break;
+	case VIONA_R_CFG7:
+		assert(size == 1);
+		value = 0;	/* XXX link status in LSB */
+		break;
+	default:
+		DPRINTF(("viona: unknown i/o read offset %ld\n\r", offset));
+		value = 0;
+		break;
+	}
+
+	pthread_mutex_unlock(&sc->vsc_mtx);
+
+	return (value);
+}
+
+struct pci_devemu pci_de_viona = {
+	.pe_emu = 	"virtio-net-viona",
+	.pe_init =	pci_viona_init,
+	.pe_barwrite =	pci_viona_write,
+	.pe_barread =	pci_viona_read
+};
+PCI_EMUL_SET(pci_de_viona);
diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c
new file mode 100644
index 0000000000..70c4f1fae8
--- /dev/null
+++ b/usr/src/cmd/bhyve/pm.c
@@ -0,0 +1,333 @@
+/*-
+ * Copyright (c) 2013 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pm.c 266125 2014-05-15 14:16:55Z jhb $");
+
+#include <sys/types.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <pthread.h>
+#ifndef	__FreeBSD__
+#include <stdlib.h>
+#endif
+#include <signal.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#ifdef	__FreeBSD__
+#include "mevent.h"
+#endif
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
+#ifdef	__FreeBSD__
+static struct mevent *power_button;
+static sig_t old_power_handler;
+#endif
+
+/*
+ * Reset Control register at I/O port 0xcf9.  Bit 2 forces a system
+ * reset when it transitions from 0 to 1.  Bit 1 selects the type of
+ * reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard"
+ * reset.
+ */
+static int
+reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+	static uint8_t reset_control;
+
+	if (bytes != 1)
+		return (-1);
+	if (in)
+		*eax = reset_control;
+	else {
+		reset_control = *eax;
+
+		/* Treat hard and soft resets the same. */
+		if (reset_control & 0x4) {
+#ifdef	__FreeBSD__
+			error = vm_suspend(ctx, VM_SUSPEND_RESET);
+			assert(error == 0 || errno == EALREADY);
+#else
+			exit(0);
+#endif
+		}
+	}
+	return (0);
+}
+INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler);
+
+/*
+ * ACPI's SCI is a level-triggered interrupt.
+ */
+static int sci_active;
+
+static void
+sci_assert(struct vmctx *ctx)
+{
+
+	if (sci_active)
+		return;
+	vm_isa_assert_irq(ctx, SCI_INT, SCI_INT);
+	sci_active = 1;
+}
+
+static void
+sci_deassert(struct vmctx *ctx)
+{
+
+	if (!sci_active)
+		return;
+	vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT);
+	sci_active = 0;
+}
+
+/*
+ * Power Management 1 Event Registers
+ *
+ * The only power management event supported is a power button upon
+ * receiving SIGTERM.
+ */
+static uint16_t pm1_enable, pm1_status;
+
+#define	PM1_TMR_STS		0x0001
+#define	PM1_BM_STS		0x0010
+#define	PM1_GBL_STS		0x0020
+#define	PM1_PWRBTN_STS		0x0100
+#define	PM1_SLPBTN_STS		0x0200
+#define	PM1_RTC_STS		0x0400
+#define	PM1_WAK_STS		0x8000
+
+#define	PM1_TMR_EN		0x0001
+#define	PM1_GBL_EN		0x0020
+#define	PM1_PWRBTN_EN		0x0100
+#define	PM1_SLPBTN_EN		0x0200
+#define	PM1_RTC_EN		0x0400
+
+static void
+sci_update(struct vmctx *ctx)
+{
+	int need_sci;
+
+	/* See if the SCI should be active or not. */
+	need_sci = 0;
+	if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS))
+		need_sci = 1;
+	if (need_sci)
+		sci_assert(ctx);
+	else
+		sci_deassert(ctx);
+}
+
+static int
+pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+
+	if (bytes != 2)
+		return (-1);
+
+	pthread_mutex_lock(&pm_lock);
+	if (in)
+		*eax = pm1_status;
+	else {
+		/*
+		 * Writes are only permitted to clear certain bits by
+		 * writing 1 to those flags.
+		 */
+		pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS |
+		    PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS));
+		sci_update(ctx);
+	}
+	pthread_mutex_unlock(&pm_lock);
+	return (0);
+}
+
+static int
+pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+
+	if (bytes != 2)
+		return (-1);
+
+	pthread_mutex_lock(&pm_lock);
+	if (in)
+		*eax = pm1_enable;
+	else {
+		/*
+		 * Only permit certain bits to be set.  We never use
+		 * the global lock, but ACPI-CA whines profusely if it
+		 * can't set GBL_EN.
+		 */
+		pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN);
+		sci_update(ctx);
+	}
+	pthread_mutex_unlock(&pm_lock);
+	return (0);
+}
+INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler);
+INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler);
+
+#ifdef	__FreeBSD__
+static void
+power_button_handler(int signal, enum ev_type type, void *arg)
+{
+	struct vmctx *ctx;
+
+	ctx = arg;
+	pthread_mutex_lock(&pm_lock);
+	if (!(pm1_status & PM1_PWRBTN_STS)) {
+		pm1_status |= PM1_PWRBTN_STS;
+		sci_update(ctx);
+	}
+	pthread_mutex_unlock(&pm_lock);
+}
+#endif
+
+/*
+ * Power Management 1 Control Register
+ *
+ * This is mostly unimplemented except that we wish to handle writes that
+ * set SPL_EN to handle S5 (soft power off).
+ */
+static uint16_t pm1_control;
+
+#define	PM1_SCI_EN	0x0001
+#define	PM1_SLP_TYP	0x1c00
+#define	PM1_SLP_EN	0x2000
+#define	PM1_ALWAYS_ZERO	0xc003
+
+static int
+pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+
+	if (bytes != 2)
+		return (-1);
+	if (in)
+		*eax = pm1_control;
+	else {
+		/*
+		 * Various bits are write-only or reserved, so force them
+		 * to zero in pm1_control.  Always preserve SCI_EN as OSPM
+		 * can never change it.
+		 */
+		pm1_control = (pm1_control & PM1_SCI_EN) |
+		    (*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO));
+
+		/*
+		 * If SLP_EN is set, check for S5.  Bhyve's _S5_ method
+		 * says that '5' should be stored in SLP_TYP for S5.
+		 */
+		if (*eax & PM1_SLP_EN) {
+			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
+#ifdef	__FreeBSD__
+				error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
+				assert(error == 0 || errno == EALREADY);
+#else
+				exit(0);
+#endif
+			}
+		}
+	}
+	return (0);
+}
+INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler);
+#ifdef	__FreeBSD__
+SYSRES_IO(PM1A_EVT_ADDR, 8);
+#endif
+
+/*
+ * ACPI SMI Command Register
+ *
+ * This write-only register is used to enable and disable ACPI.
+ */
+static int
+smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+
+	assert(!in);
+	if (bytes != 1)
+		return (-1);
+
+	pthread_mutex_lock(&pm_lock);
+	switch (*eax) {
+	case BHYVE_ACPI_ENABLE:
+		pm1_control |= PM1_SCI_EN;
+#ifdef	__FreeBSD__
+		if (power_button == NULL) {
+			power_button = mevent_add(SIGTERM, EVF_SIGNAL,
+			    power_button_handler, ctx);
+			old_power_handler = signal(SIGTERM, SIG_IGN);
+		}
+#endif
+		break;
+	case BHYVE_ACPI_DISABLE:
+		pm1_control &= ~PM1_SCI_EN;
+#ifdef	__FreeBSD__
+		if (power_button != NULL) {
+			mevent_delete(power_button);
+			power_button = NULL;
+			signal(SIGTERM, old_power_handler);
+		}
+#endif
+		break;
+	}
+	pthread_mutex_unlock(&pm_lock);
+	return (0);
+}
+INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler);
+#ifdef	__FreeBSD__
+SYSRES_IO(SMI_CMD, 1);
+#endif
+
+void
+sci_init(struct vmctx *ctx)
+{
+
+	/*
+	 * Mark ACPI's SCI as level trigger and bump its use count
+	 * in the PIRQ router.
+	 */
+	pci_irq_use(SCI_INT);
+	vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER);
+}
diff --git a/usr/src/cmd/bhyve/pmtmr.c b/usr/src/cmd/bhyve/pmtmr.c
new file mode 100644
index 0000000000..92ab24be57
--- /dev/null
+++ b/usr/src/cmd/bhyve/pmtmr.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <machine/cpufunc.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <pthread.h>
+#ifndef __FreeBSD__
+#include <kstat.h>
+#endif
+
+#include "acpi.h"
+#include "inout.h"
+
+/*
+ * The ACPI Power Management timer is a free-running 24- or 32-bit
+ * timer with a frequency of 3.579545MHz
+ *
+ * This implementation will be 32-bits
+ */
+
+#define PMTMR_FREQ	3579545  /* 3.579545MHz */
+
+static pthread_mutex_t pmtmr_mtx;
+static pthread_once_t pmtmr_once = PTHREAD_ONCE_INIT;
+
+static uint64_t	pmtmr_old;
+
+static uint64_t	pmtmr_tscf;
+static uint64_t	pmtmr_tsc_old;
+
+#ifdef	__FreeBSD__
+static clockid_t clockid = CLOCK_UPTIME_FAST;
+static struct timespec pmtmr_uptime_old;
+
+#define	timespecsub(vvp, uvp)						\
+	do {								\
+		(vvp)->tv_sec -= (uvp)->tv_sec;				\
+		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
+		if ((vvp)->tv_nsec < 0) {				\
+			(vvp)->tv_sec--;				\
+			(vvp)->tv_nsec += 1000000000;			\
+		}							\
+	} while (0)
+
+static uint64_t
+timespec_to_pmtmr(const struct timespec *tsnew, const struct timespec *tsold)
+{
+	struct timespec tsdiff;
+	int64_t nsecs;
+
+	tsdiff = *tsnew;
+	timespecsub(&tsdiff, tsold);
+	nsecs = tsdiff.tv_sec * 1000000000 + tsdiff.tv_nsec;
+	assert(nsecs >= 0);
+
+	return (nsecs * PMTMR_FREQ / 1000000000 + pmtmr_old);
+}
+#endif
+
+static uint64_t
+tsc_to_pmtmr(uint64_t tsc_new, uint64_t tsc_old)
+{
+
+	return ((tsc_new - tsc_old) * PMTMR_FREQ / pmtmr_tscf + pmtmr_old);
+}
+
+static void
+pmtmr_init(void)
+{
+#ifdef	__FreeBSD__
+	size_t len;
+	int smp_tsc, err;
+	struct timespec tsnew, tsold = { 0 };
+
+	len = sizeof(smp_tsc);
+	err = sysctlbyname("kern.timecounter.smp_tsc", &smp_tsc, &len, NULL, 0);
+	assert(err == 0);
+
+	if (smp_tsc) {
+		len = sizeof(pmtmr_tscf);
+		err = sysctlbyname("machdep.tsc_freq", &pmtmr_tscf, &len,
+				   NULL, 0);
+		assert(err == 0);
+
+		pmtmr_tsc_old = rdtsc();
+		pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0);
+	} else {
+		if (getenv("BHYVE_PMTMR_PRECISE") != NULL)
+			clockid = CLOCK_UPTIME;
+
+		err = clock_gettime(clockid, &tsnew);
+		assert(err == 0);
+
+		pmtmr_uptime_old = tsnew;
+		pmtmr_old = timespec_to_pmtmr(&tsnew, &tsold);
+	}
+#else
+	kstat_ctl_t *kstat_ctl;
+	kstat_t *kstat;
+	kstat_named_t *kstat_cpu_freq;
+
+	kstat_ctl = kstat_open();
+	kstat = kstat_lookup(kstat_ctl, "cpu_info", 0, NULL);
+	kstat_read(kstat_ctl, kstat, NULL);
+	kstat_cpu_freq = kstat_data_lookup(kstat, "current_clock_Hz");
+	pmtmr_tscf = kstat_cpu_freq->value.ul;
+	kstat_close(kstat_ctl);
+
+	pmtmr_tsc_old = rdtsc();
+	pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0);
+#endif
+	pthread_mutex_init(&pmtmr_mtx, NULL);
+}
+
+static uint32_t
+pmtmr_val(void)
+{
+	struct timespec	tsnew;
+	uint64_t	pmtmr_tsc_new;
+	uint64_t	pmtmr_new;
+	int		error;
+
+	pthread_once(&pmtmr_once, pmtmr_init);
+
+	pthread_mutex_lock(&pmtmr_mtx);
+
+#ifdef	__FreeBSD__
+	if (pmtmr_tscf) {
+		pmtmr_tsc_new = rdtsc();
+		pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old);
+		pmtmr_tsc_old = pmtmr_tsc_new;
+	} else {
+		error = clock_gettime(clockid, &tsnew);
+		assert(error == 0);
+
+		pmtmr_new = timespec_to_pmtmr(&tsnew, &pmtmr_uptime_old);
+		pmtmr_uptime_old = tsnew;
+	}
+#else
+	pmtmr_tsc_new = rdtsc();
+	pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old);
+	pmtmr_tsc_old = pmtmr_tsc_new;
+#endif
+	pmtmr_old = pmtmr_new;
+
+	pthread_mutex_unlock(&pmtmr_mtx);
+
+	return (pmtmr_new); 
+}
+
+static int
+pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	          uint32_t *eax, void *arg)
+{
+	assert(in == 1);
+
+	if (bytes != 4)
+		return (-1);
+
+	*eax = pmtmr_val();
+
+	return (0);
+}
+
+INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler);
diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c
new file mode 100644
index 0000000000..dcb481aac4
--- /dev/null
+++ b/usr/src/cmd/bhyve/post.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+#include "pci_lpc.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		  uint32_t *eax, void *arg)
+{
+	assert(in == 1);
+
+	if (bytes != 1)
+		return (-1);
+
+	*eax = 0xff;		/* return some garbage */
+	return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
+SYSRES_IO(0x84, 1);
diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c
new file mode 100644
index 0000000000..22e566ac21
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2kbd.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "atkbdc.h"
+#include "console.h"
+
+/* keyboard device commands */
+#define	PS2KC_RESET_DEV		0xff
+#define	PS2KC_DISABLE		0xf5
+#define	PS2KC_ENABLE		0xf4
+#define	PS2KC_SET_TYPEMATIC	0xf3
+#define	PS2KC_SEND_DEV_ID	0xf2
+#define	PS2KC_SET_SCANCODE_SET	0xf0
+#define	PS2KC_ECHO		0xee
+#define	PS2KC_SET_LEDS		0xed
+
+#define	PS2KC_BAT_SUCCESS	0xaa
+#define	PS2KC_ACK		0xfa
+
+#define	PS2KBD_FIFOSZ		16
+
+struct fifo {
+	uint8_t	buf[PS2KBD_FIFOSZ];
+	int	rindex;		/* index to read from */
+	int	windex;		/* index to write to */
+	int	num;		/* number of bytes in the fifo */
+	int	size;		/* size of the fifo */
+};
+
+struct ps2kbd_softc {
+	struct atkbdc_softc	*atkbdc_sc;
+	pthread_mutex_t		mtx;
+
+	bool			enabled;
+	struct fifo		fifo;
+
+	uint8_t			curcmd;	/* current command for next byte */
+};
+
+static void
+fifo_init(struct ps2kbd_softc *sc)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_reset(struct ps2kbd_softc *sc)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	bzero(fifo, sizeof(struct fifo));
+	fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static int
+fifo_available(struct ps2kbd_softc *sc)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	return (fifo->num < fifo->size);
+}
+
+static void
+fifo_put(struct ps2kbd_softc *sc, uint8_t val)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	if (fifo->num < fifo->size) {
+		fifo->buf[fifo->windex] = val;
+		fifo->windex = (fifo->windex + 1) % fifo->size;
+		fifo->num++;
+	}
+}
+
+static int
+fifo_get(struct ps2kbd_softc *sc, uint8_t *val)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	if (fifo->num > 0) {
+		*val = fifo->buf[fifo->rindex];
+		fifo->rindex = (fifo->rindex + 1) % fifo->size;
+		fifo->num--;
+		return (0);
+	}
+
+	return (-1);
+}
+
+int
+ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val)
+{
+	int retval;
+
+	pthread_mutex_lock(&sc->mtx);
+	retval = fifo_get(sc, val);
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (retval);
+}
+
+void
+ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val)
+{
+	pthread_mutex_lock(&sc->mtx);
+	if (sc->curcmd) {
+		switch (sc->curcmd) {
+		case PS2KC_SET_TYPEMATIC:
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		case PS2KC_SET_SCANCODE_SET:
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		case PS2KC_SET_LEDS:
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		default:
+			fprintf(stderr, "Unhandled ps2 keyboard current "
+			    "command byte 0x%02x\n", val);
+			break;
+		}
+		sc->curcmd = 0;
+	} else {
+		switch (val) {
+		case PS2KC_RESET_DEV:
+			fifo_reset(sc);
+			fifo_put(sc, PS2KC_ACK);
+			fifo_put(sc, PS2KC_BAT_SUCCESS);
+			break;
+		case PS2KC_DISABLE:
+			sc->enabled = false;
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		case PS2KC_ENABLE:
+			sc->enabled = true;
+			fifo_reset(sc);
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		case PS2KC_SET_TYPEMATIC:
+			sc->curcmd = val;
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		case PS2KC_SEND_DEV_ID:
+			fifo_put(sc, PS2KC_ACK);
+			fifo_put(sc, 0xab);
+			fifo_put(sc, 0x83);
+			break;
+		case PS2KC_SET_SCANCODE_SET:
+			sc->curcmd = val;
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		case PS2KC_ECHO:
+			fifo_put(sc, PS2KC_ECHO);
+			break;
+		case PS2KC_SET_LEDS:
+			sc->curcmd = val;
+			fifo_put(sc, PS2KC_ACK);
+			break;
+		default:
+			fprintf(stderr, "Unhandled ps2 keyboard command "
+			    "0x%02x\n", val);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+/*
+ * Translate keysym to type 2 scancode and insert into keyboard buffer.
+ */
+static void
+ps2kbd_keysym_queue(struct ps2kbd_softc *sc,
+    int down, uint32_t keysym)
+{
+	/* ASCII to type 2 scancode lookup table */
+	const uint8_t translation[128] = {
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52,
+		0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a,
+		0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d,
+		0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a,
+		0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e,
+		0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00,
+	};
+
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	switch (keysym) {
+	case 0x0 ... 0x7f:
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, translation[keysym]);
+		break;
+	case 0xff08:	/* Back space */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x66);
+		break;
+	case 0xff09:	/* Tab */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x0d);
+		break;
+	case 0xff0d:	/* Return  */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x5a);
+		break;
+	case 0xff1b:	/* Escape */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x76);
+		break;
+	case 0xff51:	/* Left arrow */
+		fifo_put(sc, 0xe0);
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x6b);
+		break;
+	case 0xff52:	/* Up arrow */
+		fifo_put(sc, 0xe0);
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x75);
+		break;
+	case 0xff53:	/* Right arrow */
+		fifo_put(sc, 0xe0);
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x74);
+		break;
+	case 0xff54:	/* Down arrow */
+		fifo_put(sc, 0xe0);
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x72);
+		break;
+	case 0xffbe:	/* F1 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x05);
+		break;
+	case 0xffbf:	/* F2 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x06);
+		break;
+	case 0xffc0:	/* F3 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x04);
+		break;
+	case 0xffc1:	/* F4 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x0c);
+		break;
+	case 0xffc2:	/* F5 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x03);
+		break;
+	case 0xffc3:	/* F6 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x0b);
+		break;
+	case 0xffc4:	/* F7 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x83);
+		break;
+	case 0xffc5:	/* F8 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x0a);
+		break;
+	case 0xffc6:	/* F9 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x01);
+		break;
+	case 0xffc7:	/* F10 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x09);
+		break;
+	case 0xffc8:	/* F11 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x78);
+		break;
+	case 0xffc9:	/* F12 */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x07);
+		break;
+	case 0xffe1:	/* Left shift */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x12);
+		break;
+	case 0xffe2:	/* Right shift */
+		/* XXX */
+		break;
+	case 0xffe3:	/* Left control */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x14);
+		break;
+	case 0xffe4:	/* Right control */
+		/* XXX */
+		break;
+	case 0xffe7:	/* Left meta */
+		/* XXX */
+		break;
+	case 0xffe8:	/* Right meta */
+		/* XXX */
+		break;
+	case 0xffe9:	/* Left alt */
+		if (!down)
+			fifo_put(sc, 0xf0);
+		fifo_put(sc, 0x11);
+		break;
+	case 0xffea:	/* Right alt */
+		/* XXX */
+		break;
+	default:
+		fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n",
+		     keysym);
+		break;
+	}
+}
+
+static void
+ps2kbd_event(int down, uint32_t keysym, void *arg)
+{
+	struct ps2kbd_softc *sc = arg;
+
+	pthread_mutex_lock(&sc->mtx);
+	if (!sc->enabled) {
+		pthread_mutex_unlock(&sc->mtx);
+		return;
+	}
+
+	ps2kbd_keysym_queue(sc, down, keysym);
+	pthread_mutex_unlock(&sc->mtx);
+
+	atkbdc_event(sc->atkbdc_sc);
+}
+
+struct ps2kbd_softc *
+ps2kbd_init(struct atkbdc_softc *atkbdc_sc)
+{
+	struct ps2kbd_softc *sc;
+
+	sc = calloc(1, sizeof (struct ps2kbd_softc));
+	pthread_mutex_init(&sc->mtx, NULL);
+	fifo_init(sc);
+	sc->atkbdc_sc = atkbdc_sc;
+
+	console_kbd_register(ps2kbd_event, sc);
+
+	return (sc);
+}
diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h
new file mode 100644
index 0000000000..34c31b1ea8
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2kbd.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PS2KBD_H_
+#define	_PS2KBD_H_
+
+struct atkbdc_softc;
+
+struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc);
+
+int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val);
+void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val);
+
+#endif /* _PS2KBD_H_ */
diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c
new file mode 100644
index 0000000000..e96fbbf411
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2mouse.c
@@ -0,0 +1,371 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "atkbdc.h"
+#include "console.h"
+
+/* mouse device commands */
+#define	PS2MC_RESET_DEV		0xff
+#define	PS2MC_SET_DEFAULTS	0xf6
+#define	PS2MC_DISABLE		0xf5
+#define	PS2MC_ENABLE		0xf4
+#define	PS2MC_SET_SAMPLING_RATE	0xf3
+#define	PS2MC_SEND_DEV_ID	0xf2
+#define	PS2MC_SET_REMOTE_MODE	0xf0
+#define	PS2MC_SEND_DEV_DATA	0xeb
+#define	PS2MC_SET_STREAM_MODE	0xea
+#define	PS2MC_SEND_DEV_STATUS	0xe9
+#define	PS2MC_SET_RESOLUTION	0xe8
+#define	PS2MC_SET_SCALING1	0xe7
+#define	PS2MC_SET_SCALING2	0xe6
+
+#define	PS2MC_BAT_SUCCESS	0xaa
+#define	PS2MC_ACK		0xfa
+
+/* mouse device id */
+#define	PS2MOUSE_DEV_ID		0x0
+
+/* mouse status bits */
+#define	PS2M_STS_REMOTE_MODE	0x40
+#define	PS2M_STS_ENABLE_DEV	0x20
+#define	PS2M_STS_SCALING_21	0x10
+#define	PS2M_STS_MID_BUTTON	0x04
+#define	PS2M_STS_RIGHT_BUTTON	0x02
+#define	PS2M_STS_LEFT_BUTTON	0x01
+
+#define	PS2MOUSE_FIFOSZ		16
+
+struct fifo {
+	uint8_t	buf[PS2MOUSE_FIFOSZ];
+	int	rindex;		/* index to read from */
+	int	windex;		/* index to write to */
+	int	num;		/* number of bytes in the fifo */
+	int	size;		/* size of the fifo */
+};
+
+struct ps2mouse_softc {
+	struct atkbdc_softc	*atkbdc_sc;
+	pthread_mutex_t		mtx;
+
+	uint8_t		status;
+	uint8_t		resolution;
+	uint8_t		sampling_rate;
+	struct fifo	fifo;
+
+	uint8_t		curcmd;	/* current command for next byte */
+
+	int		cur_x, cur_y;
+	int		delta_x, delta_y;
+};
+
+static void
+fifo_init(struct ps2mouse_softc *sc)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_reset(struct ps2mouse_softc *sc)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	bzero(fifo, sizeof(struct fifo));
+	fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_put(struct ps2mouse_softc *sc, uint8_t val)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	if (fifo->num < fifo->size) {
+		fifo->buf[fifo->windex] = val;
+		fifo->windex = (fifo->windex + 1) % fifo->size;
+		fifo->num++;
+	}
+}
+
+static int
+fifo_get(struct ps2mouse_softc *sc, uint8_t *val)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->fifo;
+	if (fifo->num > 0) {
+		*val = fifo->buf[fifo->rindex];
+		fifo->rindex = (fifo->rindex + 1) % fifo->size;
+		fifo->num--;
+		return (0);
+	}
+
+	return (-1);
+}
+
+static void
+movement_reset(struct ps2mouse_softc *sc)
+{
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	sc->delta_x = 0;
+	sc->delta_y = 0;
+}
+
+static void
+movement_update(struct ps2mouse_softc *sc, int x, int y)
+{
+	sc->delta_x += x - sc->cur_x;
+	sc->delta_y += sc->cur_y - y;
+	sc->cur_x = x;
+	sc->cur_y = y;
+}
+
+static void
+movement_get(struct ps2mouse_softc *sc)
+{
+	uint8_t val0, val1, val2;
+
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+
+	val0 = 	sc->status & (PS2M_STS_LEFT_BUTTON |
+	    PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON);
+
+	if (sc->delta_x >= 0) {
+		if (sc->delta_x > 255) {
+			val0 |= (1 << 6);
+			val1 = 255;
+		} else
+			val1 = sc->delta_x;
+	} else {
+		val0 |= (1 << 4);
+		if (sc->delta_x < -255) {
+			val0 |= (1 << 6);
+			val1 = 255;
+		} else
+			val1 = sc->delta_x;
+	}
+	sc->delta_x = 0;
+
+	if (sc->delta_y >= 0) {
+		if (sc->delta_y > 255) {
+			val0 |= (1 << 7);
+			val2 = 255;
+		} else
+			val2 = sc->delta_y;
+	} else {
+		val0 |= (1 << 5);
+		if (sc->delta_y < -255) {
+			val0 |= (1 << 7);
+			val2 = 255;
+		} else
+			val2 = sc->delta_y;
+	}
+	sc->delta_y = 0;
+
+	fifo_put(sc, val0);
+	fifo_put(sc, val1);
+	fifo_put(sc, val2);
+}
+
+static void
+ps2mouse_reset(struct ps2mouse_softc *sc)
+{
+	assert(pthread_mutex_isowned_np(&sc->mtx));
+	fifo_reset(sc);
+	movement_reset(sc);
+	sc->status = 0x8;
+	sc->resolution = 4;
+	sc->sampling_rate = 100;
+
+	sc->cur_x = 0;
+	sc->cur_y = 0;
+	sc->delta_x = 0;
+	sc->delta_y = 0;
+}
+
+int
+ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val)
+{
+	int retval;
+
+	pthread_mutex_lock(&sc->mtx);
+	retval = fifo_get(sc, val);
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (retval);
+}
+
+void
+ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val)
+{
+	pthread_mutex_lock(&sc->mtx);
+	if (sc->curcmd) {
+		switch (sc->curcmd) {
+		case PS2MC_SET_SAMPLING_RATE:
+			sc->sampling_rate = val;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_SET_RESOLUTION:
+			sc->resolution = val;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		default:
+			fprintf(stderr, "Unhandled ps2 mouse current "
+			    "command byte 0x%02x\n", val);
+			break;
+		}
+		sc->curcmd = 0;
+	} else {
+		switch (val) {
+		case PS2MC_RESET_DEV:
+			ps2mouse_reset(sc);
+			fifo_put(sc, PS2MC_ACK);
+			fifo_put(sc, PS2MC_BAT_SUCCESS);
+			fifo_put(sc, PS2MOUSE_DEV_ID);
+			break;
+		case PS2MC_SET_DEFAULTS:
+			ps2mouse_reset(sc);
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_DISABLE:
+			fifo_reset(sc);
+			sc->status &= ~PS2M_STS_ENABLE_DEV;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_ENABLE:
+			fifo_reset(sc);
+			sc->status |= PS2M_STS_ENABLE_DEV;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_SET_SAMPLING_RATE:
+			sc->curcmd = val;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_SEND_DEV_ID:
+			fifo_put(sc, PS2MC_ACK);
+			fifo_put(sc, PS2MOUSE_DEV_ID);
+			break;
+		case PS2MC_SET_REMOTE_MODE:
+			sc->status |= PS2M_STS_REMOTE_MODE;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_SEND_DEV_DATA:
+			fifo_put(sc, PS2MC_ACK);
+			movement_get(sc);
+			break;
+		case PS2MC_SET_STREAM_MODE:
+			sc->status &= ~PS2M_STS_REMOTE_MODE;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_SEND_DEV_STATUS:
+			fifo_put(sc, PS2MC_ACK);
+			fifo_put(sc, sc->status);
+			fifo_put(sc, sc->resolution);
+			fifo_put(sc, sc->sampling_rate);
+			break;
+		case PS2MC_SET_RESOLUTION:
+			sc->curcmd = val;
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		case PS2MC_SET_SCALING1:
+		case PS2MC_SET_SCALING2:
+			fifo_put(sc, PS2MC_ACK);
+			break;
+		default:
+			fprintf(stderr, "Unhandled ps2 mouse command "
+			    "0x%02x\n", val);
+			break;
+		}
+	}
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+ps2mouse_event(uint8_t button, int x, int y, void *arg)
+{
+	struct ps2mouse_softc *sc = arg;
+
+	pthread_mutex_lock(&sc->mtx);
+	movement_update(sc, x, y);
+
+	sc->status &= ~(PS2M_STS_LEFT_BUTTON |
+	    PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON);
+	if (button & (1 << 0))
+		sc->status |= PS2M_STS_LEFT_BUTTON;
+	if (button & (1 << 1))
+		sc->status |= PS2M_STS_MID_BUTTON;
+	if (button & (1 << 2))
+		sc->status |= PS2M_STS_RIGHT_BUTTON;
+
+	if ((sc->status & PS2M_STS_ENABLE_DEV) == 0) {
+		/* no data reporting */
+		pthread_mutex_unlock(&sc->mtx);
+		return;
+	}
+
+	movement_get(sc);
+	pthread_mutex_unlock(&sc->mtx);
+
+	atkbdc_event(sc->atkbdc_sc);
+}
+
+struct ps2mouse_softc *
+ps2mouse_init(struct atkbdc_softc *atkbdc_sc)
+{
+	struct ps2mouse_softc *sc;
+
+	sc = calloc(1, sizeof (struct ps2mouse_softc));
+	pthread_mutex_init(&sc->mtx, NULL);
+	fifo_init(sc);
+	sc->atkbdc_sc = atkbdc_sc;
+
+	pthread_mutex_lock(&sc->mtx);
+	ps2mouse_reset(sc);
+	pthread_mutex_unlock(&sc->mtx);
+
+	console_ptr_register(ps2mouse_event, sc);
+
+	return (sc);
+}
+
diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h
new file mode 100644
index 0000000000..1a78934b98
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2mouse.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PS2MOUSE_H_
+#define	_PS2MOUSE_H_
+
+struct atkbdc_softc;
+
+struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc);
+
+int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val);
+void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val);
+
+#endif /* _PS2MOUSE_H_ */
diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c
new file mode 100644
index 0000000000..0846316378
--- /dev/null
+++ b/usr/src/cmd/bhyve/rfb.c
@@ -0,0 +1,420 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+#include "rfb.h"
+
+struct rfb_softc {
+	int		sfd;
+	pthread_t	tid;
+
+	int		width, height;
+
+	bool		enc_raw_ok;
+	bool		enc_resize_ok;
+};
+
+struct rfb_pixfmt {
+	uint8_t		bpp;
+	uint8_t		depth;
+	uint8_t		bigendian;
+	uint8_t		truecolor;
+	uint16_t	red_max;
+	uint16_t	green_max;
+	uint16_t	blue_max;
+	uint8_t		red_shift;
+	uint8_t		green_shift;
+	uint8_t		blue_shift;
+	uint8_t		pad[3];
+};
+
+struct rfb_srvr_info {
+	uint16_t		width;
+	uint16_t		height;
+	struct rfb_pixfmt	pixfmt;
+	uint32_t		namelen;
+};
+
+struct rfb_pixfmt_msg {
+	uint8_t			type;
+	uint8_t			pad[3];
+	struct rfb_pixfmt	pixfmt;
+};
+
+#define	RFB_ENCODING_RAW		0
+#define	RFB_ENCODING_RESIZE		-223
+
+struct rfb_enc_msg {
+	uint8_t		type;
+	uint8_t		pad;
+	uint16_t	numencs;
+};
+
+struct rfb_updt_msg {
+	uint8_t		type;
+	uint8_t		incremental;
+	uint16_t	x;
+	uint16_t	y;
+	uint16_t	width;
+	uint16_t	height;
+};
+
+struct rfb_key_msg {
+	uint8_t		type;
+	uint8_t		down;
+	uint16_t	pad;
+	uint32_t	code;
+};
+
+struct rfb_ptr_msg {
+	uint8_t		type;
+	uint8_t		button;
+	uint16_t	x;
+	uint16_t	y;
+};
+
+struct rfb_srvr_updt_msg {
+	uint8_t		type;
+	uint8_t		pad;
+	uint16_t	numrects;
+};
+
+struct rfb_srvr_rect_hdr {
+	uint16_t	x;
+	uint16_t	y;
+	uint16_t	width;
+	uint16_t	height;
+	uint32_t	encoding;
+};
+
+static void
+rfb_send_server_init_msg(int cfd)
+{
+	struct bhyvegc_image *gc_image;
+	struct rfb_srvr_info sinfo;
+	int len;
+
+	gc_image = console_get_image();
+
+	sinfo.width = ntohs(gc_image->width);
+	sinfo.height = ntohs(gc_image->height);
+	sinfo.pixfmt.bpp = 32;
+	sinfo.pixfmt.depth = 32;
+	sinfo.pixfmt.bigendian = 0;
+	sinfo.pixfmt.truecolor = 1;
+	sinfo.pixfmt.red_max = ntohs(255);
+	sinfo.pixfmt.green_max = ntohs(255);
+	sinfo.pixfmt.blue_max = ntohs(255);
+	sinfo.pixfmt.red_shift = 16;
+	sinfo.pixfmt.green_shift = 8;
+	sinfo.pixfmt.blue_shift = 0;
+	sinfo.namelen = ntohl(strlen("bhyve"));
+	len = write(cfd, &sinfo, sizeof(sinfo));
+	len = write(cfd, "bhyve", strlen("bhyve"));
+}
+
+static void
+rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd)
+{
+	struct rfb_srvr_updt_msg supdt_msg;
+        struct rfb_srvr_rect_hdr srect_hdr;
+
+	/* Number of rectangles: 1 */
+	supdt_msg.type = 0;
+	supdt_msg.pad = 0;
+	supdt_msg.numrects = ntohs(1);
+	write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
+
+	/* Rectangle header */
+	srect_hdr.x = ntohs(0);
+	srect_hdr.y = ntohs(0);
+	srect_hdr.width = ntohs(rc->width);
+	srect_hdr.height = ntohs(rc->height);
+	srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE);
+	write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
+}
+
+static void
+rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd)
+{
+	struct rfb_pixfmt_msg pixfmt_msg;
+	int len;
+
+	len = read(cfd, ((void *)&pixfmt_msg) + 1, sizeof(pixfmt_msg) - 1);
+}
+
+
+static void
+rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd)
+{
+	struct rfb_enc_msg enc_msg;
+	int len, i;
+	uint32_t encoding;
+
+	assert((sizeof(enc_msg) - 1) == 3);
+	len = read(cfd, ((void *)&enc_msg) + 1, sizeof(enc_msg) - 1);
+
+	for (i = 0; i < ntohs(enc_msg.numencs); i++) {
+		len = read(cfd, &encoding, sizeof(encoding));
+		switch (ntohl(encoding)) {
+		case RFB_ENCODING_RAW:
+			rc->enc_raw_ok = true;
+			break;
+		case RFB_ENCODING_RESIZE:
+			rc->enc_resize_ok = true;
+			break;
+		}
+	}
+}
+
+static void
+rfb_resize_update(struct rfb_softc *rc, int fd)
+{
+	struct rfb_srvr_updt_msg supdt_msg;
+        struct rfb_srvr_rect_hdr srect_hdr;
+
+	/* Number of rectangles: 1 */
+	supdt_msg.type = 0;
+	supdt_msg.pad = 0;
+	supdt_msg.numrects = ntohs(1);
+	write(fd, &supdt_msg, sizeof (struct rfb_srvr_updt_msg));
+
+	/* Rectangle header */
+	srect_hdr.x = ntohs(0);
+	srect_hdr.y = ntohs(0);
+	srect_hdr.width = ntohs(rc->width);
+	srect_hdr.height = ntohs(rc->height);
+	srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE);
+	write(fd, &srect_hdr, sizeof (struct rfb_srvr_rect_hdr));
+}
+
+static void
+rfb_recv_update_msg(struct rfb_softc *rc, int cfd)
+{
+	struct rfb_updt_msg updt_msg;
+	struct rfb_srvr_updt_msg supdt_msg;
+        struct rfb_srvr_rect_hdr srect_hdr;
+	struct bhyvegc_image *gc_image;
+	int len;
+
+	len = read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1);
+
+	console_refresh();
+	gc_image = console_get_image();
+
+	if (rc->width != gc_image->width || rc->height != gc_image->height) {
+		rc->width = gc_image->width;
+		rc->height = gc_image->height;
+		rfb_send_resize_update_msg(rc, cfd);
+	}
+
+	/*
+	 * Send the whole thing
+	 */
+	/* Number of rectangles: 1 */
+	supdt_msg.type = 0;
+	supdt_msg.pad = 0;
+	supdt_msg.numrects = ntohs(1);
+	write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
+
+	/* Rectangle header */
+	srect_hdr.x = ntohs(0);
+	srect_hdr.y = ntohs(0);
+	srect_hdr.width = ntohs(gc_image->width);
+	srect_hdr.height = ntohs(gc_image->height);
+	srect_hdr.encoding = ntohl(0);	/* raw */
+	write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
+
+	write(cfd, gc_image->data, gc_image->width * gc_image->height *
+	    sizeof(uint32_t));
+}
+
+static void
+rfb_recv_key_msg(struct rfb_softc *rc, int cfd)
+{
+	struct rfb_key_msg key_msg;
+	int len;
+
+	len = read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1);
+
+	console_key_event(key_msg.down, ntohl(key_msg.code));
+}
+
+static void
+rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd)
+{
+	struct rfb_ptr_msg ptr_msg;
+	int len;
+
+	len = read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1);
+
+	console_ptr_event(ptr_msg.button, ntohs(ptr_msg.x), ntohs(ptr_msg.y));
+}
+
+void
+rfb_handle(struct rfb_softc *rc, int cfd)
+{
+	const char *vbuf = "RFB 003.008\n";
+	unsigned char buf[80];
+	int len;
+        uint32_t sres;
+
+	/* 1a. Send server version */
+	printf("server vers write: (%s), %d bytes\n", vbuf, (int) strlen(vbuf));
+	write(cfd, vbuf, strlen(vbuf));
+
+	/* 1b. Read client version */
+	len = read(cfd, buf, sizeof(buf));
+
+	/* 2a. Send security type 'none' */
+	buf[0] = 1;
+	buf[1] = 1; /* none */
+	write(cfd, buf, 2);
+
+	/* 2b. Read agreed security type */
+	len = read(cfd, buf, 1);
+
+	/* 2c. Write back a status of 0 */
+	sres = 0;
+	write(cfd, &sres, 4);
+
+	/* 3a. Read client shared-flag byte */
+	len = read(cfd, buf, 1);
+
+	/* 4a. Write server-init info */
+	rfb_send_server_init_msg(cfd);
+
+        /* Now read in client requests. 1st byte identifies type */
+	for (;;) {
+		len = read(cfd, buf, 1);
+		if (len <= 0) {
+			printf("exiting\n");
+			break;
+		}
+
+		switch (buf[0]) {
+		case 0:
+			rfb_recv_set_pixfmt_msg(rc, cfd);
+			break;
+		case 2:
+			rfb_recv_set_encodings_msg(rc, cfd);
+			break;
+		case 3:
+			rfb_recv_update_msg(rc, cfd);
+			break;
+		case 4:
+			rfb_recv_key_msg(rc, cfd);
+			break;
+		case 5:
+			rfb_recv_ptr_msg(rc, cfd);
+			break;
+		default:
+			printf("unknown client code!\n");
+			exit(1);
+		}
+	}
+}
+
+static void *
+rfb_thr(void *arg)
+{
+	struct rfb_softc *rc;
+	sigset_t set;
+
+	int cfd;
+
+	rc = arg;
+
+	sigemptyset(&set);
+	sigaddset(&set, SIGPIPE);
+	if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) {
+		perror("pthread_sigmask");
+		return (NULL);
+	}
+
+	for (;;) {
+		cfd = accept(rc->sfd, NULL, NULL);
+		rfb_handle(rc, cfd);
+	}
+
+	/* NOTREACHED */
+	return (NULL);
+}
+
+int
+rfb_init(int port)
+{
+	struct rfb_softc *rc;
+	struct sockaddr_in sin;
+	int on = 1;
+
+	rc = calloc(1, sizeof(struct rfb_softc));
+
+	rc->sfd = socket(AF_INET, SOCK_STREAM, 0);
+	if (rc->sfd < 0) {
+		perror("socket");
+		return (-1);
+	}
+
+	setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+#ifdef	__FreeBSD__
+	sin.sin_len = sizeof(sin);
+#endif
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(port);
+	if (bind(rc->sfd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+		perror("bind");
+		return (-1);
+	}
+
+	if (listen(rc->sfd, 1) < 0) {
+		perror("listen");
+		return (-1);
+	}
+
+	pthread_create(&rc->tid, NULL, rfb_thr, rc);
+
+	return (0);
+}
diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h
new file mode 100644
index 0000000000..5504c333ab
--- /dev/null
+++ b/usr/src/cmd/bhyve/rfb.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _RFB_H_
+#define	_RFB_H_
+
+#define	RFB_PORT	5900
+
+int	rfb_init(int port);
+
+#endif /* _RFB_H_ */
diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c
new file mode 100644
index 0000000000..5ab78e060f
--- /dev/null
+++ b/usr/src/cmd/bhyve/rtc.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $");
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_lpc.h"
+#include "rtc.h"
+
+#define	IO_RTC	0x70
+
+#define RTC_SEC		0x00	/* seconds */
+#define	RTC_SEC_ALARM	0x01
+#define	RTC_MIN		0x02
+#define	RTC_MIN_ALARM	0x03
+#define	RTC_HRS		0x04
+#define	RTC_HRS_ALARM	0x05
+#define	RTC_WDAY	0x06
+#define	RTC_DAY		0x07
+#define	RTC_MONTH	0x08
+#define	RTC_YEAR	0x09
+#define	RTC_CENTURY	0x32	/* current century */
+
+#define RTC_STATUSA	0xA
+#define  RTCSA_TUP	 0x80	/* time update, don't look now */
+
+#define	RTC_STATUSB	0xB
+#define	 RTCSB_DST	 0x01
+#define	 RTCSB_24HR	 0x02
+#define	 RTCSB_BIN	 0x04	/* 0 = BCD, 1 = Binary */
+#define	 RTCSB_PINTR	 0x40	/* 1 = enable periodic clock interrupt */
+#define	 RTCSB_HALT      0x80	/* stop clock updates */
+
+#define RTC_INTR	0x0c	/* status register C (R) interrupt source */
+
+#define RTC_STATUSD	0x0d	/* status register D (R) Lost Power */
+#define  RTCSD_PWR	 0x80	/* clock power OK */
+
+#define	RTC_NVRAM_START	0x0e
+#define	RTC_NVRAM_END	0x7f
+#define RTC_NVRAM_SZ	(128 - RTC_NVRAM_START)
+#define	nvoff(x)	((x) - RTC_NVRAM_START)
+
+#define	RTC_DIAG	0x0e
+#define RTC_RSTCODE	0x0f
+#define	RTC_EQUIPMENT	0x14
+#define	RTC_LMEM_LSB	0x34
+#define	RTC_LMEM_MSB	0x35
+#define	RTC_HMEM_LSB	0x5b
+#define	RTC_HMEM_SB	0x5c
+#define	RTC_HMEM_MSB	0x5d
+
+#define m_64KB		(64*1024)
+#define	m_16MB		(16*1024*1024)
+#define	m_4GB		(4ULL*1024*1024*1024)
+
+static int addr;
+
+static uint8_t rtc_nvram[RTC_NVRAM_SZ];
+
+/* XXX initialize these to default values as they would be from BIOS */
+static uint8_t status_a, status_b;
+
+static struct {
+	uint8_t  hours;
+	uint8_t  mins;
+	uint8_t  secs;
+} rtc_alarm;
+
+static u_char const bin2bcd_data[] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+#define	bin2bcd(bin)	(bin2bcd_data[bin])
+
+#define	rtcout(val)	((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+	if (t1->tv_usec < 0) {
+		t1->tv_sec--;
+		t1->tv_usec += 1000000;
+	}
+	if (t1->tv_usec >= 1000000) {
+		t1->tv_sec++;
+		t1->tv_usec -= 1000000;
+	}
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+	t1->tv_sec -= t2->tv_sec;
+	t1->tv_usec -= t2->tv_usec;
+	timevalfix(t1);
+}
+
+static int
+rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	if (bytes != 1)
+		return (-1);
+
+	if (in) {
+		/* straight read of this register will return 0xFF */
+		*eax = 0xff;
+		return (0);
+	}
+
+	switch (*eax & 0x7f) {
+	case RTC_SEC:
+	case RTC_SEC_ALARM:
+	case RTC_MIN:
+	case RTC_MIN_ALARM:
+	case RTC_HRS:
+	case RTC_HRS_ALARM:
+	case RTC_WDAY:
+	case RTC_DAY:
+	case RTC_MONTH:
+	case RTC_YEAR:
+	case RTC_STATUSA:
+	case RTC_STATUSB:
+	case RTC_INTR:
+	case RTC_STATUSD:
+	case RTC_NVRAM_START ... RTC_NVRAM_END:
+		break;
+	default:
+		return (-1);
+	}
+
+	addr = *eax & 0x7f;
+	return (0);
+}
+
+static int
+rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	int hour;
+	time_t t;
+	struct timeval cur, delta;
+
+	static struct timeval last;
+	static struct tm tm;
+
+	if (bytes != 1)
+		return (-1);
+
+	gettimeofday(&cur, NULL);
+
+	/*
+	 * Increment the cached time only once per second so we can guarantee
+	 * that the guest has at least one second to read the hour:min:sec
+	 * separately and still get a coherent view of the time.
+	 */
+	delta = cur;
+	timevalsub(&delta, &last);
+	if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
+		t = cur.tv_sec;
+		localtime_r(&t, &tm);
+		last = cur;
+	}
+
+	if (in) {
+		switch (addr) {
+		case RTC_SEC_ALARM:
+			*eax = rtc_alarm.secs;
+			break;
+		case RTC_MIN_ALARM:
+			*eax = rtc_alarm.mins;
+			break;
+		case RTC_HRS_ALARM:
+			*eax = rtc_alarm.hours;
+			break;
+		case RTC_SEC:
+			*eax = rtcout(tm.tm_sec);
+			return (0);
+		case RTC_MIN:
+			*eax = rtcout(tm.tm_min);
+			return (0);
+		case RTC_HRS:
+			if (status_b & RTCSB_24HR)
+				hour = tm.tm_hour;
+			else
+				hour = (tm.tm_hour % 12) + 1;
+			
+			*eax = rtcout(hour);
+
+			/*
+			 * If we are representing time in the 12-hour format
+			 * then set the MSB to indicate PM.
+			 */
+			if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
+				*eax |= 0x80;
+
+			return (0);
+		case RTC_WDAY:
+			*eax = rtcout(tm.tm_wday + 1);
+			return (0);
+		case RTC_DAY:
+			*eax = rtcout(tm.tm_mday);
+			return (0);
+		case RTC_MONTH:
+			*eax = rtcout(tm.tm_mon + 1);
+			return (0);
+		case RTC_YEAR:
+			*eax = rtcout(tm.tm_year % 100);
+			return (0);
+		case RTC_STATUSA:
+			*eax = status_a;
+			return (0);
+		case RTC_STATUSB:
+			*eax = status_b;
+			return (0);
+		case RTC_INTR:
+			*eax = 0;
+			return (0);
+		case RTC_STATUSD:
+			*eax = RTCSD_PWR;
+			return (0);
+		case RTC_NVRAM_START ... RTC_NVRAM_END:
+			*eax = rtc_nvram[addr - RTC_NVRAM_START];
+			return (0);
+		default:
+			return (-1);
+		}
+	}
+
+	switch (addr) {
+	case RTC_STATUSA:
+		status_a = *eax & ~RTCSA_TUP;
+		break;
+	case RTC_STATUSB:
+		/* XXX not implemented yet XXX */
+		if (*eax & RTCSB_PINTR)
+			return (-1);
+		status_b = *eax;
+		break;
+	case RTC_STATUSD:
+		/* ignore write */
+		break;
+	case RTC_SEC_ALARM:
+		rtc_alarm.secs = *eax;
+		break;
+	case RTC_MIN_ALARM:
+		rtc_alarm.mins = *eax;
+		break;
+	case RTC_HRS_ALARM:
+		rtc_alarm.hours = *eax;
+		break;
+	case RTC_SEC:
+	case RTC_MIN:
+	case RTC_HRS:
+	case RTC_WDAY:
+	case RTC_DAY:
+	case RTC_MONTH:
+	case RTC_YEAR:
+		/*
+		 * Ignore writes to the time of day registers
+		 */
+		break;
+	case RTC_NVRAM_START ... RTC_NVRAM_END:
+		rtc_nvram[addr - RTC_NVRAM_START] = *eax;
+		break;
+	default:
+		return (-1);
+	}
+	return (0);
+}
+
+void
+rtc_init(struct vmctx *ctx)
+{	
+	struct timeval cur;
+	struct tm tm;
+	size_t himem;
+	size_t lomem;
+	int err;
+
+	err = gettimeofday(&cur, NULL);
+	assert(err == 0);
+	(void) localtime_r(&cur.tv_sec, &tm);
+
+	memset(rtc_nvram, 0, sizeof(rtc_nvram));
+
+	rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100);
+
+	/* XXX init diag/reset code/equipment/checksum ? */
+
+	/*
+	 * Report guest memory size in nvram cells as required by UEFI.
+	 * Little-endian encoding.
+	 * 0x34/0x35 - 64KB chunks above 16MB, below 4GB
+	 * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
+	 */
+	lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
+	rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
+	rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
+
+	himem = vm_get_highmem_size(ctx) / m_64KB;
+	rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
+	rtc_nvram[nvoff(RTC_HMEM_SB)]  = himem >> 8;
+	rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16;
+}
+
+INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler);
+INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
+
+#ifdef	__FreeBSD__
+static void
+rtc_dsdt(void)
+{
+
+	dsdt_line("");
+	dsdt_line("Device (RTC)");
+	dsdt_line("{");
+	dsdt_line("  Name (_HID, EisaId (\"PNP0B00\"))");
+	dsdt_line("  Name (_CRS, ResourceTemplate ()");
+	dsdt_line("  {");
+	dsdt_indent(2);
+	dsdt_fixed_ioport(IO_RTC, 2);
+	dsdt_fixed_irq(8);
+	dsdt_unindent(2);
+	dsdt_line("  })");
+	dsdt_line("}");
+}
+LPC_DSDT(rtc_dsdt);
+#endif
+
+SYSRES_IO(0x72, 6);
diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h
new file mode 100644
index 0000000000..6406d24c37
--- /dev/null
+++ b/usr/src/cmd/bhyve/rtc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/rtc.h 253181 2013-07-11 03:54:35Z grehan $
+ */
+
+#ifndef _RTC_H_
+#define _RTC_H_
+
+void	rtc_init(struct vmctx *ctx);
+
+#endif /* _RTC_H_ */
diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c
new file mode 100644
index 0000000000..7ba0f0dfa0
--- /dev/null
+++ b/usr/src/cmd/bhyve/smbiostbl.c
@@ -0,0 +1,827 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z grehan $");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <md5.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <uuid.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "smbiostbl.h"
+
+#define	MB			(1024*1024)
+#define	GB			(1024ULL*1024*1024)
+
+#define SMBIOS_BASE		0xF1000
+
+/* BHYVE_ACPI_BASE - SMBIOS_BASE) */
+#define	SMBIOS_MAX_LENGTH	(0xF2400 - 0xF1000)
+
+#define	SMBIOS_TYPE_BIOS	0
+#define	SMBIOS_TYPE_SYSTEM	1
+#define	SMBIOS_TYPE_CHASSIS	3
+#define	SMBIOS_TYPE_PROCESSOR	4
+#define	SMBIOS_TYPE_MEMARRAY	16
+#define	SMBIOS_TYPE_MEMDEVICE	17
+#define	SMBIOS_TYPE_MEMARRAYMAP	19
+#define	SMBIOS_TYPE_BOOT	32
+#define	SMBIOS_TYPE_EOT		127
+
+struct smbios_structure {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	handle;
+} __packed;
+
+typedef int (*initializer_func_t)(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size);
+
+struct smbios_template_entry {
+	struct smbios_structure	*entry;
+	const char		**strings;
+	initializer_func_t	initializer;
+};
+
+/*
+ * SMBIOS Structure Table Entry Point
+ */
+#define	SMBIOS_ENTRY_EANCHOR	"_SM_"
+#define	SMBIOS_ENTRY_EANCHORLEN	4
+#define	SMBIOS_ENTRY_IANCHOR	"_DMI_"
+#define	SMBIOS_ENTRY_IANCHORLEN	5
+
+struct smbios_entry_point {
+	char		eanchor[4];	/* anchor tag */
+	uint8_t		echecksum;	/* checksum of entry point structure */
+	uint8_t		eplen;		/* length in bytes of entry point */
+	uint8_t		major;		/* major version of the SMBIOS spec */
+	uint8_t		minor;		/* minor version of the SMBIOS spec */
+	uint16_t	maxssize;	/* maximum size in bytes of a struct */
+	uint8_t		revision;	/* entry point structure revision */
+	uint8_t		format[5];	/* entry point rev-specific data */
+	char		ianchor[5];	/* intermediate anchor tag */
+	uint8_t		ichecksum;	/* intermediate checksum */
+	uint16_t	stlen;		/* len in bytes of structure table */
+	uint32_t	staddr;		/* physical addr of structure table */
+	uint16_t	stnum;		/* number of structure table entries */
+	uint8_t		bcdrev;		/* BCD value representing DMI ver */
+} __packed;
+
+/*
+ * BIOS Information
+ */
+#define	SMBIOS_FL_ISA		0x00000010	/* ISA is supported */
+#define	SMBIOS_FL_PCI		0x00000080	/* PCI is supported */
+#define	SMBIOS_FL_SHADOW	0x00001000	/* BIOS shadowing is allowed */
+#define	SMBIOS_FL_CDBOOT	0x00008000	/* Boot from CD is supported */
+#define	SMBIOS_FL_SELBOOT	0x00010000	/* Selectable Boot supported */
+#define	SMBIOS_FL_EDD		0x00080000	/* EDD Spec is supported */
+
+#define	SMBIOS_XB1_FL_ACPI	0x00000001	/* ACPI is supported */
+
+#define	SMBIOS_XB2_FL_BBS	0x00000001	/* BIOS Boot Specification */
+#define	SMBIOS_XB2_FL_VM	0x00000010	/* Virtual Machine */
+
+struct smbios_table_type0 {
+	struct smbios_structure	header;
+	uint8_t			vendor;		/* vendor string */
+	uint8_t			version;	/* version string */
+	uint16_t		segment;	/* address segment location */
+	uint8_t			rel_date;	/* release date */
+	uint8_t			size;		/* rom size */
+	uint64_t		cflags;		/* characteristics */
+	uint8_t			xc_bytes[2];	/* characteristics ext bytes */
+	uint8_t			sb_major_rel;	/* system bios version */
+	uint8_t			sb_minor_rele;
+	uint8_t			ecfw_major_rel;	/* embedded ctrl fw version */
+	uint8_t			ecfw_minor_rel;
+} __packed;
+
+/*
+ * System Information
+ */
+#define	SMBIOS_WAKEUP_SWITCH	0x06	/* power switch */
+
+struct smbios_table_type1 {
+	struct smbios_structure	header;
+	uint8_t			manufacturer;	/* manufacturer string */
+	uint8_t			product;	/* product name string */
+	uint8_t			version;	/* version string */
+	uint8_t			serial;		/* serial number string */
+	uint8_t			uuid[16];	/* uuid byte array */
+	uint8_t			wakeup;		/* wake-up event */
+	uint8_t			sku;		/* sku number string */
+	uint8_t			family;		/* family name string */
+} __packed;
+
+/*
+ * System Enclosure or Chassis
+ */
+#define	SMBIOS_CHT_UNKNOWN	0x02	/* unknown */
+
+#define	SMBIOS_CHST_SAFE	0x03	/* safe */
+
+#define	SMBIOS_CHSC_NONE	0x03	/* none */
+
+struct smbios_table_type3 {
+	struct smbios_structure	header;
+	uint8_t			manufacturer;	/* manufacturer string */
+	uint8_t			type;		/* type */
+	uint8_t			version;	/* version string */
+	uint8_t			serial;		/* serial number string */
+	uint8_t			asset;		/* asset tag string */
+	uint8_t			bustate;	/* boot-up state */
+	uint8_t			psstate;	/* power supply state */
+	uint8_t			tstate;		/* thermal state */
+	uint8_t			security;	/* security status */
+	uint8_t			uheight;	/* height in 'u's */
+	uint8_t			cords;		/* number of power cords */
+	uint8_t			elems;		/* number of element records */
+	uint8_t			elemlen;	/* length of records */
+	uint8_t			sku;		/* sku number string */
+} __packed;
+
+/*
+ * Processor Information
+ */
+#define	SMBIOS_PRT_CENTRAL	0x03	/* central processor */
+
+#define	SMBIOS_PRF_OTHER	0x01	/* other */
+
+#define	SMBIOS_PRS_PRESENT	0x40	/* socket is populated */
+#define	SMBIOS_PRS_ENABLED	0x1	/* enabled */
+
+#define	SMBIOS_PRU_NONE		0x06	/* none */
+
+#define	SMBIOS_PFL_64B	0x04	/* 64-bit capable */
+
+struct smbios_table_type4 {
+	struct smbios_structure	header;
+	uint8_t			socket;		/* socket designation string */
+	uint8_t			type;		/* processor type */
+	uint8_t			family;		/* processor family */
+	uint8_t			manufacturer;	/* manufacturer string */
+	uint64_t		cpuid;		/* processor cpuid */
+	uint8_t			version;	/* version string */
+	uint8_t			voltage;	/* voltage */
+	uint16_t		clkspeed;	/* ext clock speed in mhz */
+	uint16_t		maxspeed;	/* maximum speed in mhz */
+	uint16_t		curspeed;	/* current speed in mhz */
+	uint8_t			status;		/* status */
+	uint8_t			upgrade;	/* upgrade */
+	uint16_t		l1handle;	/* l1 cache handle */
+	uint16_t		l2handle;	/* l2 cache handle */
+	uint16_t		l3handle;	/* l3 cache handle */
+	uint8_t			serial;		/* serial number string */
+	uint8_t			asset;		/* asset tag string */
+	uint8_t			part;		/* part number string */
+	uint8_t			cores;		/* cores per socket */
+	uint8_t			ecores;		/* enabled cores */
+	uint8_t			threads;	/* threads per socket */
+	uint16_t		cflags;		/* processor characteristics */
+	uint16_t		family2;	/* processor family 2 */
+} __packed;
+
+/*
+ * Physical Memory Array
+ */
+#define	SMBIOS_MAL_SYSMB	0x03	/* system board or motherboard */
+
+#define	SMBIOS_MAU_SYSTEM	0x03	/* system memory */
+
+#define	SMBIOS_MAE_NONE		0x03	/* none */
+
+struct smbios_table_type16 {
+	struct smbios_structure	header;
+	uint8_t			location;	/* physical device location */
+	uint8_t			use;		/* device functional purpose */
+	uint8_t			ecc;		/* err detect/correct method */
+	uint32_t		size;		/* max mem capacity in kb */
+	uint16_t		errhand;	/* handle of error (if any) */
+	uint16_t		ndevs;		/* num of slots or sockets */
+	uint64_t		xsize;		/* max mem capacity in bytes */
+} __packed;
+
+/*
+ * Memory Device
+ */
+#define	SMBIOS_MDFF_UNKNOWN	0x02	/* unknown */
+
+#define	SMBIOS_MDT_UNKNOWN	0x02	/* unknown */
+
+#define	SMBIOS_MDF_UNKNOWN	0x0004	/* unknown */
+
+struct smbios_table_type17 {
+	struct smbios_structure	header;
+	uint16_t		arrayhand;	/* handle of physl mem array */
+	uint16_t		errhand;	/* handle of mem error data */
+	uint16_t		twidth;		/* total width in bits */
+	uint16_t		dwidth;		/* data width in bits */
+	uint16_t		size;		/* size in bytes */
+	uint8_t			form;		/* form factor */
+	uint8_t			set;		/* set */
+	uint8_t			dloc;		/* device locator string */
+	uint8_t			bloc;		/* phys bank locator string */
+	uint8_t			type;		/* memory type */
+	uint16_t		flags;		/* memory characteristics */
+	uint16_t		maxspeed;	/* maximum speed in mhz */
+	uint8_t			manufacturer;	/* manufacturer string */
+	uint8_t			serial;		/* serial number string */
+	uint8_t			asset;		/* asset tag string */
+	uint8_t			part;		/* part number string */
+	uint8_t			attributes;	/* attributes */
+	uint32_t		xsize;		/* extended size in mbs */
+	uint16_t		curspeed;	/* current speed in mhz */
+	uint16_t		minvoltage;	/* minimum voltage */
+	uint16_t		maxvoltage;	/* maximum voltage */
+	uint16_t		curvoltage;	/* configured voltage */
+} __packed;
+
+/*
+ * Memory Array Mapped Address
+ */
+struct smbios_table_type19 {
+	struct smbios_structure	header;
+	uint32_t		saddr;		/* start phys addr in kb */
+	uint32_t		eaddr;		/* end phys addr in kb */
+	uint16_t		arrayhand;	/* physical mem array handle */
+	uint8_t			width;		/* num of dev in row */
+	uint64_t		xsaddr;		/* start phys addr in bytes */
+	uint64_t		xeaddr;		/* end phys addr in bytes */
+} __packed;
+
+/*
+ * System Boot Information
+ */
+#define	SMBIOS_BOOT_NORMAL	0	/* no errors detected */
+
+struct smbios_table_type32 {
+	struct smbios_structure	header;
+	uint8_t			reserved[6];
+	uint8_t			status;		/* boot status */
+} __packed;
+
+/*
+ * End-of-Table
+ */
+struct smbios_table_type127 {
+	struct smbios_structure	header;
+} __packed;
+
+struct smbios_table_type0 smbios_type0_template = {
+	{ SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 },
+	1,	/* bios vendor string */
+	2,	/* bios version string */
+	0xF000,	/* bios address segment location */
+	3,	/* bios release date */
+	0x0,	/* bios size (64k * (n + 1) is the size in bytes) */
+	SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW |
+	    SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD,
+	{ SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM },
+	0x0,	/* bios major release */
+	0x0,	/* bios minor release */
+	0xff,	/* embedded controller firmware major release */
+	0xff	/* embedded controller firmware minor release */
+};
+
+const char *smbios_type0_strings[] = {
+	"BHYVE",	/* vendor string */
+	__TIME__,	/* bios version string */
+	__DATE__,	/* bios release date string */
+	NULL
+};
+
+struct smbios_table_type1 smbios_type1_template = {
+	{ SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 },
+	1,		/* manufacturer string */
+	2,		/* product string */
+	3,		/* version string */
+	4,		/* serial number string */
+	{ 0 },
+	SMBIOS_WAKEUP_SWITCH,
+	5,		/* sku string */
+	6		/* family string */
+};
+
+static int smbios_type1_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size);
+
+const char *smbios_type1_strings[] = {
+	" ",		/* manufacturer string */
+	"BHYVE",	/* product name string */
+	"1.0",		/* version string */
+	"None",		/* serial number string */
+	"None",		/* sku string */
+	" ",		/* family name string */
+	NULL
+};
+
+struct smbios_table_type3 smbios_type3_template = {
+	{ SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 },
+	1,		/* manufacturer string */
+	SMBIOS_CHT_UNKNOWN,
+	2,		/* version string */
+	3,		/* serial number string */
+	4,		/* asset tag string */
+	SMBIOS_CHST_SAFE,
+	SMBIOS_CHST_SAFE,
+	SMBIOS_CHST_SAFE,
+	SMBIOS_CHSC_NONE,
+	0,		/* height in 'u's (0=enclosure height unspecified) */
+	0,		/* number of power cords (0=number unspecified) */
+	0,		/* number of contained element records */
+	0,		/* length of records */
+	5		/* sku number string */
+};
+
+const char *smbios_type3_strings[] = {
+	" ",		/* manufacturer string */
+	"1.0",		/* version string */
+	"None",		/* serial number string */
+	"None",		/* asset tag string */
+	"None",		/* sku number string */
+	NULL
+};
+
+struct smbios_table_type4 smbios_type4_template = {
+	{ SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 },
+	1,		/* socket designation string */
+	SMBIOS_PRT_CENTRAL,
+	SMBIOS_PRF_OTHER,
+	2,		/* manufacturer string */
+	0,		/* cpuid */
+	3,		/* version string */
+	0,		/* voltage */
+	0,		/* external clock frequency in mhz (0=unknown) */
+	0,		/* maximum frequency in mhz (0=unknown) */
+	0,		/* current frequency in mhz (0=unknown) */
+	SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED,
+	SMBIOS_PRU_NONE,
+	-1,		/* l1 cache handle */
+	-1,		/* l2 cache handle */
+	-1,		/* l3 cache handle */
+	4,		/* serial number string */
+	5,		/* asset tag string */
+	6,		/* part number string */
+	0,		/* cores per socket (0=unknown) */
+	0,		/* enabled cores per socket (0=unknown) */
+	0,		/* threads per socket (0=unknown) */
+	SMBIOS_PFL_64B,
+	SMBIOS_PRF_OTHER
+};
+
+const char *smbios_type4_strings[] = {
+	" ",		/* socket designation string */
+	" ",		/* manufacturer string */
+	" ",		/* version string */
+	"None",		/* serial number string */
+	"None",		/* asset tag string */
+	"None",		/* part number string */
+	NULL
+};
+
+static int smbios_type4_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size);
+
+struct smbios_table_type16 smbios_type16_template = {
+	{ SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16),  0 },
+	SMBIOS_MAL_SYSMB,
+	SMBIOS_MAU_SYSTEM,
+	SMBIOS_MAE_NONE,
+	0x80000000,	/* max mem capacity in kb (0x80000000=use extended) */
+	-1,		/* handle of error (if any) */
+	0,		/* number of slots or sockets (TBD) */
+	0		/* extended maximum memory capacity in bytes (TBD) */
+};
+
+static int smbios_type16_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size);
+
+struct smbios_table_type17 smbios_type17_template = {
+	{ SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17),  0 },
+	-1,		/* handle of physical memory array */
+	-1,		/* handle of memory error data */
+	64,		/* total width in bits including ecc */
+	64,		/* data width in bits */
+	0x7fff,		/* size in bytes (0x7fff=use extended)*/
+	SMBIOS_MDFF_UNKNOWN,
+	0,		/* set (0x00=none, 0xff=unknown) */
+	1,		/* device locator string */
+	2,		/* physical bank locator string */
+	SMBIOS_MDT_UNKNOWN,
+	SMBIOS_MDF_UNKNOWN,
+	0,		/* maximum memory speed in mhz (0=unknown) */
+	3,		/* manufacturer string */
+	4,		/* serial number string */
+	5,		/* asset tag string */
+	6,		/* part number string */
+	0,		/* attributes (0=unknown rank information) */
+	0,		/* extended size in mb (TBD) */
+	0,		/* current speed in mhz (0=unknown) */
+	0,		/* minimum voltage in mv (0=unknown) */
+	0,		/* maximum voltage in mv (0=unknown) */
+	0		/* configured voltage in mv (0=unknown) */
+};
+
+const char *smbios_type17_strings[] = {
+	" ",		/* device locator string */
+	" ",		/* physical bank locator string */
+	" ",		/* manufacturer string */
+	"None",		/* serial number string */
+	"None",		/* asset tag string */
+	"None",		/* part number string */
+	NULL
+};
+
+static int smbios_type17_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size);
+
+struct smbios_table_type19 smbios_type19_template = {
+	{ SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19),  0 },
+	0xffffffff,	/* starting phys addr in kb (0xffffffff=use ext) */
+	0xffffffff,	/* ending phys addr in kb (0xffffffff=use ext) */
+	-1,		/* physical memory array handle */
+	1,		/* number of devices that form a row */
+	0,		/* extended starting phys addr in bytes (TDB) */
+	0		/* extended ending phys addr in bytes (TDB) */
+};
+
+static int smbios_type19_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size);
+
+struct smbios_table_type32 smbios_type32_template = {
+	{ SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32),  0 },
+	{ 0, 0, 0, 0, 0, 0 },
+	SMBIOS_BOOT_NORMAL
+};
+
+struct smbios_table_type127 smbios_type127_template = {
+	{ SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127),  0 }
+};
+
+static int smbios_generic_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size);
+
+static struct smbios_template_entry smbios_template[] = {
+	{ (struct smbios_structure *)&smbios_type0_template,
+	  smbios_type0_strings,
+	  smbios_generic_initializer },
+	{ (struct smbios_structure *)&smbios_type1_template,
+	  smbios_type1_strings,
+	  smbios_type1_initializer },
+	{ (struct smbios_structure *)&smbios_type3_template,
+	  smbios_type3_strings,
+	  smbios_generic_initializer },
+	{ (struct smbios_structure *)&smbios_type4_template,
+	  smbios_type4_strings,
+	  smbios_type4_initializer },
+	{ (struct smbios_structure *)&smbios_type16_template,
+	  NULL,
+	  smbios_type16_initializer },
+	{ (struct smbios_structure *)&smbios_type17_template,
+	  smbios_type17_strings,
+	  smbios_type17_initializer },
+	{ (struct smbios_structure *)&smbios_type19_template,
+	  NULL,
+	  smbios_type19_initializer },
+	{ (struct smbios_structure *)&smbios_type32_template,
+	  NULL,
+	  smbios_generic_initializer },
+	{ (struct smbios_structure *)&smbios_type127_template,
+	  NULL,
+	  smbios_generic_initializer },
+	{ NULL,NULL, NULL }
+};
+
+static uint64_t guest_lomem, guest_himem;
+static uint16_t type16_handle;
+
+static int
+smbios_generic_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size)
+{
+	struct smbios_structure *entry;
+
+	memcpy(curaddr, template_entry, template_entry->length);
+	entry = (struct smbios_structure *)curaddr;
+	entry->handle = *n + 1;
+	curaddr += entry->length;
+	if (template_strings != NULL) {
+		int	i;
+
+		for (i = 0; template_strings[i] != NULL; i++) {
+			const char *string;
+			int len;
+
+			string = template_strings[i];
+			len = strlen(string) + 1;
+			memcpy(curaddr, string, len);
+			curaddr += len;
+		}
+		*curaddr = '\0';
+		curaddr++;
+	} else {
+		/* Minimum string section is double nul */
+		*curaddr = '\0';
+		curaddr++;
+		*curaddr = '\0';
+		curaddr++;
+	}
+	(*n)++;
+	*endaddr = curaddr;
+
+	return (0);
+}
+
+static int
+smbios_type1_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size)
+{
+	struct smbios_table_type1 *type1;
+
+	smbios_generic_initializer(template_entry, template_strings,
+	    curaddr, endaddr, n, size);
+	type1 = (struct smbios_table_type1 *)curaddr;
+
+	if (guest_uuid_str != NULL) {
+		uuid_t		uuid;
+		uint32_t	status;
+
+		uuid_from_string(guest_uuid_str, &uuid, &status);
+		if (status != uuid_s_ok)
+			return (-1);
+
+		uuid_enc_le(&type1->uuid, &uuid);
+	} else {
+		MD5_CTX		mdctx;
+		u_char		digest[16];
+		char		hostname[MAXHOSTNAMELEN];
+
+		/*
+		 * Universally unique and yet reproducible are an
+		 * oxymoron, however reproducible is desirable in
+		 * this case.
+		 */
+		if (gethostname(hostname, sizeof(hostname)))
+			return (-1);
+
+		MD5Init(&mdctx);
+		MD5Update(&mdctx, vmname, strlen(vmname));
+		MD5Update(&mdctx, hostname, sizeof(hostname));
+		MD5Final(digest, &mdctx);
+
+		/*
+		 * Set the variant and version number.
+		 */
+		digest[6] &= 0x0F;
+		digest[6] |= 0x30;	/* version 3 */
+		digest[8] &= 0x3F;
+		digest[8] |= 0x80;
+
+		memcpy(&type1->uuid, digest, sizeof (digest));
+	}
+
+	return (0);
+}
+
+static int
+smbios_type4_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size)
+{
+	int i;
+
+	for (i = 0; i < guest_ncpus; i++) {
+		struct smbios_table_type4 *type4;
+		char *p;
+		int nstrings, len;
+
+		smbios_generic_initializer(template_entry, template_strings,
+		    curaddr, endaddr, n, size);
+		type4 = (struct smbios_table_type4 *)curaddr;
+		p = curaddr + sizeof (struct smbios_table_type4);
+		nstrings = 0;
+		while (p < *endaddr - 1) {
+			if (*p++ == '\0')
+				nstrings++;
+		}
+		len = sprintf(*endaddr - 1, "CPU #%d", i) + 1;
+		*endaddr += len - 1;
+		*(*endaddr) = '\0';
+		(*endaddr)++;
+		type4->socket = nstrings + 1;
+		curaddr = *endaddr;
+	}
+
+	return (0);
+}
+
+static int
+smbios_type16_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size)
+{
+	struct smbios_table_type16 *type16;
+
+	type16_handle = *n;
+	smbios_generic_initializer(template_entry, template_strings,
+	    curaddr, endaddr, n, size);
+	type16 = (struct smbios_table_type16 *)curaddr;
+	type16->xsize = guest_lomem + guest_himem;
+	type16->ndevs = guest_himem > 0 ? 2 : 1;
+
+	return (0);
+}
+
+static int
+smbios_type17_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size)
+{
+	struct smbios_table_type17 *type17;
+
+	smbios_generic_initializer(template_entry, template_strings,
+	    curaddr, endaddr, n, size);
+	type17 = (struct smbios_table_type17 *)curaddr;
+	type17->arrayhand = type16_handle;
+	type17->xsize = guest_lomem;
+
+	if (guest_himem > 0) {
+		curaddr = *endaddr;
+		smbios_generic_initializer(template_entry, template_strings,
+		    curaddr, endaddr, n, size);
+		type17 = (struct smbios_table_type17 *)curaddr;
+		type17->arrayhand = type16_handle;
+		type17->xsize = guest_himem;
+	}
+
+	return (0);
+}
+
+static int
+smbios_type19_initializer(struct smbios_structure *template_entry,
+    const char **template_strings, char *curaddr, char **endaddr,
+    uint16_t *n, uint16_t *size)
+{
+	struct smbios_table_type19 *type19;
+
+	smbios_generic_initializer(template_entry, template_strings,
+	    curaddr, endaddr, n, size);
+	type19 = (struct smbios_table_type19 *)curaddr;
+	type19->arrayhand = type16_handle;
+	type19->xsaddr = 0;
+	type19->xeaddr = guest_lomem;
+
+	if (guest_himem > 0) {
+		curaddr = *endaddr;
+		smbios_generic_initializer(template_entry, template_strings,
+		    curaddr, endaddr, n, size);
+		type19 = (struct smbios_table_type19 *)curaddr;
+		type19->arrayhand = type16_handle;
+		type19->xsaddr = 4*GB;
+		type19->xeaddr = guest_himem;
+	}
+
+	return (0);
+}
+
+static void
+smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr)
+{
+	memset(smbios_ep, 0, sizeof(*smbios_ep));
+	memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR,
+	    SMBIOS_ENTRY_EANCHORLEN);
+	smbios_ep->eplen = 0x1F;
+	assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen);
+	smbios_ep->major = 2;
+	smbios_ep->minor = 6;
+	smbios_ep->revision = 0;
+	memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR,
+	    SMBIOS_ENTRY_IANCHORLEN);
+	smbios_ep->staddr = staddr;
+	smbios_ep->bcdrev = 0x24;
+}
+
+static void
+smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len,
+    uint16_t num, uint16_t maxssize)
+{
+	uint8_t	checksum;
+	int	i;
+
+	smbios_ep->maxssize = maxssize;
+	smbios_ep->stlen = len;
+	smbios_ep->stnum = num;
+
+	checksum = 0;
+	for (i = 0x10; i < 0x1f; i++) {
+		checksum -= ((uint8_t *)smbios_ep)[i];
+	}
+	smbios_ep->ichecksum = checksum;
+
+	checksum = 0;
+	for (i = 0; i < 0x1f; i++) {
+		checksum -= ((uint8_t *)smbios_ep)[i];
+	}
+	smbios_ep->echecksum = checksum;
+}
+
+int
+smbios_build(struct vmctx *ctx)
+{
+	struct smbios_entry_point	*smbios_ep;
+	uint16_t			n;
+	uint16_t			maxssize;
+	char				*curaddr, *startaddr, *ststartaddr;
+	int				i;
+	int				err;
+
+	guest_lomem = vm_get_lowmem_size(ctx);
+	guest_himem = vm_get_highmem_size(ctx);
+
+	startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH);
+	if (startaddr == NULL) {
+		fprintf(stderr, "smbios table requires mapped mem\n");
+		return (ENOMEM);
+	}
+
+	curaddr = startaddr;
+
+	smbios_ep = (struct smbios_entry_point *)curaddr;
+	smbios_ep_initializer(smbios_ep, SMBIOS_BASE +
+	    sizeof(struct smbios_entry_point));
+	curaddr += sizeof(struct smbios_entry_point);
+	ststartaddr = curaddr;
+
+	n = 0;
+	maxssize = 0;
+	for (i = 0; smbios_template[i].entry != NULL; i++) {
+		struct smbios_structure	*entry;
+		const char		**strings;
+		initializer_func_t      initializer;
+		char			*endaddr;
+		uint16_t		size;
+
+		entry = smbios_template[i].entry;
+		strings = smbios_template[i].strings;
+		initializer = smbios_template[i].initializer;
+
+		err = (*initializer)(entry, strings, curaddr, &endaddr,
+		    &n, &size);
+		if (err != 0)
+			return (err);
+
+		if (size > maxssize)
+			maxssize = size;
+
+		curaddr = endaddr;
+	}
+
+	assert(curaddr - startaddr < SMBIOS_MAX_LENGTH);
+	smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize);
+
+	return (0);
+}
diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h
new file mode 100644
index 0000000000..fd7f86be80
--- /dev/null
+++ b/usr/src/cmd/bhyve/smbiostbl.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/smbiostbl.h 262744 2014-03-04 17:12:06Z tychon $
+ */
+
+#ifndef _SMBIOSTBL_H_
+#define _SMBIOSTBL_H_
+
+struct vmctx;
+
+int	smbios_build(struct vmctx *ctx);
+
+#endif /* _SMBIOSTBL_H_ */
diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c
new file mode 100644
index 0000000000..e1dd562d3f
--- /dev/null
+++ b/usr/src/cmd/bhyve/spinup_ap.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "spinup_ap.h"
+
+static void
+spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
+{
+	int vector, error;
+	uint16_t cs;
+	uint64_t desc_base;
+	uint32_t desc_limit, desc_access;
+
+	vector = *rip >> PAGE_SHIFT;
+	*rip = 0;
+
+	/*
+	 * Update the %cs and %rip of the guest so that it starts
+	 * executing real mode code at at 'vector << 12'.
+	 */
+	error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+	assert(error == 0);
+
+	error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+			    &desc_limit, &desc_access);
+	assert(error == 0);
+
+	desc_base = vector << PAGE_SHIFT;
+	error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+			    desc_base, desc_limit, desc_access);
+	assert(error == 0);
+
+	cs = (vector << PAGE_SHIFT) >> 4;
+	error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+	assert(error == 0);
+}
+
+int
+spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
+{
+	int error;
+
+	assert(newcpu != 0);
+	assert(newcpu < guest_ncpus);
+
+	error = vcpu_reset(ctx, newcpu);
+	assert(error == 0);
+
+	fbsdrun_set_capabilities(ctx, newcpu);
+
+	/*
+	 * Enable the 'unrestricted guest' mode for 'newcpu'.
+	 *
+	 * Set up the processor state in power-on 16-bit mode, with the CS:IP
+	 * init'd to the specified low-mem 4K page.
+	 */
+	error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+	assert(error == 0);
+
+	spinup_ap_realmode(ctx, newcpu, &rip);
+
+	fbsdrun_addcpu(ctx, vcpu, newcpu, rip);
+
+	return (newcpu);
+}
diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h
new file mode 100644
index 0000000000..090de091ba
--- /dev/null
+++ b/usr/src/cmd/bhyve/spinup_ap.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.h 240912 2012-09-25 02:33:25Z neel $
+ */
+
+#ifndef	_SPINUP_AP_H_
+#define	_SPINUP_AP_H_
+
+int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
+
+#endif
diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c
new file mode 100644
index 0000000000..a8b5d40356
--- /dev/null
+++ b/usr/src/cmd/bhyve/uart_emul.c
@@ -0,0 +1,1042 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $");
+
+#include <sys/types.h>
+#include <dev/ic/ns16550.h>
+
+#ifndef	__FreeBSD__
+#include <sys/socket.h>
+#include <sys/stat.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <pthread.h>
+#ifndef	__FreeBSD__
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#endif
+
+#ifndef	__FreeBSD__
+#include <bhyve.h>
+
+#include "bhyverun.h"
+#endif
+#ifdef	__FreeBSD__
+#include "mevent.h"
+#endif
+#include "uart_emul.h"
+
+#define	COM1_BASE	0x3F8
+#define	COM1_IRQ	4
+#define	COM2_BASE      	0x2F8
+#define COM2_IRQ	3
+
+#define	DEFAULT_RCLK	1843200
+#define	DEFAULT_BAUD	9600
+
+#define	FCR_RX_MASK	0xC0
+
+#define	MCR_OUT1	0x04
+#define	MCR_OUT2	0x08
+
+#define	MSR_DELTA_MASK	0x0f
+
+#ifndef REG_SCR
+#define REG_SCR		com_scr
+#endif
+
+#define	FIFOSZ	16
+
+static bool uart_stdio;		/* stdio in use for i/o */
+#ifndef	__FreeBSD__
+static bool uart_bcons;		/* bhyveconsole in use for i/o */
+#endif
+
+static struct {
+	int	baseaddr;
+	int	irq;
+	bool	inuse;
+} uart_lres[] = {
+	{ COM1_BASE, COM1_IRQ, false},
+	{ COM2_BASE, COM2_IRQ, false},
+};
+
+#define	UART_NLDEVS	(sizeof(uart_lres) / sizeof(uart_lres[0]))
+
+struct fifo {
+	uint8_t	buf[FIFOSZ];
+	int	rindex;		/* index to read from */
+	int	windex;		/* index to write to */
+	int	num;		/* number of characters in the fifo */
+	int	size;		/* size of the fifo */
+};
+
+struct uart_softc {
+	pthread_mutex_t mtx;	/* protects all softc elements */
+	uint8_t data;		/* Data register (R/W) */
+	uint8_t ier;		/* Interrupt enable register (R/W) */
+	uint8_t lcr;		/* Line control register (R/W) */
+	uint8_t mcr;		/* Modem control register (R/W) */
+	uint8_t lsr;		/* Line status register (R/W) */
+	uint8_t msr;		/* Modem status register (R/W) */
+	uint8_t fcr;		/* FIFO control register (W) */
+	uint8_t scr;		/* Scratch register (R/W) */
+
+	uint8_t dll;		/* Baudrate divisor latch LSB */
+	uint8_t dlh;		/* Baudrate divisor latch MSB */
+
+	struct fifo rxfifo;
+
+	bool	opened;
+	bool	stdio;
+#ifndef	__FreeBSD__
+	bool	bcons;
+	struct {
+		pid_t	clipid;
+		int	clifd;		/* console client unix domain socket */
+		int	servfd;		/* console server unix domain socket */
+	} usc_bcons;
+#endif
+
+	bool	thre_int_pending;	/* THRE interrupt pending */
+
+	void	*arg;
+	uart_intr_func_t intr_assert;
+	uart_intr_func_t intr_deassert;
+};
+
+#ifdef	__FreeBSD__
+static void uart_drain(int fd, enum ev_type ev, void *arg);
+#else
+static void uart_tty_drain(struct uart_softc *sc);
+static int uart_bcons_drain(struct uart_softc *sc);
+#endif
+
+static struct termios tio_orig, tio_new;	/* I/O Terminals */
+
+static void
+ttyclose(void)
+{
+
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+
+	tcgetattr(STDIN_FILENO, &tio_orig);
+
+	tio_new = tio_orig;
+	cfmakeraw(&tio_new);
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+	atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+	fd_set rfds;
+	struct timeval tv;
+
+	FD_ZERO(&rfds);
+	FD_SET(STDIN_FILENO, &rfds);
+	tv.tv_sec = 0;
+	tv.tv_usec = 0;
+	if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) {
+		return (true);
+	} else {
+		return (false);
+	}
+}
+
+static int
+ttyread(void)
+{
+	char rb;
+
+	if (tty_char_available()) {
+		read(STDIN_FILENO, &rb, 1);
+		return (rb & 0xff);
+	} else {
+		return (-1);
+	}
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+
+	(void)write(STDIN_FILENO, &wb, 1);
+}
+
+#ifndef	__FreeBSD__
+static void
+bconswrite(struct uart_softc *sc, unsigned char wb)
+{
+	(void) write(sc->usc_bcons.clifd, &wb, 1);
+}
+#endif
+
+static void
+fifo_reset(struct fifo *fifo, int size)
+{
+	bzero(fifo, sizeof(struct fifo));
+	fifo->size = size;
+}
+
+static int
+fifo_putchar(struct fifo *fifo, uint8_t ch)
+{
+
+	if (fifo->num < fifo->size) {
+		fifo->buf[fifo->windex] = ch;
+		fifo->windex = (fifo->windex + 1) % fifo->size;
+		fifo->num++;
+		return (0);
+	} else
+		return (-1);
+}
+
+static int
+fifo_getchar(struct fifo *fifo)
+{
+	int c;
+
+	if (fifo->num > 0) {
+		c = fifo->buf[fifo->rindex];
+		fifo->rindex = (fifo->rindex + 1) % fifo->size;
+		fifo->num--;
+		return (c);
+	} else
+		return (-1);
+}
+
+static int
+fifo_numchars(struct fifo *fifo)
+{
+
+	return (fifo->num);
+}
+
+static int
+fifo_available(struct fifo *fifo)
+{
+
+	return (fifo->num < fifo->size);
+}
+
+static void
+uart_opentty(struct uart_softc *sc)
+{
+	struct mevent *mev;
+
+	assert(!sc->opened && sc->stdio);
+
+	ttyopen();
+#ifdef	__FreeBSD__
+	mev = mevent_add(STDIN_FILENO, EVF_READ, uart_drain, sc);
+#endif
+	assert(mev);
+}
+
+/*
+ * The IIR returns a prioritized interrupt reason:
+ * - receive data available
+ * - transmit holding register empty
+ * - modem status change
+ *
+ * Return an interrupt reason if one is available.
+ */
+static int
+uart_intr_reason(struct uart_softc *sc)
+{
+
+	if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
+		return (IIR_RLS);
+	else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0)
+		return (IIR_RXTOUT);
+	else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
+		return (IIR_TXRDY);
+	else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
+		return (IIR_MLSC);
+	else
+		return (IIR_NOPEND);
+}
+
+static void
+uart_reset(struct uart_softc *sc)
+{
+	uint16_t divisor;
+
+	divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
+	sc->dll = divisor;
+	sc->dlh = divisor >> 16;
+
+	fifo_reset(&sc->rxfifo, 1);	/* no fifo until enabled by software */
+}
+
+/*
+ * Toggle the COM port's intr pin depending on whether or not we have an
+ * interrupt condition to report to the processor.
+ */
+static void
+uart_toggle_intr(struct uart_softc *sc)
+{
+	uint8_t intr_reason;
+
+	intr_reason = uart_intr_reason(sc);
+
+	if (intr_reason == IIR_NOPEND)
+		(*sc->intr_deassert)(sc->arg);
+	else
+		(*sc->intr_assert)(sc->arg);
+}
+
+#ifdef	__FreeBSD__
+static void
+uart_drain(int fd, enum ev_type ev, void *arg)
+{
+	struct uart_softc *sc;
+	int ch;
+
+	sc = arg;
+
+	assert(fd == STDIN_FILENO);
+	assert(ev == EVF_READ);
+
+	/*
+	 * This routine is called in the context of the mevent thread
+	 * to take out the softc lock to protect against concurrent
+	 * access from a vCPU i/o exit
+	 */
+	pthread_mutex_lock(&sc->mtx);
+
+	if ((sc->mcr & MCR_LOOPBACK) != 0) {
+		(void) ttyread();
+	} else {
+		while (fifo_available(&sc->rxfifo) &&
+		       ((ch = ttyread()) != -1)) {
+			fifo_putchar(&sc->rxfifo, ch);
+		}
+		uart_toggle_intr(sc);
+	}
+
+	pthread_mutex_unlock(&sc->mtx);
+}
+#else
+static void
+uart_tty_drain(struct uart_softc *sc)
+{
+	int ch;
+
+	/*
+	 * Take the softc lock to protect against concurrent
+	 * access from a vCPU i/o exit
+	 */
+	pthread_mutex_lock(&sc->mtx);
+
+	if ((sc->mcr & MCR_LOOPBACK) != 0) {
+		(void) ttyread();
+	} else {
+		while (fifo_available(&sc->rxfifo) &&
+		       ((ch = ttyread()) != -1)) {
+			fifo_putchar(&sc->rxfifo, ch);
+		}
+		uart_toggle_intr(sc);
+	}
+
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static int
+uart_bcons_drain(struct uart_softc *sc)
+{
+	char ch;
+	int nbytes;
+	int ret = 0;
+
+	/*
+	 * Take the softc lock to protect against concurrent
+	 * access from a vCPU i/o exit
+	 */
+	pthread_mutex_lock(&sc->mtx);
+
+	if ((sc->mcr & MCR_LOOPBACK) != 0) {
+		(void) read(sc->usc_bcons.clifd, &ch, 1);
+	} else {
+		for (;;) {
+			nbytes = read(sc->usc_bcons.clifd, &ch, 1);
+			if (nbytes == 0) {
+				ret = 1;
+				break;
+			}
+			if (nbytes == -1 &&
+			    errno != EINTR && errno != EAGAIN) {
+				ret = -1;
+				break;
+			}
+			if (nbytes == -1) {
+				break;
+			}
+
+			if (fifo_available(&sc->rxfifo)) {
+				fifo_putchar(&sc->rxfifo, ch);
+			}
+		}
+		uart_toggle_intr(sc);
+	}
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (ret);
+}
+#endif
+
+void
+uart_write(struct uart_softc *sc, int offset, uint8_t value)
+{
+	int fifosz;
+	uint8_t msr;
+
+	pthread_mutex_lock(&sc->mtx);
+
+	/* Open terminal */
+	if (!sc->opened && sc->stdio) {
+		uart_opentty(sc);
+		sc->opened = true;
+	}
+
+	/*
+	 * Take care of the special case DLAB accesses first
+	 */
+	if ((sc->lcr & LCR_DLAB) != 0) {
+		if (offset == REG_DLL) {
+			sc->dll = value;
+			goto done;
+		}
+
+		if (offset == REG_DLH) {
+			sc->dlh = value;
+			goto done;
+		}
+	}
+
+        switch (offset) {
+	case REG_DATA:
+		if (sc->mcr & MCR_LOOPBACK) {
+			if (fifo_putchar(&sc->rxfifo, value) != 0)
+				sc->lsr |= LSR_OE;
+		} else if (sc->stdio) {
+			ttywrite(value);
+#ifndef	__FreeBSD__
+		} else if (sc->bcons) {
+				bconswrite(sc, value);
+#endif
+		} /* else drop on floor */
+		sc->thre_int_pending = true;
+		break;
+	case REG_IER:
+		/*
+		 * Apply mask so that bits 4-7 are 0
+		 * Also enables bits 0-3 only if they're 1
+		 */
+		sc->ier = value & 0x0F;
+		break;
+		case REG_FCR:
+			/*
+			 * When moving from FIFO and 16450 mode and vice versa,
+			 * the FIFO contents are reset.
+			 */
+			if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
+				fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
+				fifo_reset(&sc->rxfifo, fifosz);
+			}
+
+			/*
+			 * The FCR_ENABLE bit must be '1' for the programming
+			 * of other FCR bits to be effective.
+			 */
+			if ((value & FCR_ENABLE) == 0) {
+				sc->fcr = 0;
+			} else {
+				if ((value & FCR_RCV_RST) != 0)
+					fifo_reset(&sc->rxfifo, FIFOSZ);
+
+				sc->fcr = value &
+					 (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
+			}
+			break;
+		case REG_LCR:
+			sc->lcr = value;
+			break;
+		case REG_MCR:
+			/* Apply mask so that bits 5-7 are 0 */
+			sc->mcr = value & 0x1F;
+
+			msr = 0;
+			if (sc->mcr & MCR_LOOPBACK) {
+				/*
+				 * In the loopback mode certain bits from the
+				 * MCR are reflected back into MSR
+				 */
+				if (sc->mcr & MCR_RTS)
+					msr |= MSR_CTS;
+				if (sc->mcr & MCR_DTR)
+					msr |= MSR_DSR;
+				if (sc->mcr & MCR_OUT1)
+					msr |= MSR_RI;
+				if (sc->mcr & MCR_OUT2)
+					msr |= MSR_DCD;
+			}
+
+			/*
+			 * Detect if there has been any change between the
+			 * previous and the new value of MSR. If there is
+			 * then assert the appropriate MSR delta bit.
+			 */
+			if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
+				sc->msr |= MSR_DCTS;
+			if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
+				sc->msr |= MSR_DDSR;
+			if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
+				sc->msr |= MSR_DDCD;
+			if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
+				sc->msr |= MSR_TERI;
+
+			/*
+			 * Update the value of MSR while retaining the delta
+			 * bits.
+			 */
+			sc->msr &= MSR_DELTA_MASK;
+			sc->msr |= msr;
+			break;
+		case REG_LSR:
+			/*
+			 * Line status register is not meant to be written to
+			 * during normal operation.
+			 */
+			break;
+		case REG_MSR:
+			/*
+			 * As far as I can tell MSR is a read-only register.
+			 */
+			break;
+		case REG_SCR:
+			sc->scr = value;
+			break;
+		default:
+			break;
+	}
+
+done:
+	uart_toggle_intr(sc);
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+uint8_t
+uart_read(struct uart_softc *sc, int offset)
+{
+	uint8_t iir, intr_reason, reg;
+
+	pthread_mutex_lock(&sc->mtx);
+
+	/* Open terminal */
+	if (!sc->opened && sc->stdio) {
+		uart_opentty(sc);
+		sc->opened = true;
+	}
+
+	/*
+	 * Take care of the special case DLAB accesses first
+	 */
+	if ((sc->lcr & LCR_DLAB) != 0) {
+		if (offset == REG_DLL) {
+			reg = sc->dll;
+			goto done;
+		}
+
+		if (offset == REG_DLH) {
+			reg = sc->dlh;
+			goto done;
+		}
+	}
+
+	switch (offset) {
+	case REG_DATA:
+		reg = fifo_getchar(&sc->rxfifo);
+		break;
+	case REG_IER:
+		reg = sc->ier;
+		break;
+	case REG_IIR:
+		iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
+
+		intr_reason = uart_intr_reason(sc);
+
+		/*
+		 * Deal with side effects of reading the IIR register
+		 */
+		if (intr_reason == IIR_TXRDY)
+			sc->thre_int_pending = false;
+
+		iir |= intr_reason;
+
+		reg = iir;
+		break;
+	case REG_LCR:
+		reg = sc->lcr;
+		break;
+	case REG_MCR:
+		reg = sc->mcr;
+		break;
+	case REG_LSR:
+		/* Transmitter is always ready for more data */
+		sc->lsr |= LSR_TEMT | LSR_THRE;
+
+		/* Check for new receive data */
+		if (fifo_numchars(&sc->rxfifo) > 0)
+			sc->lsr |= LSR_RXRDY;
+		else
+			sc->lsr &= ~LSR_RXRDY;
+
+		reg = sc->lsr;
+
+		/* The LSR_OE bit is cleared on LSR read */
+		sc->lsr &= ~LSR_OE;
+		break;
+	case REG_MSR:
+		/*
+		 * MSR delta bits are cleared on read
+		 */
+		reg = sc->msr;
+		sc->msr &= ~MSR_DELTA_MASK;
+		break;
+	case REG_SCR:
+		reg = sc->scr;
+		break;
+	default:
+		reg = 0xFF;
+		break;
+	}
+
+done:
+	uart_toggle_intr(sc);
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (reg);
+}
+
+#ifndef	__FreeBSD__
+static void *
+uart_tty_thread(void *param)
+{
+	struct uart_softc *sc = param;
+	pollfd_t pollset;
+
+	pollset.fd = STDIN_FILENO;
+	pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
+
+	for (;;) {
+		if (poll(&pollset, 1, -1) < 0) {
+			if (errno != EINTR) {
+				perror("poll failed");
+				break;
+			}
+			continue;
+		}
+		uart_tty_drain(sc);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Read the "ident" string from the client's descriptor; this routine also
+ * tolerates being called with pid=NULL, for times when you want to "eat"
+ * the ident string from a client without saving it.
+ */
+static int
+get_client_ident(int clifd, pid_t *pid)
+{
+	char buf[BUFSIZ], *bufp;
+	size_t buflen = sizeof (buf);
+	char c = '\0';
+	int i = 0, r;
+
+	/* "eat up the ident string" case, for simplicity */
+	if (pid == NULL) {
+		while (read(clifd, &c, 1) == 1) {
+			if (c == '\n')
+				return (0);
+		}
+	}
+
+	bzero(buf, sizeof (buf));
+	while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) {
+		buflen--;
+		if (c == '\n')
+			break;
+
+		buf[i] = c;
+		i++;
+	}
+	if (r == -1)
+		return (-1);
+
+	/*
+	 * We've filled the buffer, but still haven't seen \n.  Keep eating
+	 * until we find it; we don't expect this to happen, but this is
+	 * defensive.
+	 */
+	if (c != '\n') {
+		while ((r = read(clifd, &c, sizeof (c))) > 0)
+			if (c == '\n')
+				break;
+	}
+
+	/*
+	 * Parse buffer for message of the form: IDENT <pid>
+	 */
+	bufp = buf;
+	if (strncmp(bufp, "IDENT ", 6) != 0)
+		return (-1);
+	bufp += 6;
+	errno = 0;
+	*pid = strtoll(bufp, &bufp, 10);
+	if (errno != 0)
+		return (-1);
+
+	return (0);
+}
+
+static int
+uart_bcons_accept_client(struct uart_softc *sc)
+{
+	int connfd;
+	struct sockaddr_un cliaddr;
+	socklen_t clilen;
+	pid_t pid;
+
+	clilen = sizeof (cliaddr);
+	connfd = accept(sc->usc_bcons.servfd,
+			(struct sockaddr *)&cliaddr, &clilen);
+	if (connfd == -1)
+		return (-1);
+	if (get_client_ident(connfd, &pid) == -1) {
+		(void) shutdown(connfd, SHUT_RDWR);
+		(void) close(connfd);
+		return (-1);
+	}
+
+	if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) {
+		(void) shutdown(connfd, SHUT_RDWR);
+		(void) close(connfd);
+		return (-1);
+	}
+	(void) write(connfd, "OK\n", 3);
+
+	sc->usc_bcons.clipid = pid;
+	sc->usc_bcons.clifd = connfd;
+
+	printf("Connection from process ID %lu.\n", pid);
+
+	return (0);
+}
+
+static void
+uart_bcons_reject_client(struct uart_softc *sc)
+{
+	int connfd;
+	struct sockaddr_un cliaddr;
+	socklen_t clilen;
+	char nak[MAXPATHLEN];
+
+	clilen = sizeof (cliaddr);
+	connfd = accept(sc->usc_bcons.servfd,
+			(struct sockaddr *)&cliaddr, &clilen);
+
+	/*
+	 * After hear its ident string, tell client to get lost.
+	 */
+	if (get_client_ident(connfd, NULL) == 0) {
+		(void) snprintf(nak, sizeof (nak), "%lu\n",
+		    sc->usc_bcons.clipid);
+		(void) write(connfd, nak, strlen(nak));
+	}
+	(void) shutdown(connfd, SHUT_RDWR);
+	(void) close(connfd);
+}
+
+static int
+uart_bcons_client_event(struct uart_softc *sc)
+{
+	int res;
+
+	res = uart_bcons_drain(sc);
+	if (res < 0)
+		return (-1);
+
+	if (res > 0) {
+		fprintf(stderr, "Closing connection with bhyve console\n");
+		(void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR);
+		(void) close(sc->usc_bcons.clifd);
+		sc->usc_bcons.clifd = -1;
+	}
+
+	return (0);
+}
+
+static void
+uart_bcons_server_event(struct uart_softc *sc)
+{
+	int clifd;
+
+	if (sc->usc_bcons.clifd != -1) {
+		/* we're already handling a client */
+		uart_bcons_reject_client(sc);
+		return;
+	}
+
+	if (uart_bcons_accept_client(sc) == 0) {
+		pthread_mutex_lock(&bcons_wait_lock);
+		bcons_connected = B_TRUE;
+		pthread_cond_signal(&bcons_wait_done);
+		pthread_mutex_unlock(&bcons_wait_lock);
+	}
+}
+
+static void *
+uart_bcons_thread(void *param)
+{
+	struct uart_softc *sc = param;
+	struct pollfd pollfds[2];
+	int res;
+
+	/* read from client and write to vm */
+	pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND |
+	    POLLPRI | POLLERR | POLLHUP;
+
+	/* the server socket; watch for events (new connections) */
+	pollfds[1].events = pollfds[0].events;
+
+	for (;;) {
+		pollfds[0].fd = sc->usc_bcons.clifd;
+		pollfds[1].fd = sc->usc_bcons.servfd;
+		pollfds[0].revents = pollfds[1].revents = 0;
+
+		res = poll(pollfds,
+		    sizeof (pollfds) / sizeof (struct pollfd), -1);
+
+		if (res == -1 && errno != EINTR) {
+			perror("poll failed");
+			/* we are hosed, close connection */
+			break;
+		}
+
+		/* event from client side */
+		if (pollfds[0].revents) {
+			if (pollfds[0].revents &
+			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+				if (uart_bcons_client_event(sc) < 0)
+					break;
+			} else {
+				break;
+			}
+		}
+
+		/* event from server socket */
+		if (pollfds[1].revents) {
+			if (pollfds[1].revents & (POLLIN | POLLRDNORM)) {
+				uart_bcons_server_event(sc);
+			} else {
+				break;
+			}
+		}
+	}
+
+	if (sc->usc_bcons.clifd != -1) {
+		fprintf(stderr, "Closing connection with bhyve console\n");
+		(void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR);
+		(void) close(sc->usc_bcons.clifd);
+		sc->usc_bcons.clifd = -1;
+	}
+
+	return (NULL);
+}
+
+static int
+init_bcons_sock(void)
+{
+	int servfd;
+	struct sockaddr_un servaddr;
+
+	if (mkdir(BHYVE_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) {
+		fprintf(stderr, "bhyve console setup: "
+		    "could not mkdir %s", BHYVE_TMPDIR, strerror(errno));
+		return (-1);
+	}
+
+	bzero(&servaddr, sizeof (servaddr));
+	servaddr.sun_family = AF_UNIX;
+	(void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
+	    BHYVE_CONS_SOCKPATH, vmname);
+
+	if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+		fprintf(stderr, "bhyve console setup: "
+		    "could not create socket\n");
+		return (-1);
+	}
+	(void) unlink(servaddr.sun_path);
+
+	if (bind(servfd, (struct sockaddr *)&servaddr,
+	    sizeof (servaddr)) == -1) {
+		fprintf(stderr, "bhyve console setup: "
+		    "could not bind to socket\n");
+		goto out;
+        }
+
+        if (listen(servfd, 4) == -1) {
+		fprintf(stderr, "bhyve console setup: "
+		    "could not listen on socket");
+		goto out;
+        }
+        return (servfd);
+
+out:
+	(void) unlink(servaddr.sun_path);
+        (void) close(servfd);
+        return (-1);
+}
+#endif
+
+int
+uart_legacy_alloc(int which, int *baseaddr, int *irq)
+{
+
+	if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse)
+		return (-1);
+
+	uart_lres[which].inuse = true;
+	*baseaddr = uart_lres[which].baseaddr;
+	*irq = uart_lres[which].irq;
+
+	return (0);
+}
+
+struct uart_softc *
+uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
+    void *arg)
+{
+	struct uart_softc *sc;
+
+	sc = malloc(sizeof(struct uart_softc));
+	bzero(sc, sizeof(struct uart_softc));
+
+	sc->arg = arg;
+	sc->intr_assert = intr_assert;
+	sc->intr_deassert = intr_deassert;
+
+	pthread_mutex_init(&sc->mtx, NULL);
+
+	uart_reset(sc);
+
+	return (sc);
+}
+
+int
+uart_set_backend(struct uart_softc *sc, const char *opts)
+{
+#ifndef	__FreeBSD__
+	int error;
+#endif
+	/*
+	 * XXX one stdio backend supported at this time.
+	 */
+	if (opts == NULL)
+		return (0);
+
+#ifdef	__FreeBSD__
+	if (strcmp("stdio", opts) == 0 && !uart_stdio) {
+		sc->stdio = true;
+		uart_stdio = true;
+		return (0);
+#else
+	if (strcmp("stdio", opts) == 0 && !uart_stdio && !uart_bcons) {
+		sc->stdio = true;
+		uart_stdio = true;
+
+		error = pthread_create(NULL, NULL, uart_tty_thread, sc);
+		assert(error == 0);
+
+		return (0);
+	} else if (strstr(opts, "bcons") != 0 && !uart_stdio && !uart_bcons) {
+		sc->bcons = true;
+		uart_bcons= true;
+
+		if (strstr(opts, "bcons,wait") != 0) {
+			bcons_wait = true;
+		}
+
+		sc->usc_bcons.clifd = -1;
+		if ((sc->usc_bcons.servfd = init_bcons_sock()) == -1) {
+			fprintf(stderr, "bhyve console setup: "
+			    "socket initialization failed\n");
+			return (-1);
+		}
+		error = pthread_create(NULL, NULL, uart_bcons_thread, sc);
+		assert(error == 0);
+
+		return (0);
+#endif
+	} else
+		return (-1);
+}
diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h
new file mode 100644
index 0000000000..ecff957991
--- /dev/null
+++ b/usr/src/cmd/bhyve/uart_emul.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/uart_emul.h 257293 2013-10-29 00:18:11Z neel $
+ */
+
+#ifndef _UART_EMUL_H_
+#define	_UART_EMUL_H_
+
+
+#define	UART_IO_BAR_SIZE	8
+
+struct uart_softc;
+
+typedef void (*uart_intr_func_t)(void *arg);
+struct uart_softc *uart_init(uart_intr_func_t intr_assert,
+		uart_intr_func_t intr_deassert, void *arg);
+
+int	uart_legacy_alloc(int unit, int *ioaddr, int *irq);
+uint8_t	uart_read(struct uart_softc *sc, int offset);
+void	uart_write(struct uart_softc *sc, int offset, uint8_t value);
+int	uart_set_backend(struct uart_softc *sc, const char *opt);
+#endif
diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c
new file mode 100644
index 0000000000..4330741042
--- /dev/null
+++ b/usr/src/cmd/bhyve/vga.c
@@ -0,0 +1,1289 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/vmm.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+#include "inout.h"
+#include "mem.h"
+#include "vga.h"
+
+#define	KB	(1024UL)
+#define	MB	(1024 * 1024UL)
+
+struct vga_softc {
+	struct mem_range	mr;
+
+	struct bhyvegc		*gc;
+	int			gc_width;
+	int			gc_height;
+	struct bhyvegc_image	*gc_image;
+
+	uint8_t			*vga_ram;
+
+	/*
+	 * General registers
+	 */
+	uint8_t			vga_misc;
+	uint8_t			vga_sts1;
+
+	/*
+	 * Sequencer
+	 */
+	struct {
+		int		seq_index;
+		uint8_t		seq_reset;
+		uint8_t		seq_clock_mode;
+		int		seq_cm_dots;
+		uint8_t		seq_map_mask;
+		uint8_t		seq_cmap_sel;
+		int		seq_cmap_pri_off;
+		int		seq_cmap_sec_off;
+		uint8_t		seq_mm;
+	} vga_seq;
+
+	/*
+	 * CRT Controller
+	 */
+	struct {
+		int		crtc_index;
+		uint8_t		crtc_mode_ctrl;
+		uint8_t		crtc_horiz_total;
+		uint8_t		crtc_horiz_disp_end;
+		uint8_t		crtc_start_horiz_blank;
+		uint8_t		crtc_end_horiz_blank;
+		uint8_t		crtc_start_horiz_retrace;
+		uint8_t		crtc_end_horiz_retrace;
+		uint8_t		crtc_vert_total;
+		uint8_t		crtc_overflow;
+		uint8_t		crtc_present_row_scan;
+		uint8_t		crtc_max_scan_line;
+		uint8_t		crtc_cursor_start;
+		uint8_t		crtc_cursor_on;
+		uint8_t		crtc_cursor_end;
+		uint8_t		crtc_start_addr_high;
+		uint8_t		crtc_start_addr_low;
+		uint16_t	crtc_start_addr;
+		uint8_t		crtc_cursor_loc_low;
+		uint8_t		crtc_cursor_loc_high;
+		uint16_t	crtc_cursor_loc;
+		uint8_t		crtc_vert_retrace_start;
+		uint8_t		crtc_vert_retrace_end;
+		uint8_t		crtc_vert_disp_end;
+		uint8_t		crtc_offset;
+		uint8_t		crtc_underline_loc;
+		uint8_t		crtc_start_vert_blank;
+		uint8_t		crtc_end_vert_blank;
+		uint8_t		crtc_line_compare;
+	} vga_crtc;
+
+	/*
+	 * Graphics Controller
+	 */
+	struct {
+		int		gc_index;
+		uint8_t		gc_set_reset;
+		uint8_t		gc_enb_set_reset;
+		uint8_t		gc_color_compare;
+		uint8_t		gc_rotate;
+		uint8_t		gc_op;
+		uint8_t		gc_read_map_sel;
+		uint8_t		gc_mode;
+		bool		gc_mode_c4;		/* chain 4 */
+		bool		gc_mode_oe;		/* odd/even */
+		uint8_t		gc_mode_rm;		/* read mode */
+		uint8_t		gc_mode_wm;		/* write mode */
+		uint8_t		gc_misc;
+		uint8_t		gc_misc_gm;		/* graphics mode */
+		uint8_t		gc_misc_mm;		/* memory map */
+		uint8_t		gc_color_dont_care;
+		uint8_t		gc_bit_mask;
+		uint8_t		gc_latch0;
+		uint8_t		gc_latch1;
+		uint8_t		gc_latch2;
+		uint8_t		gc_latch3;
+	} vga_gc;
+
+	/*
+	 * Attribute Controller
+	 */
+	struct {
+		int		atc_flipflop;
+		int		atc_index;
+		uint8_t		atc_palette[16];
+		uint8_t		atc_mode;
+		uint8_t		atc_overscan_color;
+		uint8_t		atc_color_plane_enb;
+		uint8_t		atc_horiz_pixel_panning;
+		uint8_t		atc_color_select;
+		uint8_t		atc_color_select_45;
+		uint8_t		atc_color_select_67;
+	} vga_atc;
+
+	/*
+	 * DAC
+	 */
+	struct {
+		uint8_t		dac_state;
+		int		dac_rd_index;
+		int		dac_rd_subindex;
+		int		dac_wr_index;
+		int		dac_wr_subindex;
+		uint8_t		dac_palette[3 * 256];
+		uint32_t	dac_palette_rgb[256];
+	} vga_dac;
+};
+
+static bool
+vga_in_reset(struct vga_softc *sc)
+{
+	return (((sc->vga_seq.seq_clock_mode & SEQ_CM_SO) != 0) ||
+	    ((sc->vga_seq.seq_reset & SEQ_RESET_ASYNC) == 0) ||
+	    ((sc->vga_seq.seq_reset & SEQ_RESET_SYNC) == 0) ||
+	    ((sc->vga_crtc.crtc_mode_ctrl & CRTC_MC_TE) == 0));
+}
+
+static void
+vga_check_size(struct bhyvegc *gc, struct vga_softc *sc)
+{
+	int old_width, old_height;
+
+	if (vga_in_reset(sc))
+		return;
+
+	old_width = sc->gc_width;
+	old_height = sc->gc_height;
+
+	/*
+	 * Horizontal Display End: For text modes this is the number
+	 * of characters.  For graphics modes this is the number of
+	 * pixels per scanlines divided by the number of pixels per
+	 * character clock.
+	 */
+	sc->gc_width = (sc->vga_crtc.crtc_horiz_disp_end + 1) *
+	    sc->vga_seq.seq_cm_dots;
+
+	sc->gc_height = (sc->vga_crtc.crtc_vert_disp_end |
+	    (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE8) >> CRTC_OF_VDE8_SHIFT) << 8) |
+	    (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE9) >> CRTC_OF_VDE9_SHIFT) << 9)) + 1;
+
+	if (old_width != sc->gc_width || old_height != sc->gc_height)
+		bhyvegc_resize(gc, sc->gc_width, sc->gc_height);
+}
+
+static uint32_t
+vga_get_pixel(struct vga_softc *sc, int x, int y)
+{
+	int offset;
+	int bit;
+	uint8_t data;
+	uint8_t idx;
+
+	offset = (y * sc->gc_width / 8) + (x / 8);
+	bit = 7 - (x % 8);
+
+	data = (((sc->vga_ram[offset + 0 * 64*KB] >> bit) & 0x1) << 0) |
+		(((sc->vga_ram[offset + 1 * 64*KB] >> bit) & 0x1) << 1) |
+		(((sc->vga_ram[offset + 2 * 64*KB] >> bit) & 0x1) << 2) |
+		(((sc->vga_ram[offset + 3 * 64*KB] >> bit) & 0x1) << 3);
+
+	data &= sc->vga_atc.atc_color_plane_enb;
+
+	if (sc->vga_atc.atc_mode & ATC_MC_IPS) {
+		idx = sc->vga_atc.atc_palette[data] & 0x0f;
+		idx |= sc->vga_atc.atc_color_select_45;
+	} else {
+		idx = sc->vga_atc.atc_palette[data];
+	}
+	idx |= sc->vga_atc.atc_color_select_67;
+
+	return (sc->vga_dac.dac_palette_rgb[idx]);
+}
+
+static void
+vga_render_graphics(struct vga_softc *sc)
+{
+	int x, y;
+
+	for (y = 0; y < sc->gc_height; y++) {
+		for (x = 0; x < sc->gc_width; x++) {
+			int offset;
+
+			offset = y * sc->gc_width + x;
+			sc->gc_image->data[offset] = vga_get_pixel(sc, x, y);
+		}
+	}
+}
+
+static uint32_t
+vga_get_text_pixel(struct vga_softc *sc, int x, int y)
+{
+	int dots, offset, bit, font_offset;
+	uint8_t ch, attr, font;
+	uint8_t idx;
+
+	dots = sc->vga_seq.seq_cm_dots;
+
+	offset = 2 * sc->vga_crtc.crtc_start_addr;
+	offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2;
+
+	bit = 7 - (x % dots);
+
+	ch = sc->vga_ram[offset + 0 * 64*KB];
+	attr = sc->vga_ram[offset + 1 * 64*KB];
+
+	if (sc->vga_crtc.crtc_cursor_on &&
+	    (offset == (sc->vga_crtc.crtc_cursor_loc * 2)) &&
+	    ((y % 16) >= (sc->vga_crtc.crtc_cursor_start & CRTC_CS_CS)) &&
+	    ((y % 16) <= (sc->vga_crtc.crtc_cursor_end & CRTC_CE_CE))) {
+		idx = sc->vga_atc.atc_palette[attr & 0xf];
+		return (sc->vga_dac.dac_palette_rgb[idx]);
+	}
+
+	if ((sc->vga_seq.seq_mm & SEQ_MM_EM) &&
+	    sc->vga_seq.seq_cmap_pri_off != sc->vga_seq.seq_cmap_sec_off) {
+		if (attr & 0x8)
+			font_offset = sc->vga_seq.seq_cmap_pri_off +
+				(ch << 5) + y % 16;
+		else
+			font_offset = sc->vga_seq.seq_cmap_sec_off +
+				(ch << 5) + y % 16;
+		attr &= ~0x8;
+	} else {
+		font_offset = (ch << 5) + y % 16;
+	}
+
+	font = sc->vga_ram[font_offset + 2 * 64*KB];
+
+	if ((bit > 0) && (font & (1 << bit)))
+		idx = sc->vga_atc.atc_palette[attr & 0xf];
+	else
+		idx = sc->vga_atc.atc_palette[attr >> 4];
+
+	return (sc->vga_dac.dac_palette_rgb[idx]);
+}
+
+static void
+vga_render_text(struct vga_softc *sc)
+{
+	int x, y;
+
+	for (y = 0; y < sc->gc_height; y++) {
+		for (x = 0; x < sc->gc_width; x++) {
+			int offset;
+
+			offset = y * sc->gc_width + x;
+			sc->gc_image->data[offset] = vga_get_text_pixel(sc, x, y);
+		}
+	}
+}
+
+static void
+vga_render(struct bhyvegc *gc, void *arg)
+{
+	struct vga_softc *sc = arg;
+
+	vga_check_size(gc, sc);
+
+	if (vga_in_reset(sc)) {
+		memset(sc->gc_image->data, 0,
+		    sc->gc_image->width * sc->gc_image->height *
+		     sizeof (uint32_t));
+		return;
+	}
+
+	if (sc->vga_gc.gc_misc_gm && (sc->vga_atc.atc_mode & ATC_MC_GA))
+		vga_render_graphics(sc);
+	else
+		vga_render_text(sc);
+}
+
+static uint64_t
+vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1)
+{
+	struct vga_softc *sc = arg1;
+	uint8_t map_sel;
+	int offset;
+
+	offset = addr;
+	switch (sc->vga_gc.gc_misc_mm) {
+	case 0x0:
+		/*
+		 * extended mode: base 0xa0000 size 128k
+		 */
+		offset -=0xa0000;
+		offset &= (128 * KB - 1);
+		break;
+	case 0x1:
+		/*
+		 * EGA/VGA mode: base 0xa0000 size 64k
+		 */
+		offset -=0xa0000;
+		offset &= (64 * KB - 1);
+		break;
+	case 0x2:
+		/*
+		 * monochrome text mode: base 0xb0000 size 32kb
+		 */
+		assert(0);
+	case 0x3:
+		/*
+		 * color text mode and CGA: base 0xb8000 size 32kb
+		 */
+		offset -=0xb8000;
+		offset &= (32 * KB - 1);
+		break;
+	}
+
+	/* Fill latches. */
+	sc->vga_gc.gc_latch0 = sc->vga_ram[offset + 0*64*KB];
+	sc->vga_gc.gc_latch1 = sc->vga_ram[offset + 1*64*KB];
+	sc->vga_gc.gc_latch2 = sc->vga_ram[offset + 2*64*KB];
+	sc->vga_gc.gc_latch3 = sc->vga_ram[offset + 3*64*KB];
+
+	if (sc->vga_gc.gc_mode_rm) {
+		/* read mode 1 */
+		assert(0);
+	}
+
+	map_sel = sc->vga_gc.gc_read_map_sel;
+	if (sc->vga_gc.gc_mode_oe) {
+		map_sel |= (offset & 1);
+		offset &= ~1;
+	}
+
+	/* read mode 0: return the byte from the selected plane. */
+	offset += map_sel * 64*KB;
+
+	return (sc->vga_ram[offset]);
+}
+
+static void
+vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1)
+{
+	struct vga_softc *sc = arg1;
+	uint8_t c0, c1, c2, c3;
+	uint8_t m0, m1, m2, m3;
+	uint8_t set_reset;
+	uint8_t enb_set_reset;
+	uint8_t	mask;
+	int offset;
+
+	offset = addr;
+	switch (sc->vga_gc.gc_misc_mm) {
+	case 0x0:
+		/*
+		 * extended mode: base 0xa0000 size 128kb
+		 */
+		offset -=0xa0000;
+		offset &= (128 * KB - 1);
+		break;
+	case 0x1:
+		/*
+		 * EGA/VGA mode: base 0xa0000 size 64kb
+		 */
+		offset -=0xa0000;
+		offset &= (64 * KB - 1);
+		break;
+	case 0x2:
+		/*
+		 * monochrome text mode: base 0xb0000 size 32kb
+		 */
+		assert(0);
+	case 0x3:
+		/*
+		 * color text mode and CGA: base 0xb8000 size 32kb
+		 */
+		offset -=0xb8000;
+		offset &= (32 * KB - 1);
+		break;
+	}
+
+	set_reset = sc->vga_gc.gc_set_reset;
+	enb_set_reset = sc->vga_gc.gc_enb_set_reset;
+
+	c0 = sc->vga_gc.gc_latch0;
+	c1 = sc->vga_gc.gc_latch1;
+	c2 = sc->vga_gc.gc_latch2;
+	c3 = sc->vga_gc.gc_latch3;
+
+	switch (sc->vga_gc.gc_mode_wm) {
+	case 0:
+		/* write mode 0 */
+		mask = sc->vga_gc.gc_bit_mask;
+
+		val = (val >> sc->vga_gc.gc_rotate) |
+		    (val << (8 - sc->vga_gc.gc_rotate));
+
+		switch (sc->vga_gc.gc_op) {
+		case 0x00:		/* replace */
+			m0 = (set_reset & 1) ? mask : 0x00;
+			m1 = (set_reset & 2) ? mask : 0x00;
+			m2 = (set_reset & 4) ? mask : 0x00;
+			m3 = (set_reset & 8) ? mask : 0x00;
+
+			c0 = (enb_set_reset & 1) ? (c0 & ~mask) : (val & mask);
+			c1 = (enb_set_reset & 2) ? (c1 & ~mask) : (val & mask);
+			c2 = (enb_set_reset & 4) ? (c2 & ~mask) : (val & mask);
+			c3 = (enb_set_reset & 8) ? (c3 & ~mask) : (val & mask);
+
+			c0 |= m0;
+			c1 |= m1;
+			c2 |= m2;
+			c3 |= m3;
+			break;
+		case 0x08:		/* AND */
+			m0 = set_reset & 1 ? 0xff : ~mask;
+			m1 = set_reset & 2 ? 0xff : ~mask;
+			m2 = set_reset & 4 ? 0xff : ~mask;
+			m3 = set_reset & 8 ? 0xff : ~mask;
+
+			c0 = enb_set_reset & 1 ? c0 & m0 : val & m0;
+			c1 = enb_set_reset & 2 ? c1 & m1 : val & m1;
+			c2 = enb_set_reset & 4 ? c2 & m2 : val & m2;
+			c3 = enb_set_reset & 8 ? c3 & m3 : val & m3;
+			break;
+		case 0x10:		/* OR */
+			m0 = set_reset & 1 ? mask : 0x00;
+			m1 = set_reset & 2 ? mask : 0x00;
+			m2 = set_reset & 4 ? mask : 0x00;
+			m3 = set_reset & 8 ? mask : 0x00;
+
+			c0 = enb_set_reset & 1 ? c0 | m0 : val | m0;
+			c1 = enb_set_reset & 2 ? c1 | m1 : val | m1;
+			c2 = enb_set_reset & 4 ? c2 | m2 : val | m2;
+			c3 = enb_set_reset & 8 ? c3 | m3 : val | m3;
+			break;
+		case 0x18:		/* XOR */
+			m0 = set_reset & 1 ? mask : 0x00;
+			m1 = set_reset & 2 ? mask : 0x00;
+			m2 = set_reset & 4 ? mask : 0x00;
+			m3 = set_reset & 8 ? mask : 0x00;
+
+			c0 = enb_set_reset & 1 ? c0 ^ m0 : val ^ m0;
+			c1 = enb_set_reset & 2 ? c1 ^ m1 : val ^ m1;
+			c2 = enb_set_reset & 4 ? c2 ^ m2 : val ^ m2;
+			c3 = enb_set_reset & 8 ? c3 ^ m3 : val ^ m3;
+			break;
+		}
+		break;
+	case 1:
+		/* write mode 1 */
+		break;
+	case 2:
+		/* write mode 2 */
+		mask = sc->vga_gc.gc_bit_mask;
+
+		switch (sc->vga_gc.gc_op) {
+		case 0x00:		/* replace */
+			m0 = (val & 1 ? 0xff : 0x00) & mask;
+			m1 = (val & 2 ? 0xff : 0x00) & mask;
+			m2 = (val & 4 ? 0xff : 0x00) & mask;
+			m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+			c0 &= ~mask;
+			c1 &= ~mask;
+			c2 &= ~mask;
+			c3 &= ~mask;
+
+			c0 |= m0;
+			c1 |= m1;
+			c2 |= m2;
+			c3 |= m3;
+			break;
+		case 0x08:		/* AND */
+			m0 = (val & 1 ? 0xff : 0x00) | ~mask;
+			m1 = (val & 2 ? 0xff : 0x00) | ~mask;
+			m2 = (val & 4 ? 0xff : 0x00) | ~mask;
+			m3 = (val & 8 ? 0xff : 0x00) | ~mask;
+
+			c0 &= m0;
+			c1 &= m1;
+			c2 &= m2;
+			c3 &= m3;
+			break;
+		case 0x10:		/* OR */
+			m0 = (val & 1 ? 0xff : 0x00) & mask;
+			m1 = (val & 2 ? 0xff : 0x00) & mask;
+			m2 = (val & 4 ? 0xff : 0x00) & mask;
+			m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+			c0 |= m0;
+			c1 |= m1;
+			c2 |= m2;
+			c3 |= m3;
+			break;
+		case 0x18:		/* XOR */
+			m0 = (val & 1 ? 0xff : 0x00) & mask;
+			m1 = (val & 2 ? 0xff : 0x00) & mask;
+			m2 = (val & 4 ? 0xff : 0x00) & mask;
+			m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+			c0 ^= m0;
+			c1 ^= m1;
+			c2 ^= m2;
+			c3 ^= m3;
+			break;
+		}
+		break;
+	case 3:
+		/* write mode 3 */
+		mask = sc->vga_gc.gc_bit_mask & val;
+
+		val = (val >> sc->vga_gc.gc_rotate) |
+		    (val << (8 - sc->vga_gc.gc_rotate));
+
+		switch (sc->vga_gc.gc_op) {
+		case 0x00:		/* replace */
+			m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+			m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+			m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+			m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+			c0 &= ~mask;
+			c1 &= ~mask;
+			c2 &= ~mask;
+			c3 &= ~mask;
+
+			c0 |= m0;
+			c1 |= m1;
+			c2 |= m2;
+			c3 |= m3;
+			break;
+		case 0x08:		/* AND */
+			m0 = (set_reset & 1 ? 0xff : 0x00) | ~mask;
+			m1 = (set_reset & 2 ? 0xff : 0x00) | ~mask;
+			m2 = (set_reset & 4 ? 0xff : 0x00) | ~mask;
+			m3 = (set_reset & 8 ? 0xff : 0x00) | ~mask;
+
+			c0 &= m0;
+			c1 &= m1;
+			c2 &= m2;
+			c3 &= m3;
+			break;
+		case 0x10:		/* OR */
+			m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+			m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+			m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+			m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+			c0 |= m0;
+			c1 |= m1;
+			c2 |= m2;
+			c3 |= m3;
+			break;
+		case 0x18:		/* XOR */
+			m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+			m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+			m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+			m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+			c0 ^= m0;
+			c1 ^= m1;
+			c2 ^= m2;
+			c3 ^= m3;
+			break;
+		}
+		break;
+	}
+
+	if (sc->vga_gc.gc_mode_oe) {
+		if (offset & 1) {
+			offset &= ~1;
+			if (sc->vga_seq.seq_map_mask & 2)
+				sc->vga_ram[offset + 1*64*KB] = c1;
+			if (sc->vga_seq.seq_map_mask & 8)
+				sc->vga_ram[offset + 3*64*KB] = c3;
+		} else {
+			if (sc->vga_seq.seq_map_mask & 1)
+				sc->vga_ram[offset + 0*64*KB] = c0;
+			if (sc->vga_seq.seq_map_mask & 4)
+				sc->vga_ram[offset + 2*64*KB] = c2;
+		}
+	} else {
+		if (sc->vga_seq.seq_map_mask & 1)
+			sc->vga_ram[offset + 0*64*KB] = c0;
+		if (sc->vga_seq.seq_map_mask & 2)
+			sc->vga_ram[offset + 1*64*KB] = c1;
+		if (sc->vga_seq.seq_map_mask & 4)
+			sc->vga_ram[offset + 2*64*KB] = c2;
+		if (sc->vga_seq.seq_map_mask & 8)
+			sc->vga_ram[offset + 3*64*KB] = c3;
+	}
+}
+
+static int
+vga_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+		int size, uint64_t *val, void *arg1, long arg2)
+{
+	if (dir == MEM_F_WRITE) {
+		switch (size) {
+		case 1:
+			vga_mem_wr_handler(ctx, addr, *val, arg1);
+			break;
+		case 2:
+			vga_mem_wr_handler(ctx, addr, *val, arg1);
+			vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+			break;
+		case 4:
+			vga_mem_wr_handler(ctx, addr, *val, arg1);
+			vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+			vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1);
+			vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1);
+			break;
+		case 8:
+			vga_mem_wr_handler(ctx, addr, *val, arg1);
+			vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+			vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1);
+			vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1);
+			vga_mem_wr_handler(ctx, addr + 4, *val >> 32, arg1);
+			vga_mem_wr_handler(ctx, addr + 5, *val >> 40, arg1);
+			vga_mem_wr_handler(ctx, addr + 6, *val >> 48, arg1);
+			vga_mem_wr_handler(ctx, addr + 7, *val >> 56, arg1);
+			break;
+		}
+	} else {
+		switch (size) {
+		case 1:
+			*val = vga_mem_rd_handler(ctx, addr, arg1);
+			break;
+		case 2:
+			*val = vga_mem_rd_handler(ctx, addr, arg1);
+			*val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+			break;
+		case 4:
+			*val = vga_mem_rd_handler(ctx, addr, arg1);
+			*val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+			*val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16;
+			*val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24;
+			break;
+		case 8:
+			*val = vga_mem_rd_handler(ctx, addr, arg1);
+			*val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+			*val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16;
+			*val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24;
+			*val |= vga_mem_rd_handler(ctx, addr + 4, arg1) << 32;
+			*val |= vga_mem_rd_handler(ctx, addr + 5, arg1) << 40;
+			*val |= vga_mem_rd_handler(ctx, addr + 6, arg1) << 48;
+			*val |= vga_mem_rd_handler(ctx, addr + 7, arg1) << 56;
+			break;
+		}
+	}
+
+	return (0);
+}
+
+static int
+vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes,
+		    uint8_t *val, void *arg)
+{
+	struct vga_softc *sc = arg;
+
+	switch (port) {
+	case CRTC_IDX_MONO_PORT:
+	case CRTC_IDX_COLOR_PORT:
+		*val = sc->vga_crtc.crtc_index;
+		break;
+	case CRTC_DATA_MONO_PORT:
+	case CRTC_DATA_COLOR_PORT:
+		switch (sc->vga_crtc.crtc_index) {
+		case CRTC_HORIZ_TOTAL:
+			*val = sc->vga_crtc.crtc_horiz_total;
+			break;
+		case CRTC_HORIZ_DISP_END:
+			*val = sc->vga_crtc.crtc_horiz_disp_end;
+			break;
+		case CRTC_START_HORIZ_BLANK:
+			*val = sc->vga_crtc.crtc_start_horiz_blank;
+			break;
+		case CRTC_END_HORIZ_BLANK:
+			*val = sc->vga_crtc.crtc_end_horiz_blank;
+			break;
+		case CRTC_START_HORIZ_RETRACE:
+			*val = sc->vga_crtc.crtc_start_horiz_retrace;
+			break;
+		case CRTC_END_HORIZ_RETRACE:
+			*val = sc->vga_crtc.crtc_end_horiz_retrace;
+			break;
+		case CRTC_VERT_TOTAL:
+			*val = sc->vga_crtc.crtc_vert_total;
+			break;
+		case CRTC_OVERFLOW:
+			*val = sc->vga_crtc.crtc_overflow;
+			break;
+		case CRTC_PRESET_ROW_SCAN:
+			*val = sc->vga_crtc.crtc_present_row_scan;
+			break;
+		case CRTC_MAX_SCAN_LINE:
+			*val = sc->vga_crtc.crtc_max_scan_line;
+			break;
+		case CRTC_CURSOR_START:
+			*val = sc->vga_crtc.crtc_cursor_start;
+			break;
+		case CRTC_CURSOR_END:
+			*val = sc->vga_crtc.crtc_cursor_end;
+			break;
+		case CRTC_START_ADDR_HIGH:
+			*val = sc->vga_crtc.crtc_start_addr_high;
+			break;
+		case CRTC_START_ADDR_LOW:
+			*val = sc->vga_crtc.crtc_start_addr_low;
+			break;
+		case CRTC_CURSOR_LOC_HIGH:
+			*val = sc->vga_crtc.crtc_cursor_loc_high;
+			break;
+		case CRTC_CURSOR_LOC_LOW:
+			*val = sc->vga_crtc.crtc_cursor_loc_low;
+			break;
+		case CRTC_VERT_RETRACE_START:
+			*val = sc->vga_crtc.crtc_vert_retrace_start;
+			break;
+		case CRTC_VERT_RETRACE_END:
+			*val = sc->vga_crtc.crtc_vert_retrace_end;
+			break;
+		case CRTC_VERT_DISP_END:
+			*val = sc->vga_crtc.crtc_vert_disp_end;
+			break;
+		case CRTC_OFFSET:
+			*val = sc->vga_crtc.crtc_offset;
+			break;
+		case CRTC_UNDERLINE_LOC:
+			*val = sc->vga_crtc.crtc_underline_loc;
+			break;
+		case CRTC_START_VERT_BLANK:
+			*val = sc->vga_crtc.crtc_start_vert_blank;
+			break;
+		case CRTC_END_VERT_BLANK:
+			*val = sc->vga_crtc.crtc_end_vert_blank;
+			break;
+		case CRTC_MODE_CONTROL:
+			*val = sc->vga_crtc.crtc_mode_ctrl;
+			break;
+		case CRTC_LINE_COMPARE:
+			*val = sc->vga_crtc.crtc_line_compare;
+			break;
+		default:
+			//printf("XXX VGA CRTC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index);
+			assert(0);
+			break;
+		}
+		break;
+	case ATC_IDX_PORT:
+		*val = sc->vga_atc.atc_index;
+		break;
+	case ATC_DATA_PORT:
+		switch (sc->vga_atc.atc_index) {
+		case ATC_PALETTE0 ... ATC_PALETTE15:
+			*val = sc->vga_atc.atc_palette[sc->vga_atc.atc_index];
+			break;
+		case ATC_MODE_CONTROL:
+			*val = sc->vga_atc.atc_mode;
+			break;
+		case ATC_OVERSCAN_COLOR:
+			*val = sc->vga_atc.atc_overscan_color;
+			break;
+		case ATC_COLOR_PLANE_ENABLE:
+			*val = sc->vga_atc.atc_color_plane_enb;
+			break;
+		case ATC_HORIZ_PIXEL_PANNING:
+			*val = sc->vga_atc.atc_horiz_pixel_panning;
+			break;
+		case ATC_COLOR_SELECT:
+			*val = sc->vga_atc.atc_color_select;
+			break;
+		default:
+			//printf("XXX VGA ATC inb 0x%04x at index %d\n", port , sc->vga_atc.atc_index);
+			assert(0);
+			break;
+		}
+		break;
+	case SEQ_IDX_PORT:
+		*val = sc->vga_seq.seq_index;
+		break;
+	case SEQ_DATA_PORT:
+		switch (sc->vga_seq.seq_index) {
+		case SEQ_RESET:
+			*val = sc->vga_seq.seq_reset;
+			break;
+		case SEQ_CLOCKING_MODE:
+			*val = sc->vga_seq.seq_clock_mode;
+			break;
+		case SEQ_MAP_MASK:
+			*val = sc->vga_seq.seq_map_mask;
+			break;
+		case SEQ_CHAR_MAP_SELECT:
+			*val = sc->vga_seq.seq_cmap_sel;
+			break;
+		case SEQ_MEMORY_MODE:
+			*val = sc->vga_seq.seq_mm;
+			break;
+		default:
+			//printf("XXX VGA SEQ: inb 0x%04x at index %d\n", port, sc->vga_seq.seq_index);
+			assert(0);
+			break;
+		}
+	case DAC_DATA_PORT:
+		*val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index +
+					       sc->vga_dac.dac_rd_subindex];
+		sc->vga_dac.dac_rd_subindex++;
+		if (sc->vga_dac.dac_rd_subindex == 3) {
+			sc->vga_dac.dac_rd_index++;
+			sc->vga_dac.dac_rd_subindex = 0;
+		}
+		break;
+	case GC_IDX_PORT:
+		*val = sc->vga_gc.gc_index;
+		break;
+	case GC_DATA_PORT:
+		switch (sc->vga_gc.gc_index) {
+		case GC_SET_RESET:
+			*val = sc->vga_gc.gc_set_reset;
+			break;
+		case GC_ENABLE_SET_RESET:
+			*val = sc->vga_gc.gc_enb_set_reset;
+			break;
+		case GC_COLOR_COMPARE:
+			*val = sc->vga_gc.gc_color_compare;
+			break;
+		case GC_DATA_ROTATE:
+			*val = sc->vga_gc.gc_rotate;
+			break;
+		case GC_READ_MAP_SELECT:
+			*val = sc->vga_gc.gc_read_map_sel;
+			break;
+		case GC_MODE:
+			*val = sc->vga_gc.gc_mode;
+			break;
+		case GC_MISCELLANEOUS:
+			*val = sc->vga_gc.gc_misc;
+			break;
+		case GC_COLOR_DONT_CARE:
+			*val = sc->vga_gc.gc_color_dont_care;
+			break;
+		case GC_BIT_MASK:
+			*val = sc->vga_gc.gc_bit_mask;
+			break;
+		default:
+			//printf("XXX VGA GC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index);
+			assert(0);
+			break;
+		}
+		break;
+	case GEN_MISC_OUTPUT_PORT:
+		*val = sc->vga_misc;
+		break;
+	case GEN_INPUT_STS0_PORT:
+		assert(0);
+		break;
+	case GEN_INPUT_STS1_MONO_PORT:
+	case GEN_INPUT_STS1_COLOR_PORT:
+		sc->vga_atc.atc_flipflop = 0;
+		sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE);
+		*val = sc->vga_sts1;
+		break;
+	case GEN_FEATURE_CTRL_PORT:
+		assert(0);
+		break;
+	default:
+		printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port);
+		assert(0);
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes,
+		     uint8_t val, void *arg)
+{
+	struct vga_softc *sc = arg;
+
+	switch (port) {
+	case CRTC_IDX_MONO_PORT:
+	case CRTC_IDX_COLOR_PORT:
+		sc->vga_crtc.crtc_index = val;
+		break;
+	case CRTC_DATA_MONO_PORT:
+	case CRTC_DATA_COLOR_PORT:
+		switch (sc->vga_crtc.crtc_index) {
+		case CRTC_HORIZ_TOTAL:
+			sc->vga_crtc.crtc_horiz_total = val;
+			break;
+		case CRTC_HORIZ_DISP_END:
+			sc->vga_crtc.crtc_horiz_disp_end = val;
+			break;
+		case CRTC_START_HORIZ_BLANK:
+			sc->vga_crtc.crtc_start_horiz_blank = val;
+			break;
+		case CRTC_END_HORIZ_BLANK:
+			sc->vga_crtc.crtc_end_horiz_blank = val;
+			break;
+		case CRTC_START_HORIZ_RETRACE:
+			sc->vga_crtc.crtc_start_horiz_retrace = val;
+			break;
+		case CRTC_END_HORIZ_RETRACE:
+			sc->vga_crtc.crtc_end_horiz_retrace = val;
+			break;
+		case CRTC_VERT_TOTAL:
+			sc->vga_crtc.crtc_vert_total = val;
+			break;
+		case CRTC_OVERFLOW:
+			sc->vga_crtc.crtc_overflow = val;
+			break;
+		case CRTC_PRESET_ROW_SCAN:
+			sc->vga_crtc.crtc_present_row_scan = val;
+			break;
+		case CRTC_MAX_SCAN_LINE:
+			sc->vga_crtc.crtc_max_scan_line = val;
+			break;
+		case CRTC_CURSOR_START:
+			sc->vga_crtc.crtc_cursor_start = val;
+			sc->vga_crtc.crtc_cursor_on = (val & CRTC_CS_CO) == 0;
+			break;
+		case CRTC_CURSOR_END:
+			sc->vga_crtc.crtc_cursor_end = val;
+			break;
+		case CRTC_START_ADDR_HIGH:
+			sc->vga_crtc.crtc_start_addr_high = val;
+			sc->vga_crtc.crtc_start_addr &= 0x00ff;
+			sc->vga_crtc.crtc_start_addr |= (val << 8);
+			break;
+		case CRTC_START_ADDR_LOW:
+			sc->vga_crtc.crtc_start_addr_low = val;
+			sc->vga_crtc.crtc_start_addr &= 0xff00;
+			sc->vga_crtc.crtc_start_addr |= (val & 0xff);
+			break;
+		case CRTC_CURSOR_LOC_HIGH:
+			sc->vga_crtc.crtc_cursor_loc_high = val;
+			sc->vga_crtc.crtc_cursor_loc &= 0x00ff;
+			sc->vga_crtc.crtc_cursor_loc |= (val << 8);
+			break;
+		case CRTC_CURSOR_LOC_LOW:
+			sc->vga_crtc.crtc_cursor_loc_low = val;
+			sc->vga_crtc.crtc_cursor_loc &= 0xff00;
+			sc->vga_crtc.crtc_cursor_loc |= (val & 0xff);
+			break;
+		case CRTC_VERT_RETRACE_START:
+			sc->vga_crtc.crtc_vert_retrace_start = val;
+			break;
+		case CRTC_VERT_RETRACE_END:
+			sc->vga_crtc.crtc_vert_retrace_end = val;
+			break;
+		case CRTC_VERT_DISP_END:
+			sc->vga_crtc.crtc_vert_disp_end = val;
+			break;
+		case CRTC_OFFSET:
+			sc->vga_crtc.crtc_offset = val;
+			break;
+		case CRTC_UNDERLINE_LOC:
+			sc->vga_crtc.crtc_underline_loc = val;
+			break;
+		case CRTC_START_VERT_BLANK:
+			sc->vga_crtc.crtc_start_vert_blank = val;
+			break;
+		case CRTC_END_VERT_BLANK:
+			sc->vga_crtc.crtc_end_vert_blank = val;
+			break;
+		case CRTC_MODE_CONTROL:
+			sc->vga_crtc.crtc_mode_ctrl = val;
+			break;
+		case CRTC_LINE_COMPARE:
+			sc->vga_crtc.crtc_line_compare = val;
+			break;
+		default:
+			//printf("XXX VGA CRTC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_crtc.crtc_index);
+			assert(0);
+			break;
+		}
+		break;
+	case ATC_IDX_PORT:
+		if (sc->vga_atc.atc_flipflop == 0) {
+			if (sc->vga_atc.atc_index & 0x20)
+				assert(0);
+			sc->vga_atc.atc_index = val & ATC_IDX_MASK;
+		} else {
+			switch (sc->vga_atc.atc_index) {
+			case ATC_PALETTE0 ... ATC_PALETTE15:
+				sc->vga_atc.atc_palette[sc->vga_atc.atc_index] = val & 0x3f;
+				break;
+			case ATC_MODE_CONTROL:
+				sc->vga_atc.atc_mode = val;
+				break;
+			case ATC_OVERSCAN_COLOR:
+				sc->vga_atc.atc_overscan_color = val;
+				break;
+			case ATC_COLOR_PLANE_ENABLE:
+				sc->vga_atc.atc_color_plane_enb = val;
+				break;
+			case ATC_HORIZ_PIXEL_PANNING:
+				sc->vga_atc.atc_horiz_pixel_panning = val;
+				break;
+			case ATC_COLOR_SELECT:
+				sc->vga_atc.atc_color_select = val;
+				sc->vga_atc.atc_color_select_45 =
+					(val & ATC_CS_C45) << 4;
+				sc->vga_atc.atc_color_select_67 =
+					(val & ATC_CS_C67) << 6;
+				break;
+			default:
+				//printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index);
+				assert(0);
+				break;
+			}
+		}
+		sc->vga_atc.atc_flipflop ^= 1;
+		break;
+	case ATC_DATA_PORT:
+		break;
+	case SEQ_IDX_PORT:
+		sc->vga_seq.seq_index = val & 0x1f;
+		break;
+	case SEQ_DATA_PORT:
+		switch (sc->vga_seq.seq_index) {
+		case SEQ_RESET:
+			sc->vga_seq.seq_reset = val;
+			break;
+		case SEQ_CLOCKING_MODE:
+			sc->vga_seq.seq_clock_mode = val;
+			sc->vga_seq.seq_cm_dots = (val & SEQ_CM_89) ? 8 : 9;
+			break;
+		case SEQ_MAP_MASK:
+			sc->vga_seq.seq_map_mask = val;
+			break;
+		case SEQ_CHAR_MAP_SELECT:
+			sc->vga_seq.seq_cmap_sel = val;
+
+			sc->vga_seq.seq_cmap_pri_off = ((((val & SEQ_CMS_SA) >> SEQ_CMS_SA_SHIFT) * 2) + ((val & SEQ_CMS_SAH) >> SEQ_CMS_SAH_SHIFT)) * 8 * KB;
+			sc->vga_seq.seq_cmap_sec_off = ((((val & SEQ_CMS_SB) >> SEQ_CMS_SB_SHIFT) * 2) + ((val & SEQ_CMS_SBH) >> SEQ_CMS_SBH_SHIFT)) * 8 * KB;
+			break;
+		case SEQ_MEMORY_MODE:
+			sc->vga_seq.seq_mm = val;
+			assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0);
+			break;
+		default:
+			//printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index);
+			assert(0);
+			break;
+		}
+		break;
+	case DAC_MASK:
+		break;
+	case DAC_IDX_RD_PORT:
+		sc->vga_dac.dac_rd_index = val;
+		sc->vga_dac.dac_rd_subindex = 0;
+		break;
+	case DAC_IDX_WR_PORT:
+		sc->vga_dac.dac_wr_index = val;
+		sc->vga_dac.dac_wr_subindex = 0;
+		break;
+	case DAC_DATA_PORT:
+		sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_wr_index +
+					sc->vga_dac.dac_wr_subindex] = val;
+		sc->vga_dac.dac_wr_subindex++;
+		if (sc->vga_dac.dac_wr_subindex == 3) {
+			sc->vga_dac.dac_palette_rgb[sc->vga_dac.dac_wr_index] =
+				((((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] << 2) |
+				   ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1) << 1) |
+				   (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1)) << 16) |
+				 (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] << 2) |
+				   ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1) << 1) |
+				   (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1)) << 8) |
+				 (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] << 2) |
+				   ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1) << 1) |
+				   (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1)) << 0));
+
+			sc->vga_dac.dac_wr_index++;
+			sc->vga_dac.dac_wr_subindex = 0;
+		}
+		break;
+	case GC_IDX_PORT:
+		sc->vga_gc.gc_index = val;
+		break;
+	case GC_DATA_PORT:
+		switch (sc->vga_gc.gc_index) {
+		case GC_SET_RESET:
+			sc->vga_gc.gc_set_reset = val;
+			break;
+		case GC_ENABLE_SET_RESET:
+			sc->vga_gc.gc_enb_set_reset = val;
+			break;
+		case GC_COLOR_COMPARE:
+			sc->vga_gc.gc_color_compare = val;
+			break;
+		case GC_DATA_ROTATE:
+			sc->vga_gc.gc_rotate = val;
+			sc->vga_gc.gc_op = (val >> 3) & 0x3;
+			break;
+		case GC_READ_MAP_SELECT:
+			sc->vga_gc.gc_read_map_sel = val;
+			break;
+		case GC_MODE:
+			sc->vga_gc.gc_mode = val;
+			sc->vga_gc.gc_mode_c4 = (val & GC_MODE_C4) != 0;
+			assert(!sc->vga_gc.gc_mode_c4);
+			sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0;
+			sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1;
+			sc->vga_gc.gc_mode_wm = val & 0x3;
+			break;
+		case GC_MISCELLANEOUS:
+			sc->vga_gc.gc_misc = val;
+			sc->vga_gc.gc_misc_gm = val & GC_MISC_GM;
+			sc->vga_gc.gc_misc_mm = (val & GC_MISC_MM) >>
+			    GC_MISC_MM_SHIFT;
+			break;
+		case GC_COLOR_DONT_CARE:
+			sc->vga_gc.gc_color_dont_care = val;
+			break;
+		case GC_BIT_MASK:
+			sc->vga_gc.gc_bit_mask = val;
+			break;
+		default:
+			//printf("XXX VGA GC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_gc.gc_index);
+			assert(0);
+			break;
+		}
+		break;
+	case GEN_INPUT_STS0_PORT:
+		/* write to Miscellaneous Output Register */
+		sc->vga_misc = val;
+		break;
+	case GEN_INPUT_STS1_MONO_PORT:
+	case GEN_INPUT_STS1_COLOR_PORT:
+		/* write to Feature Control Register */
+		break;
+	default:
+		printf("XXX vga_port_out_handler() unhandled port 0x%x\n", port);
+		//assert(0);
+		return (-1);
+	}
+	return (0);
+}
+
+static int
+vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	uint8_t val;
+	int error;
+
+	switch (bytes) {
+	case 1:
+		if (in) {
+			*eax &= ~0xff;
+			error = vga_port_in_handler(ctx, in, port, 1,
+						    &val, arg);
+			if (!error) {
+				*eax |= val & 0xff;
+			}
+		} else {
+			val = *eax & 0xff;
+			error = vga_port_out_handler(ctx, in, port, 1,
+						     val, arg);
+		}
+		break;
+	case 2:
+		if (in) {
+			*eax &= ~0xffff;
+			error = vga_port_in_handler(ctx, in, port, 1,
+						    &val, arg);
+			if (!error) {
+				*eax |= val & 0xff;
+			}
+			error = vga_port_in_handler(ctx, in, port + 1, 1,
+						    &val, arg);
+			if (!error) {
+				*eax |= (val & 0xff) << 8;
+			}
+		} else {
+			val = *eax & 0xff;
+			error = vga_port_out_handler(ctx, in, port, 1,
+						     val, arg);
+			val = (*eax >> 8) & 0xff;
+			error =vga_port_out_handler(ctx, in, port + 1, 1,
+						    val, arg);
+		}
+		break;
+	default:
+		assert(0);
+		return (-1);
+	}
+
+	return (error);
+}
+
+int
+vga_init(void)
+{
+	struct inout_port iop;
+	struct vga_softc *sc;
+	int port, error;
+
+	sc = calloc(1, sizeof(struct vga_softc));
+
+	bzero(&iop, sizeof(struct inout_port));
+	iop.name = "VGA";
+	for (port = VGA_IOPORT_START; port <= VGA_IOPORT_END; port++) {
+		iop.port = port;
+		iop.size = 1;
+		iop.flags = IOPORT_F_INOUT;
+		iop.handler = vga_port_handler;
+		iop.arg = sc;
+
+		error = register_inout(&iop);
+		assert(error == 0);
+	}
+
+	sc->mr.name = "VGA memory";
+	sc->mr.flags = MEM_F_RW;
+	sc->mr.base = 640 * KB;
+	sc->mr.size = 128 * KB;
+	sc->mr.handler = vga_mem_handler;
+	sc->mr.arg1 = sc;
+	error = register_mem_fallback(&sc->mr);
+	assert(error == 0);
+
+	sc->vga_ram = malloc(256 * KB);
+	memset(sc->vga_ram, 0, 256 * KB);
+
+	sc->gc_image = console_get_image();
+	console_fb_register(vga_render, sc);
+
+	return (0);
+}
diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h
new file mode 100644
index 0000000000..14637b12b3
--- /dev/null
+++ b/usr/src/cmd/bhyve/vga.h
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VGA_H_
+#define	_VGA_H_
+
+#define	VGA_IOPORT_START		0x3c0
+#define	VGA_IOPORT_END			0x3df
+
+/* General registers */
+#define	GEN_INPUT_STS0_PORT		0x3c2
+#define	GEN_FEATURE_CTRL_PORT		0x3ca
+#define	GEN_MISC_OUTPUT_PORT		0x3cc
+#define	GEN_INPUT_STS1_MONO_PORT	0x3ba
+#define	GEN_INPUT_STS1_COLOR_PORT	0x3da
+#define	 GEN_IS1_VR			0x08	/* Vertical retrace */
+#define	 GEN_IS1_DE			0x01	/* Display enable not */
+
+/* Attribute controller registers. */
+#define	ATC_IDX_PORT			0x3c0
+#define	ATC_DATA_PORT			0x3c1
+
+#define	ATC_IDX_MASK			0x1f
+#define	ATC_PALETTE0			0
+#define	ATC_PALETTE15			15
+#define	ATC_MODE_CONTROL		16
+#define	 ATC_MC_IPS			0x80	/* Internal palette size */
+#define	 ATC_MC_GA			0x01	/* Graphics/alphanumeric */
+#define	ATC_OVERSCAN_COLOR		17
+#define	ATC_COLOR_PLANE_ENABLE		18
+#define	ATC_HORIZ_PIXEL_PANNING		19
+#define	ATC_COLOR_SELECT		20
+#define	 ATC_CS_C67			0x0c	/* Color select bits 6+7 */
+#define	 ATC_CS_C45			0x03	/* Color select bits 4+5 */
+
+/* Sequencer registers. */
+#define	SEQ_IDX_PORT			0x3c4
+#define	SEQ_DATA_PORT			0x3c5
+
+#define	SEQ_RESET			0
+#define	SEQ_RESET_ASYNC			0x1
+#define	SEQ_RESET_SYNC			0x2
+#define	SEQ_CLOCKING_MODE		1
+#define	 SEQ_CM_SO			0x20	/* Screen off */
+#define	 SEQ_CM_89			0x01	/* 8/9 dot clock */
+#define	SEQ_MAP_MASK			2
+#define	SEQ_CHAR_MAP_SELECT		3
+#define	 SEQ_CMS_SAH			0x20	/* Char map A bit 2 */
+#define	 SEQ_CMS_SAH_SHIFT		5
+#define	 SEQ_CMS_SA			0x0c	/* Char map A bits 0+1 */
+#define	 SEQ_CMS_SA_SHIFT		2
+#define	 SEQ_CMS_SBH			0x10	/* Char map B bit 2 */
+#define	 SEQ_CMS_SBH_SHIFT		4
+#define	 SEQ_CMS_SB			0x03	/* Char map B bits 0+1 */
+#define	 SEQ_CMS_SB_SHIFT		0
+#define	SEQ_MEMORY_MODE			4
+#define	 SEQ_MM_C4			0x08	/* Chain 4 */
+#define	 SEQ_MM_OE			0x04	/* Odd/even */
+#define	 SEQ_MM_EM			0x02	/* Extended memory */
+
+/* Graphics controller registers. */
+#define	GC_IDX_PORT			0x3ce
+#define	GC_DATA_PORT			0x3cf
+
+#define	GC_SET_RESET			0
+#define	GC_ENABLE_SET_RESET		1
+#define	GC_COLOR_COMPARE		2
+#define	GC_DATA_ROTATE			3
+#define	GC_READ_MAP_SELECT		4
+#define	GC_MODE				5
+#define	 GC_MODE_OE			0x10	/* Odd/even */
+#define	 GC_MODE_C4			0x04	/* Chain 4 */
+
+#define	GC_MISCELLANEOUS		6
+#define	 GC_MISC_GM			0x01	/* Graphics/alphanumeric */
+#define	 GC_MISC_MM			0x0c	/* memory map */
+#define	 GC_MISC_MM_SHIFT	2
+#define	GC_COLOR_DONT_CARE		7
+#define	GC_BIT_MASK			8
+
+/* CRT controller registers. */
+#define	CRTC_IDX_MONO_PORT		0x3b4
+#define	CRTC_DATA_MONO_PORT		0x3b5
+#define	CRTC_IDX_COLOR_PORT		0x3d4
+#define	CRTC_DATA_COLOR_PORT		0x3d5
+
+#define	CRTC_HORIZ_TOTAL		0
+#define	CRTC_HORIZ_DISP_END		1
+#define	CRTC_START_HORIZ_BLANK		2
+#define	CRTC_END_HORIZ_BLANK		3
+#define	CRTC_START_HORIZ_RETRACE	4
+#define	CRTC_END_HORIZ_RETRACE		5
+#define	CRTC_VERT_TOTAL			6
+#define	CRTC_OVERFLOW			7
+#define	 CRTC_OF_VRS9			0x80	/* VRS bit 9 */
+#define	 CRTC_OF_VRS9_SHIFT		7
+#define	 CRTC_OF_VDE9			0x40	/* VDE bit 9 */
+#define	 CRTC_OF_VDE9_SHIFT		6
+#define	 CRTC_OF_VRS8			0x04	/* VRS bit 8 */
+#define	 CRTC_OF_VRS8_SHIFT		2
+#define	 CRTC_OF_VDE8			0x02	/* VDE bit 8 */
+#define	 CRTC_OF_VDE8_SHIFT		1
+#define	CRTC_PRESET_ROW_SCAN		8
+#define	CRTC_MAX_SCAN_LINE		9
+#define	 CRTC_MSL_MSL			0x1f
+#define	CRTC_CURSOR_START		10
+#define	 CRTC_CS_CO			0x20	/* Cursor off */
+#define	 CRTC_CS_CS			0x1f	/* Cursor start */
+#define	CRTC_CURSOR_END			11
+#define	 CRTC_CE_CE			0x1f	/* Cursor end */
+#define	CRTC_START_ADDR_HIGH		12
+#define	CRTC_START_ADDR_LOW		13
+#define	CRTC_CURSOR_LOC_HIGH		14
+#define	CRTC_CURSOR_LOC_LOW		15
+#define	CRTC_VERT_RETRACE_START		16
+#define	CRTC_VERT_RETRACE_END		17
+#define	 CRTC_VRE_MASK			0xf
+#define	CRTC_VERT_DISP_END		18
+#define	CRTC_OFFSET			19
+#define	CRTC_UNDERLINE_LOC		20
+#define	CRTC_START_VERT_BLANK		21
+#define	CRTC_END_VERT_BLANK		22
+#define	CRTC_MODE_CONTROL		23
+#define	 CRTC_MC_TE			0x80	/* Timing enable */
+#define	CRTC_LINE_COMPARE		24
+
+/* DAC registers */
+#define	DAC_MASK			0x3c6
+#define	DAC_IDX_RD_PORT			0x3c7
+#define	DAC_IDX_WR_PORT			0x3c8
+#define	DAC_DATA_PORT			0x3c9
+
+int	vga_init(void);
+
+#endif /* _VGA_H_ */
diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c
new file mode 100644
index 0000000000..c3b11dc439
--- /dev/null
+++ b/usr/src/cmd/bhyve/virtio.c
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tychon $");
+
+#include <sys/param.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+/*
+ * Functions for dealing with generalized "virtual devices" as
+ * defined by <https://www.google.com/#output=search&q=virtio+spec>
+ */
+
+/*
+ * In case we decide to relax the "virtio softc comes at the
+ * front of virtio-based device softc" constraint, let's use
+ * this to convert.
+ */
+#define DEV_SOFTC(vs) ((void *)(vs))
+
+/*
+ * Link a virtio_softc to its constants, the device softc, and
+ * the PCI emulation.
+ */
+void
+vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+		void *dev_softc, struct pci_devinst *pi,
+		struct vqueue_info *queues)
+{
+	int i;
+
+	/* vs and dev_softc addresses must match */
+	assert((void *)vs == dev_softc);
+	vs->vs_vc = vc;
+	vs->vs_pi = pi;
+	pi->pi_arg = vs;
+
+	vs->vs_queues = queues;
+	for (i = 0; i < vc->vc_nvq; i++) {
+		queues[i].vq_vs = vs;
+		queues[i].vq_num = i;
+	}
+}
+
+/*
+ * Reset device (device-wide).  This erases all queues, i.e.,
+ * all the queues become invalid (though we don't wipe out the
+ * internal pointers, we just clear the VQ_ALLOC flag).
+ *
+ * It resets negotiated features to "none".
+ *
+ * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
+ */
+void
+vi_reset_dev(struct virtio_softc *vs)
+{
+	struct vqueue_info *vq;
+	int i, nvq;
+
+	if (vs->vs_mtx)
+		assert(pthread_mutex_isowned_np(vs->vs_mtx));
+
+	nvq = vs->vs_vc->vc_nvq;
+	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
+		vq->vq_flags = 0;
+		vq->vq_last_avail = 0;
+		vq->vq_pfn = 0;
+		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
+	}
+	vs->vs_negotiated_caps = 0;
+	vs->vs_curq = 0;
+	/* vs->vs_status = 0; -- redundant */
+	if (vs->vs_isr)
+		pci_lintr_deassert(vs->vs_pi);
+	vs->vs_isr = 0;
+	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
+}
+
+/*
+ * Set I/O BAR (usually 0) to map PCI config registers.
+ */
+void
+vi_set_io_bar(struct virtio_softc *vs, int barnum)
+{
+	size_t size;
+
+	/*
+	 * ??? should we use CFG0 if MSI-X is disabled?
+	 * Existing code did not...
+	 */
+	size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
+	pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
+}
+
+/*
+ * Initialize MSI-X vector capabilities if we're to use MSI-X,
+ * or MSI capabilities if not.
+ *
+ * We assume we want one MSI-X vector per queue, here, plus one
+ * for the config vec.
+ */
+int
+vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
+{
+	int nvec;
+
+	if (use_msix) {
+		vs->vs_flags |= VIRTIO_USE_MSIX;
+		VS_LOCK(vs);
+		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
+		VS_UNLOCK(vs);
+		nvec = vs->vs_vc->vc_nvq + 1;
+		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
+			return (1);
+	} else
+		vs->vs_flags &= ~VIRTIO_USE_MSIX;
+	/* Only 1 MSI vector for bhyve */
+	pci_emul_add_msicap(vs->vs_pi, 1);
+	return (0);
+}
+
+/*
+ * Initialize the currently-selected virtio queue (vs->vs_curq).
+ * The guest just gave us a page frame number, from which we can
+ * calculate the addresses of the queue.
+ */
+void
+vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
+{
+	struct vqueue_info *vq;
+	uint64_t phys;
+	size_t size;
+	char *base;
+
+	vq = &vs->vs_queues[vs->vs_curq];
+	vq->vq_pfn = pfn;
+	phys = (uint64_t)pfn << VRING_PFN;
+	size = vring_size(vq->vq_qsize);
+	base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
+
+	/* First page(s) are descriptors... */
+	vq->vq_desc = (struct virtio_desc *)base;
+	base += vq->vq_qsize * sizeof(struct virtio_desc);
+
+	/* ... immediately followed by "avail" ring (entirely uint16_t's) */
+	vq->vq_avail = (struct vring_avail *)base;
+	base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
+
+	/* Then it's rounded up to the next page... */
+	base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
+
+	/* ... and the last page(s) are the used ring. */
+	vq->vq_used = (struct vring_used *)base;
+
+	/* Mark queue as allocated, and start at 0 when we use it. */
+	vq->vq_flags = VQ_ALLOC;
+	vq->vq_last_avail = 0;
+}
+
+/*
+ * Helper inline for vq_getchain(): record the i'th "real"
+ * descriptor.
+ */
+static inline void
+_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
+	   struct iovec *iov, int n_iov, uint16_t *flags) {
+
+	if (i >= n_iov)
+		return;
+	iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
+	iov[i].iov_len = vd->vd_len;
+	if (flags != NULL)
+		flags[i] = vd->vd_flags;
+}
+#define	VQ_MAX_DESCRIPTORS	512	/* see below */
+
+/*
+ * Examine the chain of descriptors starting at the "next one" to
+ * make sure that they describe a sensible request.  If so, return
+ * the number of "real" descriptors that would be needed/used in
+ * acting on this request.  This may be smaller than the number of
+ * available descriptors, e.g., if there are two available but
+ * they are two separate requests, this just returns 1.  Or, it
+ * may be larger: if there are indirect descriptors involved,
+ * there may only be one descriptor available but it may be an
+ * indirect pointing to eight more.  We return 8 in this case,
+ * i.e., we do not count the indirect descriptors, only the "real"
+ * ones.
+ *
+ * Basically, this vets the vd_flags and vd_next field of each
+ * descriptor and tells you how many are involved.  Since some may
+ * be indirect, this also needs the vmctx (in the pci_devinst
+ * at vs->vs_pi) so that it can find indirect descriptors.
+ *
+ * As we process each descriptor, we copy and adjust it (guest to
+ * host address wise, also using the vmtctx) into the given iov[]
+ * array (of the given size).  If the array overflows, we stop
+ * placing values into the array but keep processing descriptors,
+ * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
+ * So you, the caller, must not assume that iov[] is as big as the
+ * return value (you can process the same thing twice to allocate
+ * a larger iov array if needed, or supply a zero length to find
+ * out how much space is needed).
+ *
+ * If you want to verify the WRITE flag on each descriptor, pass a
+ * non-NULL "flags" pointer to an array of "uint16_t" of the same size
+ * as n_iov and we'll copy each vd_flags field after unwinding any
+ * indirects.
+ *
+ * If some descriptor(s) are invalid, this prints a diagnostic message
+ * and returns -1.  If no descriptors are ready now it simply returns 0.
+ *
+ * You are assumed to have done a vq_ring_ready() if needed (note
+ * that vq_has_descs() does one).
+ */
+int
+vq_getchain(struct vqueue_info *vq,
+	    struct iovec *iov, int n_iov, uint16_t *flags)
+{
+	int i;
+	u_int ndesc, n_indir;
+	u_int idx, head, next;
+	volatile struct virtio_desc *vdir, *vindir, *vp;
+	struct vmctx *ctx;
+	struct virtio_softc *vs;
+	const char *name;
+
+	vs = vq->vq_vs;
+	name = vs->vs_vc->vc_name;
+
+	/*
+	 * Note: it's the responsibility of the guest not to
+	 * update vq->vq_avail->va_idx until all of the descriptors
+         * the guest has written are valid (including all their
+         * vd_next fields and vd_flags).
+	 *
+	 * Compute (last_avail - va_idx) in integers mod 2**16.  This is
+	 * the number of descriptors the device has made available
+	 * since the last time we updated vq->vq_last_avail.
+	 *
+	 * We just need to do the subtraction as an unsigned int,
+	 * then trim off excess bits.
+	 */
+	idx = vq->vq_last_avail;
+	ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
+	if (ndesc == 0)
+		return (0);
+	if (ndesc > vq->vq_qsize) {
+		/* XXX need better way to diagnose issues */
+		fprintf(stderr,
+		    "%s: ndesc (%u) out of range, driver confused?\r\n",
+		    name, (u_int)ndesc);
+		return (-1);
+	}
+
+	/*
+	 * Now count/parse "involved" descriptors starting from
+	 * the head of the chain.
+	 *
+	 * To prevent loops, we could be more complicated and
+	 * check whether we're re-visiting a previously visited
+	 * index, but we just abort if the count gets excessive.
+	 */
+	ctx = vs->vs_pi->pi_vmctx;
+	head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
+	next = head;
+	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
+		if (next >= vq->vq_qsize) {
+			fprintf(stderr,
+			    "%s: descriptor index %u out of range, "
+			    "driver confused?\r\n",
+			    name, next);
+			return (-1);
+		}
+		vdir = &vq->vq_desc[next];
+		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+			_vq_record(i, vdir, ctx, iov, n_iov, flags);
+			i++;
+		} else if ((vs->vs_negotiated_caps &
+		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+			fprintf(stderr,
+			    "%s: descriptor has forbidden INDIRECT flag, "
+			    "driver confused?\r\n",
+			    name);
+			return (-1);
+		} else {
+			n_indir = vdir->vd_len / 16;
+			if ((vdir->vd_len & 0xf) || n_indir == 0) {
+				fprintf(stderr,
+				    "%s: invalid indir len 0x%x, "
+				    "driver confused?\r\n",
+				    name, (u_int)vdir->vd_len);
+				return (-1);
+			}
+			vindir = paddr_guest2host(ctx,
+			    vdir->vd_addr, vdir->vd_len);
+			/*
+			 * Indirects start at the 0th, then follow
+			 * their own embedded "next"s until those run
+			 * out.  Each one's indirect flag must be off
+			 * (we don't really have to check, could just
+			 * ignore errors...).
+			 */
+			next = 0;
+			for (;;) {
+				vp = &vindir[next];
+				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
+					fprintf(stderr,
+					    "%s: indirect desc has INDIR flag,"
+					    " driver confused?\r\n",
+					    name);
+					return (-1);
+				}
+				_vq_record(i, vp, ctx, iov, n_iov, flags);
+				if (++i > VQ_MAX_DESCRIPTORS)
+					goto loopy;
+				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
+					break;
+				next = vp->vd_next;
+				if (next >= n_indir) {
+					fprintf(stderr,
+					    "%s: invalid next %u > %u, "
+					    "driver confused?\r\n",
+					    name, (u_int)next, n_indir);
+					return (-1);
+				}
+			}
+		}
+		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
+			return (i);
+	}
+loopy:
+	fprintf(stderr,
+	    "%s: descriptor loop? count > %d - driver confused?\r\n",
+	    name, i);
+	return (-1);
+}
+
+/*
+ * Return the currently-first request chain to the guest, setting
+ * its I/O length to the provided value.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_relchain(struct vqueue_info *vq, uint32_t iolen)
+{
+	uint16_t head, uidx, mask;
+	volatile struct vring_used *vuh;
+	volatile struct virtio_used *vue;
+
+	/*
+	 * Notes:
+	 *  - mask is N-1 where N is a power of 2 so computes x % N
+	 *  - vuh points to the "used" data shared with guest
+	 *  - vue points to the "used" ring entry we want to update
+	 *  - head is the same value we compute in vq_iovecs().
+	 *
+	 * (I apologize for the two fields named vu_idx; the
+	 * virtio spec calls the one that vue points to, "id"...)
+	 */
+	mask = vq->vq_qsize - 1;
+	vuh = vq->vq_used;
+	head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask];
+
+	uidx = vuh->vu_idx;
+	vue = &vuh->vu_ring[uidx++ & mask];
+	vue->vu_idx = head; /* ie, vue->id = head */
+	vue->vu_tlen = iolen;
+	vuh->vu_idx = uidx;
+}
+
+/*
+ * Driver has finished processing "available" chains and calling
+ * vq_relchain on each one.  If driver used all the available
+ * chains, used_all should be set.
+ *
+ * If the "used" index moved we may need to inform the guest, i.e.,
+ * deliver an interrupt.  Even if the used index did NOT move we
+ * may need to deliver an interrupt, if the avail ring is empty and
+ * we are supposed to interrupt on empty.
+ *
+ * Note that used_all_avail is provided by the caller because it's
+ * a snapshot of the ring state when he decided to finish interrupt
+ * processing -- it's possible that descriptors became available after
+ * that point.  (It's also typically a constant 1/True as well.)
+ */
+void
+vq_endchains(struct vqueue_info *vq, int used_all_avail)
+{
+	struct virtio_softc *vs;
+	uint16_t event_idx, new_idx, old_idx;
+	int intr;
+
+	/*
+	 * Interrupt generation: if we're using EVENT_IDX,
+	 * interrupt if we've crossed the event threshold.
+	 * Otherwise interrupt is generated if we added "used" entries,
+	 * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
+	 *
+	 * In any case, though, if NOTIFY_ON_EMPTY is set and the
+	 * entire avail was processed, we need to interrupt always.
+	 */
+	vs = vq->vq_vs;
+	new_idx = vq->vq_used->vu_idx;
+	old_idx = vq->vq_save_used;
+	if (used_all_avail &&
+	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
+		intr = 1;
+	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
+		event_idx = VQ_USED_EVENT_IDX(vq);
+		/*
+		 * This calculation is per docs and the kernel
+		 * (see src/sys/dev/virtio/virtio_ring.h).
+		 */
+		intr = (uint16_t)(new_idx - event_idx - 1) <
+			(uint16_t)(new_idx - old_idx);
+	} else {
+		intr = new_idx != old_idx &&
+		    !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
+	}
+	if (intr)
+		vq_interrupt(vs, vq);
+}
+
+/* Note: these are in sorted order to make for a fast search */
+static struct config_reg {
+	uint16_t	cr_offset;	/* register offset */
+	uint8_t		cr_size;	/* size (bytes) */
+	uint8_t		cr_ro;		/* true => reg is read only */
+	const char	*cr_name;	/* name of reg */
+} config_regs[] = {
+	{ VTCFG_R_HOSTCAP,	4, 1, "HOSTCAP" },
+	{ VTCFG_R_GUESTCAP,	4, 0, "GUESTCAP" },
+	{ VTCFG_R_PFN,		4, 0, "PFN" },
+	{ VTCFG_R_QNUM,		2, 1, "QNUM" },
+	{ VTCFG_R_QSEL,		2, 0, "QSEL" },
+	{ VTCFG_R_QNOTIFY,	2, 0, "QNOTIFY" },
+	{ VTCFG_R_STATUS,	1, 0, "STATUS" },
+	{ VTCFG_R_ISR,		1, 0, "ISR" },
+	{ VTCFG_R_CFGVEC,	2, 0, "CFGVEC" },
+	{ VTCFG_R_QVEC,		2, 0, "QVEC" },
+};
+
+static inline struct config_reg *
+vi_find_cr(int offset) {
+	u_int hi, lo, mid;
+	struct config_reg *cr;
+
+	lo = 0;
+	hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
+	while (hi >= lo) {
+		mid = (hi + lo) >> 1;
+		cr = &config_regs[mid];
+		if (cr->cr_offset == offset)
+			return (cr);
+		if (cr->cr_offset < offset)
+			lo = mid + 1;
+		else
+			hi = mid - 1;
+	}
+	return (NULL);
+}
+
+/*
+ * Handle pci config space reads.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+uint64_t
+vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	    int baridx, uint64_t offset, int size)
+{
+	struct virtio_softc *vs = pi->pi_arg;
+	struct virtio_consts *vc;
+	struct config_reg *cr;
+	uint64_t virtio_config_size, max;
+	const char *name;
+	uint32_t newoff;
+	uint32_t value;
+	int error;
+
+	if (vs->vs_flags & VIRTIO_USE_MSIX) {
+		if (baridx == pci_msix_table_bar(pi) ||
+		    baridx == pci_msix_pba_bar(pi)) {
+			return (pci_emul_msix_tread(pi, offset, size));
+		}
+	}
+
+	/* XXX probably should do something better than just assert() */
+	assert(baridx == 0);
+
+	if (vs->vs_mtx)
+		pthread_mutex_lock(vs->vs_mtx);
+
+	vc = vs->vs_vc;
+	name = vc->vc_name;
+	value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
+
+	if (size != 1 && size != 2 && size != 4)
+		goto bad;
+
+	if (pci_msix_enabled(pi))
+		virtio_config_size = VTCFG_R_CFG1;
+	else
+		virtio_config_size = VTCFG_R_CFG0;
+
+	if (offset >= virtio_config_size) {
+		/*
+		 * Subtract off the standard size (including MSI-X
+		 * registers if enabled) and dispatch to underlying driver.
+		 * If that fails, fall into general code.
+		 */
+		newoff = offset - virtio_config_size;
+		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+		if (newoff + size > max)
+			goto bad;
+		error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
+		if (!error)
+			goto done;
+	}
+
+bad:
+	cr = vi_find_cr(offset);
+	if (cr == NULL || cr->cr_size != size) {
+		if (cr != NULL) {
+			/* offset must be OK, so size must be bad */
+			fprintf(stderr,
+			    "%s: read from %s: bad size %d\r\n",
+			    name, cr->cr_name, size);
+		} else {
+			fprintf(stderr,
+			    "%s: read from bad offset/size %jd/%d\r\n",
+			    name, (uintmax_t)offset, size);
+		}
+		goto done;
+	}
+
+	switch (offset) {
+	case VTCFG_R_HOSTCAP:
+		value = vc->vc_hv_caps;
+		break;
+	case VTCFG_R_GUESTCAP:
+		value = vs->vs_negotiated_caps;
+		break;
+	case VTCFG_R_PFN:
+		if (vs->vs_curq < vc->vc_nvq)
+			value = vs->vs_queues[vs->vs_curq].vq_pfn;
+		break;
+	case VTCFG_R_QNUM:
+		value = vs->vs_curq < vc->vc_nvq ?
+		    vs->vs_queues[vs->vs_curq].vq_qsize : 0;
+		break;
+	case VTCFG_R_QSEL:
+		value = vs->vs_curq;
+		break;
+	case VTCFG_R_QNOTIFY:
+		value = 0;	/* XXX */
+		break;
+	case VTCFG_R_STATUS:
+		value = vs->vs_status;
+		break;
+	case VTCFG_R_ISR:
+		value = vs->vs_isr;
+		vs->vs_isr = 0;		/* a read clears this flag */
+		if (value)
+			pci_lintr_deassert(pi);
+		break;
+	case VTCFG_R_CFGVEC:
+		value = vs->vs_msix_cfg_idx;
+		break;
+	case VTCFG_R_QVEC:
+		value = vs->vs_curq < vc->vc_nvq ?
+		    vs->vs_queues[vs->vs_curq].vq_msix_idx :
+		    VIRTIO_MSI_NO_VECTOR;
+		break;
+	}
+done:
+	if (vs->vs_mtx)
+		pthread_mutex_unlock(vs->vs_mtx);
+	return (value);
+}
+
+/*
+ * Handle pci config space writes.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+void
+vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	     int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct virtio_softc *vs = pi->pi_arg;
+	struct vqueue_info *vq;
+	struct virtio_consts *vc;
+	struct config_reg *cr;
+	uint64_t virtio_config_size, max;
+	const char *name;
+	uint32_t newoff;
+	int error;
+
+	if (vs->vs_flags & VIRTIO_USE_MSIX) {
+		if (baridx == pci_msix_table_bar(pi) ||
+		    baridx == pci_msix_pba_bar(pi)) {
+			pci_emul_msix_twrite(pi, offset, size, value);
+			return;
+		}
+	}
+
+	/* XXX probably should do something better than just assert() */
+	assert(baridx == 0);
+
+	if (vs->vs_mtx)
+		pthread_mutex_lock(vs->vs_mtx);
+
+	vc = vs->vs_vc;
+	name = vc->vc_name;
+
+	if (size != 1 && size != 2 && size != 4)
+		goto bad;
+
+	if (pci_msix_enabled(pi))
+		virtio_config_size = VTCFG_R_CFG1;
+	else
+		virtio_config_size = VTCFG_R_CFG0;
+
+	if (offset >= virtio_config_size) {
+		/*
+		 * Subtract off the standard size (including MSI-X
+		 * registers if enabled) and dispatch to underlying driver.
+		 */
+		newoff = offset - virtio_config_size;
+		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+		if (newoff + size > max)
+			goto bad;
+		error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
+		if (!error)
+			goto done;
+	}
+
+bad:
+	cr = vi_find_cr(offset);
+	if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
+		if (cr != NULL) {
+			/* offset must be OK, wrong size and/or reg is R/O */
+			if (cr->cr_size != size)
+				fprintf(stderr,
+				    "%s: write to %s: bad size %d\r\n",
+				    name, cr->cr_name, size);
+			if (cr->cr_ro)
+				fprintf(stderr,
+				    "%s: write to read-only reg %s\r\n",
+				    name, cr->cr_name);
+		} else {
+			fprintf(stderr,
+			    "%s: write to bad offset/size %jd/%d\r\n",
+			    name, (uintmax_t)offset, size);
+		}
+		goto done;
+	}
+
+	switch (offset) {
+	case VTCFG_R_GUESTCAP:
+		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
+		break;
+	case VTCFG_R_PFN:
+		if (vs->vs_curq >= vc->vc_nvq)
+			goto bad_qindex;
+		vi_vq_init(vs, value);
+		break;
+	case VTCFG_R_QSEL:
+		/*
+		 * Note that the guest is allowed to select an
+		 * invalid queue; we just need to return a QNUM
+		 * of 0 while the bad queue is selected.
+		 */
+		vs->vs_curq = value;
+		break;
+	case VTCFG_R_QNOTIFY:
+		if (value >= vc->vc_nvq) {
+			fprintf(stderr, "%s: queue %d notify out of range\r\n",
+				name, (int)value);
+			goto done;
+		}
+		vq = &vs->vs_queues[value];
+		if (vq->vq_notify)
+			(*vq->vq_notify)(DEV_SOFTC(vs), vq);
+		else if (vc->vc_qnotify)
+			(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
+		else
+			fprintf(stderr,
+			    "%s: qnotify queue %d: missing vq/vc notify\r\n",
+				name, (int)value);
+		break;
+	case VTCFG_R_STATUS:
+		vs->vs_status = value;
+		if (value == 0)
+			(*vc->vc_reset)(DEV_SOFTC(vs));
+		break;
+	case VTCFG_R_CFGVEC:
+		vs->vs_msix_cfg_idx = value;
+		break;
+	case VTCFG_R_QVEC:
+		if (vs->vs_curq >= vc->vc_nvq)
+			goto bad_qindex;
+		vq = &vs->vs_queues[vs->vs_curq];
+		vq->vq_msix_idx = value;
+		break;
+	}
+	goto done;
+
+bad_qindex:
+	fprintf(stderr,
+	    "%s: write config reg %s: curq %d >= max %d\r\n",
+	    name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
+done:
+	if (vs->vs_mtx)
+		pthread_mutex_unlock(vs->vs_mtx);
+}
diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h
new file mode 100644
index 0000000000..1a2ebe8118
--- /dev/null
+++ b/usr/src/cmd/bhyve/virtio.h
@@ -0,0 +1,475 @@
+/*-
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/virtio.h 268276 2014-07-05 02:38:53Z grehan $
+ */
+
+#ifndef	_VIRTIO_H_
+#define	_VIRTIO_H_
+
+/*
+ * These are derived from several virtio specifications.
+ *
+ * Some useful links:
+ *    https://github.com/rustyrussell/virtio-spec
+ *    http://people.redhat.com/pbonzini/virtio-spec.pdf
+ */
+
+/*
+ * A virtual device has zero or more "virtual queues" (virtqueue).
+ * Each virtqueue uses at least two 4096-byte pages, laid out thus:
+ *
+ *      +-----------------------------------------------+
+ *      |    "desc":  <N> descriptors, 16 bytes each    |
+ *      |   -----------------------------------------   |
+ *      |   "avail":   2 uint16; <N> uint16; 1 uint16   |
+ *      |   -----------------------------------------   |
+ *      |              pad to 4k boundary               |
+ *      +-----------------------------------------------+
+ *      |   "used": 2 x uint16; <N> elems; 1 uint16     |
+ *      |   -----------------------------------------   |
+ *      |              pad to 4k boundary               |
+ *      +-----------------------------------------------+
+ *
+ * The number <N> that appears here is always a power of two and is
+ * limited to no more than 32768 (as it must fit in a 16-bit field).
+ * If <N> is sufficiently large, the above will occupy more than
+ * two pages.  In any case, all pages must be physically contiguous
+ * within the guest's physical address space.
+ *
+ * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
+ * physical address <addr>, a 32-bit length <len>, a 16-bit
+ * <flags>, and a 16-bit <next> field (all in guest byte order).
+ *
+ * There are three flags that may be set :
+ *	NEXT    descriptor is chained, so use its "next" field
+ *	WRITE   descriptor is for host to write into guest RAM
+ *		(else host is to read from guest RAM)
+ *	INDIRECT   descriptor address field is (guest physical)
+ *		address of a linear array of descriptors
+ *
+ * Unless INDIRECT is set, <len> is the number of bytes that may
+ * be read/written from guest physical address <addr>.  If
+ * INDIRECT is set, WRITE is ignored and <len> provides the length
+ * of the indirect descriptors (and <len> must be a multiple of
+ * 16).  Note that NEXT may still be set in the main descriptor
+ * pointing to the indirect, and should be set in each indirect
+ * descriptor that uses the next descriptor (these should generally
+ * be numbered sequentially).  However, INDIRECT must not be set
+ * in the indirect descriptors.  Upon reaching an indirect descriptor
+ * without a NEXT bit, control returns to the direct descriptors.
+ *
+ * Except inside an indirect, each <next> value must be in the
+ * range [0 .. N) (i.e., the half-open interval).  (Inside an
+ * indirect, each <next> must be in the range [0 .. <len>/16).)
+ *
+ * The "avail" data structures reside in the same pages as the
+ * "desc" structures since both together are used by the device to
+ * pass information to the hypervisor's virtual driver.  These
+ * begin with a 16-bit <flags> field and 16-bit index <idx>, then
+ * have <N> 16-bit <ring> values, followed by one final 16-bit
+ * field <used_event>.  The <N> <ring> entries are simply indices
+ * indices into the descriptor ring (and thus must meet the same
+ * constraints as each <next> value).  However, <idx> is counted
+ * up from 0 (initially) and simply wraps around after 65535; it
+ * is taken mod <N> to find the next available entry.
+ *
+ * The "used" ring occupies a separate page or pages, and contains
+ * values written from the virtual driver back to the guest OS.
+ * This begins with a 16-bit <flags> and 16-bit <idx>, then there
+ * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
+ * The <N> "vring_used" elements consist of a 32-bit <id> and a
+ * 32-bit <len> (vu_tlen below).  The <id> is simply the index of
+ * the head of a descriptor chain the guest made available
+ * earlier, and the <len> is the number of bytes actually written,
+ * e.g., in the case of a network driver that provided a large
+ * receive buffer but received only a small amount of data.
+ *
+ * The two event fields, <used_event> and <avail_event>, in the
+ * avail and used rings (respectively -- note the reversal!), are
+ * always provided, but are used only if the virtual device
+ * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
+ * negotiation.  Similarly, both rings provide a flag --
+ * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
+ * their <flags> field, indicating that the guest does not need an
+ * interrupt, or that the hypervisor driver does not need a
+ * notify, when descriptors are added to the corresponding ring.
+ * (These are provided only for interrupt optimization and need
+ * not be implemented.)
+ */
+#define VRING_ALIGN	4096
+
+#define VRING_DESC_F_NEXT	(1 << 0)
+#define VRING_DESC_F_WRITE	(1 << 1)
+#define VRING_DESC_F_INDIRECT	(1 << 2)
+
+struct virtio_desc {			/* AKA vring_desc */
+	uint64_t	vd_addr;	/* guest physical address */
+	uint32_t	vd_len;		/* length of scatter/gather seg */
+	uint16_t	vd_flags;	/* VRING_F_DESC_* */
+	uint16_t	vd_next;	/* next desc if F_NEXT */
+} __packed;
+
+struct virtio_used {			/* AKA vring_used_elem */
+	uint32_t	vu_idx;		/* head of used descriptor chain */
+	uint32_t	vu_tlen;	/* length written-to */
+} __packed;
+
+#define VRING_AVAIL_F_NO_INTERRUPT   1
+
+struct vring_avail {
+	uint16_t	va_flags;	/* VRING_AVAIL_F_* */
+	uint16_t	va_idx;		/* counts to 65535, then cycles */
+	uint16_t	va_ring[];	/* size N, reported in QNUM value */
+/*	uint16_t	va_used_event;	-- after N ring entries */
+} __packed;
+
+#define	VRING_USED_F_NO_NOTIFY		1
+struct vring_used {
+	uint16_t	vu_flags;	/* VRING_USED_F_* */
+	uint16_t	vu_idx;		/* counts to 65535, then cycles */
+	struct virtio_used vu_ring[];	/* size N */
+/*	uint16_t	vu_avail_event;	-- after N ring entries */
+} __packed;
+
+/*
+ * The address of any given virtual queue is determined by a single
+ * Page Frame Number register.  The guest writes the PFN into the
+ * PCI config space.  However, a device that has two or more
+ * virtqueues can have a different PFN, and size, for each queue.
+ * The number of queues is determinable via the PCI config space
+ * VTCFG_R_QSEL register.  Writes to QSEL select the queue: 0 means
+ * queue #0, 1 means queue#1, etc.  Once a queue is selected, the
+ * remaining PFN and QNUM registers refer to that queue.
+ *
+ * QNUM is a read-only register containing a nonzero power of two
+ * that indicates the (hypervisor's) queue size.  Or, if reading it
+ * produces zero, the hypervisor does not have a corresponding
+ * queue.  (The number of possible queues depends on the virtual
+ * device.  The block device has just one; the network device
+ * provides either two -- 0 = receive, 1 = transmit -- or three,
+ * with 2 = control.)
+ *
+ * PFN is a read/write register giving the physical page address of
+ * the virtqueue in guest memory (the guest must allocate enough space
+ * based on the hypervisor's provided QNUM).
+ *
+ * QNOTIFY is effectively write-only: when the guest writes a queue
+ * number to the register, the hypervisor should scan the specified
+ * virtqueue. (Reading QNOTIFY currently always gets 0).
+ */
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN               12
+
+/*
+ * Virtio device types
+ *
+ * XXX Should really be merged with <dev/virtio/virtio.h> defines
+ */
+#define	VIRTIO_TYPE_NET		1
+#define	VIRTIO_TYPE_BLOCK	2
+#define	VIRTIO_TYPE_CONSOLE	3
+#define	VIRTIO_TYPE_ENTROPY	4
+#define	VIRTIO_TYPE_BALLOON	5
+#define	VIRTIO_TYPE_IOMEMORY	6
+#define	VIRTIO_TYPE_RPMSG	7
+#define	VIRTIO_TYPE_SCSI	8
+#define	VIRTIO_TYPE_9P		9
+
+/* experimental IDs start at 65535 and work down */
+
+/*
+ * PCI vendor/device IDs
+ */
+#define	VIRTIO_VENDOR		0x1AF4
+#define	VIRTIO_DEV_NET		0x1000
+#define	VIRTIO_DEV_BLOCK	0x1001
+#define	VIRTIO_DEV_RANDOM	0x1002
+
+/*
+ * PCI config space constants.
+ *
+ * If MSI-X is enabled, the ISR register is generally not used,
+ * and the configuration vector and queue vector appear at offsets
+ * 20 and 22 with the remaining configuration registers at 24.
+ * If MSI-X is not enabled, those two registers disappear and
+ * the remaining configuration registers start at offset 20.
+ */
+#define VTCFG_R_HOSTCAP		0
+#define VTCFG_R_GUESTCAP	4
+#define VTCFG_R_PFN		8
+#define VTCFG_R_QNUM		12
+#define VTCFG_R_QSEL		14
+#define VTCFG_R_QNOTIFY		16
+#define VTCFG_R_STATUS		18
+#define VTCFG_R_ISR		19
+#define VTCFG_R_CFGVEC		20
+#define VTCFG_R_QVEC		22
+#define VTCFG_R_CFG0		20	/* No MSI-X */
+#define VTCFG_R_CFG1		24	/* With MSI-X */
+#define VTCFG_R_MSIX		20
+
+/*
+ * Bits in VTCFG_R_STATUS.  Guests need not actually set any of these,
+ * but a guest writing 0 to this register means "please reset".
+ */
+#define	VTCFG_STATUS_ACK	0x01	/* guest OS has acknowledged dev */
+#define	VTCFG_STATUS_DRIVER	0x02	/* guest OS driver is loaded */
+#define	VTCFG_STATUS_DRIVER_OK	0x04	/* guest OS driver ready */
+#define	VTCFG_STATUS_FAILED	0x80	/* guest has given up on this dev */
+
+/*
+ * Bits in VTCFG_R_ISR.  These apply only if not using MSI-X.
+ *
+ * (We don't [yet?] ever use CONF_CHANGED.)
+ */
+#define	VTCFG_ISR_QUEUES	0x01	/* re-scan queues */
+#define	VTCFG_ISR_CONF_CHANGED	0x80	/* configuration changed */
+
+#define VIRTIO_MSI_NO_VECTOR	0xFFFF
+
+/*
+ * Feature flags.
+ * Note: bits 0 through 23 are reserved to each device type.
+ */
+#define	VIRTIO_F_NOTIFY_ON_EMPTY	(1 << 24)
+#define	VIRTIO_RING_F_INDIRECT_DESC	(1 << 28)
+#define	VIRTIO_RING_F_EVENT_IDX		(1 << 29)
+
+/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
+static inline size_t
+vring_size(u_int qsz)
+{
+	size_t size;
+
+	/* constant 3 below = va_flags, va_idx, va_used_event */
+	size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
+	size = roundup2(size, VRING_ALIGN);
+
+	/* constant 3 below = vu_flags, vu_idx, vu_avail_event */
+	size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
+	size = roundup2(size, VRING_ALIGN);
+
+	return (size);
+}
+
+struct vmctx;
+struct pci_devinst;
+struct vqueue_info;
+
+/*
+ * A virtual device, with some number (possibly 0) of virtual
+ * queues and some size (possibly 0) of configuration-space
+ * registers private to the device.  The virtio_softc should come
+ * at the front of each "derived class", so that a pointer to the
+ * virtio_softc is also a pointer to the more specific, derived-
+ * from-virtio driver's softc.
+ *
+ * Note: inside each hypervisor virtio driver, changes to these
+ * data structures must be locked against other threads, if any.
+ * Except for PCI config space register read/write, we assume each
+ * driver does the required locking, but we need a pointer to the
+ * lock (if there is one) for PCI config space read/write ops.
+ *
+ * When the guest reads or writes the device's config space, the
+ * generic layer checks for operations on the special registers
+ * described above.  If the offset of the register(s) being read
+ * or written is past the CFG area (CFG0 or CFG1), the request is
+ * passed on to the virtual device, after subtracting off the
+ * generic-layer size.  (So, drivers can just use the offset as
+ * an offset into "struct config", for instance.)
+ *
+ * (The virtio layer also makes sure that the read or write is to/
+ * from a "good" config offset, hence vc_cfgsize, and on BAR #0.
+ * However, the driver must verify the read or write size and offset
+ * and that no one is writing a readonly register.)
+ *
+ * The BROKED flag ("this thing done gone and broked") is for future
+ * use.
+ */
+#define	VIRTIO_USE_MSIX		0x01
+#define	VIRTIO_EVENT_IDX	0x02	/* use the event-index values */
+#define	VIRTIO_BROKED		0x08	/* ??? */
+
+struct virtio_softc {
+	struct virtio_consts *vs_vc;	/* constants (see below) */
+	int	vs_flags;		/* VIRTIO_* flags from above */
+	pthread_mutex_t *vs_mtx;	/* POSIX mutex, if any */
+	struct pci_devinst *vs_pi;	/* PCI device instance */
+	uint32_t vs_negotiated_caps;	/* negotiated capabilities */
+	struct vqueue_info *vs_queues;	/* one per vc_nvq */
+	int	vs_curq;		/* current queue */
+	uint8_t	vs_status;		/* value from last status write */
+	uint8_t	vs_isr;			/* ISR flags, if not MSI-X */
+	uint16_t vs_msix_cfg_idx;	/* MSI-X vector for config event */
+};
+
+#define	VS_LOCK(vs)							\
+do {									\
+	if (vs->vs_mtx)							\
+		pthread_mutex_lock(vs->vs_mtx);				\
+} while (0)
+
+#define	VS_UNLOCK(vs)							\
+do {									\
+	if (vs->vs_mtx)							\
+		pthread_mutex_unlock(vs->vs_mtx);			\
+} while (0)
+
+struct virtio_consts {
+	const char *vc_name;		/* name of driver (for diagnostics) */
+	int	vc_nvq;			/* number of virtual queues */
+	size_t	vc_cfgsize;		/* size of dev-specific config regs */
+	void	(*vc_reset)(void *);	/* called on virtual device reset */
+	void	(*vc_qnotify)(void *, struct vqueue_info *);
+					/* called on QNOTIFY if no VQ notify */
+	int	(*vc_cfgread)(void *, int, int, uint32_t *);
+					/* called to read config regs */
+	int	(*vc_cfgwrite)(void *, int, int, uint32_t);
+					/* called to write config regs */
+	uint64_t vc_hv_caps;		/* hypervisor-provided capabilities */
+};
+
+/*
+ * Data structure allocated (statically) per virtual queue.
+ *
+ * Drivers may change vq_qsize after a reset.  When the guest OS
+ * requests a device reset, the hypervisor first calls
+ * vs->vs_vc->vc_reset(); then the data structure below is
+ * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
+ *
+ * The remaining fields should only be fussed-with by the generic
+ * code.
+ *
+ * Note: the addresses of vq_desc, vq_avail, and vq_used are all
+ * computable from each other, but it's a lot simpler if we just
+ * keep a pointer to each one.  The event indices are similarly
+ * (but more easily) computable, and this time we'll compute them:
+ * they're just XX_ring[N].
+ */
+#define	VQ_ALLOC	0x01	/* set once we have a pfn */
+#define	VQ_BROKED	0x02	/* ??? */
+struct vqueue_info {
+	uint16_t vq_qsize;	/* size of this queue (a power of 2) */
+	void	(*vq_notify)(void *, struct vqueue_info *);
+				/* called instead of vc_notify, if not NULL */
+
+	struct virtio_softc *vq_vs;	/* backpointer to softc */
+	uint16_t vq_num;	/* we're the num'th queue in the softc */
+
+	uint16_t vq_flags;	/* flags (see above) */
+	uint16_t vq_last_avail;	/* a recent value of vq_avail->va_idx */
+	uint16_t vq_save_used;	/* saved vq_used->vu_idx; see vq_endchains */
+	uint16_t vq_msix_idx;	/* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
+
+	uint32_t vq_pfn;	/* PFN of virt queue (not shifted!) */
+
+	volatile struct virtio_desc *vq_desc;	/* descriptor array */
+	volatile struct vring_avail *vq_avail;	/* the "avail" ring */
+	volatile struct vring_used *vq_used;	/* the "used" ring */
+
+};
+/* as noted above, these are sort of backwards, name-wise */
+#define VQ_AVAIL_EVENT_IDX(vq) \
+	(*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
+#define VQ_USED_EVENT_IDX(vq) \
+	((vq)->vq_avail->va_ring[(vq)->vq_qsize])
+
+/*
+ * Is this ring ready for I/O?
+ */
+static inline int
+vq_ring_ready(struct vqueue_info *vq)
+{
+
+	return (vq->vq_flags & VQ_ALLOC);
+}
+
+/*
+ * Are there "available" descriptors?  (This does not count
+ * how many, just returns True if there are some.)
+ */
+static inline int
+vq_has_descs(struct vqueue_info *vq)
+{
+
+	return (vq_ring_ready(vq) && vq->vq_last_avail !=
+	    vq->vq_avail->va_idx);
+}
+
+/*
+ * Called by virtio driver as it starts processing chains.  Each
+ * completed chain (obtained from vq_getchain()) is released by
+ * calling vq_relchain(), then when all are done, vq_endchains()
+ * can tell if / how-many chains were processed and know whether
+ * and how to generate an interrupt.
+ */
+static inline void
+vq_startchains(struct vqueue_info *vq)
+{
+
+	vq->vq_save_used = vq->vq_used->vu_idx;
+}
+
+/*
+ * Deliver an interrupt to guest on the given virtual queue
+ * (if possible, or a generic MSI interrupt if not using MSI-X).
+ */
+static inline void
+vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
+{
+
+	if (pci_msix_enabled(vs->vs_pi))
+		pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
+	else {
+		VS_LOCK(vs);
+		vs->vs_isr |= VTCFG_ISR_QUEUES;
+		pci_generate_msi(vs->vs_pi, 0);
+		pci_lintr_assert(vs->vs_pi);
+		VS_UNLOCK(vs);
+	}
+}
+
+struct iovec;
+void	vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+			void *dev_softc, struct pci_devinst *pi,
+			struct vqueue_info *queues);
+int	vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
+void	vi_reset_dev(struct virtio_softc *);
+void	vi_set_io_bar(struct virtio_softc *, int);
+
+int	vq_getchain(struct vqueue_info *vq,
+		    struct iovec *iov, int n_iov, uint16_t *flags);
+void	vq_relchain(struct vqueue_info *vq, uint32_t iolen);
+void	vq_endchains(struct vqueue_info *vq, int used_all_avail);
+
+uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		     int baridx, uint64_t offset, int size);
+void	vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		     int baridx, uint64_t offset, int size, uint64_t value);
+#endif	/* _VIRTIO_H_ */
diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c
new file mode 100644
index 0000000000..0c097251e0
--- /dev/null
+++ b/usr/src/cmd/bhyve/xmsr.c
@@ -0,0 +1,237 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $");
+
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+#include <machine/vmm.h>
+#include <machine/specialreg.h>
+
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xmsr.h"
+
+static int cpu_vendor_intel, cpu_vendor_amd;
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val)
+{
+
+	if (cpu_vendor_intel) {
+		switch (num) {
+#ifndef	__FreeBSD__
+		case MSR_PERFCTR0:
+		case MSR_PERFCTR1:
+		case MSR_EVNTSEL0:
+		case MSR_EVNTSEL1:
+			return (0);
+#endif
+		case 0xd04:		/* Sandy Bridge uncore PMCs */
+		case 0xc24:
+			return (0);
+		case MSR_BIOS_UPDT_TRIG:
+			return (0);
+		case MSR_BIOS_SIGN:
+			return (0);
+		default:
+			break;
+		}
+	} else if (cpu_vendor_amd) {
+		switch (num) {
+		case MSR_HWCR:
+			/*
+			 * Ignore writes to hardware configuration MSR.
+			 */
+			return (0);
+
+		case MSR_NB_CFG1:
+		case MSR_IC_CFG:
+			return (0);	/* Ignore writes */
+
+		case MSR_PERFEVSEL0:
+		case MSR_PERFEVSEL1:
+		case MSR_PERFEVSEL2:
+		case MSR_PERFEVSEL3:
+			/* Ignore writes to the PerfEvtSel MSRs */
+			return (0);
+
+		case MSR_K7_PERFCTR0:
+		case MSR_K7_PERFCTR1:
+		case MSR_K7_PERFCTR2:
+		case MSR_K7_PERFCTR3:
+			/* Ignore writes to the PerfCtr MSRs */
+			return (0);
+
+		case MSR_P_STATE_CONTROL:
+			/* Ignore write to change the P-state */
+			return (0);
+
+		default:
+			break;
+		}
+	}
+	return (-1);
+}
+
+int
+emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
+{
+	int error = 0;
+
+	if (cpu_vendor_intel) {
+		switch (num) {
+		case MSR_BIOS_SIGN:
+		case MSR_IA32_PLATFORM_ID:
+		case MSR_PKG_ENERGY_STATUS:
+		case MSR_PP0_ENERGY_STATUS:
+		case MSR_PP1_ENERGY_STATUS:
+		case MSR_DRAM_ENERGY_STATUS:
+			*val = 0;
+			break;
+		case MSR_RAPL_POWER_UNIT:
+			/*
+			 * Use the default value documented in section
+			 * "RAPL Interfaces" in Intel SDM vol3.
+			 */
+			*val = 0x000a1003;
+			break;
+		default:
+			error = -1;
+			break;
+		}
+	} else if (cpu_vendor_amd) {
+		switch (num) {
+		case MSR_BIOS_SIGN:
+			*val = 0;
+			break;
+		case MSR_HWCR:
+			/*
+			 * Bios and Kernel Developer's Guides for AMD Families
+			 * 12H, 14H, 15H and 16H.
+			 */
+			*val = 0x01000010;	/* Reset value */
+			*val |= 1 << 9;		/* MONITOR/MWAIT disable */
+			break;
+
+		case MSR_NB_CFG1:
+		case MSR_IC_CFG:
+			/*
+			 * The reset value is processor family dependent so
+			 * just return 0.
+			 */
+			*val = 0;
+			break;
+
+		case MSR_PERFEVSEL0:
+		case MSR_PERFEVSEL1:
+		case MSR_PERFEVSEL2:
+		case MSR_PERFEVSEL3:
+			/*
+			 * PerfEvtSel MSRs are not properly virtualized so just
+			 * return zero.
+			 */
+			*val = 0;
+			break;
+
+		case MSR_K7_PERFCTR0:
+		case MSR_K7_PERFCTR1:
+		case MSR_K7_PERFCTR2:
+		case MSR_K7_PERFCTR3:
+			/*
+			 * PerfCtr MSRs are not properly virtualized so just
+			 * return zero.
+			 */
+			*val = 0;
+			break;
+
+		case MSR_SMM_ADDR:
+		case MSR_SMM_MASK:
+			/*
+			 * Return the reset value defined in the AMD Bios and
+			 * Kernel Developer's Guide.
+			 */
+			*val = 0;
+			break;
+
+		case MSR_P_STATE_LIMIT:
+		case MSR_P_STATE_CONTROL:
+		case MSR_P_STATE_STATUS:
+		case MSR_P_STATE_CONFIG(0):	/* P0 configuration */
+			*val = 0;
+			break;
+
+		/*
+		 * OpenBSD guests test bit 0 of this MSR to detect if the
+		 * workaround for erratum 721 is already applied.
+		 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
+		 */
+		case 0xC0011029:
+			*val = 1;
+			break;
+
+		default:
+			error = -1;
+			break;
+		}
+	} else {
+		error = -1;
+	}
+	return (error);
+}
+
+int
+init_msr(void)
+{
+	int error;
+	u_int regs[4];
+	char cpu_vendor[13];
+
+	do_cpuid(0, regs);
+	((u_int *)&cpu_vendor)[0] = regs[1];
+	((u_int *)&cpu_vendor)[1] = regs[3];
+	((u_int *)&cpu_vendor)[2] = regs[2];
+	cpu_vendor[12] = '\0';
+
+	error = 0;
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+		cpu_vendor_amd = 1;
+	} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+		cpu_vendor_intel = 1;
+	} else {
+		fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
+		error = -1;
+	}
+	return (error);
+}
diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h
new file mode 100644
index 0000000000..ac3c147442
--- /dev/null
+++ b/usr/src/cmd/bhyve/xmsr.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/xmsr.h 271888 2014-09-20 02:35:21Z neel $
+ */
+
+#ifndef	_XMSR_H_
+#define	_XMSR_H_
+
+int init_msr(void);
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val);
+
+#endif
diff --git a/usr/src/cmd/bhyveconsole/Makefile b/usr/src/cmd/bhyveconsole/Makefile
new file mode 100644
index 0000000000..11d34e6599
--- /dev/null
+++ b/usr/src/cmd/bhyveconsole/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.cmd
+
+SUBDIRS= $(MACH)
+
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
+
+.KEEP_STATE:
+
+all:	$(SUBDIRS)
+
+clean clobber lint:	$(SUBDIRS)
+
+install:	$(SUBDIRS)
+	-$(RM) $(ROOTUSRSBINPROG)
+	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyveconsole/bhyveconsole.c b/usr/src/cmd/bhyveconsole/bhyveconsole.c
new file mode 100644
index 0000000000..7f237a72f6
--- /dev/null
+++ b/usr/src/cmd/bhyveconsole/bhyveconsole.c
@@ -0,0 +1,360 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/signal.h>
+#include <sys/socket.h>
+#include <sys/termios.h>
+#include <assert.h>
+#include <errno.h>
+#include <libgen.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <bhyve.h>
+
+static int masterfd;
+static struct termios save_termios;
+static int save_fd;
+
+static int nocmdchar = 0;
+static char cmdchar = '~';
+
+static const char *pname;
+
+#define	BCONS_BUFSIZ		8192
+
+static void
+usage(void)
+{
+	(void) fprintf(stderr, "usage: %s vmname\n", pname);
+	exit(2);
+}
+
+static void
+bcons_error(const char *fmt, ...)
+{
+	va_list alist;
+
+	(void) fprintf(stderr, "%s: ", pname);
+	va_start(alist, fmt);
+	(void) vfprintf(stderr, fmt, alist);
+	va_end(alist);
+	(void) fprintf(stderr, "\n");
+}
+
+static void
+bcons_perror(const char *str)
+{
+	const char *estr;
+
+	if ((estr = strerror(errno)) != NULL)
+		(void) fprintf(stderr, "%s: %s: %s\n", pname, str, estr);
+	else
+		(void) fprintf(stderr, "%s: %s: errno %d\n", pname, str, errno);
+}
+
+/*
+ * Create the unix domain socket and call bhyve; handshake
+ * with it to determine whether it will allow us to connect.
+ */
+static int
+get_console(const char *vmname)
+{
+	int sockfd = -1;
+	struct sockaddr_un servaddr;
+	char clientid[MAXPATHLEN];
+	char handshake[MAXPATHLEN], c;
+	int msglen;
+	int i = 0, err = 0;
+
+	if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+		bcons_perror("could not create socket");
+		return (-1);
+	}
+
+	bzero(&servaddr, sizeof (servaddr));
+	servaddr.sun_family = AF_UNIX;
+	(void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
+	    BHYVE_CONS_SOCKPATH, vmname);
+
+	if (connect(sockfd, (struct sockaddr *)&servaddr,
+	    sizeof (servaddr)) == -1) {
+		bcons_perror("Could not connect to console server");
+		goto bad;
+	}
+	masterfd = sockfd;
+
+	msglen = snprintf(clientid, sizeof (clientid), "IDENT %lu\n",
+	    getpid());
+	assert(msglen > 0 && msglen < sizeof (clientid));
+
+	if (write(masterfd, clientid, msglen) != msglen) {
+		bcons_error("protocol error");
+		goto bad;
+	}
+
+	/*
+	 * Take care not to accumulate more than our fill, and leave room for
+	 * the NUL at the end.
+	 */
+	while ((err = read(masterfd, &c, 1)) == 1) {
+		if (i >= (sizeof (handshake) - 1))
+			break;
+		if (c == '\n')
+			break;
+		handshake[i] = c;
+		i++;
+	}
+	handshake[i] = '\0';
+
+	/*
+	 * If something went wrong during the handshake we bail; perhaps
+	 * the server died off.
+	 */
+	if (err == -1) {
+		bcons_perror("Could not connect to console server");
+		goto bad;
+	}
+
+	if (strncmp(handshake, "OK", sizeof (handshake)) == 0)
+		return (0);
+
+	bcons_error("Console is already in use by process ID %s.",
+	    handshake);
+bad:
+	(void) close(sockfd);
+	masterfd = -1;
+	return (-1);
+}
+
+/*
+ * Place terminal into raw mode.
+ */
+static int
+set_tty_rawmode(int fd)
+{
+	struct termios term;
+	if (tcgetattr(fd, &term) < 0) {
+		bcons_perror("failed to get user terminal settings");
+		return (-1);
+	}
+
+	/* Stash for later, so we can revert back to previous mode */
+	save_termios = term;
+	save_fd = fd;
+
+	/* disable 8->7 bit strip, start/stop, enable any char to restart */
+	term.c_iflag &= ~(ISTRIP|IXON|IXANY);
+	/* disable NL->CR, CR->NL, ignore CR, UPPER->lower */
+	term.c_iflag &= ~(INLCR|ICRNL|IGNCR|IUCLC);
+	/* disable output post-processing */
+	term.c_oflag &= ~OPOST;
+	/* disable canonical mode, signal chars, echo & extended functions */
+	term.c_lflag &= ~(ICANON|ISIG|ECHO|IEXTEN);
+
+	term.c_cc[VMIN] = 1;    /* byte-at-a-time */
+	term.c_cc[VTIME] = 0;
+
+	if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term)) {
+		bcons_perror("failed to set user terminal to raw mode");
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * reset terminal settings for global environment
+ */
+static void
+reset_tty(void)
+{
+	(void) tcsetattr(save_fd, TCSADRAIN, &save_termios);
+}
+
+/*
+ * process_user_input watches the input stream for the escape sequence for
+ * 'quit' (by default, tilde-period).  Because we might be fed just one
+ * keystroke at a time, state associated with the user input (are we at the
+ * beginning of the line?  are we locally echoing the next character?) is
+ * maintained by beginning_of_line and local_echo across calls to the routine.
+ *
+ * This routine returns -1 when the 'quit' escape sequence has been issued,
+ * or an error is encountered and 0 otherwise.
+ */
+static int
+process_user_input(int out_fd, int in_fd)
+{
+	static boolean_t beginning_of_line = B_TRUE;
+	static boolean_t local_echo = B_FALSE;
+	char ibuf[BCONS_BUFSIZ];
+	int nbytes;
+	char *buf = ibuf;
+	char c;
+
+	nbytes = read(in_fd, ibuf, sizeof (ibuf));
+	if (nbytes == -1 && errno != EINTR)
+		return (-1);
+
+	if (nbytes == -1)	/* The read was interrupted. */
+		return (0);
+
+	for (c = *buf; nbytes > 0; c = *buf, --nbytes) {
+		buf++;
+		if (beginning_of_line && !nocmdchar) {
+			beginning_of_line = B_FALSE;
+			if (c == cmdchar) {
+				local_echo = B_TRUE;
+				continue;
+			}
+		} else if (local_echo) {
+			local_echo = B_FALSE;
+			if (c == '.') {
+				(void) write(STDOUT_FILENO, &cmdchar, 1);
+				(void) write(STDOUT_FILENO, &c, 1);
+				return (-1);
+			}
+		}
+
+		(void) write(out_fd, &c, 1);
+
+		beginning_of_line = (c == '\r' || c == '\n');
+	}
+
+	return (0);
+}
+
+static int
+process_output(int in_fd, int out_fd)
+{
+	int wrote = 0;
+	int cc;
+	char ibuf[BCONS_BUFSIZ];
+
+	cc = read(in_fd, ibuf, sizeof (ibuf));
+	if (cc == -1 && errno != EINTR)
+		return (-1);
+	if (cc == 0)	/* EOF */
+		return (-1);
+	if (cc == -1)	/* The read was interrupted. */
+		return (0);
+
+	do {
+		int len;
+
+		len = write(out_fd, ibuf + wrote, cc - wrote);
+		if (len == -1 && errno != EINTR)
+			return (-1);
+		if (len != -1)
+			wrote += len;
+	} while (wrote < cc);
+
+	return (0);
+}
+
+/*
+ * This is the main I/O loop.
+ */
+static void
+doio(void)
+{
+	struct pollfd pollfds[2];
+	int res;
+
+	/* read from vm and write to stdout */
+	pollfds[0].fd = masterfd;
+	pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI;
+
+	/* read from stdin and write to vm */
+	pollfds[1].fd = STDIN_FILENO;
+	pollfds[1].events = pollfds[0].events;
+
+	for (;;) {
+		pollfds[0].revents = pollfds[1].revents = 0;
+
+		res = poll(pollfds,
+		    sizeof (pollfds) / sizeof (struct pollfd), -1);
+
+		if (res == -1 && errno != EINTR) {
+			bcons_perror("poll failed");
+			/* we are hosed, close connection */
+			break;
+		}
+
+		/* event from master side stdout */
+		if (pollfds[0].revents) {
+			if (pollfds[0].revents &
+			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+				if (process_output(masterfd, STDOUT_FILENO)
+				    != 0)
+					break;
+			} else {
+				break;
+			}
+		}
+
+		/* event from user stdin side */
+		if (pollfds[1].revents) {
+			if (pollfds[1].revents &
+			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+			  if (process_user_input(masterfd, STDIN_FILENO)
+				    != 0)
+					break;
+			} else {
+				break;
+			}
+		}
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	char *vmname;
+
+	pname = basename(argv[0]);
+
+	if (argc == 2) {
+		vmname = argv[1];
+	} else {
+		usage();
+	}
+
+	/*
+	 * Make contact with bhyve
+	 */
+	if (get_console(vmname) == -1)
+		return (1);
+
+	(void) printf("[Connected to vm '%s' console]\n", vmname);
+
+	if (set_tty_rawmode(STDIN_FILENO) == -1) {
+		reset_tty();
+		bcons_perror("failed to set stdin pty to raw mode");
+		return (1);
+	}
+
+	/*
+	 * Run the I/O loop until we get disconnected.
+	 */
+	doio();
+	reset_tty();
+	(void) printf("\n[Connection to vm '%s' console closed]\n", vmname);
+
+	return (0);
+}
diff --git a/usr/src/cmd/bhyveconsole/i386/Makefile b/usr/src/cmd/bhyveconsole/i386/Makefile
new file mode 100644
index 0000000000..c4f317a9fa
--- /dev/null
+++ b/usr/src/cmd/bhyveconsole/i386/Makefile
@@ -0,0 +1,43 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG=	bhyveconsole
+
+OBJS=	bhyveconsole.o
+
+SRCS=	$(OBJS:%.o=../%.c)
+
+include ../../Makefile.cmd
+
+CFLAGS	+= $(CCVERBOSE)
+LDLIBS += -lsocket
+
+.KEEP_STATE:
+
+%.o:	../%.c
+	$(COMPILE.c) $<
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) $(OBJS) -o $@ $(LDLIBS)
+	$(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG32)
+
+clean:
+	$(RM) $(OBJS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile
new file mode 100644
index 0000000000..fe98204056
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG =		bhyvectl
+
+include ../Makefile.cmd
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint:	$(SUBDIRS)
+
+install: $(SUBDIRS)
+	-$(RM) $(ROOTUSRSBINPROG)
+	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyvectl/Makefile.com b/usr/src/cmd/bhyvectl/Makefile.com
new file mode 100644
index 0000000000..03ca34792c
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/Makefile.com
@@ -0,0 +1,48 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG= bhyvectl
+
+SRCS = bhyvectl.c
+OBJS = $(SRCS:.c=.o)
+
+include ../../Makefile.cmd
+
+.KEEP_STATE:
+
+CFLAGS +=	$(CCVERBOSE)
+CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
+	-I$(ROOT)/usr/platform/i86pc/include \
+	-I$(SRC)/uts/i86pc/io/vmm
+LDLIBS +=	-lvmmapi
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+	$(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+	$(RM) $(OBJS)
+
+lint:	lint_SRCS
+
+include ../../Makefile.targ
+
+%.o: ../%.c
+	$(COMPILE.c) -I$(SRC)/common $<
+	$(POST_PROCESS_O)
diff --git a/usr/src/cmd/bhyvectl/amd64/Makefile b/usr/src/cmd/bhyvectl/amd64/Makefile
new file mode 100644
index 0000000000..b602c50d05
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c
new file mode 100644
index 0000000000..07d0a83df5
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/bhyvectl.c
@@ -0,0 +1,1523 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <libutil.h>
+#include <fcntl.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "intel/vmcs.h"
+
+#define	MB	(1UL << 20)
+#define	GB	(1UL << 30)
+
+#define	REQ_ARG		required_argument
+#define	NO_ARG		no_argument
+#define	OPT_ARG		optional_argument
+
+static const char *progname;
+
+static void
+usage(void)
+{
+
+	(void)fprintf(stderr,
+	"Usage: %s --vm=<vmname>\n"
+	"       [--cpu=<vcpu_number>]\n"
+	"       [--create]\n"
+	"       [--destroy]\n"
+	"       [--get-all]\n"
+	"       [--get-stats]\n"
+	"       [--set-desc-ds]\n"
+	"       [--get-desc-ds]\n"
+	"       [--set-desc-es]\n"
+	"       [--get-desc-es]\n"
+	"       [--set-desc-gs]\n"
+	"       [--get-desc-gs]\n"
+	"       [--set-desc-fs]\n"
+	"       [--get-desc-fs]\n"
+	"       [--set-desc-cs]\n"
+	"       [--get-desc-cs]\n"
+	"       [--set-desc-ss]\n"
+	"       [--get-desc-ss]\n"
+	"       [--set-desc-tr]\n"
+	"       [--get-desc-tr]\n"
+	"       [--set-desc-ldtr]\n"
+	"       [--get-desc-ldtr]\n"
+	"       [--set-desc-gdtr]\n"
+	"       [--get-desc-gdtr]\n"
+	"       [--set-desc-idtr]\n"
+	"       [--get-desc-idtr]\n"
+	"       [--run]\n"
+	"       [--capname=<capname>]\n"
+	"       [--getcap]\n"
+	"       [--setcap=<0|1>]\n"
+	"       [--desc-base=<BASE>]\n"
+	"       [--desc-limit=<LIMIT>]\n"
+	"       [--desc-access=<ACCESS>]\n"
+	"       [--set-cr0=<CR0>]\n"
+	"       [--get-cr0]\n"
+	"       [--set-cr3=<CR3>]\n"
+	"       [--get-cr3]\n"
+	"       [--set-cr4=<CR4>]\n"
+	"       [--get-cr4]\n"
+	"       [--set-dr7=<DR7>]\n"
+	"       [--get-dr7]\n"
+	"       [--set-rsp=<RSP>]\n"
+	"       [--get-rsp]\n"
+	"       [--set-rip=<RIP>]\n"
+	"       [--get-rip]\n"
+	"       [--get-rax]\n"
+	"       [--set-rax=<RAX>]\n"
+	"       [--get-rbx]\n"
+	"       [--get-rcx]\n"
+	"       [--get-rdx]\n"
+	"       [--get-rsi]\n"
+	"       [--get-rdi]\n"
+	"       [--get-rbp]\n"
+	"       [--get-r8]\n"
+	"       [--get-r9]\n"
+	"       [--get-r10]\n"
+	"       [--get-r11]\n"
+	"       [--get-r12]\n"
+	"       [--get-r13]\n"
+	"       [--get-r14]\n"
+	"       [--get-r15]\n"
+	"       [--set-rflags=<RFLAGS>]\n"
+	"       [--get-rflags]\n"
+	"       [--set-cs]\n"
+	"       [--get-cs]\n"
+	"       [--set-ds]\n"
+	"       [--get-ds]\n"
+	"       [--set-es]\n"
+	"       [--get-es]\n"
+	"       [--set-fs]\n"
+	"       [--get-fs]\n"
+	"       [--set-gs]\n"
+	"       [--get-gs]\n"
+	"       [--set-ss]\n"
+	"       [--get-ss]\n"
+	"       [--get-tr]\n"
+	"       [--get-ldtr]\n"
+	"       [--get-vmcs-pinbased-ctls]\n"
+	"       [--get-vmcs-procbased-ctls]\n"
+	"       [--get-vmcs-procbased-ctls2]\n"
+	"       [--get-vmcs-entry-interruption-info]\n"
+	"       [--set-vmcs-entry-interruption-info=<info>]\n"
+	"       [--get-vmcs-eptp]\n"
+	"       [--get-vmcs-guest-physical-address\n"
+	"       [--get-vmcs-guest-linear-address\n"
+	"       [--set-vmcs-exception-bitmap]\n"
+	"       [--get-vmcs-exception-bitmap]\n"
+	"       [--get-vmcs-io-bitmap-address]\n"
+	"       [--get-vmcs-tsc-offset]\n"
+	"       [--get-vmcs-guest-pat]\n"
+	"       [--get-vmcs-host-pat]\n"
+	"       [--get-vmcs-host-cr0]\n"
+	"       [--get-vmcs-host-cr3]\n"
+	"       [--get-vmcs-host-cr4]\n"
+	"       [--get-vmcs-host-rip]\n"
+	"       [--get-vmcs-host-rsp]\n"
+	"       [--get-vmcs-cr0-mask]\n"
+	"       [--get-vmcs-cr0-shadow]\n"
+	"       [--get-vmcs-cr4-mask]\n"
+	"       [--get-vmcs-cr4-shadow]\n"
+	"       [--get-vmcs-cr3-targets]\n"
+	"       [--get-vmcs-apic-access-address]\n"
+	"       [--get-vmcs-virtual-apic-address]\n"
+	"       [--get-vmcs-tpr-threshold]\n"
+	"       [--get-vmcs-msr-bitmap]\n"
+	"       [--get-vmcs-msr-bitmap-address]\n"
+	"       [--get-vmcs-vpid]\n"
+	"       [--get-vmcs-ple-gap]\n"
+	"       [--get-vmcs-ple-window]\n"
+	"       [--get-vmcs-instruction-error]\n"
+	"       [--get-vmcs-exit-ctls]\n"
+	"       [--get-vmcs-entry-ctls]\n"
+	"       [--get-vmcs-guest-sysenter]\n"
+	"       [--get-vmcs-link]\n"
+	"       [--get-vmcs-exit-reason]\n"
+	"       [--get-vmcs-exit-qualification]\n"
+	"       [--get-vmcs-exit-interruption-info]\n"
+	"       [--get-vmcs-exit-interruption-error]\n"
+	"       [--get-vmcs-interruptibility]\n"
+	"       [--set-x2apic-state=<state>]\n"
+	"       [--get-x2apic-state]\n"
+	"       [--unassign-pptdev=<bus/slot/func>]\n"
+	"       [--set-mem=<memory in units of MB>]\n"
+	"       [--get-lowmem]\n"
+	"       [--get-highmem]\n",
+	progname);
+	exit(1);
+}
+
+static int get_stats, getcap, setcap, capval;
+static const char *capname;
+static int create, destroy, get_lowmem, get_highmem;
+static uint64_t memsize;
+static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_efer, get_efer;
+static int set_dr7, get_dr7;
+static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
+static int set_rax, get_rax;
+static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
+static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
+static int set_desc_ds, get_desc_ds;
+static int set_desc_es, get_desc_es;
+static int set_desc_fs, get_desc_fs;
+static int set_desc_gs, get_desc_gs;
+static int set_desc_cs, get_desc_cs;
+static int set_desc_ss, get_desc_ss;
+static int set_desc_gdtr, get_desc_gdtr;
+static int set_desc_idtr, get_desc_idtr;
+static int set_desc_tr, get_desc_tr;
+static int set_desc_ldtr, get_desc_ldtr;
+static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
+static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
+static int set_x2apic_state, get_x2apic_state;
+enum x2apic_state x2apic_state;
+static int unassign_pptdev, bus, slot, func;
+static int run;
+
+/*
+ * VMCS-specific fields
+ */
+static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
+static int get_eptp, get_io_bitmap, get_tsc_offset;
+static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
+static int get_vmcs_interruptibility;
+uint32_t vmcs_entry_interruption_info;
+static int get_vmcs_gpa, get_vmcs_gla;
+static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
+static int get_cr0_mask, get_cr0_shadow;
+static int get_cr4_mask, get_cr4_shadow;
+static int get_cr3_targets;
+static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
+static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_vpid, get_ple_gap, get_ple_window;
+static int get_inst_err, get_exit_ctls, get_entry_ctls;
+static int get_host_cr0, get_host_cr3, get_host_cr4;
+static int get_host_rip, get_host_rsp;
+static int get_guest_pat, get_host_pat;
+static int get_guest_sysenter, get_vmcs_link;
+static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+
+static uint64_t desc_base;
+static uint32_t desc_limit, desc_access;
+
+static int get_all;
+
+static void
+dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
+{
+	printf("vm exit[%d]\n", vcpu);
+	printf("\trip\t\t0x%016lx\n", vmexit->rip);
+	printf("\tinst_length\t%d\n", vmexit->inst_length);
+	switch (vmexit->exitcode) {
+	case VM_EXITCODE_INOUT:
+		printf("\treason\t\tINOUT\n");
+		printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
+		printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
+		printf("\tflags\t\t%s%s\n",
+			vmexit->u.inout.string ? "STRING " : "",
+			vmexit->u.inout.rep ? "REP " : "");
+		printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
+		printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
+		break;
+	case VM_EXITCODE_VMX:
+		printf("\treason\t\tVMX\n");
+		printf("\tstatus\t\t%d\n", vmexit->u.vmx.status);
+		printf("\texit_reason\t0x%08x (%u)\n",
+		    vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
+		printf("\tqualification\t0x%016lx\n",
+			vmexit->u.vmx.exit_qualification);
+		printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
+		printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
+		break;
+	default:
+		printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
+		break;
+	}
+}
+
+static int
+dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+{
+	int error, fd, byte, bit, readable, writeable;
+	u_int msr;
+	const char *bitmap;
+
+	error = -1;
+	bitmap = MAP_FAILED;
+
+	fd = open("/dev/mem", O_RDONLY, 0);
+	if (fd < 0)
+		goto done;
+
+	bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+	if (bitmap == MAP_FAILED)
+		goto done;
+
+	for (msr = 0; msr < 0x2000; msr++) {
+		byte = msr / 8;
+		bit = msr & 0x7;
+
+		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+		if (readable || writeable) {
+			printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
+				readable ? 'R' : '-',
+				writeable ? 'W' : '-');
+		}
+
+		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+		byte += 1024;
+		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+		if (readable || writeable) {
+			printf("msr 0x%08x[%d]\t\t%c%c\n",
+				0xc0000000 + msr, vcpu,
+				readable ? 'R' : '-',
+				writeable ? 'W' : '-');
+		}
+	}
+
+	error = 0;
+done:
+	if (bitmap != MAP_FAILED)
+		munmap((void *)bitmap, PAGE_SIZE);
+	if (fd >= 0)
+		close(fd);
+	return (error);
+}
+
+static int
+vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
+{
+
+	return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
+}
+
+static int
+vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
+{
+
+	return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
+}
+
+enum {
+	VMNAME = 1000,	/* avoid collision with return values from getopt */
+	VCPU,
+	SET_MEM,
+	SET_EFER,
+	SET_CR0,
+	SET_CR3,
+	SET_CR4,
+	SET_DR7,
+	SET_RSP,
+	SET_RIP,
+	SET_RAX,
+	SET_RFLAGS,
+	DESC_BASE,
+	DESC_LIMIT,
+	DESC_ACCESS,
+	SET_CS,
+	SET_DS,
+	SET_ES,
+	SET_FS,
+	SET_GS,
+	SET_SS,
+	SET_TR,
+	SET_LDTR,
+	SET_X2APIC_STATE,
+	SET_VMCS_EXCEPTION_BITMAP,
+	SET_VMCS_ENTRY_INTERRUPTION_INFO,
+	SET_CAP,
+	CAPNAME,
+	UNASSIGN_PPTDEV,
+};
+
+int
+main(int argc, char *argv[])
+{
+	char *vmname;
+	int error, ch, vcpu;
+	vm_paddr_t gpa;
+	size_t len;
+	struct vm_exit vmexit;
+	uint64_t ctl, eptp, bm, addr, u64;
+	struct vmctx *ctx;
+	int wired;
+
+	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
+	uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
+	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+
+	struct option opts[] = {
+		{ "vm",		REQ_ARG,	0,	VMNAME },
+		{ "cpu",	REQ_ARG,	0,	VCPU },
+		{ "set-mem",	REQ_ARG,	0,	SET_MEM },
+		{ "set-efer",	REQ_ARG,	0,	SET_EFER },
+		{ "set-cr0",	REQ_ARG,	0,	SET_CR0 },
+		{ "set-cr3",	REQ_ARG,	0,	SET_CR3 },
+		{ "set-cr4",	REQ_ARG,	0,	SET_CR4 },
+		{ "set-dr7",	REQ_ARG,	0,	SET_DR7 },
+		{ "set-rsp",	REQ_ARG,	0,	SET_RSP },
+		{ "set-rip",	REQ_ARG,	0,	SET_RIP },
+		{ "set-rax",	REQ_ARG,	0,	SET_RAX },
+		{ "set-rflags",	REQ_ARG,	0,	SET_RFLAGS },
+		{ "desc-base",	REQ_ARG,	0,	DESC_BASE },
+		{ "desc-limit",	REQ_ARG,	0,	DESC_LIMIT },
+		{ "desc-access",REQ_ARG,	0,	DESC_ACCESS },
+		{ "set-cs",	REQ_ARG,	0,	SET_CS },
+		{ "set-ds",	REQ_ARG,	0,	SET_DS },
+		{ "set-es",	REQ_ARG,	0,	SET_ES },
+		{ "set-fs",	REQ_ARG,	0,	SET_FS },
+		{ "set-gs",	REQ_ARG,	0,	SET_GS },
+		{ "set-ss",	REQ_ARG,	0,	SET_SS },
+		{ "set-tr",	REQ_ARG,	0,	SET_TR },
+		{ "set-ldtr",	REQ_ARG,	0,	SET_LDTR },
+		{ "set-x2apic-state",REQ_ARG,	0,	SET_X2APIC_STATE },
+		{ "set-vmcs-exception-bitmap",
+				REQ_ARG,	0, SET_VMCS_EXCEPTION_BITMAP },
+		{ "set-vmcs-entry-interruption-info",
+				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+		{ "capname",	REQ_ARG,	0,	CAPNAME },
+		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
+		{ "setcap",	REQ_ARG,	0,	SET_CAP },
+		{ "getcap",	NO_ARG,		&getcap,	1 },
+		{ "get-stats",	NO_ARG,		&get_stats,	1 },
+		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
+		{ "set-desc-ds",NO_ARG,		&set_desc_ds,	1 },
+		{ "get-desc-es",NO_ARG,		&get_desc_es,	1 },
+		{ "set-desc-es",NO_ARG,		&set_desc_es,	1 },
+		{ "get-desc-ss",NO_ARG,		&get_desc_ss,	1 },
+		{ "set-desc-ss",NO_ARG,		&set_desc_ss,	1 },
+		{ "get-desc-cs",NO_ARG,		&get_desc_cs,	1 },
+		{ "set-desc-cs",NO_ARG,		&set_desc_cs,	1 },
+		{ "get-desc-fs",NO_ARG,		&get_desc_fs,	1 },
+		{ "set-desc-fs",NO_ARG,		&set_desc_fs,	1 },
+		{ "get-desc-gs",NO_ARG,		&get_desc_gs,	1 },
+		{ "set-desc-gs",NO_ARG,		&set_desc_gs,	1 },
+		{ "get-desc-tr",NO_ARG,		&get_desc_tr,	1 },
+		{ "set-desc-tr",NO_ARG,		&set_desc_tr,	1 },
+		{ "set-desc-ldtr", NO_ARG,	&set_desc_ldtr,	1 },
+		{ "get-desc-ldtr", NO_ARG,	&get_desc_ldtr,	1 },
+		{ "set-desc-gdtr", NO_ARG,	&set_desc_gdtr, 1 },
+		{ "get-desc-gdtr", NO_ARG,	&get_desc_gdtr, 1 },
+		{ "set-desc-idtr", NO_ARG,	&set_desc_idtr, 1 },
+		{ "get-desc-idtr", NO_ARG,	&get_desc_idtr, 1 },
+		{ "get-lowmem", NO_ARG,		&get_lowmem,	1 },
+		{ "get-highmem",NO_ARG,		&get_highmem,	1 },
+		{ "get-efer",	NO_ARG,		&get_efer,	1 },
+		{ "get-cr0",	NO_ARG,		&get_cr0,	1 },
+		{ "get-cr3",	NO_ARG,		&get_cr3,	1 },
+		{ "get-cr4",	NO_ARG,		&get_cr4,	1 },
+		{ "get-dr7",	NO_ARG,		&get_dr7,	1 },
+		{ "get-rsp",	NO_ARG,		&get_rsp,	1 },
+		{ "get-rip",	NO_ARG,		&get_rip,	1 },
+		{ "get-rax",	NO_ARG,		&get_rax,	1 },
+		{ "get-rbx",	NO_ARG,		&get_rbx,	1 },
+		{ "get-rcx",	NO_ARG,		&get_rcx,	1 },
+		{ "get-rdx",	NO_ARG,		&get_rdx,	1 },
+		{ "get-rsi",	NO_ARG,		&get_rsi,	1 },
+		{ "get-rdi",	NO_ARG,		&get_rdi,	1 },
+		{ "get-rbp",	NO_ARG,		&get_rbp,	1 },
+		{ "get-r8",	NO_ARG,		&get_r8,	1 },
+		{ "get-r9",	NO_ARG,		&get_r9,	1 },
+		{ "get-r10",	NO_ARG,		&get_r10,	1 },
+		{ "get-r11",	NO_ARG,		&get_r11,	1 },
+		{ "get-r12",	NO_ARG,		&get_r12,	1 },
+		{ "get-r13",	NO_ARG,		&get_r13,	1 },
+		{ "get-r14",	NO_ARG,		&get_r14,	1 },
+		{ "get-r15",	NO_ARG,		&get_r15,	1 },
+		{ "get-rflags",	NO_ARG,		&get_rflags,	1 },
+		{ "get-cs",	NO_ARG,		&get_cs,	1 },
+		{ "get-ds",	NO_ARG,		&get_ds,	1 },
+		{ "get-es",	NO_ARG,		&get_es,	1 },
+		{ "get-fs",	NO_ARG,		&get_fs,	1 },
+		{ "get-gs",	NO_ARG,		&get_gs,	1 },
+		{ "get-ss",	NO_ARG,		&get_ss,	1 },
+		{ "get-tr",	NO_ARG,		&get_tr,	1 },
+		{ "get-ldtr",	NO_ARG,		&get_ldtr,	1 },
+		{ "get-vmcs-pinbased-ctls",
+				NO_ARG,		&get_pinbased_ctls, 1 },
+		{ "get-vmcs-procbased-ctls",
+				NO_ARG,		&get_procbased_ctls, 1 },
+		{ "get-vmcs-procbased-ctls2",
+				NO_ARG,		&get_procbased_ctls2, 1 },
+		{ "get-vmcs-guest-linear-address",
+				NO_ARG,		&get_vmcs_gla,	1 },
+		{ "get-vmcs-guest-physical-address",
+				NO_ARG,		&get_vmcs_gpa,	1 },
+		{ "get-vmcs-entry-interruption-info",
+				NO_ARG, &get_vmcs_entry_interruption_info, 1},
+		{ "get-vmcs-eptp", NO_ARG,	&get_eptp,	1 },
+		{ "get-vmcs-exception-bitmap",
+				NO_ARG,		&get_exception_bitmap, 1 },
+		{ "get-vmcs-io-bitmap-address",
+				NO_ARG,		&get_io_bitmap,	1 },
+		{ "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
+		{ "get-vmcs-cr0-mask", NO_ARG,	&get_cr0_mask,	1 },
+		{ "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+		{ "get-vmcs-cr4-mask", NO_ARG,	&get_cr4_mask,	1 },
+		{ "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
+		{ "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
+		{ "get-vmcs-apic-access-address",
+				NO_ARG,		&get_apic_access_addr, 1},
+		{ "get-vmcs-virtual-apic-address",
+				NO_ARG,		&get_virtual_apic_addr, 1},
+		{ "get-vmcs-tpr-threshold",
+				NO_ARG,		&get_tpr_threshold, 1 },
+		{ "get-vmcs-msr-bitmap",
+				NO_ARG,		&get_msr_bitmap, 1 },
+		{ "get-vmcs-msr-bitmap-address",
+				NO_ARG,		&get_msr_bitmap_address, 1 },
+		{ "get-vmcs-vpid", NO_ARG,	&get_vpid,	1 },
+		{ "get-vmcs-ple-gap", NO_ARG,	&get_ple_gap,	1 },
+		{ "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
+		{ "get-vmcs-instruction-error",
+				NO_ARG,		&get_inst_err,	1 },
+		{ "get-vmcs-exit-ctls", NO_ARG,	&get_exit_ctls,	1 },
+		{ "get-vmcs-entry-ctls",
+					NO_ARG,	&get_entry_ctls, 1 },
+		{ "get-vmcs-guest-pat",	NO_ARG,	&get_guest_pat,	1 },
+		{ "get-vmcs-host-pat",	NO_ARG,	&get_host_pat,	1 },
+		{ "get-vmcs-host-cr0",
+				NO_ARG,		&get_host_cr0,	1 },
+		{ "get-vmcs-host-cr3",
+				NO_ARG,		&get_host_cr3,	1 },
+		{ "get-vmcs-host-cr4",
+				NO_ARG,		&get_host_cr4,	1 },
+		{ "get-vmcs-host-rip",
+				NO_ARG,		&get_host_rip,	1 },
+		{ "get-vmcs-host-rsp",
+				NO_ARG,		&get_host_rsp,	1 },
+		{ "get-vmcs-guest-sysenter",
+				NO_ARG,		&get_guest_sysenter, 1 },
+		{ "get-vmcs-link", NO_ARG,	&get_vmcs_link, 1 },
+		{ "get-vmcs-exit-reason",
+				NO_ARG,		&get_vmcs_exit_reason, 1 },
+		{ "get-vmcs-exit-qualification",
+			NO_ARG,		&get_vmcs_exit_qualification, 1 },
+		{ "get-vmcs-exit-interruption-info",
+				NO_ARG,	&get_vmcs_exit_interruption_info, 1},
+		{ "get-vmcs-exit-interruption-error",
+				NO_ARG,	&get_vmcs_exit_interruption_error, 1},
+		{ "get-vmcs-interruptibility",
+				NO_ARG, &get_vmcs_interruptibility, 1 },
+		{ "get-x2apic-state",NO_ARG,	&get_x2apic_state, 1 },
+		{ "get-all",	NO_ARG,		&get_all,	1 },
+		{ "run",	NO_ARG,		&run,		1 },
+		{ "create",	NO_ARG,		&create,	1 },
+		{ "destroy",	NO_ARG,		&destroy,	1 },
+		{ NULL,		0,		NULL,		0 }
+	};
+
+	vcpu = 0;
+	progname = basename(argv[0]);
+
+	while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+		switch (ch) {
+		case 0:
+			break;
+		case VMNAME:
+			vmname = optarg;
+			break;
+		case VCPU:
+			vcpu = atoi(optarg);
+			break;
+		case SET_MEM:
+			memsize = atoi(optarg) * MB;
+			memsize = roundup(memsize, 2 * MB);
+			break;
+		case SET_EFER:
+			efer = strtoul(optarg, NULL, 0);
+			set_efer = 1;
+			break;
+		case SET_CR0:
+			cr0 = strtoul(optarg, NULL, 0);
+			set_cr0 = 1;
+			break;
+		case SET_CR3:
+			cr3 = strtoul(optarg, NULL, 0);
+			set_cr3 = 1;
+			break;
+		case SET_CR4:
+			cr4 = strtoul(optarg, NULL, 0);
+			set_cr4 = 1;
+			break;
+		case SET_DR7:
+			dr7 = strtoul(optarg, NULL, 0);
+			set_dr7 = 1;
+			break;
+		case SET_RSP:
+			rsp = strtoul(optarg, NULL, 0);
+			set_rsp = 1;
+			break;
+		case SET_RIP:
+			rip = strtoul(optarg, NULL, 0);
+			set_rip = 1;
+			break;
+		case SET_RAX:
+			rax = strtoul(optarg, NULL, 0);
+			set_rax = 1;
+			break;
+		case SET_RFLAGS:
+			rflags = strtoul(optarg, NULL, 0);
+			set_rflags = 1;
+			break;
+		case DESC_BASE:
+			desc_base = strtoul(optarg, NULL, 0);
+			break;
+		case DESC_LIMIT:
+			desc_limit = strtoul(optarg, NULL, 0);
+			break;
+		case DESC_ACCESS:
+			desc_access = strtoul(optarg, NULL, 0);
+			break;
+		case SET_CS:
+			cs = strtoul(optarg, NULL, 0);
+			set_cs = 1;
+			break;
+		case SET_DS:
+			ds = strtoul(optarg, NULL, 0);
+			set_ds = 1;
+			break;
+		case SET_ES:
+			es = strtoul(optarg, NULL, 0);
+			set_es = 1;
+			break;
+		case SET_FS:
+			fs = strtoul(optarg, NULL, 0);
+			set_fs = 1;
+			break;
+		case SET_GS:
+			gs = strtoul(optarg, NULL, 0);
+			set_gs = 1;
+			break;
+		case SET_SS:
+			ss = strtoul(optarg, NULL, 0);
+			set_ss = 1;
+			break;
+		case SET_TR:
+			tr = strtoul(optarg, NULL, 0);
+			set_tr = 1;
+			break;
+		case SET_LDTR:
+			ldtr = strtoul(optarg, NULL, 0);
+			set_ldtr = 1;
+			break;
+		case SET_X2APIC_STATE:
+			x2apic_state = strtol(optarg, NULL, 0);
+			set_x2apic_state = 1;
+			break;
+		case SET_VMCS_EXCEPTION_BITMAP:
+			exception_bitmap = strtoul(optarg, NULL, 0);
+			set_exception_bitmap = 1;
+			break;
+		case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+			vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+			set_vmcs_entry_interruption_info = 1;
+			break;
+		case SET_CAP:
+			capval = strtoul(optarg, NULL, 0);
+			setcap = 1;
+			break;
+		case CAPNAME:
+			capname = optarg;
+			break;
+		case UNASSIGN_PPTDEV:
+			unassign_pptdev = 1;
+			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
+				usage();
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (vmname == NULL)
+		usage();
+
+	error = 0;
+
+	if (!error && create)
+		error = vm_create(vmname);
+
+	if (!error) {
+		ctx = vm_open(vmname);
+		if (ctx == NULL)
+			error = -1;
+	}
+
+	if (!error && memsize)
+		error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE);
+
+	if (!error && set_efer)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+	if (!error && set_cr0)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+	if (!error && set_cr3)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+	if (!error && set_cr4)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+	if (!error && set_dr7)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+	if (!error && set_rsp)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+	if (!error && set_rip)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+	if (!error && set_rax)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+	if (!error && set_rflags) {
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+					rflags);
+	}
+
+	if (!error && set_desc_ds) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_es) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_ss) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_cs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_fs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_gs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_tr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_ldtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_gdtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+				    desc_base, desc_limit, 0);
+	}
+
+	if (!error && set_desc_idtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+				    desc_base, desc_limit, 0);
+	}
+
+	if (!error && set_cs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+	if (!error && set_ds)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+	if (!error && set_es)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+	if (!error && set_fs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+	if (!error && set_gs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+	if (!error && set_ss)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+	if (!error && set_tr)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+	if (!error && set_ldtr)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+	if (!error && set_x2apic_state)
+		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
+
+#ifdef	__FreeBSD__
+	if (!error && unassign_pptdev)
+		error = vm_unassign_pptdev(ctx, bus, slot, func);
+#endif
+
+	if (!error && set_exception_bitmap) {
+		error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+					  exception_bitmap);
+	}
+
+	if (!error && set_vmcs_entry_interruption_info) {
+		error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+					  vmcs_entry_interruption_info);
+	}
+
+	if (!error && (get_lowmem || get_all)) {
+		gpa = 0;
+		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
+		if (error == 0)
+			printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
+			    wired ? " wired" : "");
+	}
+
+	if (!error && (get_highmem || get_all)) {
+		gpa = 4 * GB;
+		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
+		if (error == 0)
+			printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
+			    wired ? " wired" : "");
+	}
+
+	if (!error && (get_efer || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+		if (error == 0)
+			printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+	}
+
+	if (!error && (get_cr0 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+		if (error == 0)
+			printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+	}
+
+	if (!error && (get_cr3 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+		if (error == 0)
+			printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+	}
+
+	if (!error && (get_cr4 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+		if (error == 0)
+			printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+	}
+
+	if (!error && (get_dr7 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
+		if (error == 0)
+			printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
+	}
+
+	if (!error && (get_rsp || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
+		if (error == 0)
+			printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+	}
+
+	if (!error && (get_rip || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+		if (error == 0)
+			printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
+	}
+
+	if (!error && (get_rax || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
+		if (error == 0)
+			printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
+	}
+
+	if (!error && (get_rbx || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
+		if (error == 0)
+			printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
+	}
+
+	if (!error && (get_rcx || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
+		if (error == 0)
+			printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
+	}
+
+	if (!error && (get_rdx || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
+		if (error == 0)
+			printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
+	}
+
+	if (!error && (get_rsi || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
+		if (error == 0)
+			printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
+	}
+
+	if (!error && (get_rdi || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
+		if (error == 0)
+			printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
+	}
+
+	if (!error && (get_rbp || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
+		if (error == 0)
+			printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
+	}
+
+	if (!error && (get_r8 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
+		if (error == 0)
+			printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
+	}
+
+	if (!error && (get_r9 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
+		if (error == 0)
+			printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
+	}
+
+	if (!error && (get_r10 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
+		if (error == 0)
+			printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
+	}
+
+	if (!error && (get_r11 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
+		if (error == 0)
+			printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
+	}
+
+	if (!error && (get_r12 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
+		if (error == 0)
+			printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
+	}
+
+	if (!error && (get_r13 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
+		if (error == 0)
+			printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
+	}
+
+	if (!error && (get_r14 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
+		if (error == 0)
+			printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
+	}
+
+	if (!error && (get_r15 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
+		if (error == 0)
+			printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
+	}
+
+	if (!error && (get_rflags || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+					&rflags);
+		if (error == 0)
+			printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
+	}
+
+#ifdef	__FreeBSD__
+	if (!error && (get_stats || get_all)) {
+		int i, num_stats;
+		uint64_t *stats;
+		struct timeval tv;
+		const char *desc;
+
+		stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+		if (stats != NULL) {
+			printf("vcpu%d\n", vcpu);
+			for (i = 0; i < num_stats; i++) {
+				desc = vm_get_stat_desc(ctx, i);
+				printf("%-40s\t%ld\n", desc, stats[i]);
+			}
+		}
+	}
+#endif
+
+	if (!error && (get_desc_ds || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_es || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_fs || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_gs || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_ss || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_cs || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_tr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_ldtr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_gdtr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
+			       vcpu, desc_base, desc_limit);	
+		}
+	}
+
+	if (!error && (get_desc_idtr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
+			       vcpu, desc_base, desc_limit);	
+		}
+	}
+
+	if (!error && (get_cs || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
+		if (error == 0)
+			printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
+	}
+
+	if (!error && (get_ds || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
+		if (error == 0)
+			printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
+	}
+
+	if (!error && (get_es || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
+		if (error == 0)
+			printf("es[%d]\t\t0x%04lx\n", vcpu, es);
+	}
+
+	if (!error && (get_fs || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
+		if (error == 0)
+			printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
+	}
+
+	if (!error && (get_gs || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
+		if (error == 0)
+			printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
+	}
+
+	if (!error && (get_ss || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
+		if (error == 0)
+			printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
+	}
+
+	if (!error && (get_tr || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
+		if (error == 0)
+			printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
+	}
+
+	if (!error && (get_ldtr || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
+		if (error == 0)
+			printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
+	}
+
+	if (!error && (get_x2apic_state || get_all)) {
+		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
+		if (error == 0)
+			printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
+	}
+
+	if (!error && (get_pinbased_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
+		if (error == 0)
+			printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_procbased_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_PRI_PROC_BASED_CTLS, &ctl);
+		if (error == 0)
+			printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_procbased_ctls2 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_SEC_PROC_BASED_CTLS, &ctl);
+		if (error == 0)
+			printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_vmcs_gla || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_LINEAR_ADDRESS, &u64);
+		if (error == 0)
+			printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_gpa || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
+		if (error == 0)
+			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_entry_interruption_info || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+		if (error == 0) {
+			printf("entry_interruption_info[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && (get_eptp || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+		if (error == 0)
+			printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
+	}
+
+	if (!error && (get_exception_bitmap || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+					  &bm);
+		if (error == 0)
+			printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm);
+	}
+
+	if (!error && (get_io_bitmap || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
+		if (error == 0)
+			printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm);
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
+		if (error == 0)
+			printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm);
+	}
+
+	if (!error && (get_tsc_offset || get_all)) {
+		uint64_t tscoff;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
+		if (error == 0)
+			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
+	}
+
+	if (!error && (get_cr0_mask || get_all)) {
+		uint64_t cr0mask;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
+		if (error == 0)
+			printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
+	}
+
+	if (!error && (get_cr0_shadow || get_all)) {
+		uint64_t cr0shadow;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
+					  &cr0shadow);
+		if (error == 0)
+			printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
+	}
+
+	if (!error && (get_cr4_mask || get_all)) {
+		uint64_t cr4mask;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
+		if (error == 0)
+			printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
+	}
+
+	if (!error && (get_cr4_shadow || get_all)) {
+		uint64_t cr4shadow;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
+					  &cr4shadow);
+		if (error == 0)
+			printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
+	}
+	
+	if (!error && (get_cr3_targets || get_all)) {
+		uint64_t target_count, target_addr;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
+					  &target_count);
+		if (error == 0) {
+			printf("cr3_target_count[%d]\t0x%08lx\n",
+				vcpu, target_count);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target0[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target1[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target2[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target3[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+	}
+
+	if (!error && (get_apic_access_addr || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+		if (error == 0)
+			printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_virtual_apic_addr || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+		if (error == 0)
+			printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_tpr_threshold || get_all)) {
+		uint64_t threshold;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+					  &threshold);
+		if (error == 0)
+			printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold);
+	}
+
+	if (!error && (get_msr_bitmap_address || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+		if (error == 0)
+			printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_msr_bitmap || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+		if (error == 0)
+			error = dump_vmcs_msr_bitmap(vcpu, addr);
+	}
+
+	if (!error && (get_vpid || get_all)) {
+		uint64_t vpid;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+		if (error == 0)
+			printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid);
+	}
+	
+	if (!error && (get_ple_window || get_all)) {
+		uint64_t window;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
+		if (error == 0)
+			printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window);
+	}
+
+	if (!error && (get_ple_gap || get_all)) {
+		uint64_t gap;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+		if (error == 0)
+			printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap);
+	}
+
+	if (!error && (get_inst_err || get_all)) {
+		uint64_t insterr;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
+					  &insterr);
+		if (error == 0) {
+			printf("instruction_error[%d]\t0x%08lx\n",
+				vcpu, insterr);
+		}
+	}
+
+	if (!error && (get_exit_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
+		if (error == 0)
+			printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_entry_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
+		if (error == 0)
+			printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_host_pat || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
+		if (error == 0)
+			printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+	}
+
+	if (!error && (get_guest_pat || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
+		if (error == 0)
+			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+	}
+
+	if (!error && (get_host_cr0 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
+		if (error == 0)
+			printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+	}
+
+	if (!error && (get_host_cr3 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
+		if (error == 0)
+			printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+	}
+
+	if (!error && (get_host_cr4 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
+		if (error == 0)
+			printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+	}
+
+	if (!error && (get_host_rip || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
+		if (error == 0)
+			printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
+	}
+
+	if (!error && (get_host_rsp || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
+		if (error == 0)
+			printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+	}
+
+	if (!error && (get_guest_sysenter || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_IA32_SYSENTER_CS, &cs);
+		if (error == 0)
+			printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs);
+
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
+		if (error == 0)
+			printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
+		if (error == 0)
+			printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
+	}
+
+	if (!error && (get_vmcs_link || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
+		if (error == 0)
+			printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_vmcs_exit_reason || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
+		if (error == 0)
+			printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_exit_qualification || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+					  &u64);
+		if (error == 0)
+			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+				vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_exit_interruption_info || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64);
+		if (error == 0) {
+			printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && (get_vmcs_exit_interruption_error || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE,
+		    &u64);
+		if (error == 0) {
+			printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && (get_vmcs_interruptibility || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_INTERRUPTIBILITY, &u64);
+		if (error == 0) {
+			printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && setcap) {
+		int captype;
+		captype = vm_capability_name2type(capname);
+		error = vm_set_capability(ctx, vcpu, captype, capval);
+		if (error != 0 && errno == ENOENT)
+			printf("Capability \"%s\" is not available\n", capname);
+	}
+
+	if (!error && (getcap || get_all)) {
+		int captype, val, getcaptype;
+
+		if (getcap && capname)
+			getcaptype = vm_capability_name2type(capname);
+		else
+			getcaptype = -1;
+
+		for (captype = 0; captype < VM_CAP_MAX; captype++) {
+			if (getcaptype >= 0 && captype != getcaptype)
+				continue;
+			error = vm_get_capability(ctx, vcpu, captype, &val);
+			if (error == 0) {
+				printf("Capability \"%s\" is %s on vcpu %d\n",
+					vm_capability_type2name(captype),
+					val ? "set" : "not set", vcpu);
+			} else if (errno == ENOENT) {
+				error = 0;
+				printf("Capability \"%s\" is not available\n",
+					vm_capability_type2name(captype));
+			} else {
+				break;
+			}
+		}
+	}
+
+	if (!error && run) {
+		error = vm_run(ctx, vcpu, &vmexit);
+		if (error == 0)
+			dump_vm_run_exitcode(&vmexit, vcpu);
+		else
+			printf("vm_run error %d\n", error);
+	}
+
+	if (error)
+		printf("errno = %d\n", errno);
+
+	if (!error && destroy)
+		error = vm_destroy(ctx);
+
+	exit(error);
+}
diff --git a/usr/src/cmd/bhyveload-uefi/Makefile b/usr/src/cmd/bhyveload-uefi/Makefile
new file mode 100644
index 0000000000..bbcbacf32f
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG =		bhyveload-uefi
+
+include ../Makefile.cmd
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint:	$(SUBDIRS)
+
+install: $(SUBDIRS)
+	-$(RM) $(ROOTUSRSBINPROG)
+	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyveload-uefi/Makefile.com b/usr/src/cmd/bhyveload-uefi/Makefile.com
new file mode 100644
index 0000000000..7865cca8d8
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/Makefile.com
@@ -0,0 +1,52 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG= bhyveload-uefi
+
+SRCS = ../bhyveload-uefi.c expand_number.c
+OBJS = bhyveload-uefi.o expand_number.o
+
+include ../../Makefile.cmd
+
+.KEEP_STATE:
+
+CFLAGS +=	$(CCVERBOSE)
+CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
+	        -I$(ROOT)/usr/platform/i86pc/include
+LDLIBS +=	-lvmmapi
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+	$(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+	$(RM) $(OBJS)
+
+lint:	lint_SRCS
+
+include ../../Makefile.targ
+
+%.o: ../%.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
+
+%.o: $(CONTRIB)/freebsd/lib/libutil/%.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
+
diff --git a/usr/src/cmd/bhyveload-uefi/amd64/Makefile b/usr/src/cmd/bhyveload-uefi/amd64/Makefile
new file mode 100644
index 0000000000..b602c50d05
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
new file mode 100644
index 0000000000..62a7ca5d0f
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
@@ -0,0 +1,190 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+
+#include <errno.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <vmmapi.h>
+
+#define	KB	(1024UL)
+#define	MB	(1024 * 1024UL)
+#define	GB	(1024 * 1024 * 1024UL)
+
+#define	UEFI_ROM_ADDR	0xFFE00000
+#define	UEFI_ROM_SIZE	(2 * MB)
+/*
+ * N.B. the UEFI code zeros the first page in memory so use the second.
+ */
+#define	BHYVE_HOB_ADDR		0x00002000
+#define	BHYVE_BO_HOB_ADDR	0x00002080
+
+#define	UEFI_ROM_PATH	"/usr/share/bhyve/uefi-rom.bin"
+
+struct platform_info {
+	uint32_t	ncpus;
+};
+
+/*
+ * Boot order code:
+ * 0 - EFI_CD_HD
+ * 1 - EFI_CD
+ * 2 - EFI_HD_CD
+ * 3 - EFI_HD
+ * 4 - EFI_NET
+ * 5 - EFI_NET_CD_HD
+ * 6 - EFI_HD_HD_CD
+ * 7 - LEGACY_CD_HD
+ * 8 - LEGACY_CD
+ * 9 - LEGACY_HD_CD
+ * 10 - LEGACY_HD
+ * 11 - EFI_SHELL
+ */
+
+struct bootorder_info {
+	uint32_t	guestbootorder;
+};
+
+static char *vmname, *progname;
+static struct vmctx *ctx;
+
+static void
+usage(void)
+{
+	printf("usage: %s "
+	       "[-c vcpus] [-m mem-size] [-b bootorder]"
+	       "<vmname>\n", progname);
+	exit(1);
+}
+
+int
+main(int argc, char** argv)
+{
+	int opt, error, fd;
+	int guest_ncpus;
+	int guest_bootorder = 0;
+	uint64_t mem_size;
+	char *membase, *rombase;
+	struct platform_info *pi;
+	struct bootorder_info *bi;
+
+	progname = argv[0];
+
+	guest_ncpus = 1;
+	mem_size = 256 * MB;
+
+	while ((opt = getopt(argc, argv, "c:m:b:")) != -1) {
+		switch (opt) {
+		case 'c':
+			guest_ncpus = atoi(optarg);
+			break;
+		case 'm':
+			error = vm_parse_memsize(optarg, &mem_size);
+			if (error != 0 || mem_size == 0)
+				errx(EX_USAGE, "Invalid memsize '%s'", optarg);
+			break;
+		case 'b':
+			guest_bootorder = atoi(optarg);
+			if (guest_bootorder < 0 || guest_bootorder > 11) {
+				errx(EX_USAGE, "Invalid bootoption: %d\n"
+		 		    "\tBoot order code:\n"
+ 				    "\t0 - EFI_CD_HD\n"
+ 				    "\t1 - EFI_CD\n"
+ 				    "\t2 - EFI_HD_CD\n"
+				    "\t3 - EFI_HD\n"
+				    "\t4 - EFI_NET\n"
+				    "\t5 - EFI_NET_CD_HD\n"
+				    "\t6 - EFI_HD_HD_CD\n"
+				    "\t7 - LEGACY_CD_HD\n"
+				    "\t8 - LEGACY_CD\n"
+				    "\t9 - LEGACY_HD_CD\n"
+				    "\t10 - LEGACY_HD\n"
+ 				    "\t11 - EFI_SHELL\n", guest_bootorder);
+				exit(1);
+			}
+			break;
+		case '?':
+			usage();
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 1)
+		usage();
+
+	vmname = argv[0];
+	error = vm_create(vmname);
+	if (error != 0 && errno != EEXIST) {
+		perror("vm_create");
+		exit(1);
+
+	}
+
+	ctx = vm_open(vmname);
+	if (ctx == NULL) {
+		perror("vm_open");
+		exit(1);
+	}
+
+	error = vm_set_capability(ctx, 0, VM_CAP_UNRESTRICTED_GUEST, 1);
+	if (error) {
+		perror("vm_set_capability(VM_CAP_UNRESTRICTED_GUEST)");
+	}
+
+	error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL);
+	if (error) {
+		perror("vm_setup_memory");
+		exit(1);
+	}
+	membase = vm_map_gpa(ctx, 0, 8 * KB);
+
+	error = vm_setup_rom(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE);
+	if (error) {
+		perror("vm_setup_rom");
+		exit(1);
+	}
+	rombase = vm_map_gpa(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE);
+
+	fd = open(UEFI_ROM_PATH, O_RDONLY);
+	if (fd == -1) {
+		perror("open");
+		exit(1);
+	}
+	read(fd, rombase, UEFI_ROM_SIZE);
+	close(fd);
+
+	pi = (struct platform_info *)(membase + BHYVE_HOB_ADDR);
+	pi->ncpus = guest_ncpus;
+	bi = (struct bootorder_info *)(membase + BHYVE_BO_HOB_ADDR);
+	bi->guestbootorder = guest_bootorder;
+
+	error = vcpu_reset(ctx, 0);
+	if (error) {
+		perror("vcpu_reset");
+		exit(1);
+	}
+
+	return (0);
+}
diff --git a/usr/src/cmd/bhyveload-uefi/i386/Makefile b/usr/src/cmd/bhyveload-uefi/i386/Makefile
new file mode 100644
index 0000000000..f5b7bb6915
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/i386/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+
+install: all $(ROOTUSRSBINPROG32)
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile
new file mode 100644
index 0000000000..bf9219b435
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile
@@ -0,0 +1,20 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2014 Pluribus Networks Inc.
+#
+
+MAKEVARS = CW_NO_SHADOW=true __GNUC=
+
+include $(SRC)/Makefile.master
+$(BUILD64)SUBDIRS += 	$(MACH64)
+include ../../../Makefile.subdirs
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
new file mode 100644
index 0000000000..49ca0c5eb3
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
@@ -0,0 +1,32 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+MODULE = vmm.so
+MDBTGT = kvm
+
+MODSRCS = vmm.c
+
+include ../../../../../Makefile.cmd
+include ../../../../../Makefile.cmd.64
+include ../../../Makefile.amd64
+include ../../../../Makefile.module
+
+CPPFLAGS = -D_KERNEL -D_MACHDEP
+CPPFLAGS += -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64
+CPPFLAGS += -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64
+CPPFLAGS += -I$(SRC)/uts/common -I$(SRC)/uts/i86pc
+CPPFLAGS += -I$(SRC)/cmd/mdb/common
+
+CPPFLAGS += -_cc=-xdryrun
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
new file mode 100644
index 0000000000..9e29d8662a
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
@@ -0,0 +1,238 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/param.h>
+
+#include <mdb/mdb_modapi.h>
+#include <sys/cpuvar.h>
+#include <sys/varargs.h>
+#include <sys/vmm.h>
+#include <sys/vmm_impl.h>
+
+/*
+ * VMM trace debug walker/dcmd code
+ */
+
+/*
+ * Initialize the vmm_trace_dmsg_t walker by either using the given starting
+ * address, or reading the value of the kernel's vmm_debug_rbuf pointer.
+ * We also allocate a vmm_trace_dmsg_t for storage, and save this using the
+ * walk_data pointer.
+ */
+static int
+vmm_dmsg_walk_i(mdb_walk_state_t *wsp)
+{
+	uintptr_t rbuf_addr;
+	vmm_trace_rbuf_t rbuf;
+
+	if (wsp->walk_addr == NULL) {
+		if (mdb_readvar(&rbuf_addr, "vmm_debug_rbuf") == -1) {
+			mdb_warn("failed to read 'vmm_debug_rbuf'");
+			return (WALK_ERR);
+		}
+
+		if (mdb_vread(&rbuf, sizeof (vmm_trace_rbuf_t), rbuf_addr)
+		    == -1) {
+			mdb_warn("failed to read vmm_trace_rbuf_t at %p",
+			    rbuf_addr);
+			return (WALK_ERR);
+		}
+
+		wsp->walk_addr = (uintptr_t)(vmm_trace_dmsg_t *)rbuf.dmsgh;
+	}
+
+	/*
+	 * Save ptr to head of ring buffer to prevent looping.
+	 */
+	wsp->walk_arg = (void *)wsp->walk_addr;
+	wsp->walk_data = mdb_alloc(sizeof (vmm_trace_dmsg_t), UM_SLEEP);
+	return (WALK_NEXT);
+}
+
+/*
+ * At each step, read a vmm_trace_dmsg_t into our private storage, and then
+ * invoke the callback function.  We terminate when we reach a NULL next
+ * pointer.
+ */
+static int
+vmm_dmsg_walk_s(mdb_walk_state_t *wsp)
+{
+	int status;
+
+	if (wsp->walk_addr == NULL)
+		return (WALK_DONE);
+
+	if (mdb_vread(wsp->walk_data, sizeof (vmm_trace_dmsg_t),
+	    wsp->walk_addr) == -1) {
+		mdb_warn("failed to read vmm_trace_dmsg_t at %p",
+		    wsp->walk_addr);
+		return (WALK_ERR);
+	}
+
+	status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data,
+	    wsp->walk_cbdata);
+
+	wsp->walk_addr =
+	    (uintptr_t)(((vmm_trace_dmsg_t *)wsp->walk_data)->next);
+
+	/*
+	 * If we've looped then we're done.
+	 */
+	if (wsp->walk_addr == (uintptr_t)wsp->walk_arg)
+		wsp->walk_addr = NULL;
+
+	return (status);
+}
+
+/*
+ * The walker's fini function is invoked at the end of each walk.  Since we
+ * dynamically allocated a vmm_trace_dmsg_t in vmm_dmsg_walk_i, we must
+ * free it now.
+ */
+static void
+vmm_dmsg_walk_f(mdb_walk_state_t *wsp)
+{
+	mdb_free(wsp->walk_data, sizeof (vmm_trace_dmsg_t));
+}
+
+/*
+ * This routine is used by the vmm_dmsg_dump dcmd to dump content of
+ * VMM trace ring buffer.
+ */
+int
+vmm_dmsg_dump(vmm_trace_dmsg_t *addr, int print_pathname, uint_t *printed)
+{
+	vmm_trace_dmsg_t	dmsg, *dmsgh = addr;
+	char			pathname[MAXPATHLEN];
+	char			merge[1024];
+
+	while (addr != NULL) {
+		if (mdb_vread(&dmsg, sizeof (dmsg), (uintptr_t)addr) !=
+		    sizeof (dmsg)) {
+			mdb_warn("failed to read message pointer in kernel");
+			return (DCMD_ERR);
+		}
+
+		(void) mdb_snprintf(merge, sizeof (merge),
+		    "[%Y:%03d:%03d:%03d] : %s",
+		    dmsg.timestamp.tv_sec,
+		    (int)dmsg.timestamp.tv_nsec/1000000,
+		    (int)(dmsg.timestamp.tv_nsec/1000)%1000,
+		    (int)dmsg.timestamp.tv_nsec%1000,
+		    dmsg.buf);
+
+		mdb_printf("%s", merge);
+
+		if (printed != NULL) {
+			(*printed)++;
+		}
+
+		if (((addr = dmsg.next) == NULL) || (dmsg.next == dmsgh)) {
+			break;
+		}
+	}
+
+	return (DCMD_OK);
+}
+
+/*
+ * 1. Process flag passed to vmm_dmsg_dump dcmd.
+ * 2. Obtain VMM trace ring buffer pointer.
+ * 3. Pass VMM trace ring buffer pointer to vmm_dmsg_dump()
+ *    to dump content of VMM trace ring buffer.
+ */
+int
+vmm_rbuf_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	vmm_trace_rbuf_t	rbuf;
+	uint_t		printed = 0; /* have we printed anything? */
+	int		print_pathname = FALSE;
+	int		rval = DCMD_OK;
+
+	if (argc > 1) {
+		return (DCMD_USAGE);
+	}
+
+	if (mdb_getopts(argc, argv,
+	    'a', MDB_OPT_SETBITS, TRUE, &print_pathname) != argc) {
+		return (DCMD_USAGE);
+	}
+
+	/*
+	 * If ring buffer address not provided try to obtain
+	 * it using vmm_debug_rbuf global.
+	 */
+	if ((addr == NULL) || !(flags & DCMD_ADDRSPEC)) {
+		if (mdb_readvar(&addr, "vmm_debug_rbuf") == -1) {
+			mdb_warn("Failed to read 'vmm_debug_rbuf'.");
+			return (DCMD_ERR);
+		}
+	}
+
+	if (mdb_vread(&rbuf, sizeof (rbuf), addr) != sizeof (rbuf)) {
+		mdb_warn("Failed to read ring buffer in kernel.");
+		return (DCMD_ERR);
+	}
+
+	if (rbuf.dmsgh == NULL) {
+		mdb_printf("The vmm trace ring buffer is empty.\n");
+		return (DCMD_OK);
+	}
+
+	rval = vmm_dmsg_dump((vmm_trace_dmsg_t *)rbuf.dmsgh,
+	    print_pathname, &printed);
+
+	if (rval != DCMD_OK) {
+		return (rval);
+	}
+
+	if (printed == 0) {
+		mdb_warn("Failed to read vmm trace ring buffer.");
+		return (DCMD_ERR);
+	}
+
+	return (rval);
+}
+
+/*
+ * MDB module linkage information:
+ *
+ * We declare a list of structures describing our dcmds, a list of structures
+ * describing our walkers, and a function named _mdb_init to return a pointer
+ * to our module information.
+ */
+
+static const mdb_dcmd_t dcmds[] = {
+	{ "vmm_dmsg_dump", "[-a]", "Dump vmm trace debug messages",
+	    vmm_rbuf_dump },
+	{ NULL }
+};
+
+static const mdb_walker_t walkers[] = {
+	{ "vmm_dmsg",
+	    "walk ring buffer containing vmm trace debug messages",
+	    vmm_dmsg_walk_i, vmm_dmsg_walk_s, vmm_dmsg_walk_f },
+	{ NULL }
+};
+
+static const mdb_modinfo_t modinfo = {
+	MDB_API_VERSION, dcmds, walkers
+};
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+	return (&modinfo);
+}
diff --git a/usr/src/compat/freebsd/amd64/machine/asmacros.h b/usr/src/compat/freebsd/amd64/machine/asmacros.h
new file mode 100644
index 0000000000..fcf35a7b78
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/asmacros.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_
+
+#define	ENTRY(x) \
+	.text; .p2align 4,0x90; \
+	.globl  x; \
+	.type   x, @function; \
+x:
+
+#define	END(x) \
+	.size x, [.-x]
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/atomic.h b/usr/src/compat/freebsd/amd64/machine/atomic.h
new file mode 100644
index 0000000000..5b78143d21
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/atomic.h
@@ -0,0 +1,244 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_
+
+static __inline u_char
+atomic_load_acq_char(volatile u_char *p)
+{
+	u_char res;
+
+	__asm volatile("lock ; " "cmpxchgb %b0,%1"
+		       : "=a" (res), "=m" (*p)
+		       : "m" (*p) : "memory", "cc");
+	return (res);
+}
+
+static __inline u_short
+atomic_load_acq_short(volatile u_short *p)
+{
+	u_short res;
+
+	__asm volatile("lock ; " "cmpxchgw %w0,%1"
+		       : "=a" (res), "=m" (*p)
+		       : "m" (*p)
+		       : "memory", "cc");
+	return (res);
+}
+
+static __inline u_int
+atomic_load_acq_int(volatile u_int *p)
+{
+	u_int res;
+
+	__asm volatile("lock ; " "cmpxchgl %0,%1"
+		       : "=a" (res), "=m" (*p)
+		       : "m" (*p)
+		       : "memory", "cc");
+	return (res);
+}
+
+static __inline u_long
+atomic_load_acq_long(volatile u_long *p)
+{
+	u_long res;
+
+	__asm volatile("lock ; " "cmpxchgq %0,%1"
+		       : "=a" (res), "=m" (*p)
+		       : "m" (*p)
+		       : "memory", "cc");
+	return (res);
+}
+
+static __inline void
+atomic_store_rel_char(volatile u_char *p, u_char v)
+{
+	__asm volatile("" : : : "memory");
+	*p = v;
+}
+
+static __inline void
+atomic_store_rel_short(volatile u_short *p, u_short v)
+{
+	__asm volatile("" : : : "memory");
+	*p = v;
+}
+
+static __inline void
+atomic_store_rel_int(volatile u_int *p, u_int v)
+{
+	__asm volatile("" : : : "memory");
+	*p = v;
+}
+
+static __inline void
+atomic_store_rel_long(volatile u_long *p, u_long v)
+{
+	__asm volatile("" : : : "memory");
+	*p = v;
+}
+
+/*
+ * Atomic compare and set.
+ *
+ * if (*dst == expect) *dst = src (all 32 bit words)
+ *
+ * Returns 0 on failure, non-zero on success
+ */
+static __inline int
+atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src)
+{
+	u_char res;
+
+	__asm __volatile(
+	"	lock ;			"
+	"	cmpxchgl %3,%1 ;	"
+	"       sete	%0 ;		"
+	"# atomic_cmpset_int"
+	: "=q" (res),			/* 0 */
+	  "+m" (*dst),			/* 1 */
+	  "+a" (expect)			/* 2 */
+	: "r" (src)			/* 3 */
+	: "memory", "cc");
+	return (res);
+}
+
+static __inline int
+atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src)
+{
+	u_char res;
+
+	__asm __volatile(
+	"	lock ;			"
+	"	cmpxchgq %3,%1 ;	"
+	"       sete	%0 ;		"
+	"# atomic_cmpset_long"
+	: "=q" (res),			/* 0 */
+	  "+m" (*dst),			/* 1 */
+	  "+a" (expect)			/* 2 */
+	: "r" (src)			/* 3 */
+	: "memory", "cc");
+	return (res);
+}
+
+/*
+ * Atomically add the value of v to the integer pointed to by p and return
+ * the previous value of *p.
+ */
+static __inline u_int
+atomic_fetchadd_int(volatile u_int *p, u_int v)
+{
+
+	__asm __volatile(
+	"	lock ;			"
+	"	xaddl	%0, %1 ;	"
+	"# atomic_fetchadd_int"
+	: "+r" (v),			/* 0 (result) */
+	  "=m" (*p)			/* 1 */
+	: "m" (*p)			/* 2 */
+	: "cc");
+	return (v);
+}
+
+static __inline void
+atomic_set_int(volatile u_int *p, u_int v)
+{
+	__asm volatile(
+	"lock ; " "orl %1,%0"
+	: "=m" (*p)
+	: "ir" (v), "m" (*p)
+	: "cc");
+}
+
+static __inline void
+atomic_clear_int(volatile u_int *p, u_int v)
+{
+	__asm volatile(
+	"lock ; " "andl %1,%0"
+	: "=m" (*p)
+	: "ir" (~v), "m" (*p)
+	: "cc");
+}
+
+static __inline void
+atomic_subtract_int(volatile u_int *p, u_int v)
+{
+	__asm volatile(
+	"lock ; " "subl %1,%0"
+	: "=m" (*p)
+	: "ir" (v), "m" (*p)
+	: "cc");
+}
+
+static __inline void
+atomic_set_long(volatile u_long *p, u_long v)
+{
+	__asm volatile(
+	"lock ; " "orq %1,%0"
+	: "+m" (*p)
+	: "ir" (v)
+	: "cc");
+}
+
+static __inline void
+atomic_clear_long(volatile u_long *p, u_long v)
+{
+	__asm volatile("lock ; " "andq %1,%0"
+	: "+m" (*p)
+	: "ir" (~v)
+	: "cc");
+}
+
+static __inline u_int
+atomic_swap_int(volatile u_int *p, u_int v)
+{
+
+	__asm __volatile(
+	"	xchgl	%1,%0 ;		"
+	"# atomic_swap_int"
+	: "+r" (v),			/* 0 */
+	  "+m" (*p));			/* 1 */
+	return (v);
+}
+
+static __inline u_long
+atomic_swap_long(volatile u_long *p, u_long v)
+{
+
+	__asm __volatile(
+	"	xchgq	%1,%0 ;		"
+	"# atomic_swap_long"
+	: "+r" (v),			/* 0 */
+	  "+m" (*p));			/* 1 */
+	return (v);
+}
+
+#define	atomic_readandclear_int(p)	atomic_swap_int(p, 0)
+#define	atomic_readandclear_long(p)	atomic_swap_long(p, 0)
+
+/* Operations on 32-bit double words. */
+#define	atomic_load_acq_32	atomic_load_acq_int
+#define	atomic_store_rel_32	atomic_store_rel_int
+#define	atomic_cmpset_32	atomic_cmpset_int
+
+/* Operations on 64-bit quad words. */
+#define	atomic_cmpset_64	atomic_cmpset_long
+#define	atomic_readandclear_64	atomic_readandclear_long
+
+/* Operations on pointers. */
+#define	atomic_cmpset_ptr	atomic_cmpset_long
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/clock.h b/usr/src/compat/freebsd/amd64/machine/clock.h
new file mode 100644
index 0000000000..f50b42a126
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/clock.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_
+
+extern uint64_t cpu_freq_hz;
+
+#define	tsc_freq	cpu_freq_hz
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/cpufunc.h b/usr/src/compat/freebsd/amd64/machine/cpufunc.h
new file mode 100644
index 0000000000..cf485e947c
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/cpufunc.h
@@ -0,0 +1,165 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_
+
+static __inline u_long
+bsfq(u_long mask)
+{
+	u_long	result;
+
+	__asm __volatile("bsfq %1,%0" : "=r" (result) : "rm" (mask));
+	return (result);
+}
+
+static __inline u_int
+bsrl(u_int mask)
+{
+	u_int	result;
+
+	__asm __volatile("bsrl %1,%0" : "=r" (result) : "rm" (mask));
+	return (result);
+}
+
+static __inline u_long
+bsrq(u_long mask)
+{
+	u_long	result;
+
+	__asm __volatile("bsrq %1,%0" : "=r" (result) : "rm" (mask));
+	return (result);
+}
+
+static __inline void
+clts(void)
+{
+	__asm __volatile("clts");
+}
+
+static __inline void
+do_cpuid(u_int ax, u_int *p)
+{
+	__asm __volatile("cpuid"
+			 : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+			 :  "0" (ax));
+}
+
+static __inline void
+cpuid_count(u_int ax, u_int cx, u_int *p)
+{
+	__asm __volatile("cpuid"
+			 : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+			 :  "0" (ax), "c" (cx));
+}
+
+static __inline void
+enable_intr(void)
+{
+	__asm __volatile("sti");
+}
+
+static __inline int
+ffsl(long mask)
+{
+	return (mask == 0 ? mask : (int)bsfq((u_long)mask) + 1);
+}
+
+static __inline int
+fls(int mask)
+{
+	return (mask == 0 ? mask : (int)bsrl((u_int)mask) + 1);
+}
+
+static __inline int
+flsl(long mask)
+{
+	return (mask == 0 ? mask : (int)bsrq((u_long)mask) + 1);
+}
+
+static __inline int
+flsll(long long mask)
+{
+	return (flsl((long)mask));
+}
+
+static __inline uint64_t
+rdmsr(u_int msr)
+{
+	uint32_t low, high;
+ 
+	__asm __volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr));
+	return (low | ((uint64_t)high << 32));
+}
+
+static __inline uint64_t
+rdtsc(void)
+{
+	uint32_t low, high;
+ 
+	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
+	return (low | ((uint64_t)high << 32));
+}
+
+static __inline void
+wrmsr(u_int msr, uint64_t newval)
+{
+	uint32_t low, high;
+
+	low = newval;
+	high = newval >> 32;
+	__asm __volatile("wrmsr" : : "a" (low), "d" (high), "c" (msr));
+}
+
+static __inline void
+load_cr0(u_long data)
+{
+	__asm __volatile("movq %0,%%cr0" : : "r" (data));
+}
+
+static __inline u_long
+rcr0(void)
+{
+	u_long  data;
+ 
+	__asm __volatile("movq %%cr0,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline u_long
+rcr3(void)
+{
+	u_long  data;
+
+	__asm __volatile("movq %%cr3,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_cr4(u_long data)
+{
+	__asm __volatile("movq %0,%%cr4" : : "r" (data));
+}
+
+static __inline u_long
+rcr4(void)
+{
+	u_long  data;
+ 
+	__asm __volatile("movq %%cr4,%0" : "=r" (data));
+	return (data);
+}
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/fpu.h b/usr/src/compat/freebsd/amd64/machine/fpu.h
new file mode 100644
index 0000000000..48e686780c
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/fpu.h
@@ -0,0 +1,29 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_
+
+#define	XSAVE_AREA_ALIGN	64
+
+void	fpuexit(kthread_t *td);
+void	fpurestore(void *);
+void	fpusave(void *);
+
+struct savefpu	*fpu_save_area_alloc(void);
+void	fpu_save_area_free(struct savefpu *fsa);
+void	fpu_save_area_reset(struct savefpu *fsa);
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/md_var.h b/usr/src/compat/freebsd/amd64/machine/md_var.h
new file mode 100644
index 0000000000..60fdd566e5
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/md_var.h
@@ -0,0 +1,24 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_
+
+extern  u_int   cpu_high;		/* Highest arg to CPUID */
+extern	u_int	cpu_exthigh;		/* Highest arg to extended CPUID */
+extern	u_int	cpu_id;			/* Stepping ID */
+extern	char	cpu_vendor[];		/* CPU Origin code */
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/param.h b/usr/src/compat/freebsd/amd64/machine/param.h
new file mode 100644
index 0000000000..eaca5ab8d7
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/param.h
@@ -0,0 +1,39 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_
+
+#ifdef	_KERNEL
+#define	MAXCPU		NCPU
+#endif	/* _KERNEL */
+
+#define	PAGE_SHIFT	12		/* LOG2(PAGE_SIZE) */
+#define	PAGE_SIZE	(1<<PAGE_SHIFT)	/* bytes/page */
+#define	PAGE_MASK	(PAGE_SIZE-1)
+
+/* Size of the level 1 page table units */
+#define	NPTEPG		(PAGE_SIZE/(sizeof (pt_entry_t)))
+
+/* Size of the level 2 page directory units */
+#define	NPDEPG		(PAGE_SIZE/(sizeof (pd_entry_t)))
+
+/* Size of the level 3 page directory pointer table units */
+#define	NPDPEPG		(PAGE_SIZE/(sizeof (pdp_entry_t)))
+
+/* Size of the level 4 page-map level-4 table units */
+#define	NPML4EPG	(PAGE_SIZE/(sizeof (pml4_entry_t)))
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/pcb.h b/usr/src/compat/freebsd/amd64/machine/pcb.h
new file mode 100644
index 0000000000..75b5de640c
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/pcb.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_
+
+#include <machine/fpu.h>
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/pmap.h b/usr/src/compat/freebsd/amd64/machine/pmap.h
new file mode 100644
index 0000000000..d0303bdd56
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/pmap.h
@@ -0,0 +1,44 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
+
+				/* ---- Intel Nomenclature ---- */
+#define	PG_V		0x001	/* P	Valid			*/
+#define	PG_RW		0x002	/* R/W	Read/Write		*/
+#define	PG_U		0x004	/* U/S	User/Supervisor 	*/
+#define	PG_A		0x020	/* A	Accessed		*/
+#define	PG_M		0x040	/* D	Dirty			*/
+#define	PG_PS		0x080	/* PS	Page size (0=4k,1=2M)	*/
+
+/*
+ * Page Protection Exception bits
+ */
+#define PGEX_P		0x01	/* Protection violation vs. not present */
+#define PGEX_W		0x02	/* during a Write cycle */
+#define PGEX_U		0x04	/* access from User mode (UPL) */
+#define PGEX_RSV	0x08	/* reserved PTE field is non-zero */
+#define PGEX_I		0x10	/* during an instruction fetch */
+
+typedef u_int64_t pd_entry_t;
+typedef u_int64_t pt_entry_t;
+typedef u_int64_t pdp_entry_t;
+typedef u_int64_t pml4_entry_t;
+
+#define	vtophys(va)	pmap_kextract(((vm_offset_t) (va)))
+vm_paddr_t pmap_kextract(vm_offset_t va);
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/segments.h b/usr/src/compat/freebsd/amd64/machine/segments.h
new file mode 100644
index 0000000000..d0655f4a0e
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/segments.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_
+
+#include <x86/segments.h>
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/smp.h b/usr/src/compat/freebsd/amd64/machine/smp.h
new file mode 100644
index 0000000000..ef719b9684
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/smp.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmm.h b/usr/src/compat/freebsd/amd64/machine/vmm.h
new file mode 100644
index 0000000000..79c3ec959e
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmm.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_
+
+#include <sys/vmm.h>
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmm_dev.h b/usr/src/compat/freebsd/amd64/machine/vmm_dev.h
new file mode 100644
index 0000000000..fe9cb6c705
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmm_dev.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_
+
+#include <sys/vmm_dev.h>
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h b/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h
new file mode 100644
index 0000000000..02c3f391c7
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_
+
+#include <sys/vmm_instruction_emul.h>
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmparam.h b/usr/src/compat/freebsd/amd64/machine/vmparam.h
new file mode 100644
index 0000000000..c80c2af545
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmparam.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ */
diff --git a/usr/src/compat/freebsd/libutil.h b/usr/src/compat/freebsd/libutil.h
new file mode 100644
index 0000000000..e22ffc0551
--- /dev/null
+++ b/usr/src/compat/freebsd/libutil.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_LIBUTIL_H_
+#define	_COMPAT_FREEBSD_LIBUTIL_H_
+
+int	expand_number(const char *_buf, uint64_t *_num);
+
+#endif	/* _COMPAT_FREEBSD_LIBUTIL_H_ */
diff --git a/usr/src/compat/freebsd/net/ethernet.h b/usr/src/compat/freebsd/net/ethernet.h
new file mode 100644
index 0000000000..a0d5a828c6
--- /dev/null
+++ b/usr/src/compat/freebsd/net/ethernet.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_
+#define	_COMPAT_FREEBSD_SYS_NET_ETHERNET_H_
+
+#include <sys/ethernet.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ */
diff --git a/usr/src/compat/freebsd/paths.h b/usr/src/compat/freebsd/paths.h
new file mode 100644
index 0000000000..e43c963f93
--- /dev/null
+++ b/usr/src/compat/freebsd/paths.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_PATHS_H_
+#define	_COMPAT_FREEBSD_PATHS_H_
+
+#define	_PATH_TMP	"/tmp/"
+
+#endif	/* _COMPAT_FREEBSD_PATHS_H_ */
diff --git a/usr/src/compat/freebsd/pthread_np.h b/usr/src/compat/freebsd/pthread_np.h
new file mode 100644
index 0000000000..641c58f406
--- /dev/null
+++ b/usr/src/compat/freebsd/pthread_np.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_PTHREAD_NP_H_
+#define	_COMPAT_FREEBSD_PTHREAD_NP_H_
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+
+#include <synch.h>
+
+#define	pthread_set_name_np(thread, name)
+
+#define	pthread_mutex_isowned_np(x)	_mutex_held(x)
+
+#endif	/* _COMPAT_FREEBSD_PTHREAD_NP_H_ */
diff --git a/usr/src/compat/freebsd/string.h b/usr/src/compat/freebsd/string.h
new file mode 100644
index 0000000000..7e0f5c7ddc
--- /dev/null
+++ b/usr/src/compat/freebsd/string.h
@@ -0,0 +1,26 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_STRING_H_
+#define	_COMPAT_FREEBSD_STRING_H_
+
+/*
+ * This is quite a hack; blame bcopy/bcmp/bzero and memcpy/memcmp/memset.
+ */
+#include <strings.h>
+
+#include_next <string.h>
+
+#endif	/* _COMPAT_FREEBSD_STRING_H_ */
diff --git a/usr/src/compat/freebsd/strings.h b/usr/src/compat/freebsd/strings.h
new file mode 100644
index 0000000000..fa3539fb96
--- /dev/null
+++ b/usr/src/compat/freebsd/strings.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_STRINGS_H_
+#define	_COMPAT_FREEBSD_STRINGS_H_
+
+#include <machine/cpufunc.h>
+
+#include_next <strings.h>
+
+#endif	/* _COMPAT_FREEBSD_STRINGS_H_ */
diff --git a/usr/src/compat/freebsd/sys/_iovec.h b/usr/src/compat/freebsd/sys/_iovec.h
new file mode 100644
index 0000000000..b755ae7e21
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/_iovec.h
@@ -0,0 +1,24 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS__IOVEC_H_
+#define	_COMPAT_FREEBSD_SYS__IOVEC_H_
+
+struct iovec {
+        void	*iov_base;	/* Base address. */
+        size_t	iov_len;	/* Length. */
+};
+
+#endif	/* _COMPAT_FREEBSD_SYS__IOVEC_H_ */
diff --git a/usr/src/compat/freebsd/sys/_pthreadtypes.h b/usr/src/compat/freebsd/sys/_pthreadtypes.h
new file mode 100644
index 0000000000..d746da3712
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/_pthreadtypes.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_
+#define	_COMPAT_FREEBSD_SYS__PTHREADTYPES_H_
+
+#endif	/* _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_ */
diff --git a/usr/src/compat/freebsd/sys/_types.h b/usr/src/compat/freebsd/sys/_types.h
new file mode 100644
index 0000000000..62c327d216
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/_types.h
@@ -0,0 +1,22 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS__TYPES_H_
+#define	_COMPAT_FREEBSD_SYS__TYPES_H_
+
+#include <sys/cdefs.h>
+#include <machine/_types.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS__TYPES_H_ */
diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h
new file mode 100644
index 0000000000..17b6e31507
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/callout.h
@@ -0,0 +1,70 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_
+#define	_COMPAT_FREEBSD_SYS_CALLOUT_H_
+
+#include <sys/cyclic.h>
+
+struct callout {
+	cyclic_id_t	c_cyc_id;
+	int		c_flags;
+	void		(*c_func)(void *);
+	void		*c_arg;
+
+};
+
+#define	CALLOUT_ACTIVE		0x0002	/* callout is currently active */
+#define	CALLOUT_PENDING		0x0004	/* callout is waiting for timeout */
+
+#define	C_ABSOLUTE		0x0200	/* event time is absolute. */
+
+#define	callout_active(c)	((c)->c_flags & CALLOUT_ACTIVE)
+#define	callout_deactivate(c)	((c)->c_flags &= ~CALLOUT_ACTIVE)
+#define	callout_pending(c)	((c)->c_flags & CALLOUT_PENDING)
+
+void	vmm_glue_callout_init(struct callout *c, int mpsafe);
+int	vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt,
+    sbintime_t pr, void (*func)(void *), void *arg, int flags);
+int	vmm_glue_callout_stop(struct callout *c);
+int	vmm_glue_callout_drain(struct callout *c);
+
+static __inline void
+callout_init(struct callout *c, int mpsafe)
+{
+	vmm_glue_callout_init(c, mpsafe);
+}
+
+static __inline int
+callout_stop(struct callout *c)
+{
+	return (vmm_glue_callout_stop(c));
+}
+
+static __inline int
+callout_drain(struct callout *c)
+{
+	return (vmm_glue_callout_drain(c));
+}
+
+static __inline int
+callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
+    void (*func)(void *), void *arg, int flags)
+{
+	return (vmm_glue_callout_reset_sbt(c, sbt, pr, func, arg, flags));
+}
+
+
+#endif	/* _COMPAT_FREEBSD_SYS_CALLOUT_H_ */
diff --git a/usr/src/compat/freebsd/sys/cdefs.h b/usr/src/compat/freebsd/sys/cdefs.h
new file mode 100644
index 0000000000..974e323dbe
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/cdefs.h
@@ -0,0 +1,58 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_
+#define	_COMPAT_FREEBSD_SYS_CDEFS_H_
+
+#define	__FBSDID(s)
+
+#ifdef	__GNUC__
+#define	inline		__inline
+
+#define	__GNUCLIKE___SECTION		1
+
+#define	__dead2		__attribute__((__noreturn__))
+#define	__unused	__attribute__((__unused__))
+#define	__used		__attribute__((__used__))
+#define	__packed	__attribute__((__packed__))
+#define	__aligned(x)	__attribute__((__aligned__(x)))
+#define	__section(x)	__attribute__((__section__(x)))
+#endif
+
+/*
+ * The __CONCAT macro is used to concatenate parts of symbol names, e.g.
+ * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo.
+ * The __CONCAT macro is a bit tricky to use if it must work in non-ANSI
+ * mode -- there must be no spaces between its arguments, and for nested
+ * __CONCAT's, all the __CONCAT's must be at the left.  __CONCAT can also
+ * concatenate double-quoted strings produced by the __STRING macro, but
+ * this only works with ANSI C.
+ *
+ * __XSTRING is like __STRING, but it expands any macros in its argument
+ * first.  It is only available with ANSI C.
+ */
+#if defined(__STDC__) || defined(__cplusplus)
+#define	__P(protos)	protos		/* full-blown ANSI C */
+#define	__CONCAT1(x,y)	x ## y
+#define	__CONCAT(x,y)	__CONCAT1(x,y)
+#define	__STRING(x)	#x		/* stringify without expanding x */
+#define	__XSTRING(x)	__STRING(x)	/* expand x, then stringify */
+#else	/* !(__STDC__ || __cplusplus) */
+#define	__P(protos)	()		/* traditional C preprocessor */
+#define	__CONCAT(x,y)	x/**/y
+#define	__STRING(x)	"x"
+#endif	/* !(__STDC__ || __cplusplus) */
+
+#endif	/* _COMPAT_FREEBSD_SYS_CDEFS_H_ */
diff --git a/usr/src/compat/freebsd/sys/cpuset.h b/usr/src/compat/freebsd/sys/cpuset.h
new file mode 100644
index 0000000000..8527624b5e
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/cpuset.h
@@ -0,0 +1,44 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_CPUSET_H_
+#define	_COMPAT_FREEBSD_SYS_CPUSET_H_
+
+#define	NOCPU			-1
+
+#ifdef	_KERNEL
+#define	CPU_SET(cpu, set)		CPUSET_ADD(*(set), cpu)
+#define	CPU_SETOF(cpu, set)		CPUSET_ONLY(*(set), cpu)
+#define	CPU_ZERO(set)			CPUSET_ZERO(*(set))
+#define	CPU_CLR(cpu, set)		CPUSET_DEL(*(set), cpu)
+#define	CPU_FFS(set)			cpusetobj_ffs(set)
+#define	CPU_ISSET(cpu, set)		CPU_IN_SET(*(set), cpu)
+#define	CPU_CMP(set1, set2)		CPUSET_ISEQUAL(*(set1), *(set2))
+#define	CPU_SET_ATOMIC(cpu, set)	CPUSET_ATOMIC_ADD(*(set), cpu)
+
+#include <sys/cpuvar.h>
+
+int	cpusetobj_ffs(const cpuset_t *set);
+#else
+#include <machine/atomic.h>
+
+typedef int cpuset_t;
+
+#define	CPUSET(cpu)			(1UL << (cpu))
+
+#define	CPU_SET_ATOMIC(cpu, set)	atomic_set_int((set), CPUSET(cpu))
+#endif
+
+#endif	/* _COMPAT_FREEBSD_SYS_CPUSET_H_ */
diff --git a/usr/src/compat/freebsd/sys/disk.h b/usr/src/compat/freebsd/sys/disk.h
new file mode 100644
index 0000000000..c9bdc6a2d8
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/disk.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_DISK_H_
+#define	_COMPAT_FREEBSD_SYS_DISK_H_
+
+#endif	/* _COMPAT_FREEBSD_SYS_DISK_H_ */
diff --git a/usr/src/compat/freebsd/sys/endian.h b/usr/src/compat/freebsd/sys/endian.h
new file mode 100644
index 0000000000..a31bff55d6
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/endian.h
@@ -0,0 +1,125 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_ENDIAN_H_
+#define	_COMPAT_FREEBSD_SYS_ENDIAN_H_
+
+static __inline uint16_t
+be16dec(const void *pp)
+{
+	uint8_t const *p = (uint8_t const *)pp;
+
+	return ((p[0] << 8) | p[1]);
+}
+
+static __inline uint32_t
+be32dec(const void *pp)
+{
+	uint8_t const *p = (uint8_t const *)pp;
+
+	return (((unsigned)p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]);
+}
+
+static __inline uint64_t
+be64dec(const void *pp)
+{
+	uint8_t const *p = (uint8_t const *)pp;
+
+	return (((uint64_t)be32dec(p) << 32) | be32dec(p + 4));
+}
+
+static __inline uint16_t
+le16dec(const void *pp)
+{
+	uint8_t const *p = (uint8_t const *)pp;
+
+	return ((p[1] << 8) | p[0]);
+}
+
+static __inline uint32_t
+le32dec(const void *pp)
+{
+	uint8_t const *p = (uint8_t const *)pp;
+
+	return (((unsigned)p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]);
+}
+
+static __inline uint64_t
+le64dec(const void *pp)
+{
+	uint8_t const *p = (uint8_t const *)pp;
+
+	return (((uint64_t)le32dec(p + 4) << 32) | le32dec(p));
+}
+
+static __inline void
+be16enc(void *pp, uint16_t u)
+{
+	uint8_t *p = (uint8_t *)pp;
+
+	p[0] = (u >> 8) & 0xff;
+	p[1] = u & 0xff;
+}
+
+static __inline void
+be32enc(void *pp, uint32_t u)
+{
+	uint8_t *p = (uint8_t *)pp;
+
+	p[0] = (u >> 24) & 0xff;
+	p[1] = (u >> 16) & 0xff;
+	p[2] = (u >> 8) & 0xff;
+	p[3] = u & 0xff;
+}
+
+static __inline void
+be64enc(void *pp, uint64_t u)
+{
+	uint8_t *p = (uint8_t *)pp;
+
+	be32enc(p, (uint32_t)(u >> 32));
+	be32enc(p + 4, (uint32_t)(u & 0xffffffffU));
+}
+
+static __inline void
+le16enc(void *pp, uint16_t u)
+{
+	uint8_t *p = (uint8_t *)pp;
+
+	p[0] = u & 0xff;
+	p[1] = (u >> 8) & 0xff;
+}
+
+static __inline void
+le32enc(void *pp, uint32_t u)
+{
+	uint8_t *p = (uint8_t *)pp;
+
+	p[0] = u & 0xff;
+	p[1] = (u >> 8) & 0xff;
+	p[2] = (u >> 16) & 0xff;
+	p[3] = (u >> 24) & 0xff;
+}
+
+static __inline void
+le64enc(void *pp, uint64_t u)
+{
+	uint8_t *p = (uint8_t *)pp;
+
+	le32enc(p, (uint32_t)(u & 0xffffffffU));
+	le32enc(p + 4, (uint32_t)(u >> 32));
+}
+
+#endif	/* _COMPAT_FREEBSD_SYS_ENDIAN_H_ */
diff --git a/usr/src/compat/freebsd/sys/errno.h b/usr/src/compat/freebsd/sys/errno.h
new file mode 100644
index 0000000000..bd37f43065
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/errno.h
@@ -0,0 +1,27 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_ERRNO_H_
+#define	_COMPAT_FREEBSD_SYS_ERRNO_H_
+
+#ifndef	_KERNEL
+extern int *___errno();
+
+#define	errno	(*(___errno()))
+#endif
+
+#include_next <sys/errno.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_ERRNO_H_ */
diff --git a/usr/src/compat/freebsd/sys/fcntl.h b/usr/src/compat/freebsd/sys/fcntl.h
new file mode 100644
index 0000000000..062a3b84ac
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/fcntl.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_FCNTL_H_
+#define	_COMPAT_FREEBSD_SYS_FCNTL_H_
+
+#define	O_DIRECT	0x0
+
+#include_next <sys/fcntl.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_FCNTL_H_ */
diff --git a/usr/src/compat/freebsd/sys/ioctl.h b/usr/src/compat/freebsd/sys/ioctl.h
new file mode 100644
index 0000000000..e223e1e4c7
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/ioctl.h
@@ -0,0 +1,22 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_IOCTL_H_
+#define	_COMPAT_FREEBSD_SYS_IOCTL_H_
+
+#include <sys/ioccom.h>
+#include_next <sys/ioctl.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_IOCTL_H_ */
diff --git a/usr/src/compat/freebsd/sys/kernel.h b/usr/src/compat/freebsd/sys/kernel.h
new file mode 100644
index 0000000000..b1c07674e4
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/kernel.h
@@ -0,0 +1,25 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_KERNEL_H_
+#define	_COMPAT_FREEBSD_SYS_KERNEL_H_
+
+#define	SYSINIT(uniquifier, subsystem, order, func, ident)
+
+#include <sys/linker_set.h>
+
+#define	ticks	ddi_get_lbolt()
+
+#endif	/* _COMPAT_FREEBSD_SYS_KERNEL_H_ */
diff --git a/usr/src/compat/freebsd/sys/ktr.h b/usr/src/compat/freebsd/sys/ktr.h
new file mode 100644
index 0000000000..96c499ef18
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/ktr.h
@@ -0,0 +1,27 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_KTR_H_
+#define	_COMPAT_FREEBSD_SYS_KTR_H_
+
+#define	CTR0(m, format)
+#define	CTR1(m, format, p1)
+#define	CTR2(m, format, p1, p2)
+#define	CTR3(m, format, p1, p2, p3)
+#define	CTR4(m, format, p1, p2, p3, p4)
+#define	CTR5(m, format, p1, p2, p3, p4, p5)
+#define	CTR6(m, d, p1, p2, p3, p4, p5, p6)
+
+#endif	/* _COMPAT_FREEBSD_SYS_KTR_H_ */
diff --git a/usr/src/compat/freebsd/sys/libkern.h b/usr/src/compat/freebsd/sys/libkern.h
new file mode 100644
index 0000000000..94675a0d66
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/libkern.h
@@ -0,0 +1,25 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_LIBKERN_H_
+#define	_COMPAT_FREEBSD_SYS_LIBKERN_H_
+
+#include <sys/systm.h>
+
+#ifndef	min
+static __inline u_int min(u_int a, u_int b) { return (a < b ? a : b); }
+#endif
+
+#endif	/* _COMPAT_FREEBSD_SYS_LIBKERN_H_ */
diff --git a/usr/src/compat/freebsd/sys/limits.h b/usr/src/compat/freebsd/sys/limits.h
new file mode 100644
index 0000000000..99ae0f4d64
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/limits.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_LIMITS_H_
+#define	_COMPAT_FREEBSD_SYS_LIMITS_H_
+
+#endif	/* _COMPAT_FREEBSD_SYS_LIMITS_H_ */
diff --git a/usr/src/compat/freebsd/sys/malloc.h b/usr/src/compat/freebsd/sys/malloc.h
new file mode 100644
index 0000000000..579df44533
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/malloc.h
@@ -0,0 +1,44 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MALLOC_H_
+#define	_COMPAT_FREEBSD_SYS_MALLOC_H_
+
+/*
+ * flags to malloc.
+ */
+#define	M_NOWAIT	0x0001		/* do not block */
+#define	M_WAITOK	0x0002		/* ok to block */
+#define	M_ZERO		0x0100		/* bzero the allocation */
+
+struct malloc_type {
+	const char	*ks_shortdesc;	/* Printable type name. */
+};
+
+#ifdef	_KERNEL
+#define	MALLOC_DEFINE(type, shortdesc, longdesc)			\
+	struct malloc_type type[1] = {					\
+		{ shortdesc }						\
+	}
+
+#define	MALLOC_DECLARE(type)						\
+	extern struct malloc_type type[1]
+
+void	free(void *addr, struct malloc_type *type);
+void	*malloc(unsigned long size, struct malloc_type *type, int flags);
+void	*old_malloc(unsigned long size, struct malloc_type *type , int flags);
+#endif	/* _KERNEL */
+
+#endif	/* _COMPAT_FREEBSD_SYS_MALLOC_H_ */
diff --git a/usr/src/compat/freebsd/sys/module.h b/usr/src/compat/freebsd/sys/module.h
new file mode 100644
index 0000000000..87b73e3fa3
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/module.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MODULE_H_
+#define	_COMPAT_FREEBSD_SYS_MODULE_H_
+
+#endif	/* _COMPAT_FREEBSD_SYS_MODULE_H_ */
diff --git a/usr/src/compat/freebsd/sys/mutex.h b/usr/src/compat/freebsd/sys/mutex.h
new file mode 100644
index 0000000000..b99884b652
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/mutex.h
@@ -0,0 +1,81 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MUTEX_H_
+#define	_COMPAT_FREEBSD_SYS_MUTEX_H_
+
+#ifdef	_KERNEL
+
+#include <sys/debug.h>
+
+#define	MTX_DEF		0x00000000
+#define	MTX_SPIN	0x00000001
+
+struct mtx;
+
+void mtx_init(struct mtx *, char *name, const char *type_name, int opts);
+void mtx_destroy(struct mtx *);
+
+int mtx_sleep(void *chan, struct mtx *mtx, int priority, const char *wmesg,
+    int timo);
+
+#endif	/* KERNEL */
+#include_next <sys/mutex.h>
+#ifdef	_KERNEL
+
+struct mtx {
+	kmutex_type_t	t;
+	kmutex_t	m;
+};
+
+static __inline void mtx_lock(struct mtx *mtx)
+{
+	mutex_enter(&mtx->m);
+}
+
+static __inline void mtx_unlock(struct mtx *mtx)
+{
+	mutex_exit(&mtx->m);
+}
+
+static __inline void mtx_lock_spin(struct mtx *mtx)
+{
+	mutex_enter(&mtx->m);
+}
+
+static __inline void mtx_unlock_spin(struct mtx *mtx)
+{
+	mutex_exit(&mtx->m);
+}
+
+static __inline int mtx_owned(struct mtx *mtx)
+{
+	return (mutex_owned(&mtx->m));
+}
+
+#define	MA_OWNED	0
+
+static __inline void mtx_assert(struct mtx *mtx, int what)
+{
+	switch (what) {
+	case MA_OWNED:
+		ASSERT(mutex_owned(&mtx->m));
+		break;
+	}
+}
+
+#endif	/* _KERNEL */
+
+#endif	/* _COMPAT_FREEBSD_SYS_MUTEX_H_ */
diff --git a/usr/src/compat/freebsd/sys/param.h b/usr/src/compat/freebsd/sys/param.h
new file mode 100644
index 0000000000..f09e9183f6
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/param.h
@@ -0,0 +1,48 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_PARAM_H_
+#define	_COMPAT_FREEBSD_SYS_PARAM_H_
+
+#ifndef	_KERNEL
+#define	MAXCOMLEN	16
+#endif
+#define	MAXHOSTNAMELEN	256
+
+#ifdef	_KERNEL
+#include <sys/time.h>
+
+#ifndef	FALSE
+#define	FALSE	0
+#endif
+#ifndef	TRUE
+#define	TRUE	1
+#endif
+#endif
+
+#include <machine/param.h>
+
+#define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
+#define	rounddown(x,y)	(((x)/(y))*(y))
+#define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
+#define	roundup2(x,y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+
+/* Macros for min/max. */
+#define	MIN(a,b) (((a)<(b))?(a):(b))
+#define	MAX(a,b) (((a)>(b))?(a):(b))
+
+#include_next <sys/param.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_PARAM_H_ */
diff --git a/usr/src/compat/freebsd/sys/pcpu.h b/usr/src/compat/freebsd/sys/pcpu.h
new file mode 100644
index 0000000000..f29c9c5018
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/pcpu.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_PCPU_H_
+#define	_COMPAT_FREEBSD_SYS_PCPU_H_
+
+#define	curcpu	(CPU->cpu_id)
+
+#endif	/* _COMPAT_FREEBSD_SYS_PCPU_H_ */
diff --git a/usr/src/compat/freebsd/sys/sched.h b/usr/src/compat/freebsd/sys/sched.h
new file mode 100644
index 0000000000..b426ee757e
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/sched.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SCHED_H_
+#define	_COMPAT_FREEBSD_SYS_SCHED_H_
+
+#endif	/* _COMPAT_FREEBSD_SYS_SCHED_H_ */
diff --git a/usr/src/compat/freebsd/sys/select.h b/usr/src/compat/freebsd/sys/select.h
new file mode 100644
index 0000000000..fcb40c23b1
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/select.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MODULE_H_
+#define	_COMPAT_FREEBSD_SYS_MODULE_H_
+
+void *memset(void *s, int c, size_t n);
+
+#include_next <sys/select.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_MODULE_H_ */
diff --git a/usr/src/compat/freebsd/sys/smp.h b/usr/src/compat/freebsd/sys/smp.h
new file mode 100644
index 0000000000..46183e8677
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/smp.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SMP_H_
+#define	_COMPAT_FREEBSD_SYS_SMP_H_
+
+#include <sys/cpuset.h>
+
+void	smp_rendezvous(void (*)(void *),
+		       void (*)(void *),
+		       void (*)(void *),
+		       void *arg);
+
+void	ipi_cpu(int cpu, u_int ipi);
+
+#endif	/* _COMPAT_FREEBSD_SYS_SMP_H_ */
diff --git a/usr/src/compat/freebsd/sys/sysctl.h b/usr/src/compat/freebsd/sys/sysctl.h
new file mode 100644
index 0000000000..9f6a695e34
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/sysctl.h
@@ -0,0 +1,27 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SYSCTL_H_
+#define	_COMPAT_FREEBSD_SYS_SYSCTL_H_
+
+#define	SYSCTL_DECL(name)
+
+#define	SYSCTL_NODE(parent, nbr, name, access, handler, descr)
+
+#define	SYSCTL_INT(parent, nbr, name, access, ptr, val, descr)
+#define	SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr)
+#define	SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr)
+
+#endif	/* _COMPAT_FREEBSD_SYS_SYSCTL_H_ */
diff --git a/usr/src/compat/freebsd/sys/systm.h b/usr/src/compat/freebsd/sys/systm.h
new file mode 100644
index 0000000000..e25acc0e4a
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/systm.h
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SYSTM_H_
+#define	_COMPAT_FREEBSD_SYS_SYSTM_H_
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <sys/callout.h>
+#include <sys/queue.h>
+
+struct mtx;
+
+#define	KASSERT(exp,msg) do {						\
+	if (!(exp))							\
+		panic msg;						\
+} while (0)
+
+#define	CTASSERT(x)	_CTASSERT(x, __LINE__)
+#define	_CTASSERT(x,y)	__CTASSERT(x,y)
+#define	__CTASSERT(x,y)	typedef char __assert ## y[(x) ? 1 : -1]
+
+void	critical_enter(void);
+void	critical_exit(void);
+
+int	msleep_spin(void *chan, struct mtx *mutex, const char *wmesg,
+    int ticks);
+void	wakeup(void *chan);
+void	wakeup_one(void *chan);
+
+struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex);
+void delete_unrhdr(struct unrhdr *uh);
+int alloc_unr(struct unrhdr *uh);
+void free_unr(struct unrhdr *uh, u_int item);
+
+#include <sys/libkern.h>
+
+#include_next <sys/systm.h>
+#include <sys/cmn_err.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_SYSTM_H_ */
diff --git a/usr/src/compat/freebsd/sys/time.h b/usr/src/compat/freebsd/sys/time.h
new file mode 100644
index 0000000000..f8f9da5cdf
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/time.h
@@ -0,0 +1,104 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_TIME_H_
+#define	_COMPAT_FREEBSD_SYS_TIME_H_
+
+#include_next <sys/time.h>
+
+#define	tc_precexp	0
+
+struct bintime {
+	ulong_t		sec;		/* seconds */
+	uint64_t	frac;		/* 64 bit fraction of a second */
+};
+
+#define	BT2FREQ(bt)							\
+	(((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) /		\
+	    ((bt)->frac >> 1))
+
+#define	FREQ2BT(freq, bt)						\
+{									\
+	(bt)->sec = 0;							\
+	(bt)->frac = ((uint64_t)0x8000000000000000  / (freq)) << 1;	\
+}
+
+static __inline void
+binuptime(struct bintime *bt)
+{
+	hrtime_t	now = gethrtime();
+
+	bt->sec = now / 1000000000;
+	/* 18446744073 = int(2^64 / 1000000000) = 1ns in 64-bit fractions */
+	bt->frac = (now % 1000000000) * (uint64_t)18446744073LL;
+}
+
+#define	bintime_cmp(a, b, cmp)						\
+	(((a)->sec == (b)->sec) ?					\
+	    ((a)->frac cmp (b)->frac) :					\
+	    ((a)->sec cmp (b)->sec))
+
+#define	SBT_1US	(1000)
+
+static __inline void
+bintime_add(struct bintime *bt, const struct bintime *bt2)
+{
+	uint64_t u;
+
+	u = bt->frac;
+	bt->frac += bt2->frac;
+	if (u > bt->frac)
+		bt->sec++;
+	bt->sec += bt2->sec;
+}
+
+static __inline void
+bintime_sub(struct bintime *bt, const struct bintime *bt2)
+{
+	uint64_t u;
+
+	u = bt->frac;
+	bt->frac -= bt2->frac;
+	if (u < bt->frac)
+		bt->sec--;
+	bt->sec -= bt2->sec;
+}
+
+static __inline void
+bintime_mul(struct bintime *bt, u_int x)
+{
+	uint64_t p1, p2;
+
+	p1 = (bt->frac & 0xffffffffull) * x;
+	p2 = (bt->frac >> 32) * x + (p1 >> 32);
+	bt->sec *= x;
+	bt->sec += (p2 >> 32);
+	bt->frac = (p2 << 32) | (p1 & 0xffffffffull);
+}
+
+static __inline sbintime_t
+bttosbt(const struct bintime bt)
+{
+	return ((bt.sec * 1000000000) +
+	    (((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32));
+}
+
+static __inline sbintime_t
+sbinuptime(void)
+{
+	return (gethrtime());
+}
+
+#endif	/* _COMPAT_FREEBSD_SYS_TIME_H_ */
diff --git a/usr/src/compat/freebsd/sys/types.h b/usr/src/compat/freebsd/sys/types.h
new file mode 100644
index 0000000000..6fc8179f2e
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/types.h
@@ -0,0 +1,74 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_TYPES_H_
+#define	_COMPAT_FREEBSD_SYS_TYPES_H_
+
+#include <sys/_types.h>
+
+typedef __uint8_t	u_int8_t;	/* unsigned integrals (deprecated) */
+typedef __uint16_t	u_int16_t;
+typedef __uint32_t	u_int32_t;
+typedef __uint64_t	u_int64_t;
+
+#ifndef	__REGISTER_T_DEFINED
+#define	__REGISTER_T_DEFINED
+typedef __register_t	register_t;
+#endif
+
+#ifndef	__SBINTIME_T_DEFINED
+#define	__SBINTIME_T_DEFINED
+typedef __int64_t	sbintime_t;
+#endif
+
+#ifndef	__VM_MEMATTR_T_DEFINED
+#define	__VM_MEMATTR_T_DEFINED
+typedef char	vm_memattr_t;
+#endif
+
+#ifndef	__VM_OFFSET_T_DEFINED
+#define	__VM_OFFSET_T_DEFINED
+typedef __vm_offset_t	vm_offset_t;
+#endif
+
+#ifndef	__VM_OOFFSET_T_DEFINED
+#define	__VM_OOFFSET_T_DEFINED
+typedef __vm_ooffset_t	vm_ooffset_t;
+#endif
+
+#ifndef	__VM_PADDR_T_DEFINED
+#define	__VM_PADDR_T_DEFINED
+typedef __vm_paddr_t	vm_paddr_t;
+#endif
+
+#ifndef	__VM_MEMATTR_T_DEFINED
+#define	__VM_MEMATTR_T_DEFINED
+typedef char		vm_memattr_t;
+#endif
+
+#ifndef	__bool_true_false_are_defined
+#define	__bool_true_false_are_defined	1
+#define	false	0
+#define	true	1
+typedef _Bool bool;
+#endif
+
+#if defined(_KERNEL) && !defined(offsetof)
+#define	offsetof(s, m)	((size_t)(&(((s *)0)->m)))
+#endif
+
+#include_next <sys/types.h>
+
+#endif	/* _COMPAT_FREEBSD_SYS_TYPES_H_ */
diff --git a/usr/src/compat/freebsd/sys/uio.h b/usr/src/compat/freebsd/sys/uio.h
new file mode 100644
index 0000000000..05c6f2a028
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/uio.h
@@ -0,0 +1,26 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_UIO_H_
+#define	_COMPAT_FREEBSD_SYS_UIO_H_
+
+#include_next <sys/uio.h>
+
+#ifndef	_KERNEL
+ssize_t preadv(int, const struct iovec *, int, off_t);
+ssize_t pwritev(int, const struct iovec *, int, off_t);
+#endif
+
+#endif	/* _COMPAT_FREEBSD_SYS_UIO_H_ */
diff --git a/usr/src/compat/freebsd/termios.h b/usr/src/compat/freebsd/termios.h
new file mode 100644
index 0000000000..feaa705358
--- /dev/null
+++ b/usr/src/compat/freebsd/termios.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_TERMIOS_H_
+#define	_COMPAT_FREEBSD_TERMIOS_H_
+
+#include_next <termios.h>
+
+void	cfmakeraw(struct termios *);
+
+#endif	/* _COMPAT_FREEBSD_TERMIOS_H_ */
diff --git a/usr/src/compat/freebsd/uuid.h b/usr/src/compat/freebsd/uuid.h
new file mode 100644
index 0000000000..72ef2c7787
--- /dev/null
+++ b/usr/src/compat/freebsd/uuid.h
@@ -0,0 +1,55 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_UUID_H_
+#define	_COMPAT_FREEBSD_UUID_H_
+
+#include <sys/endian.h>
+#include <uuid/uuid.h>
+
+/* Status codes returned by the functions. */
+#define	uuid_s_ok			0
+#define	uuid_s_bad_version		1
+#define	uuid_s_invalid_string_uuid	2
+
+static __inline void
+uuid_from_string(char *str, uuid_t *uuidp, uint32_t *status)
+{
+	if (uuid_parse(str, *uuidp) == 0) {
+		*status = uuid_s_ok;
+	} else {
+		*status = uuid_s_invalid_string_uuid;
+	}
+}
+
+static __inline void
+uuid_enc_le(void *buf, uuid_t *uuidp)
+{
+	uchar_t	*p;
+	int	i;
+
+	p = buf;
+	be32enc(p, ((struct uuid *)uuidp)->time_low);
+	be16enc(p + 4, ((struct uuid *)uuidp)->time_mid);
+	be16enc(p + 6, ((struct uuid *)uuidp)->time_hi_and_version);
+	p[8] = ((struct uuid *)uuidp)->clock_seq_hi_and_reserved;
+	p[9] = ((struct uuid *)uuidp)->clock_seq_low;
+
+	for (i = 0; i < 6; i++)
+		p[10 + i] = ((struct uuid *)uuidp)->node_addr[i];
+
+}
+
+#endif	/* _COMPAT_FREEBSD_UUID_H_ */
diff --git a/usr/src/compat/freebsd/vm/pmap.h b/usr/src/compat/freebsd/vm/pmap.h
new file mode 100644
index 0000000000..5958c4b101
--- /dev/null
+++ b/usr/src/compat/freebsd/vm/pmap.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_VM_PMAP_H_
+#define	_COMPAT_FREEBSD_VM_PMAP_H_
+
+#include <machine/pmap.h>
+
+#endif	/* _COMPAT_FREEBSD_VM_PMAP_H_ */
diff --git a/usr/src/compat/freebsd/vm/vm.h b/usr/src/compat/freebsd/vm/vm.h
new file mode 100644
index 0000000000..7da22099b6
--- /dev/null
+++ b/usr/src/compat/freebsd/vm/vm.h
@@ -0,0 +1,39 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _FREEBSD_VM_VM_H_
+#define	_FREEBSD_VM_VM_H_
+
+#include <machine/vm.h>
+
+typedef u_char vm_prot_t;
+
+#define	VM_PROT_NONE		((vm_prot_t) 0x00)
+#define	VM_PROT_READ		((vm_prot_t) 0x01)
+#define	VM_PROT_WRITE		((vm_prot_t) 0x02)
+#define	VM_PROT_EXECUTE		((vm_prot_t) 0x04)
+
+#define	VM_PROT_ALL		(VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
+#define	VM_PROT_RW		(VM_PROT_READ|VM_PROT_WRITE)
+
+/*
+ * <sys/promif.h> contains a troublesome preprocessor define for BYTE.
+ * Do this ugly workaround to avoid it.
+ */
+#define	_SYS_PROMIF_H
+#include <vm/hat_i86.h>
+#undef	_SYS_PROMIF_H
+
+#endif	/* _FREEBSD_VM_VM_H_ */
diff --git a/usr/src/compat/freebsd/x86/_types.h b/usr/src/compat/freebsd/x86/_types.h
new file mode 100644
index 0000000000..a07fc017ad
--- /dev/null
+++ b/usr/src/compat/freebsd/x86/_types.h
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _FREEBSD_X86__TYPES_H_
+#define	_FREEBSD_X86__TYPES_H_
+
+/*
+ * Basic types upon which most other types are built.
+ */
+typedef signed char		__int8_t;
+typedef unsigned char		__uint8_t;
+typedef short			__int16_t;
+typedef unsigned short		__uint16_t;
+typedef int			__int32_t;
+typedef unsigned int		__uint32_t;
+#ifdef	_LP64
+typedef long			__int64_t;
+typedef unsigned long		__uint64_t;
+#else
+typedef long long		__int64_t;
+typedef unsigned long long	__uint64_t;
+#endif
+
+/* 
+ * Standard type definitions.
+ */
+#ifdef	_LP64
+typedef __int64_t	__register_t;
+typedef __uint64_t	__vm_offset_t;
+typedef __uint64_t	__vm_paddr_t;
+typedef __int64_t	__vm_ooffset_t;
+#else
+typedef __int32_t	__register_t;
+typedef __uint32_t	__vm_paddr_t;
+#endif
+
+#endif	/* _FREEBSD_X86__TYPES_H_ */
diff --git a/usr/src/compat/freebsd/x86/segments.h b/usr/src/compat/freebsd/x86/segments.h
new file mode 100644
index 0000000000..bc6ba976b8
--- /dev/null
+++ b/usr/src/compat/freebsd/x86/segments.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H_
+#define	_COMPAT_FREEBSD_X86_SEGMENTS_H_
+
+/*
+ * Entries in the Interrupt Descriptor Table (IDT)
+ */
+#define	IDT_BP		3	/* #BP: Breakpoint */
+#define	IDT_UD		6	/* #UD: Undefined/Invalid Opcode */
+#define	IDT_SS		12	/* #SS: Stack Segment Fault */
+#define	IDT_GP		13	/* #GP: General Protection Fault */
+#define	IDT_AC		17	/* #AC: Alignment Check */
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */
diff --git a/usr/src/head/bhyve.h b/usr/src/head/bhyve.h
new file mode 100644
index 0000000000..8c79ca1ccc
--- /dev/null
+++ b/usr/src/head/bhyve.h
@@ -0,0 +1,25 @@
+/*
+ * COPYRIGHT 2013 Pluribus Networks Inc.
+ *
+ * All rights reserved. This copyright notice is Copyright Management
+ * Information under 17 USC 1202 and is included to protect this work and
+ * deter copyright infringement.  Removal or alteration of this Copyright
+ * Management Information without the express written permission from
+ * Pluribus Networks Inc is prohibited, and any such unauthorized removal
+ * or alteration will be a violation of federal law.
+ */
+#ifndef	_BHYVE_H
+#define	_BHYVE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	BHYVE_TMPDIR			"/var/run/bhyve"
+#define	BHYVE_CONS_SOCKPATH		BHYVE_TMPDIR "/%s.console_sock"
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _BHYVE_H */
diff --git a/usr/src/lib/libvmmapi/Makefile b/usr/src/lib/libvmmapi/Makefile
new file mode 100644
index 0000000000..60621fcb75
--- /dev/null
+++ b/usr/src/lib/libvmmapi/Makefile
@@ -0,0 +1,49 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include		../Makefile.lib
+
+HDRS =		vmmapi.h
+
+HDRDIR =	common
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all:=		TARGET= all
+install:=	TARGET= install
+clean:=		TARGET= clean
+clobber:=	TARGET= clobber
+lint:=		TARGET= lint
+_msg:=		TARGET= _msg
+
+.KEEP_STATE:
+
+all install clean clobber lint: $(SUBDIRS)
+
+# install rule for install_h target
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+_msg: $(MSGSUBDIRS)
+
+$(SUBDIRS): FRC
+	cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
+include ../../Makefile.msg.targ
diff --git a/usr/src/lib/libvmmapi/Makefile.com b/usr/src/lib/libvmmapi/Makefile.com
new file mode 100644
index 0000000000..e41a82f9a2
--- /dev/null
+++ b/usr/src/lib/libvmmapi/Makefile.com
@@ -0,0 +1,53 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+LIBRARY		= libvmmapi.a
+VERS		= .1
+
+OBJECTS		= vmmapi.o expand_number.o
+
+# include library definitions
+include ../../Makefile.lib
+
+# install this library in the root filesystem
+include ../../Makefile.rootfs
+
+SRCDIR		=	../common
+
+LIBS		=	$(DYNLIB) $(LINTLIB)
+
+CPPFLAGS	=	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+	$(CPPFLAGS.master) -I$(SRC)/uts/i86pc
+
+$(LINTLIB) :=	SRCS = $(SRCDIR)/$(LINTSRC)
+
+LDLIBS		+=	-lc
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+pics/%.o: $(CONTRIB)/freebsd/lib/libutil/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+pics/%.o: ../common/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+# include library targets
+include ../../Makefile.targ
diff --git a/usr/src/lib/libvmmapi/amd64/Makefile b/usr/src/lib/libvmmapi/amd64/Makefile
new file mode 100644
index 0000000000..b5cac1ffce
--- /dev/null
+++ b/usr/src/lib/libvmmapi/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64) $(ROOTLINT64)
diff --git a/usr/src/lib/libvmmapi/common/llib-lvmmapi b/usr/src/lib/libvmmapi/common/llib-lvmmapi
new file mode 100644
index 0000000000..221ed3a23e
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/llib-lvmmapi
@@ -0,0 +1,2 @@
+/* LINTLIBRARY */
+/* PROTOLIB1 */
diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers
new file mode 100644
index 0000000000..7a8443a2b8
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers
@@ -0,0 +1,77 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+SUNWprivate_1.0 {
+    global:
+	vcpu_reset;
+	vm_activate_cpu;
+	vm_apicid2vcpu;
+	vm_capability_name2type;
+	vm_capability_type2name;
+	vm_copy_setup;
+	vm_copy_teardown;
+	vm_copyin;
+	vm_copyout;
+	vm_create;
+	vm_destroy;
+	vm_get_capability;
+	vm_get_desc;
+	vm_get_highmem_size;
+	vm_get_lowmem_limit;
+	vm_get_lowmem_size;
+	vm_get_memory_seg;
+	vm_get_register;
+	vm_get_seg_desc;
+	vm_get_x2apic_state;
+	vm_gla2gpa;
+	vm_inject_exception;
+	vm_isa_assert_irq;
+	vm_isa_deassert_irq;
+	vm_isa_pulse_irq;
+	vm_isa_set_irq_trigger;
+	vm_ioapic_assert_irq;
+	vm_ioapic_deassert_irq;
+	vm_ioapic_pincount;
+	vm_ioapic_pulse_irq;
+	vm_lapic_irq;
+	vm_lapic_msi;
+	vm_map_gpa;
+	vm_open;
+	vm_parse_memsize;
+	vm_restart_instruction;
+	vm_run;
+	vm_set_capability;
+	vm_set_desc;
+	vm_set_register;
+	vm_set_x2apic_state;
+	vm_setup_memory;
+	vm_setup_rom;
+    local:
+        *;
+};
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
new file mode 100644
index 0000000000..bbab3961a9
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -0,0 +1,1257 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $");
+
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/_iovec.h>
+#include <sys/cpuset.h>
+
+#include <machine/specialreg.h>
+
+#ifndef	__FreeBSD__
+#include <errno.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <libutil.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#ifndef	__FreeBSD__
+#include <sys/vmm_impl.h>
+#endif
+
+#include "vmmapi.h"
+
+#define	KB	(1024UL)
+#define	MB	(1024 * 1024UL)
+#define	GB	(1024 * 1024 * 1024UL)
+
+struct vmctx {
+	int	fd;
+	uint32_t lowmem_limit;
+	enum vm_mmap_style vms;
+	char	*lowermem_addr;
+	char	*biosmem_addr;
+	size_t	lowmem;
+	char	*lowmem_addr;
+	size_t	highmem;
+	char	*highmem_addr;
+	uint64_t rombase;
+	uint64_t romlimit;
+	char	*rom_addr;
+	char	*name;
+};
+
+#ifdef	__FreeBSD__
+#define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
+#define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
+#else
+#define	CREATE(x)	vmm_vm_create(x)
+#define	DESTROY(x)	vmm_vm_destroy(x)
+#endif
+
+static int
+vm_device_open(const char *name)
+{
+        int fd, len;
+        char *vmfile;
+
+#ifdef	__FreeBSD__
+	len = strlen("/dev/vmm/") + strlen(name) + 1;
+#else
+	len = strlen("/devices/pseudo/vmm@0:") + strlen(name) + 1;
+#endif
+	vmfile = malloc(len);
+	assert(vmfile != NULL);
+#ifdef	__FreeBSD__
+	snprintf(vmfile, len, "/dev/vmm/%s", name);
+#else
+	snprintf(vmfile, len, "/devices/pseudo/vmm@0:%s", name);
+#endif
+
+        /* Open the device file */
+        fd = open(vmfile, O_RDWR, 0);
+
+	free(vmfile);
+        return (fd);
+}
+
+#ifndef	__FreeBSD__
+static int
+vmm_vm_create(const char *name)
+{
+	const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl";
+	struct vmm_ioctl vi;
+	int err = 0;
+	int ctl_fd;
+
+	(void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1);
+
+	ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR);
+	if (ctl_fd == -1) {
+		err = errno;
+		if ((errno == EPERM) || (errno == EACCES)) {
+			fprintf(stderr, "you do not have permission to "
+				"perform that operation.\n");
+		} else {
+			fprintf(stderr, "open: %s: %s\n", vmm_ctl,
+				strerror(errno));
+		}
+		return (err);
+	}
+	if (ioctl(ctl_fd, VMM_CREATE_VM, &vi) == -1) {
+		err = errno;
+		fprintf(stderr, "couldn't create vm \"%s\"", name);
+	}
+	close (ctl_fd);
+
+	return (err);
+}
+#endif
+
+int
+vm_create(const char *name)
+{
+
+	return (CREATE((char *)name));
+}
+
+struct vmctx *
+vm_open(const char *name)
+{
+	struct vmctx *vm;
+
+	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
+	assert(vm != NULL);
+
+	vm->fd = -1;
+	vm->lowmem_limit = 3 * GB;
+	vm->name = (char *)(vm + 1);
+	strcpy(vm->name, name);
+
+	if ((vm->fd = vm_device_open(vm->name)) < 0)
+		goto err;
+
+	return (vm);
+err:
+	(void) vm_destroy(vm);
+	return (NULL);
+}
+
+#ifndef	__FreeBSD__
+static int
+vmm_vm_destroy(const char *name)
+{
+	const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl";
+	struct vmm_ioctl vi;	
+	int ctl_fd;
+	int err = 0;
+
+	(void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1);
+
+	ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR);
+	if (ctl_fd == -1) {
+		err = errno;
+		if ((errno == EPERM) || (errno == EACCES)) {
+			fprintf(stderr, "you do not have permission to "
+				"perform that operation.\n");
+		} else {
+			fprintf(stderr, "open: %s: %s\n", vmm_ctl,
+				strerror(errno));
+		}
+		return (err);
+	}
+	if (ioctl(ctl_fd, VMM_DESTROY_VM, &vi) == -1) {
+		err = errno;
+		fprintf(stderr, "couldn't destroy vm \"%s\"", name);
+	}
+	close (ctl_fd);
+	return (err);
+}
+#endif
+
+int
+vm_destroy(struct vmctx *vm)
+{
+	int err;
+	assert(vm != NULL);
+
+	if (vm->fd >= 0)
+		close(vm->fd);
+	err = DESTROY(vm->name);
+
+	free(vm);
+	return (err);
+}
+
+int
+vm_parse_memsize(const char *optarg, size_t *ret_memsize)
+{
+	char *endptr;
+	size_t optval;
+	int error;
+
+	optval = strtoul(optarg, &endptr, 0);
+	if (*optarg != '\0' && *endptr == '\0') {
+		/*
+		 * For the sake of backward compatibility if the memory size
+		 * specified on the command line is less than a megabyte then
+		 * it is interpreted as being in units of MB.
+		 */
+		if (optval < MB)
+			optval *= MB;
+		*ret_memsize = optval;
+		error = 0;
+	} else
+		error = expand_number(optarg, ret_memsize);
+
+	return (error);
+}
+
+#ifdef	__FreeBSD__
+size_t
+vmm_get_mem_total(void)
+{
+	size_t mem_total = 0;
+	size_t oldlen = sizeof(mem_total);
+	int error;
+	error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0);
+	if (error)
+		return -1;
+	return mem_total;
+}
+
+size_t
+vmm_get_mem_free(void)
+{
+	size_t mem_free = 0;
+	size_t oldlen = sizeof(mem_free);
+	int error;
+	error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0);
+	if (error)
+		return -1;
+	return mem_free;
+}
+#endif
+
+int
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+		  int *wired)
+{
+	int error;
+	struct vm_memory_segment seg;
+
+	bzero(&seg, sizeof(seg));
+	seg.gpa = gpa;
+	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
+	*ret_len = seg.len;
+	if (wired != NULL)
+		*wired = seg.wired;
+	return (error);
+}
+
+uint32_t
+vm_get_lowmem_limit(struct vmctx *ctx)
+{
+
+	return (ctx->lowmem_limit);
+}
+
+void
+vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
+{
+
+	ctx->lowmem_limit = limit;
+}
+
+static int
+setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
+{
+	int error;
+	struct vm_memory_segment seg;
+
+	/*
+	 * Create and optionally map 'len' bytes of memory at guest
+	 * physical address 'gpa'
+	 */
+	bzero(&seg, sizeof(seg));
+	seg.gpa = gpa;
+	seg.len = len;
+	error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
+	if (error == 0 && addr != NULL) {
+		*addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+				ctx->fd, gpa);
+	}
+	return (error);
+}
+
+int
+vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+{
+	char **addr;
+	int error;
+
+	/* XXX VM_MMAP_SPARSE not implemented yet */
+	assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
+	ctx->vms = vms;
+
+	/*
+	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
+	 * create another 'highmem' segment above 4GB for the remainder.
+	 */
+	if (memsize > ctx->lowmem_limit) {
+		ctx->lowmem = ctx->lowmem_limit;
+		ctx->highmem = memsize - ctx->lowmem;
+	} else {
+		ctx->lowmem = memsize;
+		ctx->highmem = 0;
+	}
+
+	if (ctx->lowmem > 0) {
+		addr = (vms == VM_MMAP_ALL) ? &ctx->lowermem_addr : NULL;
+		error = setup_memory_segment(ctx, 0, 640*KB, addr);
+		if (error)
+			return (error);
+
+		addr = (vms == VM_MMAP_ALL) ? &ctx->biosmem_addr : NULL;
+		error = setup_memory_segment(ctx, 768*KB, 256*KB, addr);
+		if (error)
+			return (error);
+
+		addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
+		error = setup_memory_segment(ctx, 1*MB, ctx->lowmem - 1*MB, addr);
+		if (error)
+			return (error);
+	}
+
+	if (ctx->highmem > 0) {
+		addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
+		error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+int
+vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
+{
+	ctx->rombase = gpa;
+	ctx->romlimit = gpa + len;
+
+	return (setup_memory_segment(ctx, gpa, len, &ctx->rom_addr));
+}
+
+void *
+vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
+{
+
+	/* XXX VM_MMAP_SPARSE not implemented yet */
+	assert(ctx->vms == VM_MMAP_ALL);
+
+	if (gaddr + len <= 1*MB) {
+		if (gaddr + len <= 640*KB)
+			return ((void *)(ctx->lowermem_addr + gaddr));
+
+		if (768*KB <= gaddr && gaddr + len <= 1*MB) {
+			gaddr -= 768*KB;
+			return ((void *)(ctx->biosmem_addr + gaddr));
+		}
+
+		return (NULL);
+	}
+
+	if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem) {
+		gaddr -= 1*MB;
+		return ((void *)(ctx->lowmem_addr + gaddr));
+	}
+
+	if (ctx->rombase <= gaddr && gaddr + len <= ctx->romlimit) {
+		gaddr -= ctx->rombase;
+		return ((void *)(ctx->rom_addr + gaddr));
+	}
+
+	if (gaddr >= 4*GB) {
+		gaddr -= 4*GB;
+		if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
+			return ((void *)(ctx->highmem_addr + gaddr));
+	}
+
+	return (NULL);
+}
+
+size_t
+vm_get_lowmem_size(struct vmctx *ctx)
+{
+
+	return (ctx->lowmem);
+}
+
+size_t
+vm_get_highmem_size(struct vmctx *ctx)
+{
+
+	return (ctx->highmem);
+}
+
+int
+vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+	    uint64_t base, uint32_t limit, uint32_t access)
+{
+	int error;
+	struct vm_seg_desc vmsegdesc;
+
+	bzero(&vmsegdesc, sizeof(vmsegdesc));
+	vmsegdesc.cpuid = vcpu;
+	vmsegdesc.regnum = reg;
+	vmsegdesc.desc.base = base;
+	vmsegdesc.desc.limit = limit;
+	vmsegdesc.desc.access = access;
+
+	error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+	return (error);
+}
+
+int
+vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+	    uint64_t *base, uint32_t *limit, uint32_t *access)
+{
+	int error;
+	struct vm_seg_desc vmsegdesc;
+
+	bzero(&vmsegdesc, sizeof(vmsegdesc));
+	vmsegdesc.cpuid = vcpu;
+	vmsegdesc.regnum = reg;
+
+	error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+	if (error == 0) {
+		*base = vmsegdesc.desc.base;
+		*limit = vmsegdesc.desc.limit;
+		*access = vmsegdesc.desc.access;
+	}
+	return (error);
+}
+
+int
+vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
+{
+	int error;
+
+	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
+	    &seg_desc->access);
+	return (error);
+}
+
+int
+vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+	int error;
+	struct vm_register vmreg;
+
+	bzero(&vmreg, sizeof(vmreg));
+	vmreg.cpuid = vcpu;
+	vmreg.regnum = reg;
+	vmreg.regval = val;
+
+	error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
+	return (error);
+}
+
+int
+vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
+{
+	int error;
+	struct vm_register vmreg;
+
+	bzero(&vmreg, sizeof(vmreg));
+	vmreg.cpuid = vcpu;
+	vmreg.regnum = reg;
+
+	error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
+	*ret_val = vmreg.regval;
+	return (error);
+}
+
+int
+vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
+{
+	int error;
+	struct vm_run vmrun;
+
+	bzero(&vmrun, sizeof(vmrun));
+	vmrun.cpuid = vcpu;
+
+	error = ioctl(ctx->fd, VM_RUN, &vmrun);
+	bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
+	return (error);
+}
+
+static int
+vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector,
+    int error_code, int error_code_valid)
+{
+	struct vm_exception exc;
+
+	bzero(&exc, sizeof(exc));
+	exc.cpuid = vcpu;
+	exc.vector = vector;
+	exc.error_code = error_code;
+	exc.error_code_valid = error_code_valid;
+
+	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
+}
+
+int
+vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
+    uint32_t errcode, int restart_instruction)
+{
+	struct vm_exception exc;
+
+	exc.cpuid = vcpu;
+	exc.vector = vector;
+	exc.error_code = errcode;
+	exc.error_code_valid = errcode_valid;
+	exc.restart_instruction = restart_instruction;
+
+	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
+}
+
+int
+vm_apicid2vcpu(struct vmctx *ctx, int apicid)
+{
+	/*
+	 * The apic id associated with the 'vcpu' has the same numerical value
+	 * as the 'vcpu' itself.
+	 */
+	return (apicid);
+}
+
+int
+vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
+{
+	struct vm_lapic_irq vmirq;
+
+	bzero(&vmirq, sizeof(vmirq));
+	vmirq.cpuid = vcpu;
+	vmirq.vector = vector;
+
+	return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
+}
+
+int
+vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector)
+{
+	struct vm_lapic_irq vmirq;
+
+	bzero(&vmirq, sizeof(vmirq));
+	vmirq.cpuid = vcpu;
+	vmirq.vector = vector;
+
+	return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq));
+}
+
+int
+vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg)
+{
+	struct vm_lapic_msi vmmsi;
+
+	bzero(&vmmsi, sizeof(vmmsi));
+	vmmsi.addr = addr;
+	vmmsi.msg = msg;
+
+	return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi));
+}
+
+int
+vm_ioapic_assert_irq(struct vmctx *ctx, int irq)
+{
+	struct vm_ioapic_irq ioapic_irq;
+
+	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
+	ioapic_irq.irq = irq;
+
+	return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq));
+}
+
+int
+vm_ioapic_deassert_irq(struct vmctx *ctx, int irq)
+{
+	struct vm_ioapic_irq ioapic_irq;
+
+	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
+	ioapic_irq.irq = irq;
+
+	return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq));
+}
+
+int
+vm_ioapic_pulse_irq(struct vmctx *ctx, int irq)
+{
+	struct vm_ioapic_irq ioapic_irq;
+
+	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
+	ioapic_irq.irq = irq;
+
+	return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq));
+}
+
+int
+vm_ioapic_pincount(struct vmctx *ctx, int *pincount)
+{
+
+	return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount));
+}
+
+int
+vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
+{
+	struct vm_isa_irq isa_irq;
+
+	bzero(&isa_irq, sizeof(struct vm_isa_irq));
+	isa_irq.atpic_irq = atpic_irq;
+	isa_irq.ioapic_irq = ioapic_irq;
+
+	return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq));
+}
+
+int
+vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
+{
+	struct vm_isa_irq isa_irq;
+
+	bzero(&isa_irq, sizeof(struct vm_isa_irq));
+	isa_irq.atpic_irq = atpic_irq;
+	isa_irq.ioapic_irq = ioapic_irq;
+
+	return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq));
+}
+
+int
+vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
+{
+	struct vm_isa_irq isa_irq;
+
+	bzero(&isa_irq, sizeof(struct vm_isa_irq));
+	isa_irq.atpic_irq = atpic_irq;
+	isa_irq.ioapic_irq = ioapic_irq;
+
+	return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
+}
+
+int
+vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
+    enum vm_intr_trigger trigger)
+{
+	struct vm_isa_irq_trigger isa_irq_trigger;
+
+	bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
+	isa_irq_trigger.atpic_irq = atpic_irq;
+	isa_irq_trigger.trigger = trigger;
+
+	return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
+}
+
+int
+vm_inject_nmi(struct vmctx *ctx, int vcpu)
+{
+	struct vm_nmi vmnmi;
+
+	bzero(&vmnmi, sizeof(vmnmi));
+	vmnmi.cpuid = vcpu;
+
+	return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
+}
+
+static struct {
+	const char	*name;
+	int		type;
+} capstrmap[] = {
+	{ "hlt_exit",		VM_CAP_HALT_EXIT },
+	{ "mtrap_exit",		VM_CAP_MTRAP_EXIT },
+	{ "pause_exit",		VM_CAP_PAUSE_EXIT },
+	{ "unrestricted_guest",	VM_CAP_UNRESTRICTED_GUEST },
+	{ "enable_invpcid",	VM_CAP_ENABLE_INVPCID },
+	{ 0 }
+};
+
+int
+vm_capability_name2type(const char *capname)
+{
+	int i;
+
+	for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
+		if (strcmp(capstrmap[i].name, capname) == 0)
+			return (capstrmap[i].type);
+	}
+
+	return (-1);
+}
+
+const char *
+vm_capability_type2name(int type)
+{
+	int i;
+
+	for (i = 0; capstrmap[i].name != NULL; i++) {
+		if (capstrmap[i].type == type)
+			return (capstrmap[i].name);
+	}
+
+	return (NULL);
+}
+
+int
+vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+		  int *retval)
+{
+	int error;
+	struct vm_capability vmcap;
+
+	bzero(&vmcap, sizeof(vmcap));
+	vmcap.cpuid = vcpu;
+	vmcap.captype = cap;
+
+	error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
+	*retval = vmcap.capval;
+	return (error);
+}
+
+int
+vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
+{
+	struct vm_capability vmcap;
+
+	bzero(&vmcap, sizeof(vmcap));
+	vmcap.cpuid = vcpu;
+	vmcap.captype = cap;
+	vmcap.capval = val;
+	
+	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
+}
+
+int
+vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+	struct vm_pptdev pptdev;
+
+	bzero(&pptdev, sizeof(pptdev));
+	pptdev.bus = bus;
+	pptdev.slot = slot;
+	pptdev.func = func;
+
+	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
+}
+
+int
+vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+	struct vm_pptdev pptdev;
+
+	bzero(&pptdev, sizeof(pptdev));
+	pptdev.bus = bus;
+	pptdev.slot = slot;
+	pptdev.func = func;
+
+	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
+}
+
+int
+vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+		   vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	struct vm_pptdev_mmio pptmmio;
+
+	bzero(&pptmmio, sizeof(pptmmio));
+	pptmmio.bus = bus;
+	pptmmio.slot = slot;
+	pptmmio.func = func;
+	pptmmio.gpa = gpa;
+	pptmmio.len = len;
+	pptmmio.hpa = hpa;
+
+	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
+}
+
+int
+vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+    uint64_t addr, uint64_t msg, int numvec)
+{
+	struct vm_pptdev_msi pptmsi;
+
+	bzero(&pptmsi, sizeof(pptmsi));
+	pptmsi.vcpu = vcpu;
+	pptmsi.bus = bus;
+	pptmsi.slot = slot;
+	pptmsi.func = func;
+	pptmsi.msg = msg;
+	pptmsi.addr = addr;
+	pptmsi.numvec = numvec;
+
+	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
+}
+
+int	
+vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+    int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
+{
+	struct vm_pptdev_msix pptmsix;
+
+	bzero(&pptmsix, sizeof(pptmsix));
+	pptmsix.vcpu = vcpu;
+	pptmsix.bus = bus;
+	pptmsix.slot = slot;
+	pptmsix.func = func;
+	pptmsix.idx = idx;
+	pptmsix.msg = msg;
+	pptmsix.addr = addr;
+	pptmsix.vector_control = vector_control;
+
+	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
+}
+
+#ifdef	__FreeBSD__
+uint64_t *
+vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+	     int *ret_entries)
+{
+	int error;
+
+	static struct vm_stats vmstats;
+
+	vmstats.cpuid = vcpu;
+
+	error = ioctl(ctx->fd, VM_STATS, &vmstats);
+	if (error == 0) {
+		if (ret_entries)
+			*ret_entries = vmstats.num_entries;
+		if (ret_tv)
+			*ret_tv = vmstats.tv;
+		return (vmstats.statbuf);
+	} else
+		return (NULL);
+}
+
+const char *
+vm_get_stat_desc(struct vmctx *ctx, int index)
+{
+	static struct vm_stat_desc statdesc;
+
+	statdesc.index = index;
+	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
+		return (statdesc.desc);
+	else
+		return (NULL);
+}
+#endif
+
+int
+vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
+{
+	int error;
+	struct vm_x2apic x2apic;
+
+	bzero(&x2apic, sizeof(x2apic));
+	x2apic.cpuid = vcpu;
+
+	error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
+	*state = x2apic.state;
+	return (error);
+}
+
+int
+vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
+{
+	int error;
+	struct vm_x2apic x2apic;
+
+	bzero(&x2apic, sizeof(x2apic));
+	x2apic.cpuid = vcpu;
+	x2apic.state = state;
+
+	error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
+
+	return (error);
+}
+
+/*
+ * From Intel Vol 3a:
+ * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
+ */
+int
+vcpu_reset(struct vmctx *vmctx, int vcpu)
+{
+	int error;
+	uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
+	uint32_t desc_access, desc_limit;
+	uint16_t sel;
+
+	zero = 0;
+
+	rflags = 0x2;
+	error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+	if (error)
+		goto done;
+
+	rip = 0xfff0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+		goto done;
+
+	cr0 = CR0_NE;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+		goto done;
+
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
+		goto done;
+	
+	cr4 = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+		goto done;
+
+	/*
+	 * CS: present, r/w, accessed, 16-bit, byte granularity, usable
+	 */
+	desc_base = 0xffff0000;
+	desc_limit = 0xffff;
+	desc_access = 0x0093;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0xf000;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
+		goto done;
+
+	/*
+	 * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
+	 */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0x0093;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
+		goto done;
+
+	/* General purpose registers */
+	rdx = 0xf00;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
+		goto done;
+
+	/* GDTR, IDTR */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+			    desc_base, desc_limit, desc_access);
+	if (error != 0)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
+			    desc_base, desc_limit, desc_access);
+	if (error != 0)
+		goto done;
+
+	/* TR */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0x0000008b;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
+		goto done;
+
+	/* LDTR */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0x00000082;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
+			    desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+		goto done;
+
+	/* XXX cr2, debug registers */
+
+	error = 0;
+done:
+	return (error);
+}
+
+int
+vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
+{
+	int error, i;
+	struct vm_gpa_pte gpapte;
+
+	bzero(&gpapte, sizeof(gpapte));
+	gpapte.gpa = gpa;
+
+	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
+
+	if (error == 0) {
+		*num = gpapte.ptenum;
+		for (i = 0; i < gpapte.ptenum; i++)
+			pte[i] = gpapte.pte[i];
+	}
+
+	return (error);
+}
+
+int
+vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
+{
+	int error;
+	struct vm_hpet_cap cap;
+
+	bzero(&cap, sizeof(struct vm_hpet_cap));
+	error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap);
+	if (capabilities != NULL)
+		*capabilities = cap.capabilities;
+	return (error);
+}
+
+static int
+gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, int *fault, uint64_t *gpa)
+{
+	struct vm_gla2gpa gg;
+	int error;
+
+	bzero(&gg, sizeof(struct vm_gla2gpa));
+	gg.vcpuid = vcpu;
+	gg.prot = prot;
+	gg.gla = gla;
+	gg.paging = *paging;
+
+	error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
+	if (error == 0) {
+		*fault = gg.fault;
+		*gpa = gg.gpa;
+	}
+	return (error);
+}
+
+int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa)
+{
+	int error, fault;
+
+	error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa);
+	if (fault)
+		error = fault;
+	return (error);
+}
+
+#ifndef min
+#define	min(a,b)	(((a) < (b)) ? (a) : (b))
+#endif
+
+int
+vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
+{
+	void *va;
+	uint64_t gpa;
+	int error, fault, i, n, off;
+
+	for (i = 0; i < iovcnt; i++) {
+		iov[i].iov_base = 0;
+		iov[i].iov_len = 0;
+	}
+
+	while (len) {
+		assert(iovcnt > 0);
+		error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa);
+		if (error)
+			return (-1);
+		if (fault)
+			return (1);
+
+		off = gpa & PAGE_MASK;
+		n = min(len, PAGE_SIZE - off);
+
+		va = vm_map_gpa(ctx, gpa, n);
+		if (va == NULL)
+			return (-1);
+
+		iov->iov_base = va;
+		iov->iov_len = n;
+		iov++;
+		iovcnt--;
+
+		gla += n;
+		len -= n;
+	}
+	return (0);
+}
+
+void
+vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt)
+{
+
+	return;
+}
+
+void
+vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
+{
+	const char *src;
+	char *dst;
+	size_t n;
+
+	dst = vp;
+	while (len) {
+		assert(iov->iov_len);
+		n = min(len, iov->iov_len);
+		src = iov->iov_base;
+		bcopy(src, dst, n);
+
+		iov++;
+		dst += n;
+		len -= n;
+	}
+}
+
+void
+vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
+    size_t len)
+{
+	const char *src;
+	char *dst;
+	size_t n;
+
+	src = vp;
+	while (len) {
+		assert(iov->iov_len);
+		n = min(len, iov->iov_len);
+		dst = iov->iov_base;
+		bcopy(src, dst, n);
+
+		iov++;
+		src += n;
+		len -= n;
+	}
+}
+
+int
+vm_activate_cpu(struct vmctx *ctx, int vcpu)
+{
+	struct vm_activate_cpu ac;
+	int error;
+
+	bzero(&ac, sizeof(struct vm_activate_cpu));
+	ac.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
+	return (error);
+}
+
+int
+vm_restart_instruction(void *arg, int vcpu)
+{
+	struct vmctx *ctx = arg;
+
+	return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
+}
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
new file mode 100644
index 0000000000..d7eb67aa58
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -0,0 +1,159 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/lib/libvmmapi/vmmapi.h 280929 2015-04-01 00:15:31Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _VMMAPI_H_
+#define	_VMMAPI_H_
+
+#include <sys/param.h>
+
+struct iovec;
+struct vmctx;
+enum x2apic_state;
+
+/*
+ * Different styles of mapping the memory assigned to a VM into the address
+ * space of the controlling process.
+ */
+enum vm_mmap_style {
+	VM_MMAP_NONE,		/* no mapping */
+	VM_MMAP_ALL,		/* fully and statically mapped */
+	VM_MMAP_SPARSE,		/* mappings created on-demand */
+};
+
+int	vm_create(const char *name);
+struct vmctx *vm_open(const char *name);
+int	vm_destroy(struct vmctx *ctx);
+int	vm_parse_memsize(const char *optarg, size_t *memsize);
+int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+			  int *wired);
+int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
+int	vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
+void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+int	vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
+		   uint64_t gla, int prot, uint64_t *gpa);
+uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
+void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
+size_t	vm_get_lowmem_size(struct vmctx *ctx);
+size_t	vm_get_highmem_size(struct vmctx *ctx);
+int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+		    uint64_t base, uint32_t limit, uint32_t access);
+int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+		    uint64_t *base, uint32_t *limit, uint32_t *access);
+int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
+			struct seg_desc *seg_desc);
+int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
+int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
+int	vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
+int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
+int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
+    int errcode_valid, uint32_t errcode, int restart_instruction);
+int	vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
+int	vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
+int	vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
+int	vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
+int	vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
+int	vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
+int	vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
+int	vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
+int	vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
+int	vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
+int	vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
+	    enum vm_intr_trigger trigger);
+int	vm_inject_nmi(struct vmctx *ctx, int vcpu);
+int	vm_capability_name2type(const char *capname);
+const char *vm_capability_type2name(int type);
+int	vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+			  int *retval);
+int	vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+			  int val);
+int	vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int	vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int	vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+			   vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int	vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
+	    int func, uint64_t addr, uint64_t msg, int numvec);
+int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
+	    int func, int idx, uint64_t addr, uint64_t msg,
+	    uint32_t vector_control);
+
+/*
+ * Return a pointer to the statistics buffer. Note that this is not MT-safe.
+ */
+uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+		       int *ret_entries);
+const char *vm_get_stat_desc(struct vmctx *ctx, int index);
+
+int	vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
+int	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
+
+int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
+
+/*
+ * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
+ * The 'iovcnt' should be big enough to accomodate all GPA segments.
+ * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
+ */
+int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
+	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
+void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
+	    void *host_dst, size_t len);
+void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
+	    struct iovec *guest_iov, size_t len);
+void	vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
+	    int iovcnt);
+
+/* Reset vcpu register state */
+int	vcpu_reset(struct vmctx *ctx, int vcpu);
+
+int	vm_activate_cpu(struct vmctx *ctx, int vcpu);
+
+#ifdef	__FreeBSD__
+/*
+ * FreeBSD specific APIs
+ */
+int	vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
+				uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+				uint64_t rsp);
+int	vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
+					uint32_t eip, uint32_t gdtbase,
+					uint32_t esp);
+void	vm_setup_freebsd_gdt(uint64_t *gdtr);
+#endif
+#endif	/* _VMMAPI_H_ */
diff --git a/usr/src/tools/scripts/gensetdefs.pl b/usr/src/tools/scripts/gensetdefs.pl
new file mode 100644
index 0000000000..8ca5782feb
--- /dev/null
+++ b/usr/src/tools/scripts/gensetdefs.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl -w
+#
+# COPYRIGHT 2013 Pluribus Networks Inc.
+#
+# All rights reserved. This copyright notice is Copyright Management
+# Information under 17 USC 1202 and is included to protect this work and
+# deter copyright infringement.  Removal or alteration of this Copyright
+# Management Information without the express written permission from
+# Pluribus Networks Inc is prohibited, and any such unauthorized removal
+# or alteration will be a violation of federal law.
+
+use strict;
+
+my @Sections = split(/\n/, `elfedit -r -e \'shdr:sh_name -osimple\' $ARGV[0] 2>&1`);
+
+foreach my $Section (@Sections) {
+	if ($Section =~ "^set_") {
+		print "\tfixing $Section\n";
+
+		chomp(my $SectionAddr = `elfedit -r -e \'shdr:sh_addr -onum $Section\' $ARGV[0] 2>&1`);
+		chomp(my $SectionSize = `elfedit -r -e \'shdr:sh_size -onum $Section\' $ARGV[0] 2>&1`);
+		my $SectionEnd = hex($SectionAddr) + hex($SectionSize);
+
+		`elfedit -e \'sym:st_bind __start_$Section global\' $ARGV[0] 2>&1`;
+		`elfedit -e \'sym:st_value __start_$Section $SectionAddr\' $ARGV[0] 2>&1`;
+		`elfedit -e \'sym:st_shndx __start_$Section $Section\' $ARGV[0] 2>&1`;
+		`elfedit -e \'sym:st_bind __stop_$Section global\' $ARGV[0] 2>&1`;
+		`elfedit -e \'sym:st_value __stop_$Section $SectionEnd\' $ARGV[0] 2>&1`;
+		`elfedit -e \'sym:st_shndx __stop_$Section $Section\' $ARGV[0] 2>&1`;
+	}
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
new file mode 100644
index 0000000000..40bdd80a6e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona.c
@@ -0,0 +1,1404 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <vm/seg_kmem.h>
+
+#include <sys/dls.h>
+#include <sys/mac_client.h>
+
+#include <sys/viona_io.h>
+
+#define	MB	(1024UL * 1024)
+#define	GB	(1024UL * MB)
+
+/*
+ * Min. octets in an ethernet frame minus FCS
+ */
+#define	MIN_BUF_SIZE	60
+
+#define	VIONA_NAME		"Virtio Network Accelerator"
+
+#define	VIONA_CTL_MINOR		0
+#define	VIONA_CTL_NODE_NAME	"ctl"
+
+#define	VIONA_CLI_NAME		"viona"
+
+#define	VTNET_MAXSEGS		32
+
+#define	VRING_ALIGN		4096
+
+#define	VRING_DESC_F_NEXT	(1 << 0)
+#define	VRING_DESC_F_WRITE	(1 << 1)
+#define	VRING_DESC_F_INDIRECT	(1 << 2)
+
+#define	VRING_AVAIL_F_NO_INTERRUPT	1
+
+#define	VRING_USED_F_NO_NOTIFY		1
+
+#define	BCM_NIC_DRIVER		"bnxe"
+/*
+ * Host capabilities
+ */
+#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
+#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
+#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
+
+#define	VIONA_S_HOSTCAPS		\
+	(VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | \
+	VIRTIO_NET_F_STATUS)
+
+#pragma pack(1)
+struct virtio_desc {
+	uint64_t	vd_addr;
+	uint32_t	vd_len;
+	uint16_t	vd_flags;
+	uint16_t	vd_next;
+};
+#pragma pack()
+
+#pragma pack(1)
+struct virtio_used {
+	uint32_t	vu_idx;
+	uint32_t	vu_tlen;
+};
+#pragma pack()
+
+#pragma pack(1)
+struct virtio_net_mrgrxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+};
+struct virtio_net_hdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+};
+#pragma pack()
+
+typedef struct viona_vring_hqueue {
+	/* Internal state */
+	uint16_t		hq_size;
+	kmutex_t		hq_a_mutex;
+	kmutex_t		hq_u_mutex;
+	uint16_t		hq_cur_aidx;	/* trails behind 'avail_idx' */
+
+	/* Host-context pointers to the queue */
+	caddr_t			hq_baseaddr;
+	uint16_t		*hq_avail_flags;
+	uint16_t		*hq_avail_idx;	/* monotonically increasing */
+	uint16_t		*hq_avail_ring;
+
+	uint16_t		*hq_used_flags;
+	uint16_t		*hq_used_idx;	/* monotonically increasing */
+	struct virtio_used	*hq_used_ring;
+} viona_vring_hqueue_t;
+
+
+typedef struct viona_link {
+	datalink_id_t		l_linkid;
+
+	struct vm		*l_vm;
+	size_t			l_vm_lomemsize;
+	caddr_t			l_vm_lomemaddr;
+	size_t			l_vm_himemsize;
+	caddr_t			l_vm_himemaddr;
+
+	mac_handle_t		l_mh;
+	mac_client_handle_t	l_mch;
+
+	kmem_cache_t		*l_desb_kmc;
+
+	pollhead_t		l_pollhead;
+
+	viona_vring_hqueue_t	l_rx_vring;
+	uint_t			l_rx_intr;
+
+	viona_vring_hqueue_t	l_tx_vring;
+	kcondvar_t		l_tx_cv;
+	uint_t			l_tx_intr;
+	kmutex_t		l_tx_mutex;
+	int			l_tx_outstanding;
+	uint32_t		l_features;
+} viona_link_t;
+
+typedef struct {
+	frtn_t			d_frtn;
+	viona_link_t		*d_link;
+	uint_t			d_ref;
+	uint16_t		d_cookie;
+	int			d_len;
+} viona_desb_t;
+
+typedef struct viona_soft_state {
+	viona_link_t		*ss_link;
+} viona_soft_state_t;
+
+typedef struct used_elem {
+	uint16_t	id;
+	uint32_t	len;
+} used_elem_t;
+
+static void			*viona_state;
+static dev_info_t		*viona_dip;
+static id_space_t		*viona_minor_ids;
+/*
+ * copy tx mbufs from virtio ring to avoid necessitating a wait 
+ * for packet transmission to free resources.
+ */
+static boolean_t		copy_tx_mblks = B_TRUE;
+
+extern struct vm *vm_lookup_by_name(char *name);
+extern uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len);
+
+static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
+static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
+static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+    cred_t *credp, int *rval);
+static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp);
+
+static int viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create);
+static int viona_ioc_delete(viona_soft_state_t *ss);
+
+static int viona_vm_map(viona_link_t *link);
+static caddr_t viona_gpa2kva(viona_link_t *link, uint64_t gpa);
+static void viona_vm_unmap(viona_link_t *link);
+
+static int viona_ioc_rx_ring_init(viona_link_t *link,
+    vioc_ring_init_t *u_ri);
+static int viona_ioc_tx_ring_init(viona_link_t *link,
+    vioc_ring_init_t *u_ri);
+static int viona_ioc_rx_ring_reset(viona_link_t *link);
+static int viona_ioc_tx_ring_reset(viona_link_t *link);
+static void viona_ioc_rx_ring_kick(viona_link_t *link);
+static void viona_ioc_tx_ring_kick(viona_link_t *link);
+static int viona_ioc_rx_intr_clear(viona_link_t *link);
+static int viona_ioc_tx_intr_clear(viona_link_t *link);
+
+static void viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback);
+static void viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq);
+
+static struct cb_ops viona_cb_ops = {
+	viona_open,
+	viona_close,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	viona_ioctl,
+	nodev,
+	nodev,
+	nodev,
+	viona_chpoll,
+	ddi_prop_op,
+	0,
+	D_MP | D_NEW | D_HOTPLUG,
+	CB_REV,
+	nodev,
+	nodev
+};
+
+static struct dev_ops viona_ops = {
+	DEVO_REV,
+	0,
+	nodev,
+	nulldev,
+	nulldev,
+	viona_attach,
+	viona_detach,
+	nodev,
+	&viona_cb_ops,
+	NULL,
+	ddi_power,
+	ddi_quiesce_not_needed
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	VIONA_NAME,
+	&viona_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+	int	ret;
+
+	ret = ddi_soft_state_init(&viona_state,
+	    sizeof (viona_soft_state_t), 0);
+	if (ret == 0) {
+		ret = mod_install(&modlinkage);
+		if (ret != 0) {
+			ddi_soft_state_fini(&viona_state);
+			return (ret);
+		}
+	}
+
+	return (ret);
+}
+
+int
+_fini(void)
+{
+	int	ret;
+
+	ret = mod_remove(&modlinkage);
+	if (ret == 0) {
+		ddi_soft_state_fini(&viona_state);
+	}
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+static void
+set_viona_tx_mode()
+{
+	major_t bcm_nic_major;
+	if ((bcm_nic_major = ddi_name_to_major(BCM_NIC_DRIVER))
+	    != DDI_MAJOR_T_NONE) {
+		if (ddi_hold_installed_driver(bcm_nic_major) != NULL) {
+			copy_tx_mblks = B_FALSE;
+			ddi_rele_driver(bcm_nic_major);
+		}
+	}
+}
+
+static int
+viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	if (cmd != DDI_ATTACH) {
+		return (DDI_FAILURE);
+	}
+
+	viona_minor_ids = id_space_create("viona_minor_id",
+	    VIONA_CTL_MINOR + 1, UINT16_MAX);
+
+	if (ddi_create_minor_node(dip, VIONA_CTL_NODE_NAME,
+	    S_IFCHR, VIONA_CTL_MINOR, DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	viona_dip = dip;
+
+	set_viona_tx_mode();
+	ddi_report_dev(viona_dip);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH) {
+		return (DDI_FAILURE);
+	}
+
+	id_space_destroy(viona_minor_ids);
+
+	ddi_remove_minor_node(viona_dip, NULL);
+
+	viona_dip = NULL;
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+	int	minor;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+
+	if (drv_priv(credp) != 0) {
+		return (EPERM);
+	}
+
+	if (getminor(*devp) != VIONA_CTL_MINOR) {
+		return (ENXIO);
+	}
+
+	minor = id_alloc(viona_minor_ids);
+	if (minor == 0) {
+		/* All minors are busy */
+		return (EBUSY);
+	}
+
+	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
+		id_free(viona_minor_ids, minor);
+	}
+
+	*devp = makedevice(getmajor(*devp), minor);
+
+	return (0);
+}
+
+static int
+viona_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+	int			minor;
+	viona_soft_state_t	*ss;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+
+	if (drv_priv(credp) != 0) {
+		return (EPERM);
+	}
+
+	minor = getminor(dev);
+
+	ss = ddi_get_soft_state(viona_state, minor);
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	viona_ioc_delete(ss);
+
+	ddi_soft_state_free(viona_state, minor);
+
+	id_free(viona_minor_ids, minor);
+
+	return (0);
+}
+
+static int
+viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+    cred_t *credp, int *rval)
+{
+	viona_soft_state_t	*ss;
+	int			err = 0;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	case VNA_IOC_CREATE:
+		err = viona_ioc_create(ss, (vioc_create_t *)data);
+		break;
+	case VNA_IOC_DELETE:
+		err = viona_ioc_delete(ss);
+		break;
+	case VNA_IOC_SET_FEATURES:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		ss->ss_link->l_features = *(int *)data & VIONA_S_HOSTCAPS;
+		break;
+	case VNA_IOC_GET_FEATURES:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		*(int *)data = VIONA_S_HOSTCAPS;
+		break;
+	case VNA_IOC_RX_RING_INIT:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		err = viona_ioc_rx_ring_init(ss->ss_link,
+		    (vioc_ring_init_t *)data);
+		break;
+	case VNA_IOC_RX_RING_RESET:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		err = viona_ioc_rx_ring_reset(ss->ss_link);
+		break;
+	case VNA_IOC_RX_RING_KICK:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		viona_ioc_rx_ring_kick(ss->ss_link);
+		err = 0;
+		break;
+	case VNA_IOC_TX_RING_INIT:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		err = viona_ioc_tx_ring_init(ss->ss_link,
+		    (vioc_ring_init_t *)data);
+		break;
+	case VNA_IOC_TX_RING_RESET:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		err = viona_ioc_tx_ring_reset(ss->ss_link);
+		break;
+	case VNA_IOC_TX_RING_KICK:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		viona_ioc_tx_ring_kick(ss->ss_link);
+		err = 0;
+		break;
+	case VNA_IOC_RX_INTR_CLR:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		err = viona_ioc_rx_intr_clear(ss->ss_link);
+		break;
+	case VNA_IOC_TX_INTR_CLR:
+		if (ss->ss_link == NULL) {
+			return (ENOSYS);
+		}
+		err = viona_ioc_tx_intr_clear(ss->ss_link);
+		break;
+	default:
+		err = ENOTTY;
+		break;
+	}
+
+	return (err);
+}
+
+static int
+viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	viona_soft_state_t	*ss;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL || ss->ss_link == NULL) {
+		return (ENXIO);
+	}
+
+	*reventsp = 0;
+
+	if (ss->ss_link->l_rx_intr && (events & POLLIN)) {
+		*reventsp |= POLLIN;
+	}
+
+	if (ss->ss_link->l_tx_intr && (events & POLLOUT)) {
+		*reventsp |= POLLOUT;
+	}
+
+	if (*reventsp == 0 && !anyyet) {
+		*phpp = &ss->ss_link->l_pollhead;
+	}
+
+	return (0);
+}
+
+static int
+viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create)
+{
+	vioc_create_t		k_create;
+	viona_link_t		*link;
+	char			cli_name[MAXNAMELEN];
+	int			err;
+
+	if (ss->ss_link != NULL) {
+		return (ENOSYS);
+	}
+	if (copyin(u_create, &k_create, sizeof (k_create)) != 0) {
+		return (EFAULT);
+	}
+
+	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
+
+	link->l_linkid = k_create.c_linkid;
+	link->l_vm = vm_lookup_by_name(k_create.c_vmname);
+	if (link->l_vm == NULL) {
+		err = ENXIO;
+		goto bail;
+	}
+
+	link->l_vm_lomemsize = k_create.c_lomem_size;
+	link->l_vm_himemsize = k_create.c_himem_size;
+	err = viona_vm_map(link);
+	if (err != 0) {
+		goto bail;
+	}
+
+	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
+	if (err != 0) {
+		cmn_err(CE_WARN, "viona create mac_open_by_linkid"
+		    " returned %d\n", err);
+		goto bail;
+	}
+
+	snprintf(cli_name, sizeof (cli_name), "%s-%d",
+	    VIONA_CLI_NAME, link->l_linkid);
+	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
+	if (err != 0) {
+		cmn_err(CE_WARN, "viona create mac_client_open"
+		    " returned %d\n", err);
+		goto bail;
+	}
+
+	link->l_features = VIONA_S_HOSTCAPS;
+	link->l_desb_kmc = kmem_cache_create(cli_name,
+	    sizeof (viona_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&link->l_rx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&link->l_tx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
+	if (copy_tx_mblks) {
+		mutex_init(&link->l_tx_mutex, NULL, MUTEX_DRIVER, NULL);
+		cv_init(&link->l_tx_cv, NULL, CV_DRIVER, NULL);
+	}
+	ss->ss_link = link;
+
+	return (0);
+
+bail:
+	if (link->l_mch != NULL) {
+		mac_client_close(link->l_mch, 0);
+	}
+	if (link->l_mh != NULL) {
+		mac_close(link->l_mh);
+	}
+
+	kmem_free(link, sizeof (viona_link_t));
+
+	return (err);
+}
+
+static int
+viona_ioc_delete(viona_soft_state_t *ss)
+{
+	viona_link_t	*link;
+
+	link = ss->ss_link;
+	if (link == NULL) {
+		return (ENOSYS);
+	}
+	if (copy_tx_mblks) {
+		mutex_enter(&link->l_tx_mutex);
+		while (link->l_tx_outstanding != 0) {
+			cv_wait(&link->l_tx_cv, &link->l_tx_mutex);
+		}
+		mutex_exit(&link->l_tx_mutex);
+	}
+	if (link->l_mch != NULL) {
+		mac_rx_clear(link->l_mch);
+		mac_client_close(link->l_mch, 0);
+	}
+	if (link->l_mh != NULL) {
+		mac_close(link->l_mh);
+	}
+
+	viona_vm_unmap(link);
+	mutex_destroy(&link->l_tx_vring.hq_a_mutex);
+	mutex_destroy(&link->l_tx_vring.hq_u_mutex);
+	mutex_destroy(&link->l_rx_vring.hq_a_mutex);
+	mutex_destroy(&link->l_rx_vring.hq_u_mutex);
+	if (copy_tx_mblks) {
+		mutex_destroy(&link->l_tx_mutex);
+		cv_destroy(&link->l_tx_cv);
+	}
+
+	kmem_cache_destroy(link->l_desb_kmc);
+
+	kmem_free(link, sizeof (viona_link_t));
+
+	ss->ss_link = NULL;
+
+	return (0);
+}
+
+static caddr_t
+viona_mapin_vm_chunk(viona_link_t *link, uint64_t gpa, size_t len)
+{
+	caddr_t		addr;
+	size_t		offset;
+	pfn_t		pfnum;
+
+	if (len == 0)
+		return (NULL);
+
+	addr = vmem_alloc(heap_arena, len, VM_SLEEP);
+	if (addr == NULL)
+		return (NULL);
+
+	for (offset = 0; offset < len; offset += PAGESIZE) {
+		pfnum = btop(vm_gpa2hpa(link->l_vm, gpa + offset, PAGESIZE));
+		ASSERT(pfnum);
+		hat_devload(kas.a_hat, addr + offset, PAGESIZE, pfnum,
+		    PROT_READ | PROT_WRITE, HAT_LOAD_LOCK);
+	}
+
+	return (addr);
+}
+
+/*
+ * Map the guest physical address space into the kernel virtual address space.
+ */
+static int
+viona_vm_map(viona_link_t *link)
+{
+	link->l_vm_lomemaddr = viona_mapin_vm_chunk(link,
+	    0, link->l_vm_lomemsize);
+	if (link->l_vm_lomemaddr == NULL)
+		return (-1);
+	link->l_vm_himemaddr = viona_mapin_vm_chunk(link,
+	    4 * (1024 * 1024 * 1024UL), link->l_vm_himemsize);
+	if (link->l_vm_himemsize && link->l_vm_himemaddr == NULL)
+		return (-1);
+
+	return (0);
+}
+
+/*
+ * Translate a guest physical address into a kernel virtual address.
+ */
+static caddr_t
+viona_gpa2kva(viona_link_t *link, uint64_t gpa)
+{
+	if (gpa < link->l_vm_lomemsize)
+		return (link->l_vm_lomemaddr + gpa);
+
+	gpa -= (4 * GB);
+	if (gpa < link->l_vm_himemsize)
+		return (link->l_vm_himemaddr + gpa);
+
+	return (NULL);
+}
+
+static void
+viona_vm_unmap(viona_link_t *link)
+{
+	if (link->l_vm_lomemaddr) {
+		hat_unload(kas.a_hat, link->l_vm_lomemaddr,
+		    link->l_vm_lomemsize, HAT_UNLOAD_UNLOCK);
+		vmem_free(heap_arena, link->l_vm_lomemaddr,
+		    link->l_vm_lomemsize);
+	}
+	if (link->l_vm_himemaddr) {
+		hat_unload(kas.a_hat, link->l_vm_himemaddr,
+		    link->l_vm_himemsize, HAT_UNLOAD_UNLOCK);
+		vmem_free(heap_arena, link->l_vm_himemaddr,
+		    link->l_vm_himemsize);
+	}
+}
+
+static int
+viona_ioc_ring_init_common(viona_link_t *link, viona_vring_hqueue_t *hq,
+    vioc_ring_init_t *u_ri)
+{
+	vioc_ring_init_t	k_ri;
+
+	if (copyin(u_ri, &k_ri, sizeof (k_ri)) != 0) {
+		return (EFAULT);
+	}
+
+	hq->hq_size = k_ri.ri_qsize;
+	hq->hq_baseaddr = viona_gpa2kva(link, k_ri.ri_qaddr);
+	if (hq->hq_baseaddr == NULL)
+		return (EINVAL);
+
+	hq->hq_avail_flags = (uint16_t *)(viona_gpa2kva(link,
+	    k_ri.ri_qaddr + hq->hq_size * sizeof (struct virtio_desc)));
+	if (hq->hq_avail_flags == NULL)
+		return (EINVAL);
+	hq->hq_avail_idx = hq->hq_avail_flags + 1;
+	hq->hq_avail_ring = hq->hq_avail_flags + 2;
+
+	hq->hq_used_flags = (uint16_t *)(viona_gpa2kva(link,
+	    P2ROUNDUP(k_ri.ri_qaddr +
+	    hq->hq_size * sizeof (struct virtio_desc) + 2, VRING_ALIGN)));
+	if (hq->hq_used_flags == NULL)
+		return (EINVAL);
+	hq->hq_used_idx = hq->hq_used_flags + 1;
+	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+	/*
+	 * Initialize queue indexes
+	 */
+	hq->hq_cur_aidx = 0;
+
+	return (0);
+}
+
+static int
+viona_ioc_rx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
+{
+	viona_vring_hqueue_t	*hq;
+	int			rval;
+
+	hq = &link->l_rx_vring;
+
+	rval = viona_ioc_ring_init_common(link, hq, u_ri);
+	if (rval != 0) {
+		return (rval);
+	}
+
+	return (0);
+}
+
+static int
+viona_ioc_tx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
+{
+	viona_vring_hqueue_t	*hq;
+
+	hq = &link->l_tx_vring;
+
+	return (viona_ioc_ring_init_common(link, hq, u_ri));
+}
+
+static int
+viona_ioc_ring_reset_common(viona_vring_hqueue_t *hq)
+{
+	/*
+	 * Reset all soft state
+	 */
+	hq->hq_cur_aidx = 0;
+
+	return (0);
+}
+
+static int
+viona_ioc_rx_ring_reset(viona_link_t *link)
+{
+	viona_vring_hqueue_t	*hq;
+
+	mac_rx_clear(link->l_mch);
+
+	hq = &link->l_rx_vring;
+
+	return (viona_ioc_ring_reset_common(hq));
+}
+
+static int
+viona_ioc_tx_ring_reset(viona_link_t *link)
+{
+	viona_vring_hqueue_t	*hq;
+
+	hq = &link->l_tx_vring;
+
+	return (viona_ioc_ring_reset_common(hq));
+}
+
+static void
+viona_ioc_rx_ring_kick(viona_link_t *link)
+{
+	viona_vring_hqueue_t	*hq = &link->l_rx_vring;
+
+	atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
+
+	mac_rx_set(link->l_mch, viona_rx, link);
+}
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static inline int
+viona_hq_num_avail(viona_vring_hqueue_t *hq)
+{
+	uint16_t ndesc;
+
+	/*
+	 * We're just computing (a-b) in GF(216).
+	 *
+	 * The only glitch here is that in standard C,
+	 * uint16_t promotes to (signed) int when int has
+	 * more than 16 bits (pretty much always now), so
+	 * we have to force it back to unsigned.
+	 */
+	ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
+
+	ASSERT(ndesc <= hq->hq_size);
+
+	return (ndesc);
+}
+
+static void
+viona_ioc_tx_ring_kick(viona_link_t *link)
+{
+	viona_vring_hqueue_t	*hq = &link->l_tx_vring;
+
+	do {
+		atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
+		while (viona_hq_num_avail(hq)) {
+			viona_tx(link, hq);
+		}
+		if (copy_tx_mblks) {
+			mutex_enter(&link->l_tx_mutex);
+			if (link->l_tx_outstanding != 0) {
+				cv_wait_sig(&link->l_tx_cv, &link->l_tx_mutex);
+			}
+			mutex_exit(&link->l_tx_mutex);
+		}
+		atomic_and_16(hq->hq_used_flags, ~VRING_USED_F_NO_NOTIFY);
+	} while (viona_hq_num_avail(hq));
+}
+
+static int
+viona_ioc_rx_intr_clear(viona_link_t *link)
+{
+	link->l_rx_intr = 0;
+
+	return (0);
+}
+
+static int
+viona_ioc_tx_intr_clear(viona_link_t *link)
+{
+	link->l_tx_intr = 0;
+
+	return (0);
+}
+#define	VQ_MAX_DESCRIPTORS	512
+
+static int
+vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov,
+int n_iov, uint16_t *cookie)
+{
+	int			i;
+	int			ndesc, nindir;
+	int			idx, head, next;
+	struct virtio_desc	*vdir, *vindir, *vp;
+
+	idx = hq->hq_cur_aidx;
+	ndesc = (uint16_t)((unsigned)*hq->hq_avail_idx - (unsigned)idx);
+
+	if (ndesc == 0)
+		return (0);
+	if (ndesc > hq->hq_size) {
+		cmn_err(CE_NOTE, "ndesc (%d) out of range\n", ndesc);
+		return (-1);
+	}
+
+	head = hq->hq_avail_ring[idx & (hq->hq_size - 1)];
+	next = head;
+
+	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
+		if (next >= hq->hq_size) {
+			cmn_err(CE_NOTE, "descriptor index (%d)"
+			    "out of range\n", next);
+			return (-1);
+		}
+
+		vdir = (struct virtio_desc *)(hq->hq_baseaddr +
+		    next * sizeof (struct virtio_desc));
+		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+			if (i > n_iov)
+				return (-1);
+			iov[i].iov_base = viona_gpa2kva(link, vdir->vd_addr);
+			if (iov[i].iov_base == NULL) {
+				cmn_err(CE_NOTE, "invalid guest physical"
+				    " address 0x%"PRIx64"\n", vdir->vd_addr);
+				return (-1);
+			}
+			iov[i++].iov_len = vdir->vd_len;
+		} else {
+			nindir = vdir->vd_len / 16;
+			if ((vdir->vd_len & 0xf) || nindir == 0) {
+				cmn_err(CE_NOTE, "invalid indir len 0x%x\n",
+				    vdir->vd_len);
+				return (-1);
+			}
+			vindir = (struct virtio_desc *)
+			    viona_gpa2kva(link, vdir->vd_addr);
+			if (vindir == NULL) {
+				cmn_err(CE_NOTE, "invalid guest physical"
+				    " address 0x%"PRIx64"\n", vdir->vd_addr);
+				return (-1);
+			}
+			next = 0;
+			for (;;) {
+				vp = &vindir[next];
+				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
+					cmn_err(CE_NOTE, "indirect desc"
+					    " has INDIR flag\n");
+					return (-1);
+				}
+				if (i > n_iov)
+					return (-1);
+				iov[i].iov_base =
+				    viona_gpa2kva(link, vp->vd_addr);
+				if (iov[i].iov_base == NULL) {
+					cmn_err(CE_NOTE, "invalid guest"
+					    " physical address 0x%"PRIx64"\n",
+					    vp->vd_addr);
+					return (-1);
+				}
+				iov[i++].iov_len = vp->vd_len;
+
+				if (i > VQ_MAX_DESCRIPTORS)
+					goto loopy;
+				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
+					break;
+
+				next = vp->vd_next;
+				if (next >= nindir) {
+					cmn_err(CE_NOTE, "invalid next"
+					    " %d > %d\n", next, nindir);
+					return (-1);
+				}
+			}
+		}
+		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) {
+			*cookie = head;
+			hq->hq_cur_aidx++;
+			return (i);
+		}
+	}
+
+loopy:
+	cmn_err(CE_NOTE, "%d > descriptor loop count\n", i);
+
+	return (-1);
+}
+
+static void
+vq_pushchain(viona_vring_hqueue_t *hq, uint32_t len, uint16_t cookie)
+{
+	struct virtio_used	*vu;
+	int			uidx;
+
+	uidx = *hq->hq_used_idx;
+	vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
+	vu->vu_idx = cookie;
+	vu->vu_tlen = len;
+	membar_producer();
+	*hq->hq_used_idx = uidx;
+}
+
+static void
+vq_pushchain_mrgrx(viona_vring_hqueue_t *hq, int num_bufs, used_elem_t *elem)
+{
+	struct virtio_used	*vu;
+	int			uidx;
+	int			i;
+
+	uidx = *hq->hq_used_idx;
+	if (num_bufs == 1) {
+		vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
+		vu->vu_idx = elem[0].id;
+		vu->vu_tlen = elem[0].len;
+	} else {
+		for (i = 0; i < num_bufs; i++) {
+			vu = &hq->hq_used_ring[(uidx + i) & (hq->hq_size - 1)];
+			vu->vu_idx = elem[i].id;
+			vu->vu_tlen = elem[i].len;
+		}
+		uidx = uidx + num_bufs;
+	}
+	membar_producer();
+	*hq->hq_used_idx = uidx;
+}
+
+/*
+ * Copy bytes from mp to iov.
+ * copied_buf: Total num_bytes copied from mblk to iov array.
+ * buf: pointer to iov_base.
+ * i: index of iov array. Mainly used to identify if we are
+ *    dealing with first iov array element.
+ * rxhdr_size: Virtio header size. Two possibilities in case
+ *    of MRGRX buf, header has 2 additional bytes.
+ *    In case of mrgrx, virtio header should be part of iov[0].
+ *    In case of non-mrgrx, virtio header may or may not be part
+ *    of iov[0].
+ */
+static int
+copy_in_mblk(mblk_t *mp, int copied_buf, caddr_t buf, struct iovec *iov,
+    int i, int rxhdr_size)
+{
+	int copied_chunk = 0;
+	mblk_t *ml;
+	int total_buf_len = iov->iov_len;
+	/*
+	 * iov[0] might have header, adjust
+	 * total_buf_len accordingly
+	 */
+	if (i == 0) {
+		total_buf_len = iov->iov_len - rxhdr_size;
+	}
+	for (ml = mp; ml != NULL; ml = ml->b_cont) {
+		size_t	chunk = MBLKL(ml);
+		/*
+		 * If chunk is less than
+		 * copied_buf we should move
+		 * to correct msgblk
+		 */
+		if (copied_buf != 0) {
+			if (copied_buf < chunk) {
+				chunk -= copied_buf;
+			} else {
+				copied_buf -= chunk;
+				continue;
+			}
+		}
+		/*
+		 * iov[0] already has virtio header.
+		 * and if copied chunk is length of iov_len break
+		 */
+		if (copied_chunk == total_buf_len) {
+			break;
+		}
+		/*
+		 * Sometimes chunk is total mblk len, sometimes mblk is
+		 * divided into multiple chunks.
+		 */
+		if (chunk > copied_buf) {
+			if (chunk > copied_chunk) {
+				if ((chunk + copied_chunk) > total_buf_len)
+					chunk = (size_t)total_buf_len
+					    - copied_chunk;
+			} else {
+				if (chunk > (total_buf_len - copied_chunk))
+					chunk = (size_t)((total_buf_len
+					    - copied_chunk) - chunk);
+			}
+			bcopy(ml->b_rptr + copied_buf, buf, chunk);
+		} else {
+			if (chunk > (total_buf_len - copied_chunk)) {
+				chunk = (size_t)(total_buf_len - copied_chunk);
+			}
+			bcopy(ml->b_rptr + copied_buf, buf, chunk);
+		}
+		buf += chunk;
+		copied_chunk += chunk;
+	}
+	return (copied_chunk);
+}
+
+static void
+viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
+{
+	viona_link_t		*link = arg;
+	viona_vring_hqueue_t	*hq = &link->l_rx_vring;
+	mblk_t			*mp0 = mp;
+
+	while (viona_hq_num_avail(hq)) {
+		struct iovec		iov[VTNET_MAXSEGS];
+		size_t			mblklen;
+		int			n, i = 0;
+		uint16_t		cookie;
+		struct virtio_net_hdr	*vrx;
+		struct virtio_net_mrgrxhdr *vmrgrx;
+		mblk_t			*ml;
+		caddr_t			buf;
+		int			total_len = 0;
+		int			copied_buf = 0;
+		int			num_bufs = 0;
+		int			num_pops = 0;
+		used_elem_t		uelem[VTNET_MAXSEGS];
+
+		if (mp == NULL) {
+			break;
+		}
+		mblklen = msgsize(mp);
+		if (mblklen == 0) {
+			break;
+		}
+
+		mutex_enter(&hq->hq_a_mutex);
+		n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
+		mutex_exit(&hq->hq_a_mutex);
+		if (n <= 0) {
+			break;
+		}
+		num_pops++;
+		if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
+			int total_n = n;
+			int mrgrxhdr_size = sizeof (struct virtio_net_mrgrxhdr);
+			/*
+			 * Get a pointer to the rx header, and use the
+			 * data immediately following it for the packet buffer.
+			 */
+			vmrgrx = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
+			if (n == 1) {
+				buf = iov[0].iov_base + mrgrxhdr_size;
+			}
+			while (mblklen > copied_buf) {
+				if (total_n == i) {
+					mutex_enter(&hq->hq_a_mutex);
+					n = vq_popchain(link, hq, &iov[i],
+					    VTNET_MAXSEGS, &cookie);
+					mutex_exit(&hq->hq_a_mutex);
+					if (n <= 0) {
+						freemsgchain(mp0);
+						return;
+					}
+					num_pops++;
+					total_n += n;
+				}
+				if (total_n > i) {
+					int copied_chunk = 0;
+					if (i != 0) {
+						buf = iov[i].iov_base;
+					}
+					copied_chunk = copy_in_mblk(mp,
+					    copied_buf, buf, &iov[i], i,
+					    mrgrxhdr_size);
+					copied_buf += copied_chunk;
+					uelem[i].id = cookie;
+					uelem[i].len = copied_chunk;
+					if (i == 0) {
+						uelem[i].len += mrgrxhdr_size;
+					}
+				}
+				num_bufs++;
+				i++;
+			}
+		} else {
+			boolean_t virt_hdr_incl_iov = B_FALSE;
+			int rxhdr_size = sizeof (struct virtio_net_hdr);
+			/* First element is header */
+			vrx = (struct virtio_net_hdr *)iov[0].iov_base;
+			if (n == 1 || iov[0].iov_len > rxhdr_size) {
+				buf = iov[0].iov_base + rxhdr_size;
+				virt_hdr_incl_iov = B_TRUE;
+				total_len += rxhdr_size;
+				if (iov[0].iov_len < rxhdr_size) {
+					// Buff too small to fit pkt. Drop it.
+					freemsgchain(mp0);
+					return;
+				}
+			} else {
+				total_len = iov[0].iov_len;
+			}
+			if (iov[0].iov_len == rxhdr_size)
+				i++;
+			while (mblklen > copied_buf) {
+				if (n > i) {
+					int copied_chunk = 0;
+					if (i != 0) {
+						buf = iov[i].iov_base;
+					}
+					/*
+					 * In case of non-mrgrx buf, first
+					 * descriptor always has header and
+					 * rest of the descriptors have data.
+					 * But it is not guaranteed that first
+					 * descriptor will only have virtio
+					 * header. It might also have data.
+					 */
+					if (virt_hdr_incl_iov) {
+						copied_chunk = copy_in_mblk(mp,
+						    copied_buf, buf, &iov[i],
+						    i, rxhdr_size);
+					} else {
+						copied_chunk = copy_in_mblk(mp,
+						    copied_buf, buf, &iov[i],
+						    i, 0);
+					}
+					copied_buf += copied_chunk;
+					total_len += copied_chunk;
+				} else {
+					/*
+					 * Drop packet as it cant fit
+					 * in buf provided by guest.
+					 */
+					freemsgchain(mp0);
+					return;
+				}
+				i++;
+			}
+		}
+		/*
+		 * The only valid field in the rx packet header is the
+		 * number of buffers, which is always 1 without TSO
+		 * support.
+		 */
+		if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
+			memset(vmrgrx, 0, sizeof (struct virtio_net_mrgrxhdr));
+			vmrgrx->vrh_bufs = num_bufs;
+			/*
+			 * Make sure iov[0].iov_len >= MIN_BUF_SIZE
+			 * otherwise guest will consider it as invalid frame.
+			 */
+			if (num_bufs == 1 && uelem[0].len < MIN_BUF_SIZE) {
+				uelem[0].len = MIN_BUF_SIZE;
+			}
+			/*
+			 * Release this chain and handle more chains.
+			 */
+			mutex_enter(&hq->hq_u_mutex);
+			vq_pushchain_mrgrx(hq, num_pops, uelem);
+			mutex_exit(&hq->hq_u_mutex);
+		} else {
+			memset(vrx, 0, sizeof (struct virtio_net_hdr));
+			if (total_len < MIN_BUF_SIZE) {
+				total_len = MIN_BUF_SIZE;
+			}
+			/*
+			 * Release this chain and handle more chains.
+			 */
+			mutex_enter(&hq->hq_u_mutex);
+			vq_pushchain(hq, total_len, cookie);
+			mutex_exit(&hq->hq_u_mutex);
+		}
+
+		mp = mp->b_next;
+	}
+
+	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		if (atomic_cas_uint(&link->l_rx_intr, 0, 1) == 0) {
+			pollwakeup(&link->l_pollhead, POLLIN);
+		}
+	}
+
+	freemsgchain(mp0);
+}
+
+static void
+viona_desb_free(viona_desb_t *dp)
+{
+	viona_link_t		*link;
+	viona_vring_hqueue_t	*hq;
+	struct virtio_used	*vu;
+	int			uidx;
+	uint_t			ref;
+
+	ref = atomic_dec_uint_nv(&dp->d_ref);
+	if (ref != 0)
+		return;
+
+	link = dp->d_link;
+	hq = &link->l_tx_vring;
+
+	mutex_enter(&hq->hq_u_mutex);
+	vq_pushchain(hq, dp->d_len, dp->d_cookie);
+	mutex_exit(&hq->hq_u_mutex);
+
+	kmem_cache_free(link->l_desb_kmc, dp);
+
+	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		if (atomic_cas_uint(&link->l_tx_intr, 0, 1) == 0) {
+			pollwakeup(&link->l_pollhead, POLLOUT);
+		}
+	}
+	if (copy_tx_mblks) {
+		mutex_enter(&link->l_tx_mutex);
+		if (--link->l_tx_outstanding == 0) {
+			cv_broadcast(&link->l_tx_cv);
+		}
+		mutex_exit(&link->l_tx_mutex);
+	}
+}
+
+static void
+viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq)
+{
+	struct iovec		iov[VTNET_MAXSEGS];
+	uint16_t		cookie;
+	int			i, n;
+	mblk_t			*mp_head, *mp_tail, *mp;
+	viona_desb_t		*dp;
+	mac_client_handle_t	link_mch = link->l_mch;
+
+	mp_head = mp_tail = NULL;
+
+	mutex_enter(&hq->hq_a_mutex);
+	n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
+	mutex_exit(&hq->hq_a_mutex);
+	ASSERT(n != 0);
+
+	dp = kmem_cache_alloc(link->l_desb_kmc, KM_SLEEP);
+	dp->d_frtn.free_func = viona_desb_free;
+	dp->d_frtn.free_arg = (void *)dp;
+	dp->d_link = link;
+	dp->d_cookie = cookie;
+
+	dp->d_ref = 0;
+	dp->d_len = iov[0].iov_len;
+
+	for (i = 1; i < n; i++) {
+		dp->d_ref++;
+		dp->d_len += iov[i].iov_len;
+		if (copy_tx_mblks) {
+			mp = desballoc((uchar_t *)iov[i].iov_base,
+			    iov[i].iov_len, BPRI_MED, &dp->d_frtn);
+			ASSERT(mp);
+		} else {
+			mp = allocb(iov[i].iov_len, BPRI_MED);
+			ASSERT(mp);
+			bcopy((uchar_t *)iov[i].iov_base, mp->b_wptr,
+			    iov[i].iov_len);
+		}
+		mp->b_wptr += iov[i].iov_len;
+		if (mp_head == NULL) {
+			ASSERT(mp_tail == NULL);
+			mp_head = mp;
+		} else {
+			ASSERT(mp_tail != NULL);
+			mp_tail->b_cont = mp;
+		}
+		mp_tail = mp;
+	}
+	if (copy_tx_mblks == B_FALSE) {
+		viona_desb_free(dp);
+	}
+	if (copy_tx_mblks) {
+		mutex_enter(&link->l_tx_mutex);
+		link->l_tx_outstanding++;
+		mutex_exit(&link->l_tx_mutex);
+	}
+	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona.conf b/usr/src/uts/i86pc/io/viona/viona.conf
new file mode 100644
index 0000000000..e66488531a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona.conf
@@ -0,0 +1,14 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+name="viona" parent="pseudo";
diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdv.c b/usr/src/uts/i86pc/io/vmm/amd/amdv.c
new file mode 100644
index 0000000000..6b62daae6c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/amdv.c
@@ -0,0 +1,271 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#ifdef	__FreeBSD__
+#include "io/iommu.h"
+#endif
+
+static int
+amdv_init(void)
+{
+
+	printf("amdv_init: not implemented\n");
+	return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+	printf("amdv_cleanup: not implemented\n");
+	return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+	printf("amdv_vminit: not implemented\n");
+	return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip)
+{
+
+	printf("amdv_vmrun: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+	printf("amdv_vmcleanup: not implemented\n");
+	return;
+}
+
+static int
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+	printf("amdv_vmmmap_set: not implemented\n");
+	return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+	printf("amdv_vmmmap_get: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+	
+	printf("amdv_getreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+	
+	printf("amdv_setreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+	printf("amdv_getcap: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+	printf("amdv_setcap: not implemented\n");
+	return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+	amdv_init,
+	amdv_cleanup,
+	amdv_vminit,
+	amdv_vmrun,
+	amdv_vmcleanup,
+	amdv_vmmmap_set,
+	amdv_vmmmap_get,
+	amdv_getreg,
+	amdv_setreg,
+	amdv_getdesc,
+	amdv_setdesc,
+	amdv_getcap,
+	amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+	printf("amd_iommu_init: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+	printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+	printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+	printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	printf("amd_iommu_create_domain: not implemented\n");
+	return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+	printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+			 uint64_t len)
+{
+
+	printf("amd_iommu_create_mapping: not implemented\n");
+	return (0);
+}
+
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	printf("amd_iommu_remove_mapping: not implemented\n");
+	return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_remove_device: not implemented\n");
+}
+
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+	printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
+#ifdef	__FreeBSD__
+struct iommu_ops iommu_ops_amd = {
+	amd_iommu_init,
+	amd_iommu_cleanup,
+	amd_iommu_enable,
+	amd_iommu_disable,
+	amd_iommu_create_domain,
+	amd_iommu_destroy_domain,
+	amd_iommu_create_mapping,
+	amd_iommu_remove_mapping,
+	amd_iommu_add_device,
+	amd_iommu_remove_device,
+	amd_iommu_invalidate_tlb,
+};
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c
new file mode 100644
index 0000000000..5ae9ed2f6a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/ept.c
@@ -0,0 +1,452 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define	EPT_PWL4(cap)			((cap) & (1UL << 6))
+#define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
+#define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
+#define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
+#define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+
+#define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
+#define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
+	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define	INVEPT_ALL_TYPES_MASK		0x6000000UL
+#define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
+	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define	EPT_PG_RD			(1 << 0)
+#define	EPT_PG_WR			(1 << 1)
+#define	EPT_PG_EX			(1 << 2)
+#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
+#define	EPT_PG_IGNORE_PAT		(1 << 6)
+#define	EPT_PG_SUPERPAGE		(1 << 7)
+
+#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+/*
+ * Set this to 1 to have the EPT tables respect the guest PAT settings
+ */
+static int ept_pat_passthru;
+
+int
+ept_init(void)
+{
+	int page_shift;
+	uint64_t cap;
+
+	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+	/*
+	 * Verify that:
+	 * - page walk length is 4 steps
+	 * - extended page tables can be laid out in write-back memory
+	 * - invvpid instruction with all possible types is supported
+	 * - invept instruction with all possible types is supported
+	 */
+	if (!EPT_PWL4(cap) ||
+	    !EPT_MEMORY_TYPE_WB(cap) ||
+	    !INVVPID_SUPPORTED(cap) ||
+	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+	    !INVEPT_SUPPORTED(cap) ||
+	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
+		return (EINVAL);
+
+	/* Set bits in 'page_sizes_mask' for each valid page size */
+	page_shift = PAGE_SHIFT;
+	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+
+	page_shift += 9;
+	if (EPT_PDE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+
+	page_shift += 9;
+	if (EPT_PDPTE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+
+	return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+	int i, t, tabs;
+	uint64_t *ptpnext, ptpval;
+
+	if (--nlevels < 0)
+		return;
+
+	tabs = 3 - nlevels;
+	for (t = 0; t < tabs; t++)
+		printf("\t");
+	printf("PTP = %p\n", ptp);
+
+	for (i = 0; i < 512; i++) {
+		ptpval = ptp[i];
+
+		if (ptpval == 0)
+			continue;
+		
+		for (t = 0; t < tabs; t++)
+			printf("\t");
+		printf("%3d 0x%016lx\n", i, ptpval);
+
+		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+			ptpnext = (uint64_t *)
+				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+			ept_dump(ptpnext, nlevels);
+		}
+	}
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+	int spshift, ptpshift, ptpindex, nlevels;
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - super page sizes supported by the processor
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = PAGE_SHIFT;
+	if (spok)
+		spshift += (EPT_PWLEVELS - 1) * 9;
+	while (spshift >= PAGE_SHIFT) {
+		uint64_t spsize = 1UL << spshift;
+		if ((page_sizes_mask & spsize) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    length >= spsize) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	if (spshift < PAGE_SHIFT) {
+		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+		      "length 0x%016lx, page_sizes_mask 0x%016lx",
+		      gpa, hpa, length, page_sizes_mask);
+	}
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift)
+			break;
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create the next level page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+#ifdef	__FreeBSD__
+			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+#else
+			void *nlp = kmem_zalloc(PAGE_SIZE, KM_SLEEP);
+			ASSERT((((uintptr_t)nlp) & PAGE_MASK) == 0);
+#endif
+			ptp[ptpindex] = vtophys(nlp);
+			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+		}
+
+		/* Work our way down to the next level page table page */
+#ifdef	__FreeBSD__
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+#else
+		ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptp[ptpindex] & EPT_ADDR_MASK));
+#endif
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+		      "mismatch\n", gpa, ptpshift);
+	}
+
+	if (prot != VM_PROT_NONE) {
+		/* Do the mapping */
+		ptp[ptpindex] = hpa;
+
+		/* Apply the access controls */
+		if (prot & VM_PROT_READ)
+			ptp[ptpindex] |= EPT_PG_RD;
+		if (prot & VM_PROT_WRITE)
+			ptp[ptpindex] |= EPT_PG_WR;
+		if (prot & VM_PROT_EXECUTE)
+			ptp[ptpindex] |= EPT_PG_EX;
+
+		/*
+		 * By default the PAT type is ignored - this appears to
+		 * be how other hypervisors handle EPT. Allow this to be
+		 * overridden.
+		 */
+		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+		if (!ept_pat_passthru)
+			ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
+
+		if (nlevels > 0)
+			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+	} else {
+		/* Remove the mapping */
+		ptp[ptpindex] = 0;
+	}
+
+	return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+	int nlevels, ptpshift, ptpindex;
+	uint64_t ptpval, hpabase, pgmask;
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		ptpval = ptp[ptpindex];
+
+		/* Cannot make progress beyond this point */
+		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+			break;
+
+		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+			pgmask = (1UL << ptpshift) - 1;
+			hpabase = ptpval & ~pgmask;
+			return (hpabase | (gpa & pgmask));
+		}
+
+		/* Work our way down to the next level page table page */
+#ifdef	__FreBSD__
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+#else
+		ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptpval & EPT_ADDR_MASK));
+#endif
+	}
+
+	return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+	if (pte == 0)
+		return;
+
+	/* sanity check */
+	if ((pte & EPT_PG_SUPERPAGE) != 0)
+		panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+	return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+	pt_entry_t	*pt;
+	int		i;
+
+	if (pde == 0)
+		return;
+
+	if ((pde & EPT_PG_SUPERPAGE) == 0) {
+#ifdef	__FreeBSD__
+		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+		for (i = 0; i < NPTEPG; i++)
+			ept_free_pt_entry(pt[i]);
+		free(pt, M_VMX);	/* free the page table page */
+#else
+		page_t		*pp;
+		pt = (pt_entry_t *)hat_kpm_pfn2va(btop(pde & EPT_ADDR_MASK));
+		for (i = 0; i < NPTEPG; i++)
+			ept_free_pt_entry(pt[i]);
+		pp = page_numtopp_nolock(btop(pde & EPT_ADDR_MASK));
+		kmem_free((void *)pp->p_offset, PAGE_SIZE);
+#endif
+	}
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+	pd_entry_t 	*pd;
+	int		 i;
+
+	if (pdpe == 0)
+		return;
+
+	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+#ifdef	__FreeBSD__
+		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+		for (i = 0; i < NPDEPG; i++)
+			ept_free_pd_entry(pd[i]);
+		free(pd, M_VMX);	/* free the page directory page */
+#else
+		page_t		*pp;
+		pd = (pd_entry_t *)hat_kpm_pfn2va(btop(pdpe & EPT_ADDR_MASK));
+		for (i = 0; i < NPDEPG; i++)
+			ept_free_pd_entry(pd[i]);
+		pp = page_numtopp_nolock(btop(pdpe & EPT_ADDR_MASK));
+		kmem_free((void *)pp->p_offset, PAGE_SIZE);
+#endif
+	}
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+	pdp_entry_t	*pdp;
+	int		i;
+
+	if (pml4e == 0)
+		return;
+
+	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+#ifdef	__FreeBSD__
+		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+		for (i = 0; i < NPDPEPG; i++)
+			ept_free_pdp_entry(pdp[i]);
+		free(pdp, M_VMX);	/* free the page directory ptr page */
+#else
+		page_t		*pp;
+		pdp = (pdp_entry_t *)hat_kpm_pfn2va(btop(pml4e
+		    & EPT_ADDR_MASK));
+		for (i = 0; i < NPDPEPG; i++)
+			ept_free_pdp_entry(pdp[i]);
+		pp = page_numtopp_nolock(btop(pml4e & EPT_ADDR_MASK));
+		kmem_free((void *)pp->p_offset, PAGE_SIZE);
+#endif
+	}
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+	int 		 i;
+
+	for (i = 0; i < NPML4EPG; i++)
+		ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+		vm_memattr_t attr, int prot, boolean_t spok)
+{
+	size_t n;
+	struct vmx *vmx = arg;
+
+	while (len > 0) {
+		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+				       prot, spok);
+		len -= n;
+		gpa += n;
+		hpa += n;
+	}
+
+	return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+	vm_paddr_t hpa;
+	struct vmx *vmx;
+
+	vmx = arg;
+	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+	return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+	struct invept_desc desc = *(struct invept_desc *)arg;
+
+	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+	struct invept_desc invept_desc = { 0 };
+
+	invept_desc.eptp = EPTP(pml4ept);
+
+	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h
new file mode 100644
index 0000000000..d0bcce7ec3
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/ept.h 245678 2013-01-20 03:42:49Z neel $
+ */
+
+#ifndef	_EPT_H_
+#define	_EPT_H_
+
+struct vmx;
+
+#define	EPT_PWLEVELS	4		/* page walk levels */
+#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int	ept_init(void);
+int	ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void	ept_invalidate_mappings(u_long ept_pml4);
+void	ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c
new file mode 100644
index 0000000000..bbd2da2a34
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c
@@ -0,0 +1,597 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifdef	__FreeBSD__
+#include "opt_ddb.h"
+#endif
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmx_cpufunc.h"
+#include "vmcs.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+	switch (encoding) {
+	case VMCS_GUEST_CR0:
+		val = vmx_fix_cr0(val);
+		break;
+	case VMCS_GUEST_CR4:
+		val = vmx_fix_cr4(val);
+		break;
+	default:
+		break;
+	}
+	return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		return (VMCS_GUEST_CR0);
+	case VM_REG_GUEST_CR3:
+		return (VMCS_GUEST_CR3);
+	case VM_REG_GUEST_CR4:
+		return (VMCS_GUEST_CR4);
+	case VM_REG_GUEST_DR7:
+		return (VMCS_GUEST_DR7);
+	case VM_REG_GUEST_RSP:
+		return (VMCS_GUEST_RSP);
+	case VM_REG_GUEST_RIP:
+		return (VMCS_GUEST_RIP);
+	case VM_REG_GUEST_RFLAGS:
+		return (VMCS_GUEST_RFLAGS);
+	case VM_REG_GUEST_ES:
+		return (VMCS_GUEST_ES_SELECTOR);
+	case VM_REG_GUEST_CS:
+		return (VMCS_GUEST_CS_SELECTOR);
+	case VM_REG_GUEST_SS:
+		return (VMCS_GUEST_SS_SELECTOR);
+	case VM_REG_GUEST_DS:
+		return (VMCS_GUEST_DS_SELECTOR);
+	case VM_REG_GUEST_FS:
+		return (VMCS_GUEST_FS_SELECTOR);
+	case VM_REG_GUEST_GS:
+		return (VMCS_GUEST_GS_SELECTOR);
+	case VM_REG_GUEST_TR:
+		return (VMCS_GUEST_TR_SELECTOR);
+	case VM_REG_GUEST_LDTR:
+		return (VMCS_GUEST_LDTR_SELECTOR);
+	case VM_REG_GUEST_EFER:
+		return (VMCS_GUEST_IA32_EFER);
+	default:
+		return (-1);
+	}
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+	switch (seg) {
+	case VM_REG_GUEST_ES:
+		*base = VMCS_GUEST_ES_BASE;
+		*lim = VMCS_GUEST_ES_LIMIT;
+		*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_CS:
+		*base = VMCS_GUEST_CS_BASE;
+		*lim = VMCS_GUEST_CS_LIMIT;
+		*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_SS:
+		*base = VMCS_GUEST_SS_BASE;
+		*lim = VMCS_GUEST_SS_LIMIT;
+		*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_DS:
+		*base = VMCS_GUEST_DS_BASE;
+		*lim = VMCS_GUEST_DS_LIMIT;
+		*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_FS:
+		*base = VMCS_GUEST_FS_BASE;
+		*lim = VMCS_GUEST_FS_LIMIT;
+		*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_GS:
+		*base = VMCS_GUEST_GS_BASE;
+		*lim = VMCS_GUEST_GS_LIMIT;
+		*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_TR:
+		*base = VMCS_GUEST_TR_BASE;
+		*lim = VMCS_GUEST_TR_LIMIT;
+		*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_LDTR:
+		*base = VMCS_GUEST_LDTR_BASE;
+		*lim = VMCS_GUEST_LDTR_LIMIT;
+		*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_IDTR:
+		*base = VMCS_GUEST_IDTR_BASE;
+		*lim = VMCS_GUEST_IDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	case VM_REG_GUEST_GDTR:
+		*base = VMCS_GUEST_GDTR_BASE;
+		*lim = VMCS_GUEST_GDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval)
+{
+	int error;
+	uint32_t encoding;
+
+	/*
+	 * If we need to get at vmx-specific state in the VMCS we can bypass
+	 * the translation of 'ident' to 'encoding' by simply setting the
+	 * sign bit. As it so happens the upper 16 bits are reserved (i.e
+	 * set to 0) in the encodings for the VMCS so we are free to use the
+	 * sign bit.
+	 */
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	if (!running)
+		VMPTRLD(vmcs);
+
+	error = vmread(encoding, retval);
+
+	if (!running)
+		VMCLEAR(vmcs);
+
+	return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val)
+{
+	int error;
+	uint32_t encoding;
+
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	val = vmcs_fix_regval(encoding, val);
+
+	if (!running)
+		VMPTRLD(vmcs);
+
+	error = vmwrite(encoding, val);
+
+	if (!running)
+		VMCLEAR(vmcs);
+
+	return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_setdesc: invalid segment register %d", seg);
+
+	if (!running)
+		VMPTRLD(vmcs);
+	if ((error = vmwrite(base, desc->base)) != 0)
+		goto done;
+
+	if ((error = vmwrite(limit, desc->limit)) != 0)
+		goto done;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmwrite(access, desc->access)) != 0)
+			goto done;
+	}
+done:
+	if (!running)
+		VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+	uint64_t u64;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_getdesc: invalid segment register %d", seg);
+
+	if (!running)
+		VMPTRLD(vmcs);
+	if ((error = vmread(base, &u64)) != 0)
+		goto done;
+	desc->base = u64;
+
+	if ((error = vmread(limit, &u64)) != 0)
+		goto done;
+	desc->limit = u64;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmread(access, &u64)) != 0)
+			goto done;
+		desc->access = u64;
+	}
+done:
+	if (!running)
+		VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+	int error;
+
+	VMPTRLD(vmcs);
+
+	/*
+	 * Guest MSRs are saved in the VM-exit MSR-store area.
+	 * Guest MSRs are loaded from the VM-entry MSR-load area.
+	 * Both areas point to the same location in memory.
+	 */
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+		goto done;
+
+	error = 0;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+#ifndef	__FreeBSD__
+int
+vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count)
+{
+	int error;
+
+	VMPTRLD(vmcs);
+
+	/*
+	 * Host MSRs are loaded from the VM-exit MSR-load area.
+	 */
+	if ((error = vmwrite(VMCS_EXIT_MSR_LOAD, h_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, h_count)) != 0)
+		goto done;
+
+	error = 0;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+#endif
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
+		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+		  uint32_t procbased_ctls2, uint32_t exit_ctls,
+		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+	int error, codesel, datasel, tsssel;
+	u_long cr0, cr4, efer;
+	uint64_t eptp, pat, fsbase, idtrbase;
+	uint32_t exc_bitmap;
+
+	codesel = vmm_get_host_codesel();
+	datasel = vmm_get_host_datasel();
+	tsssel = vmm_get_host_tsssel();
+
+	/*
+	 * Make sure we have a "current" VMCS to work with.
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * Load the VMX controls
+	 */
+	if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+		goto done;
+
+	/* Guest state */
+
+	/* Initialize guest IA32_PAT MSR with the default value */
+	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
+	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(2, PAT_UNCACHED)	|
+	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	      PAT_VALUE(4, PAT_WRITE_BACK)	|
+	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(6, PAT_UNCACHED)	|
+	      PAT_VALUE(7, PAT_UNCACHEABLE);
+	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Host state */
+
+	/* Initialize host IA32_PAT MSR */
+	pat = vmm_get_host_pat();
+	if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Load the IA32_EFER MSR */
+	efer = vmm_get_host_efer();
+	if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+		goto done;
+
+	/* Load the control registers */
+
+	cr0 = vmm_get_host_cr0();
+	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+		goto done;
+	
+	cr4 = vmm_get_host_cr4() | CR4_VMXE;
+	if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+		goto done;
+
+	/* Load the segment selectors */
+	if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+		goto done;
+
+#ifdef	__FreeBSD__
+	if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+		goto done;
+#else
+	if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, vmm_get_host_fssel())) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, vmm_get_host_gssel())) != 0)
+		goto done;
+#endif
+
+	if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+		goto done;
+
+#ifdef	__FreeBSD__
+	/*
+	 * Load the Base-Address for %fs and idtr.
+	 *
+	 * Note that we exclude %gs, tss and gdtr here because their base
+	 * address is pcpu specific.
+	 */
+	fsbase = vmm_get_host_fsbase();
+	if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+		goto done;
+#endif
+
+	idtrbase = vmm_get_host_idtrbase();
+	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+		goto done;
+
+	/* instruction pointer */
+	if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+		goto done;
+
+	/* stack pointer */
+	if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+		goto done;
+
+	/* eptp */
+	eptp = EPTP(ept_pml4);
+	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+		goto done;
+
+	/* vpid */
+	if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+		goto done;
+
+	/* msr bitmap */
+	if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+		goto done;
+
+	/* exception bitmap */
+	exc_bitmap = 1 << IDT_MC;
+	if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+		goto done;
+
+	/* link pointer */
+	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+		goto done;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+	uint64_t cur_vmcs, val;
+	uint32_t exit;
+
+	if (!vmxon_enabled[curcpu]) {
+		db_printf("VMX not enabled\n");
+		return;
+	}
+
+	if (have_addr) {
+		db_printf("Only current VMCS supported\n");
+		return;
+	}
+
+	vmptrst(&cur_vmcs);
+	if (cur_vmcs == VMCS_INITIAL) {
+		db_printf("No current VM context\n");
+		return;
+	}
+	db_printf("VMCS: %jx\n", cur_vmcs);
+	db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+	db_printf("Activity: ");
+	val = vmcs_read(VMCS_GUEST_ACTIVITY);
+	switch (val) {
+	case 0:
+		db_printf("Active");
+		break;
+	case 1:
+		db_printf("HLT");
+		break;
+	case 2:
+		db_printf("Shutdown");
+		break;
+	case 3:
+		db_printf("Wait for SIPI");
+		break;
+	default:
+		db_printf("Unknown: %#lx", val);
+	}
+	db_printf("\n");
+	exit = vmcs_read(VMCS_EXIT_REASON);
+	if (exit & 0x80000000)
+		db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+	else
+		db_printf("Exit Reason: %u\n", exit & 0xffff);
+	db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+	db_printf("Guest Linear Address: %#lx\n",
+	    vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+	switch (exit & 0x8000ffff) {
+	case EXIT_REASON_EXCEPTION:
+	case EXIT_REASON_EXT_INTR:
+		val = vmcs_read(VMCS_EXIT_INTR_INFO);
+		db_printf("Interrupt Type: ");
+		switch (val >> 8 & 0x7) {
+		case 0:
+			db_printf("external");
+			break;
+		case 2:
+			db_printf("NMI");
+			break;
+		case 3:
+			db_printf("HW exception");
+			break;
+		case 4:
+			db_printf("SW exception");
+			break;
+		default:
+			db_printf("?? %lu", val >> 8 & 0x7);
+			break;
+		}
+		db_printf("  Vector: %lu", val & 0xff);
+		if (val & 0x800)
+			db_printf("  Error Code: %lx",
+			    vmcs_read(VMCS_EXIT_INTR_ERRCODE));
+		db_printf("\n");
+		break;
+	case EXIT_REASON_EPT_FAULT:
+	case EXIT_REASON_EPT_MISCONFIG:
+		db_printf("Guest Physical Address: %#lx\n",
+		    vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+		break;
+	}
+	db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h
new file mode 100644
index 0000000000..20e99e8184
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h
@@ -0,0 +1,410 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.h 276098 2014-12-23 02:14:49Z neel $
+ */
+
+#ifndef _VMCS_H_
+#define	_VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+	uint32_t	identifier;
+	uint32_t	abort_code;
+	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+	uint32_t	index;
+	uint32_t	reserved;
+	uint64_t	val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int	vmcs_init(struct vmcs *vmcs);
+#ifndef	__FreeBSD__
+int vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count);
+#endif
+int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+			  u_long ept_pml4,
+			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+			  uint32_t procbased_ctls2, uint32_t exit_ctls,
+			  uint32_t entry_ctls, u_long msr_bitmap,
+			  uint16_t vpid);
+int	vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv);
+int	vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val);
+int	vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
+		     struct seg_desc *desc);
+int	vmcs_setdesc(struct vmcs *vmcs, int running, int ident,
+		     struct seg_desc *desc);
+
+/*
+ * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h
+ */
+#ifdef _VMX_CPUFUNC_H_
+static __inline uint64_t
+vmcs_read(uint32_t encoding)
+{
+	int error;
+	uint64_t val;
+
+	error = vmread(encoding, &val);
+	KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error));
+	return (val);
+}
+
+static __inline void
+vmcs_write(uint32_t encoding, uint64_t val)
+{
+	int error;
+
+	error = vmwrite(encoding, val);
+	KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error));
+}
+#endif	/* _VMX_CPUFUNC_H_ */
+
+#define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
+#define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
+#define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+#define	vmcs_idt_vectoring_info()	vmcs_read(VMCS_IDT_VECTORING_INFO)
+#define	vmcs_idt_vectoring_err()	vmcs_read(VMCS_IDT_VECTORING_ERROR)
+
+#endif	/* _KERNEL */
+
+#define	VMCS_INITIAL			0xffffffffffffffff
+
+#define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define	VMCS_INVALID_ENCODING		0xffffffff
+
+/* 16-bit control fields */
+#define	VMCS_VPID			0x00000000
+#define	VMCS_PIR_VECTOR			0x00000002
+
+/* 16-bit guest-state fields */
+#define	VMCS_GUEST_ES_SELECTOR		0x00000800
+#define	VMCS_GUEST_CS_SELECTOR		0x00000802
+#define	VMCS_GUEST_SS_SELECTOR		0x00000804
+#define	VMCS_GUEST_DS_SELECTOR		0x00000806
+#define	VMCS_GUEST_FS_SELECTOR		0x00000808
+#define	VMCS_GUEST_GS_SELECTOR		0x0000080A
+#define	VMCS_GUEST_LDTR_SELECTOR	0x0000080C
+#define	VMCS_GUEST_TR_SELECTOR		0x0000080E
+#define	VMCS_GUEST_INTR_STATUS		0x00000810
+
+/* 16-bit host-state fields */
+#define	VMCS_HOST_ES_SELECTOR		0x00000C00
+#define	VMCS_HOST_CS_SELECTOR		0x00000C02
+#define	VMCS_HOST_SS_SELECTOR		0x00000C04
+#define	VMCS_HOST_DS_SELECTOR		0x00000C06
+#define	VMCS_HOST_FS_SELECTOR		0x00000C08
+#define	VMCS_HOST_GS_SELECTOR		0x00000C0A
+#define	VMCS_HOST_TR_SELECTOR		0x00000C0C
+
+/* 64-bit control fields */
+#define	VMCS_IO_BITMAP_A		0x00002000
+#define	VMCS_IO_BITMAP_B		0x00002002
+#define	VMCS_MSR_BITMAP			0x00002004
+#define	VMCS_EXIT_MSR_STORE		0x00002006
+#define	VMCS_EXIT_MSR_LOAD		0x00002008
+#define	VMCS_ENTRY_MSR_LOAD		0x0000200A
+#define	VMCS_EXECUTIVE_VMCS		0x0000200C
+#define	VMCS_TSC_OFFSET			0x00002010
+#define	VMCS_VIRTUAL_APIC		0x00002012
+#define	VMCS_APIC_ACCESS		0x00002014
+#define	VMCS_PIR_DESC			0x00002016
+#define	VMCS_EPTP			0x0000201A
+#define	VMCS_EOI_EXIT0			0x0000201C
+#define	VMCS_EOI_EXIT1			0x0000201E
+#define	VMCS_EOI_EXIT2			0x00002020
+#define	VMCS_EOI_EXIT3			0x00002022
+#define	VMCS_EOI_EXIT(vector)		(VMCS_EOI_EXIT0 + ((vector) / 64) * 2)
+
+/* 64-bit read-only fields */
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+
+/* 64-bit guest-state fields */
+#define	VMCS_LINK_POINTER		0x00002800
+#define	VMCS_GUEST_IA32_DEBUGCTL	0x00002802
+#define	VMCS_GUEST_IA32_PAT		0x00002804
+#define	VMCS_GUEST_IA32_EFER		0x00002806
+#define	VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define	VMCS_GUEST_PDPTE0		0x0000280A
+#define	VMCS_GUEST_PDPTE1		0x0000280C
+#define	VMCS_GUEST_PDPTE2		0x0000280E
+#define	VMCS_GUEST_PDPTE3		0x00002810
+
+/* 64-bit host-state fields */
+#define	VMCS_HOST_IA32_PAT		0x00002C00
+#define	VMCS_HOST_IA32_EFER		0x00002C02
+#define	VMCS_HOST_IA32_PERF_GLOBAL_CTRL	0x00002C04
+
+/* 32-bit control fields */
+#define	VMCS_PIN_BASED_CTLS		0x00004000
+#define	VMCS_PRI_PROC_BASED_CTLS	0x00004002
+#define	VMCS_EXCEPTION_BITMAP		0x00004004
+#define	VMCS_PF_ERROR_MASK		0x00004006
+#define	VMCS_PF_ERROR_MATCH		0x00004008
+#define	VMCS_CR3_TARGET_COUNT		0x0000400A
+#define	VMCS_EXIT_CTLS			0x0000400C
+#define	VMCS_EXIT_MSR_STORE_COUNT	0x0000400E
+#define	VMCS_EXIT_MSR_LOAD_COUNT	0x00004010
+#define	VMCS_ENTRY_CTLS			0x00004012
+#define	VMCS_ENTRY_MSR_LOAD_COUNT	0x00004014
+#define	VMCS_ENTRY_INTR_INFO		0x00004016
+#define	VMCS_ENTRY_EXCEPTION_ERROR	0x00004018
+#define	VMCS_ENTRY_INST_LENGTH		0x0000401A
+#define	VMCS_TPR_THRESHOLD		0x0000401C
+#define	VMCS_SEC_PROC_BASED_CTLS	0x0000401E
+#define	VMCS_PLE_GAP			0x00004020
+#define	VMCS_PLE_WINDOW			0x00004022
+
+/* 32-bit read-only data fields */
+#define	VMCS_INSTRUCTION_ERROR		0x00004400
+#define	VMCS_EXIT_REASON		0x00004402
+#define	VMCS_EXIT_INTR_INFO		0x00004404
+#define	VMCS_EXIT_INTR_ERRCODE		0x00004406
+#define	VMCS_IDT_VECTORING_INFO		0x00004408
+#define	VMCS_IDT_VECTORING_ERROR	0x0000440A
+#define	VMCS_EXIT_INSTRUCTION_LENGTH	0x0000440C
+#define	VMCS_EXIT_INSTRUCTION_INFO	0x0000440E
+
+/* 32-bit guest-state fields */
+#define	VMCS_GUEST_ES_LIMIT		0x00004800
+#define	VMCS_GUEST_CS_LIMIT		0x00004802
+#define	VMCS_GUEST_SS_LIMIT		0x00004804
+#define	VMCS_GUEST_DS_LIMIT		0x00004806
+#define	VMCS_GUEST_FS_LIMIT		0x00004808
+#define	VMCS_GUEST_GS_LIMIT		0x0000480A
+#define	VMCS_GUEST_LDTR_LIMIT		0x0000480C
+#define	VMCS_GUEST_TR_LIMIT		0x0000480E
+#define	VMCS_GUEST_GDTR_LIMIT		0x00004810
+#define	VMCS_GUEST_IDTR_LIMIT		0x00004812
+#define	VMCS_GUEST_ES_ACCESS_RIGHTS	0x00004814
+#define	VMCS_GUEST_CS_ACCESS_RIGHTS	0x00004816
+#define	VMCS_GUEST_SS_ACCESS_RIGHTS	0x00004818
+#define	VMCS_GUEST_DS_ACCESS_RIGHTS	0x0000481A
+#define	VMCS_GUEST_FS_ACCESS_RIGHTS	0x0000481C
+#define	VMCS_GUEST_GS_ACCESS_RIGHTS	0x0000481E
+#define	VMCS_GUEST_LDTR_ACCESS_RIGHTS	0x00004820
+#define	VMCS_GUEST_TR_ACCESS_RIGHTS	0x00004822
+#define	VMCS_GUEST_INTERRUPTIBILITY	0x00004824
+#define	VMCS_GUEST_ACTIVITY		0x00004826
+#define VMCS_GUEST_SMBASE		0x00004828
+#define	VMCS_GUEST_IA32_SYSENTER_CS	0x0000482A
+#define	VMCS_PREEMPTION_TIMER_VALUE	0x0000482E
+
+/* 32-bit host state fields */
+#define	VMCS_HOST_IA32_SYSENTER_CS	0x00004C00
+
+/* Natural Width control fields */
+#define	VMCS_CR0_MASK			0x00006000
+#define	VMCS_CR4_MASK			0x00006002
+#define	VMCS_CR0_SHADOW			0x00006004
+#define	VMCS_CR4_SHADOW			0x00006006
+#define	VMCS_CR3_TARGET0		0x00006008
+#define	VMCS_CR3_TARGET1		0x0000600A
+#define	VMCS_CR3_TARGET2		0x0000600C
+#define	VMCS_CR3_TARGET3		0x0000600E
+
+/* Natural Width read-only fields */
+#define	VMCS_EXIT_QUALIFICATION		0x00006400
+#define	VMCS_IO_RCX			0x00006402
+#define	VMCS_IO_RSI			0x00006404
+#define	VMCS_IO_RDI			0x00006406
+#define	VMCS_IO_RIP			0x00006408
+#define	VMCS_GUEST_LINEAR_ADDRESS	0x0000640A
+
+/* Natural Width guest-state fields */
+#define	VMCS_GUEST_CR0			0x00006800
+#define	VMCS_GUEST_CR3			0x00006802
+#define	VMCS_GUEST_CR4			0x00006804
+#define	VMCS_GUEST_ES_BASE		0x00006806
+#define	VMCS_GUEST_CS_BASE		0x00006808
+#define	VMCS_GUEST_SS_BASE		0x0000680A
+#define	VMCS_GUEST_DS_BASE		0x0000680C
+#define	VMCS_GUEST_FS_BASE		0x0000680E
+#define	VMCS_GUEST_GS_BASE		0x00006810
+#define	VMCS_GUEST_LDTR_BASE		0x00006812
+#define	VMCS_GUEST_TR_BASE		0x00006814
+#define	VMCS_GUEST_GDTR_BASE		0x00006816
+#define	VMCS_GUEST_IDTR_BASE		0x00006818
+#define	VMCS_GUEST_DR7			0x0000681A
+#define	VMCS_GUEST_RSP			0x0000681C
+#define	VMCS_GUEST_RIP			0x0000681E
+#define	VMCS_GUEST_RFLAGS		0x00006820
+#define	VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define	VMCS_GUEST_IA32_SYSENTER_ESP	0x00006824
+#define	VMCS_GUEST_IA32_SYSENTER_EIP	0x00006826
+
+/* Natural Width host-state fields */
+#define	VMCS_HOST_CR0			0x00006C00
+#define	VMCS_HOST_CR3			0x00006C02
+#define	VMCS_HOST_CR4			0x00006C04
+#define	VMCS_HOST_FS_BASE		0x00006C06
+#define	VMCS_HOST_GS_BASE		0x00006C08
+#define	VMCS_HOST_TR_BASE		0x00006C0A
+#define	VMCS_HOST_GDTR_BASE		0x00006C0C
+#define	VMCS_HOST_IDTR_BASE		0x00006C0E
+#define	VMCS_HOST_IA32_SYSENTER_ESP	0x00006C10
+#define	VMCS_HOST_IA32_SYSENTER_EIP	0x00006C12
+#define	VMCS_HOST_RSP			0x00006C14
+#define	VMCS_HOST_RIP			0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define	VMRESUME_WITH_NON_LAUNCHED_VMCS	5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION		0
+#define EXIT_REASON_EXT_INTR		1
+#define EXIT_REASON_TRIPLE_FAULT	2
+#define EXIT_REASON_INIT		3
+#define EXIT_REASON_SIPI		4
+#define EXIT_REASON_IO_SMI		5
+#define EXIT_REASON_SMI			6
+#define EXIT_REASON_INTR_WINDOW		7
+#define EXIT_REASON_NMI_WINDOW		8
+#define EXIT_REASON_TASK_SWITCH		9
+#define EXIT_REASON_CPUID		10
+#define EXIT_REASON_GETSEC		11
+#define EXIT_REASON_HLT			12
+#define EXIT_REASON_INVD		13
+#define EXIT_REASON_INVLPG		14
+#define EXIT_REASON_RDPMC		15
+#define EXIT_REASON_RDTSC		16
+#define EXIT_REASON_RSM			17
+#define EXIT_REASON_VMCALL		18
+#define EXIT_REASON_VMCLEAR		19
+#define EXIT_REASON_VMLAUNCH		20
+#define EXIT_REASON_VMPTRLD		21
+#define EXIT_REASON_VMPTRST		22
+#define EXIT_REASON_VMREAD		23
+#define EXIT_REASON_VMRESUME		24
+#define EXIT_REASON_VMWRITE		25
+#define EXIT_REASON_VMXOFF		26
+#define EXIT_REASON_VMXON		27
+#define EXIT_REASON_CR_ACCESS		28
+#define EXIT_REASON_DR_ACCESS		29
+#define EXIT_REASON_INOUT		30
+#define EXIT_REASON_RDMSR		31
+#define EXIT_REASON_WRMSR		32
+#define EXIT_REASON_INVAL_VMCS		33
+#define EXIT_REASON_INVAL_MSR		34
+#define EXIT_REASON_MWAIT		36
+#define EXIT_REASON_MTF			37
+#define EXIT_REASON_MONITOR		39
+#define EXIT_REASON_PAUSE		40
+#define EXIT_REASON_MCE_DURING_ENTRY	41
+#define EXIT_REASON_TPR			43
+#define EXIT_REASON_APIC_ACCESS		44
+#define	EXIT_REASON_VIRTUALIZED_EOI	45
+#define EXIT_REASON_GDTR_IDTR		46
+#define EXIT_REASON_LDTR_TR		47
+#define EXIT_REASON_EPT_FAULT		48
+#define EXIT_REASON_EPT_MISCONFIG	49
+#define EXIT_REASON_INVEPT		50
+#define EXIT_REASON_RDTSCP		51
+#define EXIT_REASON_VMX_PREEMPT		52
+#define EXIT_REASON_INVVPID		53
+#define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
+#define	EXIT_REASON_APIC_WRITE		56
+
+/*
+ * NMI unblocking due to IRET.
+ *
+ * Applies to VM-exits due to hardware exception or EPT fault.
+ */
+#define	EXIT_QUAL_NMIUDTI	(1 << 12)
+/*
+ * VMCS interrupt information fields
+ */
+#define	VMCS_INTR_VALID		(1U << 31)
+#define	VMCS_INTR_T_MASK	0x700		/* Interruption-info type */
+#define	VMCS_INTR_T_HWINTR	(0 << 8)
+#define	VMCS_INTR_T_NMI		(2 << 8)
+#define	VMCS_INTR_T_HWEXCEPTION	(3 << 8)
+#define	VMCS_INTR_T_SWINTR	(4 << 8)
+#define	VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define	VMCS_INTR_T_SWEXCEPTION	(6 << 8)
+#define	VMCS_INTR_DEL_ERRCODE	(1 << 11)
+
+/*
+ * VMCS IDT-Vectoring information fields
+ */
+#define	VMCS_IDT_VEC_VALID		(1U << 31)
+#define	VMCS_IDT_VEC_ERRCODE_VALID	(1 << 11)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define	VMCS_INTERRUPTIBILITY_STI_BLOCKING	(1 << 0)
+#define	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING	(1 << 1)
+#define	VMCS_INTERRUPTIBILITY_SMI_BLOCKING	(1 << 2)
+#define	VMCS_INTERRUPTIBILITY_NMI_BLOCKING	(1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define	EPT_VIOLATION_DATA_READ		(1UL << 0)
+#define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
+#define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
+#define	EPT_VIOLATION_GPA_READABLE	(1UL << 3)
+#define	EPT_VIOLATION_GPA_WRITEABLE	(1UL << 4)
+#define	EPT_VIOLATION_GPA_EXECUTABLE	(1UL << 5)
+#define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
+#define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)
+
+/*
+ * Exit qualification for APIC-access VM exit
+ */
+#define	APIC_ACCESS_OFFSET(qual)	((qual) & 0xFFF)
+#define	APIC_ACCESS_TYPE(qual)		(((qual) >> 12) & 0xF)
+
+/*
+ * Exit qualification for APIC-write VM exit
+ */
+#define	APIC_WRITE_OFFSET(qual)		((qual) & 0xFFF)
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
new file mode 100644
index 0000000000..7ddf4e2a46
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -0,0 +1,2842 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/segments.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+#include "vmm_lapic.h"
+#include "vmm_host.h"
+#include "vmm_ioport.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+#include "vatpic.h"
+#include "vlapic.h"
+#include "vlapic_priv.h"
+
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "vmx_msr.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define	PINBASED_CTLS_ONE_SETTING					\
+	(PINBASED_EXTINT_EXITING	|				\
+	 PINBASED_NMI_EXITING		|				\
+	 PINBASED_VIRTUAL_NMI)
+#define	PINBASED_CTLS_ZERO_SETTING	0
+
+#define PROCBASED_CTLS_WINDOW_SETTING					\
+	(PROCBASED_INT_WINDOW_EXITING	|				\
+	 PROCBASED_NMI_WINDOW_EXITING)
+
+#define	PROCBASED_CTLS_ONE_SETTING 					\
+	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_IO_EXITING		|				\
+	 PROCBASED_MSR_BITMAPS		|				\
+	 PROCBASED_CTLS_WINDOW_SETTING	|				\
+	 PROCBASED_CR8_LOAD_EXITING	|				\
+	 PROCBASED_CR8_STORE_EXITING)
+#define	PROCBASED_CTLS_ZERO_SETTING	\
+	(PROCBASED_CR3_LOAD_EXITING |	\
+	PROCBASED_CR3_STORE_EXITING |	\
+	PROCBASED_IO_BITMAPS)
+
+#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
+#define	PROCBASED_CTLS2_ZERO_SETTING	0
+
+#define	VM_EXIT_CTLS_ONE_SETTING					\
+	(VM_EXIT_HOST_LMA			|			\
+	VM_EXIT_SAVE_EFER			|			\
+	VM_EXIT_LOAD_EFER			|			\
+	VM_EXIT_LOAD_PAT			|			\
+	VM_EXIT_SAVE_PAT			|			\
+	VM_EXIT_LOAD_PAT)
+
+#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT)
+
+#define	VM_ENTRY_CTLS_ZERO_SETTING					\
+	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
+	VM_ENTRY_INTO_SMM			|			\
+	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define	HANDLED		1
+#define	UNHANDLED	0
+
+static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+#ifndef	__FreeBSD__
+static vm_paddr_t vmxon_region_pa[MAXCPU];
+#endif
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
+	     &cr0_ones_mask, 0, NULL);
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
+	     &cr0_zeros_mask, 0, NULL);
+
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
+	     &cr4_ones_mask, 0, NULL);
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
+	     &cr4_zeros_mask, 0, NULL);
+
+static int vmx_initialized;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
+	   &vmx_initialized, 0, "Intel VMX initialized");
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+static int cap_invpcid;
+
+static int virtual_interrupt_delivery;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+    &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
+
+static int posted_interrupts;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+    &posted_interrupts, 0, "APICv posted interrupt support");
+
+static int pirvec;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
+    &pirvec, 0, "APICv posted interrupt vector");
+
+static struct unrhdr *vpid_unr;
+static u_int vpid_alloc_failed;
+SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
+	    &vpid_alloc_failed, 0, NULL);
+
+/*
+ * Use the last page below 4GB as the APIC access address. This address is
+ * occupied by the boot firmware so it is guaranteed that it will not conflict
+ * with a page in system memory.
+ */
+#define	APIC_ACCESS_ADDRESS	0xFFFFF000
+
+static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
+static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
+static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
+static void vmx_inject_pir(struct vlapic *vlapic);
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+	static char reasonbuf[32];
+
+	switch (reason) {
+	case EXIT_REASON_EXCEPTION:
+		return "exception";
+	case EXIT_REASON_EXT_INTR:
+		return "extint";
+	case EXIT_REASON_TRIPLE_FAULT:
+		return "triplefault";
+	case EXIT_REASON_INIT:
+		return "init";
+	case EXIT_REASON_SIPI:
+		return "sipi";
+	case EXIT_REASON_IO_SMI:
+		return "iosmi";
+	case EXIT_REASON_SMI:
+		return "smi";
+	case EXIT_REASON_INTR_WINDOW:
+		return "intrwindow";
+	case EXIT_REASON_NMI_WINDOW:
+		return "nmiwindow";
+	case EXIT_REASON_TASK_SWITCH:
+		return "taskswitch";
+	case EXIT_REASON_CPUID:
+		return "cpuid";
+	case EXIT_REASON_GETSEC:
+		return "getsec";
+	case EXIT_REASON_HLT:
+		return "hlt";
+	case EXIT_REASON_INVD:
+		return "invd";
+	case EXIT_REASON_INVLPG:
+		return "invlpg";
+	case EXIT_REASON_RDPMC:
+		return "rdpmc";
+	case EXIT_REASON_RDTSC:
+		return "rdtsc";
+	case EXIT_REASON_RSM:
+		return "rsm";
+	case EXIT_REASON_VMCALL:
+		return "vmcall";
+	case EXIT_REASON_VMCLEAR:
+		return "vmclear";
+	case EXIT_REASON_VMLAUNCH:
+		return "vmlaunch";
+	case EXIT_REASON_VMPTRLD:
+		return "vmptrld";
+	case EXIT_REASON_VMPTRST:
+		return "vmptrst";
+	case EXIT_REASON_VMREAD:
+		return "vmread";
+	case EXIT_REASON_VMRESUME:
+		return "vmresume";
+	case EXIT_REASON_VMWRITE:
+		return "vmwrite";
+	case EXIT_REASON_VMXOFF:
+		return "vmxoff";
+	case EXIT_REASON_VMXON:
+		return "vmxon";
+	case EXIT_REASON_CR_ACCESS:
+		return "craccess";
+	case EXIT_REASON_DR_ACCESS:
+		return "draccess";
+	case EXIT_REASON_INOUT:
+		return "inout";
+	case EXIT_REASON_RDMSR:
+		return "rdmsr";
+	case EXIT_REASON_WRMSR:
+		return "wrmsr";
+	case EXIT_REASON_INVAL_VMCS:
+		return "invalvmcs";
+	case EXIT_REASON_INVAL_MSR:
+		return "invalmsr";
+	case EXIT_REASON_MWAIT:
+		return "mwait";
+	case EXIT_REASON_MTF:
+		return "mtf";
+	case EXIT_REASON_MONITOR:
+		return "monitor";
+	case EXIT_REASON_PAUSE:
+		return "pause";
+	case EXIT_REASON_MCE:
+		return "mce";
+	case EXIT_REASON_TPR:
+		return "tpr";
+	case EXIT_REASON_APIC_ACCESS:
+		return "apic-access";
+	case EXIT_REASON_GDTR_IDTR:
+		return "gdtridtr";
+	case EXIT_REASON_LDTR_TR:
+		return "ldtrtr";
+	case EXIT_REASON_EPT_FAULT:
+		return "eptfault";
+	case EXIT_REASON_EPT_MISCONFIG:
+		return "eptmisconfig";
+	case EXIT_REASON_INVEPT:
+		return "invept";
+	case EXIT_REASON_RDTSCP:
+		return "rdtscp";
+	case EXIT_REASON_VMX_PREEMPT:
+		return "vmxpreempt";
+	case EXIT_REASON_INVVPID:
+		return "invvpid";
+	case EXIT_REASON_WBINVD:
+		return "wbinvd";
+	case EXIT_REASON_XSETBV:
+		return "xsetbv";
+	case EXIT_REASON_APIC_WRITE:
+		return "apic-write";
+	default:
+		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+		return (reasonbuf);
+	}
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+	switch (rc) {
+	case VMX_RETURN_DIRECT:
+		return "direct";
+	case VMX_RETURN_LONGJMP:
+		return "longjmp";
+	case VMX_RETURN_VMRESUME:
+		return "vmresume";
+	case VMX_RETURN_VMLAUNCH:
+		return "vmlaunch";
+	case VMX_RETURN_AST:
+		return "ast";
+	default:
+		return "unknown";
+	}
+}
+
+#define	SETJMP_TRACE(vmx, vcpu, vmxctx, regname)			  \
+	VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+		 (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	uint64_t host_rip, host_rsp;
+
+	if (vmxctx != &vmx->ctx[vcpu])
+		panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+			vmxctx, &vmx->ctx[vcpu]);
+
+	VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+	VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+		 vmx_setjmp_rc2str(rc), rc);
+
+	host_rsp = host_rip = ~0;
+	vmread(VMCS_HOST_RIP, &host_rip);
+	vmread(VMCS_HOST_RSP, &host_rsp);
+	VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+		 host_rip, host_rsp);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	return;
+}
+#endif	/* KTR */
+
+static int
+vmx_allow_x2apic_msrs(struct vmx *vmx)
+{
+	int i, error;
+
+	error = 0;
+
+	/*
+	 * Allow readonly access to the following x2APIC MSRs from the guest.
+	 */
+	error += guest_msr_ro(vmx, MSR_APIC_ID);
+	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
+	error += guest_msr_ro(vmx, MSR_APIC_LDR);
+	error += guest_msr_ro(vmx, MSR_APIC_SVR);
+
+	for (i = 0; i < 8; i++)
+		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
+
+	for (i = 0; i < 8; i++)
+		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
+	
+	for (i = 0; i < 8; i++)
+		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
+
+	error += guest_msr_ro(vmx, MSR_APIC_ESR);
+	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
+	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
+	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
+	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
+	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
+	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
+	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
+	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
+	error += guest_msr_ro(vmx, MSR_APIC_ICR);
+
+	/*
+	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
+	 *
+	 * These registers get special treatment described in the section
+	 * "Virtualizing MSR-Based APIC Accesses".
+	 */
+	error += guest_msr_rw(vmx, MSR_APIC_TPR);
+	error += guest_msr_rw(vmx, MSR_APIC_EOI);
+	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
+
+	return (error);
+}
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+vpid_free(int vpid)
+{
+	if (vpid < 0 || vpid > 0xffff)
+		panic("vpid_free: invalid vpid %d", vpid);
+
+	/*
+	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
+	 * the unit number allocator.
+	 */
+
+	if (vpid > VM_MAXCPU)
+		free_unr(vpid_unr, vpid);
+}
+
+static void
+vpid_alloc(uint16_t *vpid, int num)
+{
+	int i, x;
+
+	if (num <= 0 || num > VM_MAXCPU)
+		panic("invalid number of vpids requested: %d", num);
+
+	/*
+	 * If the "enable vpid" execution control is not enabled then the
+	 * VPID is required to be 0 for all vcpus.
+	 */
+	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
+		for (i = 0; i < num; i++)
+			vpid[i] = 0;
+		return;
+	}
+
+	/*
+	 * Allocate a unique VPID for each vcpu from the unit number allocator.
+	 */
+	for (i = 0; i < num; i++) {
+		x = alloc_unr(vpid_unr);
+		if (x == -1)
+			break;
+		else
+			vpid[i] = x;
+	}
+
+	if (i < num) {
+		atomic_add_int(&vpid_alloc_failed, 1);
+
+		/*
+		 * If the unit number allocator does not have enough unique
+		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
+		 *
+		 * These VPIDs are not be unique across VMs but this does not
+		 * affect correctness because the combined mappings are also
+		 * tagged with the EP4TA which is unique for each VM.
+		 *
+		 * It is still sub-optimal because the invvpid will invalidate
+		 * combined mappings for a particular VPID across all EP4TAs.
+		 */
+		while (i-- > 0)
+			vpid_free(vpid[i]);
+
+		for (i = 0; i < num; i++)
+			vpid[i] = i + 1;
+	}
+}
+
+static void
+vpid_init(void)
+{
+	/*
+	 * VPID 0 is required when the "enable VPID" execution control is
+	 * disabled.
+	 *
+	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
+	 * unit number allocator does not have sufficient unique VPIDs to
+	 * satisfy the allocation.
+	 *
+	 * The remaining VPIDs are managed by the unit number allocator.
+	 */
+	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
+}
+
+#ifndef	__FreeBSD__
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+	int cnt;
+
+	static struct msr_entry guest_msrs[] = {
+		{ MSR_KGSBASE, 0, 0 },
+		{ MSR_LSTAR, 0, 0 },
+		{ MSR_CSTAR, 0, 0 },
+		{ MSR_STAR, 0, 0 },
+		{ MSR_SF_MASK, 0, 0 },
+	};
+
+	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+	if (cnt > GUEST_MSR_MAX_ENTRIES)
+		panic("guest msr save area overrun");
+	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+	*g_count = cnt;
+}
+
+static void
+host_msr_save_area_init(struct msr_entry *h_area, int *h_count)
+{
+	int i, cnt;
+
+	static struct msr_entry host_msrs[] = {
+		{ MSR_LSTAR, 0, 0 },
+		{ MSR_CSTAR, 0, 0 },
+		{ MSR_STAR, 0, 0 },
+		{ MSR_SF_MASK, 0, 0 },
+	};
+
+	cnt = sizeof(host_msrs) / sizeof(host_msrs[0]);
+	if (cnt > HOST_MSR_MAX_ENTRIES)
+		panic("host msr save area overrun");
+	for (i = 0; i < cnt; i++) {
+		host_msrs[i].val = rdmsr(host_msrs[i].index);
+	}
+	bcopy(host_msrs, h_area, sizeof(host_msrs));
+	*h_count = cnt;
+}
+#endif
+
+static void
+vmx_disable(void *arg __unused)
+{
+	struct invvpid_desc invvpid_desc = { 0 };
+	struct invept_desc invept_desc = { 0 };
+
+	if (vmxon_enabled[curcpu]) {
+		/*
+		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+		 *
+		 * VMXON or VMXOFF are not required to invalidate any TLB
+		 * caching structures. This prevents potential retention of
+		 * cached information in the TLB between distinct VMX episodes.
+		 */
+		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+		vmxoff();
+	}
+	load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+	
+#ifdef	__FreeBSD__
+	if (pirvec != 0)
+		vmm_ipi_free(pirvec);
+#endif
+
+	if (vpid_unr != NULL) {
+		delete_unrhdr(vpid_unr);
+		vpid_unr = NULL;
+	}
+
+	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+	return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+	int error;
+	uint64_t feature_control;
+
+	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
+	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
+		wrmsr(MSR_IA32_FEATURE_CONTROL,
+		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
+		    IA32_FEATURE_CONTROL_LOCK);
+	}
+
+	load_cr4(rcr4() | CR4_VMXE);
+
+	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+#ifdef	__FreeBSD__
+	error = vmxon(vmxon_region[curcpu]);
+#else
+	error = vmxon_pa(vmxon_region_pa[curcpu]);
+	ASSERT(error == 0);
+#endif
+	if (error == 0)
+		vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+#define	X86FSET_VMX	35
+	extern uchar_t x86_featureset[];
+	extern boolean_t is_x86_feature(void *featureset, uint_t feature);
+	int error;
+	uint64_t fixed0, fixed1, feature_control;
+	uint32_t tmp;
+#ifndef	__FreeBSD__
+	int i;
+#endif
+
+	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+#ifdef	__FreeBSD__
+	if (!(cpu_feature2 & CPUID2_VMX)) {
+		printf("vmx_init: processor does not support VMX operation\n");
+		return (ENXIO);
+	}
+#else
+	if (!is_x86_feature(x86_featureset, X86FSET_VMX)) {
+		cmn_err(CE_WARN, "vmx_init: processor does not support VMX operation\n");
+	}
+#endif
+
+	/*
+	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+	 * are set (bits 0 and 2 respectively).
+	 */
+	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
+	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
+		printf("vmx_init: VMX operation disabled by BIOS\n");
+		return (ENXIO);
+	}
+
+	/* Check support for primary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+			       MSR_VMX_TRUE_PROCBASED_CTLS,
+			       PROCBASED_CTLS_ONE_SETTING,
+			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired primary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Clear the processor-based ctl bits that are set on demand */
+	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+	/* Check support for secondary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+			       MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED_CTLS2_ONE_SETTING,
+			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+	if (error) {
+		printf("vmx_init: processor does not support desired secondary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VPID */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED2_ENABLE_VPID, 0, &tmp);
+	if (error == 0)
+		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+	/* Check support for pin-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+			       MSR_VMX_TRUE_PINBASED_CTLS,
+			       PINBASED_CTLS_ONE_SETTING,
+			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "pin-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VM-exit controls */
+	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+			       VM_EXIT_CTLS_ONE_SETTING,
+			       VM_EXIT_CTLS_ZERO_SETTING,
+			       &exit_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		    "exit controls\n");
+		return (error);
+	}
+
+	/* Check support for VM-entry controls */
+	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
+	    &entry_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		    "entry controls\n");
+		return (error);
+	}
+
+	/*
+	 * Check support for optional features by testing them
+	 * as individual bits
+	 */
+	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_TRUE_PROCBASED_CTLS,
+					PROCBASED_HLT_EXITING, 0,
+					&tmp) == 0);
+
+	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_PROCBASED_CTLS,
+					PROCBASED_MTF, 0,
+					&tmp) == 0);
+
+	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					 MSR_VMX_TRUE_PROCBASED_CTLS,
+					 PROCBASED_PAUSE_EXITING, 0,
+					 &tmp) == 0);
+
+	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+					MSR_VMX_PROCBASED_CTLS2,
+					PROCBASED2_UNRESTRICTED_GUEST, 0,
+				        &tmp) == 0);
+
+	/* Initialize EPT */
+	error = ept_init();
+	if (error) {
+		printf("vmx_init: ept initialization failed (%d)\n", error);
+		return (error);
+	}
+
+	/*
+	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+	 */
+	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+	cr0_ones_mask = fixed0 & fixed1;
+	cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+	/*
+	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+	 * if unrestricted guest execution is allowed.
+	 */
+	if (cap_unrestricted_guest)
+		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+	/*
+	 * Do not allow the guest to set CR0_NW or CR0_CD.
+	 */
+	cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+	cr4_ones_mask = fixed0 & fixed1;
+	cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+#ifndef	__FreeBSD__
+	for (i = 0; i < MAXCPU; i++) {
+		vmxon_region_pa[i] = vtophys(&vmxon_region[i]);
+	}
+#endif
+
+	vpid_init();
+
+	vmx_msr_init();
+
+	/* enable VMX operation */
+	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+	vmx_initialized = 1;
+
+	return (0);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
+{
+	int error, mask_ident, shadow_ident;
+	uint64_t mask_value;
+
+	if (which != 0 && which != 4)
+		panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+	if (which == 0) {
+		mask_ident = VMCS_CR0_MASK;
+		mask_value = cr0_ones_mask | cr0_zeros_mask;
+		shadow_ident = VMCS_CR0_SHADOW;
+	} else {
+		mask_ident = VMCS_CR4_MASK;
+		mask_value = cr4_ones_mask | cr4_zeros_mask;
+		shadow_ident = VMCS_CR4_SHADOW;
+	}
+
+	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
+	if (error)
+		return (error);
+
+	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
+	if (error)
+		return (error);
+
+	return (0);
+}
+#define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
+#define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+	uint16_t vpid[VM_MAXCPU];
+	int i, error, guest_msr_count;
+#ifndef	__FreeBSD__
+	int host_msr_count;
+#endif
+	struct vmx *vmx;
+	struct vmcs *vmcs;
+
+	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+	if ((uintptr_t)vmx & PAGE_MASK) {
+		panic("malloc of struct vmx not aligned on %d byte boundary",
+		      PAGE_SIZE);
+	}
+	vmx->vm = vm;
+
+	/*
+	 * Clean up EPTP-tagged guest physical and combined mappings
+	 *
+	 * VMX transitions are not required to invalidate any guest physical
+	 * mappings. So, it may be possible for stale guest physical mappings
+	 * to be present in the processor TLBs.
+	 *
+	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+	 */
+	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+	msr_bitmap_initialize(vmx->msr_bitmap);
+
+	/*
+	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+	 * The guest FSBASE and GSBASE are saved and restored during
+	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+	 * always restored from the vmcs host state area on vm-exit.
+	 *
+	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
+	 * how they are saved/restored so can be directly accessed by the
+	 * guest.
+	 *
+	 * MSR_EFER is saved and restored in the guest VMCS area on a
+	 * VM exit and entry respectively. It is also restored from the
+	 * host VMCS area on a VM exit.
+	 *
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit.
+	 *
+	 * The TSC MSR is exposed read-only. Writes are disallowed as
+	 * that will impact the host TSC.  If the guest does a write
+	 * the "use TSC offsetting" execution control is enabled and the
+	 * difference between the host TSC and the guest TSC is written
+	 * into the TSC offset in the VMCS.
+	 */
+	if (guest_msr_rw(vmx, MSR_GSBASE) ||
+	    guest_msr_rw(vmx, MSR_FSBASE) ||
+	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
+	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
+	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
+	    guest_msr_rw(vmx, MSR_EFER) ||
+	    guest_msr_rw(vmx, MSR_PAT) ||
+	    guest_msr_ro(vmx, MSR_TSC))
+		panic("vmx_vminit: error setting guest msr access");
+
+	vpid_alloc(vpid, VM_MAXCPU);
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vmcs = &vmx->vmcs[i];
+		vmcs->identifier = vmx_revision();
+		error = vmclear(vmcs);
+		if (error != 0) {
+			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+			      error, i);
+		}
+
+		vmx_msr_guest_init(vmx, i);
+
+		error = vmcs_set_defaults(vmcs,
+					  (u_long)vmx_longjmp,
+					  (u_long)&vmx->ctx[i],
+					  vtophys(vmx->pml4ept),
+					  pinbased_ctls,
+					  procbased_ctls,
+					  procbased_ctls2,
+					  exit_ctls, entry_ctls,
+					  vtophys(vmx->msr_bitmap),
+					  vpid[i]);
+
+		if (error != 0)
+			panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+		vmx->cap[i].set = 0;
+		vmx->cap[i].proc_ctls = procbased_ctls;
+
+		vmx->state[i].lastcpu = -1;
+		vmx->state[i].vpid = vpid[i];
+
+#ifndef	__FreeBSD__
+		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
+		    guest_msr_count);
+		if (error != 0)
+			panic("vmcs_set_msr_save error %d", error);
+
+		host_msr_save_area_init(vmx->host_msrs[i], &host_msr_count);
+
+		error = vmcs_set_host_msr_save(&vmx->vmcs[i],
+					       vtophys(vmx->host_msrs[i]),
+					       host_msr_count);
+		if (error != 0)
+			panic("vmcs_set_msr_save error %d", error);
+#endif
+
+		/*
+		 * Set up the CR0/4 shadows, and init the read shadow
+		 * to the power-on register value from the Intel Sys Arch.
+		 *  CR0 - 0x60000010
+		 *  CR4 - 0
+		 */
+		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
+		if (error != 0)
+			panic("vmx_setup_cr0_shadow %d", error);
+
+		error = vmx_setup_cr4_shadow(vmcs, 0);
+		if (error != 0)
+			panic("vmx_setup_cr4_shadow %d", error);
+	}
+
+	return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+	int handled, func;
+	
+	func = vmxctx->guest_rax;
+
+	handled = x86_emulate_cpuid(vm, vcpu,
+				    (uint32_t*)(&vmxctx->guest_rax),
+				    (uint32_t*)(&vmxctx->guest_rbx),
+				    (uint32_t*)(&vmxctx->guest_rcx),
+				    (uint32_t*)(&vmxctx->guest_rdx));
+	return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+	       int handled)
+{
+#ifdef KTR
+	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+		 handled ? "handled" : "unhandled",
+		 exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+	struct vmxstate *vmxstate;
+	struct invvpid_desc invvpid_desc = { 0 };
+#ifndef	__FreeBSD__
+	desctbr_t idtr, gdtr;
+#endif
+
+	vmxstate = &vmx->state[vcpu];
+	vmcs_write(VMCS_HOST_FS_BASE, vmm_get_host_fsbase());
+	if (vmxstate->lastcpu == curcpu)
+		return;
+
+	vmxstate->lastcpu = curcpu;
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+
+#ifndef	__FreeBSD__
+	vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(MSR_SYSENTER_CS_MSR));
+	vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR));
+	vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(MSR_SYSENTER_EIP_MSR));
+#endif
+
+	/*
+	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 *
+	 * We do this because this vcpu was executing on a different host
+	 * cpu when it last ran. We do not track whether it invalidated
+	 * mappings associated with its 'vpid' during that run. So we must
+	 * assume that the mappings associated with 'vpid' on 'curcpu' are
+	 * stale and invalidate them.
+	 *
+	 * Note that we incur this penalty only when the scheduler chooses to
+	 * move the thread associated with this vcpu between host cpus.
+	 *
+	 * Note also that this will invalidate mappings tagged with 'vpid'
+	 * for "all" EP4TAs.
+	 */
+	if (vmxstate->vpid != 0) {
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+	}
+}
+
+static void 
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+	int error;
+
+	error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+	if (error)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
+		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+	}
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+#ifdef	__FreeBSD__
+	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
+	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
+#else
+	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
+	    ("intr_window_exiting not set: %x", vmx->cap[vcpu].proc_ctls));
+#endif
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
+		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+	}
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+#ifdef	__FreeBSD__
+	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
+	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
+#else
+	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
+	    ("nmi_window_exiting not set %x", vmx->cap[vcpu].proc_ctls));
+#endif
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+}
+
+int
+vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
+{
+	int error;
+
+	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
+		vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
+		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+		VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
+	}
+
+	error = vmwrite(VMCS_TSC_OFFSET, offset);
+
+	return (error);
+}
+
+#define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
+			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
+#define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
+			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
+
+static void
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+	uint32_t gi, info;
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+#ifdef	__FreeBSD__
+	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
+	    "interruptibility-state %#x", gi));
+#else
+	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
+	    "interruptibility-state %x", gi));
+#endif
+
+	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+#ifdef	__FreeBSD__
+	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
+	    "VM-entry interruption information %#x", info));
+#else
+	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
+	    "VM-entry interruption information %x", info));
+#endif
+
+	/*
+	 * Inject the virtual NMI. The vector must be the NMI IDT entry
+	 * or the VMCS entry check will fail.
+	 */
+	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
+	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+
+	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+	/* Clear the request */
+	vm_nmi_clear(vmx->vm, vcpu);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
+{
+	int vector, need_nmi_exiting, extint_pending;
+	uint64_t rflags, entryinfo;
+	uint32_t gi, info;
+
+	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+#ifdef	__FreeBSD__
+		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+		    "intinfo is not valid: %#lx", __func__, entryinfo));
+#else
+		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+		    "intinfo is not valid: %lx", __func__, entryinfo));
+#endif
+
+		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+#ifdef	__FreeBSD__
+		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
+		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
+#else
+		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
+		     "pending exception: %lx/%x", __func__, entryinfo, info));
+#endif
+
+		info = entryinfo;
+		vector = info & 0xff;
+		if (vector == IDT_BP || vector == IDT_OF) {
+			/*
+			 * VT-x requires #BP and #OF to be injected as software
+			 * exceptions.
+			 */
+			info &= ~VMCS_INTR_T_MASK;
+			info |= VMCS_INTR_T_SWEXCEPTION;
+		}
+
+		if (info & VMCS_INTR_DEL_ERRCODE)
+			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
+		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+	}
+
+	if (vm_nmi_pending(vmx->vm, vcpu)) {
+		/*
+		 * If there are no conditions blocking NMI injection then
+		 * inject it directly here otherwise enable "NMI window
+		 * exiting" to inject it as soon as we can.
+		 *
+		 * We also check for STI_BLOCKING because some implementations
+		 * don't allow NMI injection in this case. If we are running
+		 * on a processor that doesn't have this restriction it will
+		 * immediately exit and the NMI will be injected in the
+		 * "NMI window exiting" handler.
+		 */
+		need_nmi_exiting = 1;
+		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
+			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+			if ((info & VMCS_INTR_VALID) == 0) {
+				vmx_inject_nmi(vmx, vcpu);
+				need_nmi_exiting = 0;
+			} else {
+				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
+				    "due to VM-entry intr info %#x", info);
+			}
+		} else {
+			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
+			    "Guest Interruptibility-state %#x", gi);
+		}
+
+		if (need_nmi_exiting)
+			vmx_set_nmi_window_exiting(vmx, vcpu);
+	}
+
+	extint_pending = vm_extint_pending(vmx->vm, vcpu);
+
+#ifdef	__FreeBSD__
+	if (!extint_pending && virtual_interrupt_delivery) {
+		vmx_inject_pir(vlapic);
+		return;
+	}
+#endif
+
+	/*
+	 * If interrupt-window exiting is already in effect then don't bother
+	 * checking for pending interrupts. This is just an optimization and
+	 * not needed for correctness.
+	 */
+	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
+		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
+		    "pending int_window_exiting");
+		return;
+	}
+
+	if (!extint_pending) {
+		/* Ask the local apic for a vector to inject */
+		if (!vlapic_pending_intr(vlapic, &vector))
+			return;
+
+		/*
+		 * From the Intel SDM, Volume 3, Section "Maskable
+		 * Hardware Interrupts":
+		 * - maskable interrupt vectors [16,255] can be delivered
+		 *   through the local APIC.
+		*/
+		KASSERT(vector >= 16 && vector <= 255,
+		    ("invalid vector %d from local APIC", vector));
+	} else {
+		/* Ask the legacy pic for a vector to inject */
+		vatpic_pending_intr(vmx->vm, &vector);
+
+		/*
+		 * From the Intel SDM, Volume 3, Section "Maskable
+		 * Hardware Interrupts":
+		 * - maskable interrupt vectors [0,255] can be delivered
+		 *   through the INTR pin.
+		 */
+		KASSERT(vector >= 0 && vector <= 255,
+		    ("invalid vector %d from INTR", vector));
+	}
+
+	/* Check RFLAGS.IF and the interruptibility state of the guest */
+	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+	if ((rflags & PSL_I) == 0) {
+		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+		    "rflags %#lx", vector, rflags);
+		goto cantinject;
+	}
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	if (gi & HWINTR_BLOCKING) {
+		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+		    "Guest Interruptibility-state %#x", vector, gi);
+		goto cantinject;
+	}
+
+	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+	if (info & VMCS_INTR_VALID) {
+		/*
+		 * This is expected and could happen for multiple reasons:
+		 * - A vectoring VM-entry was aborted due to astpending
+		 * - A VM-exit happened during event injection.
+		 * - An exception was injected above.
+		 * - An NMI was injected above or after "NMI window exiting"
+		 */
+		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+		    "VM-entry intr info %#x", vector, info);
+		goto cantinject;
+	}
+
+	/* Inject the interrupt */
+	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
+	info |= vector;
+	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+
+	if (!extint_pending) {
+		/* Update the Local APIC ISR */
+		vlapic_intr_accepted(vlapic, vector);
+	} else {
+		vm_extint_clear(vmx->vm, vcpu);
+		vatpic_intr_accepted(vmx->vm, vector);
+
+		/*
+		 * After we accepted the current ExtINT the PIC may
+		 * have posted another one.  If that is the case, set
+		 * the Interrupt Window Exiting execution control so
+		 * we can inject that one too.
+		 *
+		 * Also, interrupt window exiting allows us to inject any
+		 * pending APIC vector that was preempted by the ExtINT
+		 * as soon as possible. This applies both for the software
+		 * emulated vlapic and the hardware assisted virtual APIC.
+		 */
+		vmx_set_int_window_exiting(vmx, vcpu);
+	}
+
+	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+	return;
+
+cantinject:
+	/*
+	 * Set the Interrupt Window Exiting execution control so we can inject
+	 * the interrupt as soon as blocking condition goes away.
+	 */
+	vmx_set_int_window_exiting(vmx, vcpu);
+}
+
+/*
+ * If the Virtual NMIs execution control is '1' then the logical processor
+ * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
+ * the VMCS. An IRET instruction in VMX non-root operation will remove any
+ * virtual-NMI blocking.
+ *
+ * This unblocking occurs even if the IRET causes a fault. In this case the
+ * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
+ */
+static void
+vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+	uint32_t gi;
+
+	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
+	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+}
+
+static void
+vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+	uint32_t gi;
+
+	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
+	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+}
+
+static uint64_t
+vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
+{
+	const struct vmxctx *vmxctx;
+
+	vmxctx = &vmx->ctx[vcpu];
+
+	switch (ident) {
+	case 0:
+		return (vmxctx->guest_rax);
+	case 1:
+		return (vmxctx->guest_rcx);
+	case 2:
+		return (vmxctx->guest_rdx);
+	case 3:
+		return (vmxctx->guest_rbx);
+	case 4:
+		return (vmcs_read(VMCS_GUEST_RSP));
+	case 5:
+		return (vmxctx->guest_rbp);
+	case 6:
+		return (vmxctx->guest_rsi);
+	case 7:
+		return (vmxctx->guest_rdi);
+	case 8:
+		return (vmxctx->guest_r8);
+	case 9:
+		return (vmxctx->guest_r9);
+	case 10:
+		return (vmxctx->guest_r10);
+	case 11:
+		return (vmxctx->guest_r11);
+	case 12:
+		return (vmxctx->guest_r12);
+	case 13:
+		return (vmxctx->guest_r13);
+	case 14:
+		return (vmxctx->guest_r14);
+	case 15:
+		return (vmxctx->guest_r15);
+	default:
+		panic("invalid vmx register %d", ident);
+	}
+}
+
+static void
+vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
+{
+	struct vmxctx *vmxctx;
+
+	vmxctx = &vmx->ctx[vcpu];
+
+	switch (ident) {
+	case 0:
+		vmxctx->guest_rax = regval;
+		break;
+	case 1:
+		vmxctx->guest_rcx = regval;
+		break;
+	case 2:
+		vmxctx->guest_rdx = regval;
+		break;
+	case 3:
+		vmxctx->guest_rbx = regval;
+		break;
+	case 4:
+		vmcs_write(VMCS_GUEST_RSP, regval);
+		break;
+	case 5:
+		vmxctx->guest_rbp = regval;
+		break;
+	case 6:
+		vmxctx->guest_rsi = regval;
+		break;
+	case 7:
+		vmxctx->guest_rdi = regval;
+		break;
+	case 8:
+		vmxctx->guest_r8 = regval;
+		break;
+	case 9:
+		vmxctx->guest_r9 = regval;
+		break;
+	case 10:
+		vmxctx->guest_r10 = regval;
+		break;
+	case 11:
+		vmxctx->guest_r11 = regval;
+		break;
+	case 12:
+		vmxctx->guest_r12 = regval;
+		break;
+	case 13:
+		vmxctx->guest_r13 = regval;
+		break;
+	case 14:
+		vmxctx->guest_r14 = regval;
+		break;
+	case 15:
+		vmxctx->guest_r15 = regval;
+		break;
+	default:
+		panic("invalid vmx register %d", ident);
+	}
+}
+
+static int
+vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+	uint64_t crval, regval;
+
+	/* We only handle mov to %cr0 at this time */
+	if ((exitqual & 0xf0) != 0x00)
+		return (UNHANDLED);
+
+	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
+
+	vmcs_write(VMCS_CR0_SHADOW, regval);
+
+	crval = regval | cr0_ones_mask;
+	crval &= ~cr0_zeros_mask;
+	vmcs_write(VMCS_GUEST_CR0, crval);
+
+	if (regval & CR0_PG) {
+		uint64_t efer, entry_ctls;
+
+		/*
+		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
+		 * the "IA-32e mode guest" bit in VM-entry control must be
+		 * equal.
+		 */
+		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
+		if (efer & EFER_LME) {
+			efer |= EFER_LMA;
+			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
+			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
+			entry_ctls |= VM_ENTRY_GUEST_LMA;
+			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
+		}
+	}
+
+	return (HANDLED);
+}
+
+static int
+vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+	uint64_t crval, regval;
+
+	/* We only handle mov to %cr4 at this time */
+	if ((exitqual & 0xf0) != 0x00)
+		return (UNHANDLED);
+
+	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
+
+	vmcs_write(VMCS_CR4_SHADOW, regval);
+
+	crval = regval | cr4_ones_mask;
+	crval &= ~cr4_zeros_mask;
+	vmcs_write(VMCS_GUEST_CR4, crval);
+
+	return (HANDLED);
+}
+
+static int
+vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+	struct vlapic *vlapic;
+	uint64_t cr8;
+	int regnum;
+
+	/* We only handle mov %cr8 to/from a register at this time. */
+	if ((exitqual & 0xe0) != 0x00) {
+		return (UNHANDLED);
+	}
+
+	vlapic = vm_lapic(vmx->vm, vcpu);
+	regnum = (exitqual >> 8) & 0xf;
+	if (exitqual & 0x10) {
+		cr8 = vlapic_get_cr8(vlapic);
+		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
+	} else {
+		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
+		vlapic_set_cr8(vlapic, cr8);
+	}
+
+	return (HANDLED);
+}
+
+/*
+ * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
+ */
+static int
+vmx_cpl(void)
+{
+	uint32_t ssar;
+
+	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
+	return ((ssar >> 5) & 0x3);
+}
+
+static enum vm_cpu_mode
+vmx_cpu_mode(void)
+{
+	uint32_t csar;
+
+	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		if (csar & 0x2000)
+			return (CPU_MODE_64BIT);	/* CS.L = 1 */
+		else
+			return (CPU_MODE_COMPATIBILITY);
+	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
+		return (CPU_MODE_PROTECTED);
+	} else {
+		return (CPU_MODE_REAL);
+	}
+}
+
+static enum vm_paging_mode
+vmx_paging_mode(void)
+{
+
+	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
+		return (PAGING_MODE_FLAT);
+	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
+		return (PAGING_MODE_32);
+	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
+		return (PAGING_MODE_64);
+	else
+		return (PAGING_MODE_PAE);
+}
+
+static uint64_t
+inout_str_index(struct vmx *vmx, int vcpuid, int in)
+{
+	uint64_t val;
+	int error;
+	enum vm_reg_name reg;
+
+	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
+	error = vmx_getreg(vmx, vcpuid, reg, &val);
+	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
+	return (val);
+}
+
+static uint64_t
+inout_str_count(struct vmx *vmx, int vcpuid, int rep)
+{
+	uint64_t val;
+	int error;
+
+	if (rep) {
+		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
+		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
+	} else {
+		val = 1;
+	}
+	return (val);
+}
+
+static int
+inout_str_addrsize(uint32_t inst_info)
+{
+	uint32_t size;
+
+	size = (inst_info >> 7) & 0x7;
+	switch (size) {
+	case 0:
+		return (2);	/* 16 bit */
+	case 1:
+		return (4);	/* 32 bit */
+	case 2:
+		return (8);	/* 64 bit */
+	default:
+		panic("%s: invalid size encoding %d", __func__, size);
+	}
+}
+
+static void
+inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
+    struct vm_inout_str *vis)
+{
+	int error, s;
+
+	if (in) {
+		vis->seg_name = VM_REG_GUEST_ES;
+	} else {
+		s = (inst_info >> 15) & 0x7;
+		vis->seg_name = vm_segment_name(s);
+	}
+
+	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
+	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
+}
+
+static void
+vmx_paging_info(struct vm_guest_paging *paging)
+{
+	paging->cr3 = vmcs_guest_cr3();
+	paging->cpl = vmx_cpl();
+	paging->cpu_mode = vmx_cpu_mode();
+	paging->paging_mode = vmx_paging_mode();
+}
+
+static void
+vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
+{
+	struct vm_guest_paging *paging;
+	uint32_t csar;
+
+	paging = &vmexit->u.inst_emul.paging;
+
+	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+	vmexit->u.inst_emul.gpa = gpa;
+	vmexit->u.inst_emul.gla = gla;
+	vmx_paging_info(paging);
+	switch (paging->cpu_mode) {
+	case CPU_MODE_REAL:
+		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
+	case CPU_MODE_PROTECTED:
+	case CPU_MODE_COMPATIBILITY:
+		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
+		break;
+	default:
+		vmexit->u.inst_emul.cs_base = 0;
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
+	}
+	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
+}
+
+static int
+ept_fault_type(uint64_t ept_qual)
+{
+	int fault_type;
+
+	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
+		fault_type = VM_PROT_WRITE;
+	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
+		fault_type = VM_PROT_EXECUTE;
+	else
+		fault_type= VM_PROT_READ;
+
+	return (fault_type);
+}
+
+static boolean_t
+ept_emulation_fault(uint64_t ept_qual)
+{
+	int read, write;
+
+	/* EPT fault on an instruction fetch doesn't make sense here */
+	if (ept_qual & EPT_VIOLATION_INST_FETCH)
+		return (FALSE);
+
+	/* EPT fault must be a read fault or a write fault */
+	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+	if ((read | write) == 0)
+		return (FALSE);
+
+	/*
+	 * The EPT violation must have been caused by accessing a
+	 * guest-physical address that is a translation of a guest-linear
+	 * address.
+	 */
+	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+		return (FALSE);
+	}
+
+	return (TRUE);
+}
+
+static int
+emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
+	else
+		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
+
+	return (error);
+}
+
+static int
+emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
+{
+	struct vmxctx *vmxctx;
+	uint64_t result;
+	uint32_t eax, edx;
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
+	else
+		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
+
+	if (error == 0) {
+		eax = result;
+		vmxctx = &vmx->ctx[vcpuid];
+		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
+		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
+
+		edx = result >> 32;
+		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
+		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
+	}
+
+	return (error);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+	int error, handled, in;
+	struct vmcs *vmcs;
+	struct vmxctx *vmxctx;
+	struct vm_inout_str *vis;
+	uint32_t eax, ecx, edx, idtvec_info, intr_info, inst_info;
+	uint64_t qual, gla, gpa, cr3;
+	bool retu;
+
+	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
+	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
+
+	handled = UNHANDLED;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	qual = vmexit->u.vmx.exit_qualification;
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
+
+	switch (vmexit->u.vmx.exit_reason) {
+	case EXIT_REASON_CR_ACCESS:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
+		switch (qual & 0xf) {
+		case 0:
+			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
+			break;
+		case 4:
+			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
+			break;
+		case 8:
+			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
+			break;
+		}
+		break;
+	case EXIT_REASON_RDMSR:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
+		retu = false;
+		ecx = vmxctx->guest_rcx;
+		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
+		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
+		if (error) {
+			vmexit->exitcode = VM_EXITCODE_RDMSR;
+			vmexit->u.msr.code = ecx;
+		} else if (!retu) {
+			handled = HANDLED;
+		} else {
+			/* Return to userspace with a valid exitcode */
+			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+			    ("emulate_rdmsr retu with bogus exitcode"));
+		}
+		break;
+	case EXIT_REASON_WRMSR:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
+		retu = false;
+		eax = vmxctx->guest_rax;
+		ecx = vmxctx->guest_rcx;
+		edx = vmxctx->guest_rdx;
+		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
+		    ecx, (uint64_t)edx << 32 | eax);
+		error = emulate_wrmsr(vmx, vcpu, ecx,
+		    (uint64_t)edx << 32 | eax, &retu);
+		if (error) {
+			vmexit->exitcode = VM_EXITCODE_WRMSR;
+			vmexit->u.msr.code = ecx;
+			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+		} else if (!retu) {
+			handled = HANDLED;
+		} else {
+			/* Return to userspace with a valid exitcode */
+			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+			    ("emulate_wrmsr retu with bogus exitcode"));
+		}
+		break;
+	case EXIT_REASON_HLT:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+		vmexit->exitcode = VM_EXITCODE_HLT;
+		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+		break;
+	case EXIT_REASON_MTF:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
+		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		break;
+	case EXIT_REASON_PAUSE:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
+		vmexit->exitcode = VM_EXITCODE_PAUSE;
+		break;
+	case EXIT_REASON_INTR_WINDOW:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
+		vmx_clear_int_window_exiting(vmx, vcpu);
+		return (1);
+	case EXIT_REASON_EXT_INTR:
+		/*
+		 * External interrupts serve only to cause VM exits and allow
+		 * the host interrupt handler to run.
+		 *
+		 * If this external interrupt triggers a virtual interrupt
+		 * to a VM, then that state will be recorded by the
+		 * host interrupt handler in the VM's softc. We will inject
+		 * this virtual interrupt during the subsequent VM enter.
+		 */
+		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+
+		/*
+		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
+		 * This appears to be a bug in VMware Fusion?
+		 */
+		if (!(intr_info & VMCS_INTR_VALID))
+			return (1);
+#ifdef	__FreeBSD__
+		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
+		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
+		    ("VM exit interruption info invalid: %#x", intr_info));
+#else
+		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
+		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
+		    ("VM exit interruption info invalid: %x", intr_info));
+#endif
+#if 0	/* XXX */
+		vmx_trigger_hostintr(intr_info & 0xff);
+#endif
+
+		/*
+		 * This is special. We want to treat this as an 'handled'
+		 * VM-exit but not increment the instruction pointer.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+		return (1);
+	case EXIT_REASON_NMI_WINDOW:
+		/* Exit to allow the pending virtual NMI to be injected */
+		if (vm_nmi_pending(vmx->vm, vcpu))
+			vmx_inject_nmi(vmx, vcpu);
+		vmx_clear_nmi_window_exiting(vmx, vcpu);
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
+		return (1);
+	case EXIT_REASON_INOUT:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
+		vmexit->exitcode = VM_EXITCODE_INOUT;
+		vmexit->u.inout.bytes = (qual & 0x7) + 1;
+		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
+		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+		vmexit->u.inout.port = (uint16_t)(qual >> 16);
+		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+		if (vmexit->u.inout.string) {
+			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
+			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
+			vis = &vmexit->u.inout_str;
+			vmx_paging_info(&vis->paging);
+			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
+			vis->index = inout_str_index(vmx, vcpu, in);
+			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
+			vis->addrsize = inout_str_addrsize(inst_info);
+			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
+		}
+		break;
+	case EXIT_REASON_CPUID:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
+		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+		break;
+	case EXIT_REASON_EXCEPTION:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
+		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+#ifdef	__FreeBSD__
+		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
+		    ("VM exit interruption info invalid: %#x", intr_info));
+#else
+		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
+		    ("VM exit interruption info invalid: %x", intr_info));
+#endif
+
+		/*
+		 * If Virtual NMIs control is 1 and the VM-exit is due to a
+		 * fault encountered during the execution of IRET then we must
+		 * restore the state of "virtual-NMI blocking" before resuming
+		 * the guest.
+		 *
+		 * See "Resuming Guest Software after Handling an Exception".
+		 */
+		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
+		    (intr_info & 0xff) != IDT_DF &&
+		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
+			vmx_restore_nmi_blocking(vmx, vcpu);
+
+		/*
+		 * The NMI has already been handled in vmx_exit_handle_nmi().
+		 */
+		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
+			return (1);
+		break;
+	case EXIT_REASON_EPT_FAULT:
+		gpa = vmcs_gpa();
+		if (ept_emulation_fault(qual)) {
+			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
+			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
+		}
+		break;
+	default:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
+		break;
+	}
+
+	if (handled) {
+		/*
+		 * It is possible that control is returned to userland
+		 * even though we were able to handle the VM exit in the
+		 * kernel.
+		 *
+		 * In such a case we want to make sure that the userland
+		 * restarts guest execution at the instruction *after*
+		 * the one we just processed. Therefore we update the
+		 * guest rip in the VMCS and in 'vmexit'.
+		 */
+		vm_exit_update_rip(vmexit);
+		vmexit->rip += vmexit->inst_length;
+		vmexit->inst_length = 0;
+
+		/*
+		 * Special case for spinning up an AP - exit to userspace to
+		 * give the controlling process a chance to intercept and
+		 * spin up a thread for the AP.
+		 */
+		if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+			handled = 0;
+	} else {
+		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+			/*
+			 * If this VM exit was not claimed by anybody then
+			 * treat it as a generic VMX exit.
+			 */
+			vmexit->exitcode = VM_EXITCODE_VMX;
+			vmexit->u.vmx.status = VM_SUCCESS;
+			vmexit->u.vmx.inst_type = 0;
+			vmexit->u.vmx.inst_error = 0;
+		} else {
+			/*
+			 * The exitcode and collateral have been populated.
+			 * The VM exit will be processed further in userland.
+			 */
+		}
+	}
+	return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+	int error, vie, rc, handled, astpending;
+	uint32_t exit_reason;
+	struct vmx *vmx;
+	struct vm *vm;
+	struct vmxctx *vmxctx;
+	struct vmcs *vmcs;
+	struct vm_exit *vmexit;
+	struct vlapic *vlapic;
+	
+	vmx = arg;
+	vm = vmx->vm;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	vlapic = vm_lapic(vm, vcpu);
+	vmxctx->launched = 0;
+
+	astpending = 0;
+	vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+	vmx_msr_guest_enter(vmx, vcpu);
+
+	VMPTRLD(vmcs);
+
+	/*
+	 * XXX
+	 * We do this every time because we may setup the virtual machine
+	 * from a different process than the one that actually runs it.
+	 *
+	 * If the life of a virtual machine was spent entirely in the context
+	 * of a single process we could do this once in vmcs_set_defaults().
+	 */
+	vmcs_write(VMCS_HOST_CR3, rcr3());
+
+	vmcs_write(VMCS_GUEST_RIP, rip);
+	vmx_set_pcpu_defaults(vmx, vcpu);
+	do {
+		vmx_inject_interrupts(vmx, vcpu, vlapic);
+		vmx_run_trace(vmx, vcpu);
+		rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+		vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+		switch (rc) {
+		case VMX_RETURN_DIRECT:
+			if (vmxctx->launched == 0) {
+				vmxctx->launched = 1;
+				vmx_launch(vmxctx);
+			} else
+				vmx_resume(vmxctx);
+			panic("vmx_launch/resume should not return");
+			break;
+		case VMX_RETURN_LONGJMP:
+			break;			/* vm exit */
+		case VMX_RETURN_AST:
+			astpending = 1;
+			break;
+		case VMX_RETURN_VMRESUME:
+			vie = vmcs_instruction_error();
+			if (vmxctx->launch_error == VM_FAIL_INVALID ||
+			    vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+				printf("vmresume error %d vmcs inst error %d\n",
+					vmxctx->launch_error, vie);
+				goto err_exit;
+			}
+			vmx_launch(vmxctx);	/* try to launch the guest */
+			panic("vmx_launch should not return");
+			break;
+		case VMX_RETURN_VMLAUNCH:
+			vie = vmcs_instruction_error();
+#if 1
+			printf("vmlaunch error %d vmcs inst error %d\n",
+				vmxctx->launch_error, vie);
+#endif
+			goto err_exit;
+		default:
+			panic("vmx_setjmp returned %d", rc);
+		}
+		
+		/* collect some basic information for VM exit processing */
+		vmexit->rip = rip = vmcs_guest_rip();
+		vmexit->inst_length = vmexit_instruction_length();
+		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+		/* Update 'nextrip' */
+		vmx->state[vcpu].nextrip = rip;
+
+		/* enable interrupts */
+		enable_intr();
+
+		if (astpending) {
+			handled = 1;
+			vmexit->inst_length = 0;
+			vmexit->exitcode = VM_EXITCODE_BOGUS;
+			vmx_astpending_trace(vmx, vcpu, rip);
+			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
+			break;
+		}
+
+		handled = vmx_exit_process(vmx, vcpu, vmexit);
+		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+	} while (handled);
+
+	/*
+	 * If a VM exit has been handled then the exitcode must be BOGUS
+	 * If a VM exit is not handled then the exitcode must not be BOGUS
+	 */
+	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+		panic("Mismatch between handled (%d) and exitcode (%d)",
+		      handled, vmexit->exitcode);
+	}
+
+	if (!handled)
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
+
+	VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",
+	    vmexit->exitcode);
+
+	VMCLEAR(vmcs);
+	vmx_msr_guest_exit(vmx, vcpu);
+
+	return (0);
+
+err_exit:
+	vmexit->exitcode = VM_EXITCODE_VMX;
+	vmexit->u.vmx.exit_reason = (uint32_t)-1;
+	vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+	vmexit->u.vmx.status = ~0;
+	VMCLEAR(vmcs);
+	vmx_msr_guest_exit(vmx, vcpu);
+
+	return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+	int i, error;
+	struct vmx *vmx = arg;
+
+	for (i = 0; i < VM_MAXCPU; i++)
+		vpid_free(vmx->state[i].vpid);
+
+	/*
+	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
+	 */
+	error = vmclear(&vmx->vmcs[0]);
+	if (error != 0)
+		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+	ept_vmcleanup(vmx);
+	free(vmx, M_VMX);
+
+	return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_RAX:
+		return (&vmxctx->guest_rax);
+	case VM_REG_GUEST_RBX:
+		return (&vmxctx->guest_rbx);
+	case VM_REG_GUEST_RCX:
+		return (&vmxctx->guest_rcx);
+	case VM_REG_GUEST_RDX:
+		return (&vmxctx->guest_rdx);
+	case VM_REG_GUEST_RSI:
+		return (&vmxctx->guest_rsi);
+	case VM_REG_GUEST_RDI:
+		return (&vmxctx->guest_rdi);
+	case VM_REG_GUEST_RBP:
+		return (&vmxctx->guest_rbp);
+	case VM_REG_GUEST_R8:
+		return (&vmxctx->guest_r8);
+	case VM_REG_GUEST_R9:
+		return (&vmxctx->guest_r9);
+	case VM_REG_GUEST_R10:
+		return (&vmxctx->guest_r10);
+	case VM_REG_GUEST_R11:
+		return (&vmxctx->guest_r11);
+	case VM_REG_GUEST_R12:
+		return (&vmxctx->guest_r12);
+	case VM_REG_GUEST_R13:
+		return (&vmxctx->guest_r13);
+	case VM_REG_GUEST_R14:
+		return (&vmxctx->guest_r14);
+	case VM_REG_GUEST_R15:
+		return (&vmxctx->guest_r15);
+	case VM_REG_GUEST_CR2:
+		return (&vmxctx->guest_cr2);
+	default:
+		break;
+	}
+	return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*retval = *regp;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*regp = val;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmx_shadow_reg(int reg)
+{
+	int shreg;
+
+	shreg = -1;
+
+	switch (reg) {
+	case VM_REG_GUEST_CR0:
+		shreg = VMCS_CR0_SHADOW;
+                break;
+        case VM_REG_GUEST_CR4:
+		shreg = VMCS_CR4_SHADOW;
+		break;
+	default:
+		break;
+	}
+
+	return (shreg);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+	int running, hostcpu;
+	struct vmx *vmx = arg;
+
+	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+	if (running && hostcpu != curcpu)
+		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+		return (0);
+
+	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+	int error, hostcpu, running, shadow;
+	uint64_t ctls;
+	struct vmx *vmx = arg;
+
+	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+	if (running && hostcpu != curcpu)
+		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+		return (0);
+
+	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
+
+	if (error == 0) {
+		/*
+		 * If the "load EFER" VM-entry control is 1 then the
+		 * value of EFER.LMA must be identical to "IA-32e mode guest"
+		 * bit in the VM-entry control.
+		 */
+		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+		    (reg == VM_REG_GUEST_EFER)) {
+			vmcs_getreg(&vmx->vmcs[vcpu], running,
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+			if (val & EFER_LMA)
+				ctls |= VM_ENTRY_GUEST_LMA;
+			else
+				ctls &= ~VM_ENTRY_GUEST_LMA;
+			vmcs_setreg(&vmx->vmcs[vcpu], running,
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+		}
+
+		shadow = vmx_shadow_reg(reg);
+		if (shadow > 0) {
+			/*
+			 * Store the unmodified value in the shadow
+			 */			
+			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
+				    VMCS_IDENT(shadow), val);
+		}
+	}
+
+	return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	int hostcpu, running;
+	struct vmx *vmx = arg;
+
+	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+	if (running && hostcpu != curcpu)
+		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	int hostcpu, running;
+	struct vmx *vmx = arg;
+
+	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+	if (running && hostcpu != curcpu)
+		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+	struct vmx *vmx = arg;
+	int vcap;
+	int ret;
+
+	ret = ENOENT;
+
+	vcap = vmx->cap[vcpu].set;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit)
+			ret = 0;
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit)
+			ret = 0;
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap)
+			ret = 0;
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest)
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+	if (ret == 0)
+		*retval = (vcap & (1 << type)) ? 1 : 0;
+
+	return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+	uint32_t baseval;
+	uint32_t *pptr;
+	int error;
+	int flag;
+	int reg;
+	int retval;
+
+	retval = ENOENT;
+	pptr = NULL;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_HLT_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_MTF;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_PAUSE_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest) {
+			retval = 0;
+			baseval = procbased_ctls2;
+			flag = PROCBASED2_UNRESTRICTED_GUEST;
+			reg = VMCS_SEC_PROC_BASED_CTLS;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (retval == 0) {
+		if (val) {
+			baseval |= flag;
+		} else {
+			baseval &= ~flag;
+		}
+		VMPTRLD(vmcs);
+		error = vmwrite(reg, baseval);
+		VMCLEAR(vmcs);
+
+		if (error) {
+			retval = error;
+		} else {
+			/*
+			 * Update optional stored flags, and record
+			 * setting
+			 */
+			if (pptr != NULL) {
+				*pptr = baseval;
+			}
+
+			if (val) {
+				vmx->cap[vcpu].set |= (1 << type);
+			} else {
+				vmx->cap[vcpu].set &= ~(1 << type);
+			}
+		}
+	}
+
+        return (retval);
+}
+
+struct vlapic_vtx {
+	struct vlapic	vlapic;
+	struct pir_desc	*pir_desc;
+	struct vmx	*vmx;
+};
+
+#define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
+do {									\
+	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
+	    level ? "level" : "edge", vector);				\
+	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
+	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
+	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
+	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
+	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
+} while (0)
+
+/*
+ * vlapic->ops handlers that utilize the APICv hardware assist described in
+ * Chapter 29 of the Intel SDM.
+ */
+static int
+vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
+{
+	struct vlapic_vtx *vlapic_vtx;
+	struct pir_desc *pir_desc;
+	uint64_t mask;
+	int idx, notify;
+
+	vlapic_vtx = (struct vlapic_vtx *)vlapic;
+	pir_desc = vlapic_vtx->pir_desc;
+
+	/*
+	 * Keep track of interrupt requests in the PIR descriptor. This is
+	 * because the virtual APIC page pointed to by the VMCS cannot be
+	 * modified if the vcpu is running.
+	 */
+	idx = vector / 64;
+	mask = 1UL << (vector % 64);
+	atomic_set_long(&pir_desc->pir[idx], mask);
+	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
+
+	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
+	    level, "vmx_set_intr_ready");
+	return (notify);
+}
+
+static int
+vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
+{
+	struct vlapic_vtx *vlapic_vtx;
+	struct pir_desc *pir_desc;
+	struct LAPIC *lapic;
+	uint64_t pending, pirval;
+	uint32_t ppr, vpr;
+	int i;
+
+	/*
+	 * This function is only expected to be called from the 'HLT' exit
+	 * handler which does not care about the vector that is pending.
+	 */
+	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
+
+	vlapic_vtx = (struct vlapic_vtx *)vlapic;
+	pir_desc = vlapic_vtx->pir_desc;
+
+	pending = atomic_load_acq_long(&pir_desc->pending);
+	if (!pending)
+		return (0);	/* common case */
+
+	/*
+	 * If there is an interrupt pending then it will be recognized only
+	 * if its priority is greater than the processor priority.
+	 *
+	 * Special case: if the processor priority is zero then any pending
+	 * interrupt will be recognized.
+	 */
+	lapic = vlapic->apic_page;
+	ppr = lapic->ppr & 0xf0;
+	if (ppr == 0)
+		return (1);
+
+	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
+	    lapic->ppr);
+
+	for (i = 3; i >= 0; i--) {
+		pirval = pir_desc->pir[i];
+		if (pirval != 0) {
+			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
+			return (vpr > ppr);
+		}
+	}
+	return (0);
+}
+
+static void
+vmx_intr_accepted(struct vlapic *vlapic, int vector)
+{
+
+	panic("vmx_intr_accepted: not expected to be called");
+}
+
+static void
+vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
+{
+	struct vlapic_vtx *vlapic_vtx;
+	struct vmx *vmx;
+	struct vmcs *vmcs;
+	uint64_t mask, val;
+
+	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
+	    ("vmx_set_tmr: vcpu cannot be running"));
+
+	vlapic_vtx = (struct vlapic_vtx *)vlapic;
+	vmx = vlapic_vtx->vmx;
+	vmcs = &vmx->vmcs[vlapic->vcpuid];
+	mask = 1UL << (vector % 64);
+
+	VMPTRLD(vmcs);
+	val = vmcs_read(VMCS_EOI_EXIT(vector));
+	if (level)
+		val |= mask;
+	else
+		val &= ~mask;
+	vmcs_write(VMCS_EOI_EXIT(vector), val);
+	VMCLEAR(vmcs);
+}
+
+static void
+vmx_post_intr(struct vlapic *vlapic, int hostcpu)
+{
+
+	ipi_cpu(hostcpu, pirvec);
+}
+
+/*
+ * Transfer the pending interrupts in the PIR descriptor to the IRR
+ * in the virtual APIC page.
+ */
+static void
+vmx_inject_pir(struct vlapic *vlapic)
+{
+	struct vlapic_vtx *vlapic_vtx;
+	struct pir_desc *pir_desc;
+	struct LAPIC *lapic;
+	uint64_t val, pirval;
+	int rvi, pirbase = -1;
+	uint16_t intr_status_old, intr_status_new;
+
+	vlapic_vtx = (struct vlapic_vtx *)vlapic;
+	pir_desc = vlapic_vtx->pir_desc;
+	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
+		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
+		    "no posted interrupt pending");
+		return;
+	}
+
+	pirval = 0;
+	pirbase = -1;
+	lapic = vlapic->apic_page;
+
+	val = atomic_readandclear_long(&pir_desc->pir[0]);
+	if (val != 0) {
+		lapic->irr0 |= val;
+		lapic->irr1 |= val >> 32;
+		pirbase = 0;
+		pirval = val;
+	}
+
+	val = atomic_readandclear_long(&pir_desc->pir[1]);
+	if (val != 0) {
+		lapic->irr2 |= val;
+		lapic->irr3 |= val >> 32;
+		pirbase = 64;
+		pirval = val;
+	}
+
+	val = atomic_readandclear_long(&pir_desc->pir[2]);
+	if (val != 0) {
+		lapic->irr4 |= val;
+		lapic->irr5 |= val >> 32;
+		pirbase = 128;
+		pirval = val;
+	}
+
+	val = atomic_readandclear_long(&pir_desc->pir[3]);
+	if (val != 0) {
+		lapic->irr6 |= val;
+		lapic->irr7 |= val >> 32;
+		pirbase = 192;
+		pirval = val;
+	}
+
+	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
+
+	/*
+	 * Update RVI so the processor can evaluate pending virtual
+	 * interrupts on VM-entry.
+	 *
+	 * It is possible for pirval to be 0 here, even though the
+	 * pending bit has been set. The scenario is:
+	 * CPU-Y is sending a posted interrupt to CPU-X, which
+	 * is running a guest and processing posted interrupts in h/w.
+	 * CPU-X will eventually exit and the state seen in s/w is
+	 * the pending bit set, but no PIR bits set.
+	 *
+	 *      CPU-X                      CPU-Y
+	 *   (vm running)                (host running)
+	 *   rx posted interrupt
+	 *   CLEAR pending bit
+	 *				 SET PIR bit
+	 *   READ/CLEAR PIR bits
+	 *				 SET pending bit
+	 *   (vm exit)
+	 *   pending bit set, PIR 0
+	 */
+	if (pirval != 0) {
+		rvi = pirbase + flsl(pirval) - 1;
+		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
+		intr_status_new = (intr_status_old & 0xFF00) | rvi;
+		if (intr_status_new > intr_status_old) {
+			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
+			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
+			    "guest_intr_status changed from 0x%04x to 0x%04x",
+			    intr_status_old, intr_status_new);
+		}
+	}
+}
+
+static struct vlapic *
+vmx_vlapic_init(void *arg, int vcpuid)
+{
+	struct vmx *vmx;
+	struct vlapic *vlapic;
+	struct vlapic_vtx *vlapic_vtx;
+	
+	vmx = arg;
+
+	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
+	vlapic->vm = vmx->vm;
+	vlapic->vcpuid = vcpuid;
+	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
+
+	vlapic_vtx = (struct vlapic_vtx *)vlapic;
+	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
+	vlapic_vtx->vmx = vmx;
+
+	if (virtual_interrupt_delivery) {
+		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
+		vlapic->ops.pending_intr = vmx_pending_intr;
+		vlapic->ops.intr_accepted = vmx_intr_accepted;
+		vlapic->ops.set_tmr = vmx_set_tmr;
+#ifdef	__FreeBSD__
+		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
+#endif
+	}
+
+	if (posted_interrupts)
+		vlapic->ops.post_intr = vmx_post_intr;
+
+	vlapic_init(vlapic);
+
+	return (vlapic);
+}
+
+static void
+vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
+{
+
+	vlapic_cleanup(vlapic);
+	free(vlapic, M_VLAPIC);
+}
+
+struct vmm_ops vmm_ops_intel = {
+	vmx_init,
+	vmx_cleanup,
+	vmx_vminit,
+	vmx_run,
+	vmx_vmcleanup,
+	ept_vmmmap_set,
+	ept_vmmmap_get,
+	vmx_getreg,
+	vmx_setreg,
+	vmx_getdesc,
+	vmx_setdesc,
+	vmx_getcap,
+	vmx_setcap,
+	vmx_vlapic_init,
+	vmx_vlapic_cleanup,
+};
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
new file mode 100644
index 0000000000..50ca62b371
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
@@ -0,0 +1,156 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx.h 284174 2015-06-09 00:14:47Z tychon $
+ */
+
+#ifndef _VMX_H_
+#define	_VMX_H_
+
+#include "vmcs.h"
+
+#ifndef	__FreeBSD__
+#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
+#define	HOST_MSR_MAX_ENTRIES	64		/* arbitrary */
+#endif
+
+struct vmxctx {
+	register_t	tmpstk[32];		/* vmx_return() stack */
+	register_t	tmpstktop;
+
+	register_t	guest_rdi;		/* Guest state */
+	register_t	guest_rsi;
+	register_t	guest_rdx;
+	register_t	guest_rcx;
+	register_t	guest_r8;
+	register_t	guest_r9;
+	register_t	guest_rax;
+	register_t	guest_rbx;
+	register_t	guest_rbp;
+	register_t	guest_r10;
+	register_t	guest_r11;
+	register_t	guest_r12;
+	register_t	guest_r13;
+	register_t	guest_r14;
+	register_t	guest_r15;
+	register_t	guest_cr2;
+
+	register_t	host_r15;		/* Host state */
+	register_t	host_r14;
+	register_t	host_r13;
+	register_t	host_r12;
+	register_t	host_rbp;
+	register_t	host_rsp;
+	register_t	host_rbx;
+	register_t	host_rip;
+	/*
+	 * XXX todo debug registers and fpu state
+	 */
+	
+	int		launched;		/* vmcs launch state */
+	int		launch_error;
+};
+
+struct vmxcap {
+	int	set;
+	uint32_t proc_ctls;
+	uint32_t proc_ctls2;
+};
+
+struct vmxstate {
+	uint64_t nextrip;	/* next instruction to be executed by guest */
+	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
+	uint16_t vpid;
+};
+
+struct apic_page {
+	uint32_t reg[PAGE_SIZE / 4];
+};
+CTASSERT(sizeof(struct apic_page) == PAGE_SIZE);
+
+/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */
+struct pir_desc {
+	uint64_t	pir[4];
+	uint64_t	pending;
+	uint64_t	unused[3];
+} __aligned(64);
+CTASSERT(sizeof(struct pir_desc) == 64);
+
+/* Index into the 'guest_msrs[]' array */
+enum {
+	IDX_MSR_LSTAR,
+	IDX_MSR_CSTAR,
+	IDX_MSR_STAR,
+	IDX_MSR_SF_MASK,
+	IDX_MSR_KGSBASE,
+	GUEST_MSR_NUM		/* must be the last enumeration */
+};
+
+/* virtual machine softc */
+struct vmx {
+	pml4_entry_t	pml4ept[NPML4EPG];
+	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
+	struct apic_page apic_page[VM_MAXCPU];	/* one apic page per vcpu */
+	char		msr_bitmap[PAGE_SIZE];
+	struct pir_desc	pir_desc[VM_MAXCPU];
+#ifdef	__FreeBSD__
+	uint64_t	guest_msrs[VM_MAXCPU][GUEST_MSR_NUM];
+#else
+	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+	struct msr_entry host_msrs[VM_MAXCPU][HOST_MSR_MAX_ENTRIES];
+#endif
+	struct vmxctx	ctx[VM_MAXCPU];
+	struct vmxcap	cap[VM_MAXCPU];
+	struct vmxstate	state[VM_MAXCPU];
+	struct vm	*vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+
+#define	VMX_RETURN_DIRECT	0
+#define	VMX_RETURN_LONGJMP	1
+#define	VMX_RETURN_VMRESUME	2
+#define	VMX_RETURN_VMLAUNCH	3
+#define	VMX_RETURN_AST		4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int	vmx_setjmp(struct vmxctx *ctx);
+void	vmx_longjmp(void);			/* returns via vmx_setjmp */
+void	vmx_launch(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+void	vmx_resume(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+
+u_long	vmx_fix_cr0(u_long cr0);
+u_long	vmx_fix_cr4(u_long cr4);
+
+int	vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000000..08b1469f19
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_controls.h 260410 2014-01-07 21:04:49Z neel $
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define	_VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define	PINBASED_EXTINT_EXITING		(1 << 0)
+#define	PINBASED_NMI_EXITING		(1 << 3)
+#define	PINBASED_VIRTUAL_NMI		(1 << 5)
+#define	PINBASED_PREMPTION_TIMER	(1 << 6)
+#define	PINBASED_POSTED_INTERRUPT	(1 << 7)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define	PROCBASED_INT_WINDOW_EXITING	(1 << 2)
+#define	PROCBASED_TSC_OFFSET		(1 << 3)
+#define	PROCBASED_HLT_EXITING		(1 << 7)
+#define	PROCBASED_INVLPG_EXITING	(1 << 9)
+#define	PROCBASED_MWAIT_EXITING		(1 << 10)
+#define	PROCBASED_RDPMC_EXITING		(1 << 11)
+#define	PROCBASED_RDTSC_EXITING		(1 << 12)
+#define	PROCBASED_CR3_LOAD_EXITING	(1 << 15)
+#define	PROCBASED_CR3_STORE_EXITING	(1 << 16)
+#define	PROCBASED_CR8_LOAD_EXITING	(1 << 19)
+#define	PROCBASED_CR8_STORE_EXITING	(1 << 20)
+#define	PROCBASED_USE_TPR_SHADOW	(1 << 21)
+#define	PROCBASED_NMI_WINDOW_EXITING	(1 << 22)
+#define PROCBASED_MOV_DR_EXITING	(1 << 23)
+#define	PROCBASED_IO_EXITING		(1 << 24)
+#define	PROCBASED_IO_BITMAPS		(1 << 25)
+#define	PROCBASED_MTF			(1 << 27)
+#define	PROCBASED_MSR_BITMAPS		(1 << 28)
+#define	PROCBASED_MONITOR_EXITING	(1 << 29)
+#define	PROCBASED_PAUSE_EXITING		(1 << 30)
+#define	PROCBASED_SECONDARY_CONTROLS	(1U << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define	PROCBASED2_VIRTUALIZE_APIC_ACCESSES	(1 << 0)
+#define	PROCBASED2_ENABLE_EPT			(1 << 1)
+#define	PROCBASED2_DESC_TABLE_EXITING		(1 << 2)
+#define	PROCBASED2_ENABLE_RDTSCP		(1 << 3)
+#define	PROCBASED2_VIRTUALIZE_X2APIC_MODE	(1 << 4)
+#define	PROCBASED2_ENABLE_VPID			(1 << 5)
+#define	PROCBASED2_WBINVD_EXITING		(1 << 6)
+#define	PROCBASED2_UNRESTRICTED_GUEST		(1 << 7)
+#define	PROCBASED2_APIC_REGISTER_VIRTUALIZATION	(1 << 8)
+#define	PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY	(1 << 9)
+#define	PROCBASED2_PAUSE_LOOP_EXITING		(1 << 10)
+#define	PROCBASED2_ENABLE_INVPCID		(1 << 12)
+
+/* VM Exit Controls */
+#define	VM_EXIT_SAVE_DEBUG_CONTROLS	(1 << 2)
+#define	VM_EXIT_HOST_LMA		(1 << 9)
+#define	VM_EXIT_LOAD_PERF_GLOBAL_CTRL	(1 << 12)
+#define	VM_EXIT_ACKNOWLEDGE_INTERRUPT	(1 << 15)
+#define	VM_EXIT_SAVE_PAT		(1 << 18)
+#define	VM_EXIT_LOAD_PAT		(1 << 19)
+#define	VM_EXIT_SAVE_EFER		(1 << 20)
+#define	VM_EXIT_LOAD_EFER		(1 << 21)
+#define	VM_EXIT_SAVE_PREEMPTION_TIMER	(1 << 22)
+
+/* VM Entry Controls */
+#define	VM_ENTRY_LOAD_DEBUG_CONTROLS	(1 << 2)
+#define	VM_ENTRY_GUEST_LMA		(1 << 9)
+#define	VM_ENTRY_INTO_SMM		(1 << 10)
+#define	VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define	VM_ENTRY_LOAD_PERF_GLOBAL_CTRL	(1 << 13)
+#define	VM_ENTRY_LOAD_PAT		(1 << 14)
+#define	VM_ENTRY_LOAD_EFER		(1 << 15)
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000000..9513f6c70b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,245 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_cpufunc.h 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef	_VMX_CPUFUNC_H_
+#define	_VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ *			error
+ * VMsucceed		  0
+ * VMFailInvalid	  1
+ * VMFailValid		  2	see also VMCS VM-Instruction Error Field
+ */
+#define	VM_SUCCESS		0
+#define	VM_FAIL_INVALID		1
+#define	VM_FAIL_VALID		2
+#define	VMX_SET_ERROR_CODE \
+	"	jnc 1f;"						\
+	"	mov $1, %[error];"	/* CF: error = 1 */		\
+	"	jmp 3f;"						\
+	"1:	jnz 2f;"						\
+	"	mov $2, %[error];"	/* ZF: error = 2 */		\
+	"	jmp 3f;"						\
+	"2:	mov $0, %[error];"					\
+	"3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(region);
+	__asm __volatile("vmxon %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+
+	return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon_pa(vm_paddr_t addr)
+{
+	int error;
+
+	__asm __volatile("vmxon %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+
+	return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmclear %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+	__asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+	__asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmptrld %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+	int error;
+
+	__asm __volatile("vmwrite %[val], %[reg];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [val] "r" (val), [reg] "r" (reg)
+			 : "memory");
+
+	return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+	int error;
+
+	__asm __volatile("vmread %[r], %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [r] "r" (r), [addr] "m" (*addr)
+			 : "memory");
+
+	return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+	int err;
+
+	err = vmclear(vmcs);
+	if (err != 0)
+		panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+	critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+	int err;
+
+	critical_enter();
+
+	err = vmptrld(vmcs);
+	if (err != 0)
+		panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define	INVVPID_TYPE_ADDRESS		0UL
+#define	INVVPID_TYPE_SINGLE_CONTEXT	1UL
+#define	INVVPID_TYPE_ALL_CONTEXTS	2UL
+
+struct invvpid_desc {
+	uint16_t	vpid;
+	uint16_t	_res1;
+	uint32_t	_res2;
+	uint64_t	linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+	int error;
+
+	__asm __volatile("invvpid %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
+
+	if (error)
+		panic("invvpid error %d", error);
+}
+
+#define	INVEPT_TYPE_SINGLE_CONTEXT	1UL
+#define	INVEPT_TYPE_ALL_CONTEXTS	2UL
+struct invept_desc {
+	uint64_t	eptp;
+	uint64_t	_res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+	int error;
+
+	__asm __volatile("invept %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
+
+	if (error)
+		panic("invept error %d", error);
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000000..1ced311ca8
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
@@ -0,0 +1,445 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/clock.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#ifndef	__FreeBSD__
+#include <vm/pmap.h>
+#endif
+
+#include "vmx.h"
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+	if (msr_val & (1UL << (bitpos + 32)))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+	if ((msr_val & (1UL << bitpos)) == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+	       uint32_t zeros_mask, uint32_t *retval)
+{
+	int i;
+	uint64_t val, trueval;
+	boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+	/* We cannot ask the same bit to be set to both '1' and '0' */
+	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+		return (EINVAL);
+
+	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+		true_ctls_avail = TRUE;
+	else
+		true_ctls_avail = FALSE;
+
+	val = rdmsr(ctl_reg);
+	if (true_ctls_avail)
+		trueval = rdmsr(true_ctl_reg);		/* step c */
+	else
+		trueval = val;				/* step a */
+
+	for (i = 0; i < 32; i++) {
+		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+		KASSERT(one_allowed || zero_allowed,
+			("invalid zero/one setting for bit %d of ctl 0x%0x, "
+			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
+			if (ones_mask & (1 << i))
+				return (EINVAL);
+			*retval &= ~(1 << i);
+		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
+			if (zeros_mask & (1 << i))
+				return (EINVAL);
+			*retval |= 1 << i;
+		} else {
+			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
+				*retval &= ~(1 << i);
+			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+				*retval |= 1 << i;
+			else if (!true_ctls_avail)
+				*retval &= ~(1 << i);	/* b(iii) */
+			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+				*retval &= ~(1 << i);
+			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+				*retval |= 1 << i;
+			else {
+				panic("vmx_set_ctlreg: unable to determine "
+				      "correct value of ctl bit %d for msr "
+				      "0x%0x and true msr 0x%0x", i, ctl_reg,
+				      true_ctl_reg);
+			}
+		}
+	}
+
+	return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+	memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+	int byte, bit;
+
+	if (msr <= 0x00001FFF)
+		byte = msr / 8;
+	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+		byte = 1024 + (msr - 0xC0000000) / 8;
+	else
+		return (EINVAL);
+
+	bit = msr & 0x7;
+
+	if (access & MSR_BITMAP_ACCESS_READ)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	byte += 2048;
+	if (access & MSR_BITMAP_ACCESS_WRITE)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	return (0);
+}
+
+static uint64_t misc_enable;
+static uint64_t platform_info;
+static uint64_t turbo_ratio_limit;
+static uint64_t host_msrs[GUEST_MSR_NUM];
+
+static bool
+nehalem_cpu(void)
+{
+	u_int family, model;
+
+	/*
+	 * The family:model numbers belonging to the Nehalem microarchitecture
+	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
+	 */
+	family = CPUID_TO_FAMILY(cpu_id);
+	model = CPUID_TO_MODEL(cpu_id);
+	if (family == 0x6) {
+		switch (model) {
+		case 0x1A:
+		case 0x1E:
+		case 0x1F:
+		case 0x2E:
+			return (true);
+		default:
+			break;
+		}
+	}
+	return (false);
+}
+
+static bool
+westmere_cpu(void)
+{
+	u_int family, model;
+
+	/*
+	 * The family:model numbers belonging to the Westmere microarchitecture
+	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
+	 */
+	family = CPUID_TO_FAMILY(cpu_id);
+	model = CPUID_TO_MODEL(cpu_id);
+	if (family == 0x6) {
+		switch (model) {
+		case 0x25:
+		case 0x2C:
+			return (true);
+		default:
+			break;
+		}
+	}
+	return (false);
+}
+
+void
+vmx_msr_init(void)
+{
+	uint64_t bus_freq, ratio;
+	int i;
+
+#ifdef	__FreeBSD__
+	/*
+	 * It is safe to cache the values of the following MSRs because
+	 * they don't change based on curcpu, curproc or curthread.
+	 */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+#endif
+
+	/*
+	 * Initialize emulated MSRs
+	 */
+	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
+	/*
+	 * Set mandatory bits
+	 *  11:   branch trace disabled
+	 *  12:   PEBS unavailable
+	 * Clear unsupported features
+	 *  16:   SpeedStep enable
+	 *  18:   enable MONITOR FSM
+	 */
+	misc_enable |= (1 << 12) | (1 << 11);
+	misc_enable &= ~((1 << 18) | (1 << 16));
+
+	if (nehalem_cpu() || westmere_cpu())
+		bus_freq = 133330000;		/* 133Mhz */
+	else
+		bus_freq = 100000000;		/* 100Mhz */
+
+	/*
+	 * XXXtime
+	 * The ratio should really be based on the virtual TSC frequency as
+	 * opposed to the host TSC.
+	 */
+	ratio = (tsc_freq / bus_freq) & 0xff;
+
+	/*
+	 * The register definition is based on the micro-architecture
+	 * but the following bits are always the same:
+	 * [15:8]  Maximum Non-Turbo Ratio
+	 * [28]    Programmable Ratio Limit for Turbo Mode
+	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
+	 * [47:40] Maximum Efficiency Ratio
+	 *
+	 * The other bits can be safely set to 0 on all
+	 * micro-architectures up to Haswell.
+	 */
+	platform_info = (ratio << 8) | (ratio << 40);
+
+	/*
+	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
+	 * dependent on the maximum cores per package supported by the micro-
+	 * architecture. For e.g., Westmere supports 6 cores per package and
+	 * uses the low 48 bits. Sandybridge support 8 cores per package and
+	 * uses up all 64 bits.
+	 *
+	 * However, the unused bits are reserved so we pretend that all bits
+	 * in this MSR are valid.
+	 */
+	for (i = 0; i < 8; i++)
+		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
+}
+
+void
+vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
+{
+	/*
+	 * The permissions bitmap is shared between all vcpus so initialize it
+	 * once when initializing the vBSP.
+	 */
+	if (vcpuid == 0) {
+		guest_msr_rw(vmx, MSR_LSTAR);
+		guest_msr_rw(vmx, MSR_CSTAR);
+		guest_msr_rw(vmx, MSR_STAR);
+		guest_msr_rw(vmx, MSR_SF_MASK);
+		guest_msr_rw(vmx, MSR_KGSBASE);
+	}
+	return;
+}
+
+void
+vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
+{
+#ifdef	__FreeBSD__
+	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+	/* Save host MSRs (if any) and restore guest MSRs */
+	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
+	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
+#endif
+}
+
+void
+vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
+{
+#ifdef	__FreeBSD__
+	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+	/* Save guest MSRs */
+	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
+
+	/* Restore host MSRs */
+	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
+
+	/* MSR_KGSBASE will be restored on the way back to userspace */
+#endif
+}
+
+int
+vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		*val = 0;
+		break;
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		*val = 0;
+		break;
+	case MSR_IA32_MISC_ENABLE:
+		*val = misc_enable;
+		break;
+	case MSR_PLATFORM_INFO:
+		*val = platform_info;
+		break;
+	case MSR_TURBO_RATIO_LIMIT:
+	case MSR_TURBO_RATIO_LIMIT1:
+		*val = turbo_ratio_limit;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+int
+vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+	uint64_t changed;
+	int error;
+	
+	error = 0;
+	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		break;		/* ignore writes */
+	case MSR_MTRRcap:
+		vm_inject_gp(vmx->vm, vcpuid);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		break;		/* Ignore writes */
+	case MSR_IA32_MISC_ENABLE:
+		changed = val ^ misc_enable;
+		/*
+		 * If the host has disabled the NX feature then the guest
+		 * also cannot use it. However, a Linux guest will try to
+		 * enable the NX feature by writing to the MISC_ENABLE MSR.
+		 *
+		 * This can be safely ignored because the memory management
+		 * code looks at CPUID.80000001H:EDX.NX to check if the
+		 * functionality is actually enabled.
+		 */
+		changed &= ~(1UL << 34);
+
+		/*
+		 * Punt to userspace if any other bits are being modified.
+		 */
+		if (changed)
+			error = EINVAL;
+
+		break;
+	case MSR_TSC:
+		error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc());
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000000..5300d14d9b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.h 271888 2014-09-20 02:35:21Z neel $
+ */
+
+#ifndef _VMX_MSR_H_
+#define	_VMX_MSR_H_
+
+struct vmx;
+
+void vmx_msr_init(void);
+void vmx_msr_guest_init(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid);
+int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu);
+int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu);
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+		   uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define	MSR_BITMAP_ACCESS_NONE	0x0
+#define	MSR_BITMAP_ACCESS_READ	0x1
+#define	MSR_BITMAP_ACCESS_WRITE	0x2
+#define	MSR_BITMAP_ACCESS_RW	(MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void	msr_bitmap_initialize(char *bitmap);
+int	msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#define	guest_msr_rw(vmx, msr) \
+    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define	guest_msr_ro(vmx, msr) \
+    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
new file mode 100644
index 0000000000..d57dde1093
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
@@ -0,0 +1,271 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_support.S 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define	VMX_DISABLE_INTERRUPTS		cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#ifdef	__FreeBSD__
+#define	VMX_CHECK_AST							\
+	movq	PCPU(CURTHREAD),%rax;					\
+	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax);	\
+	je	9f;							\
+	movq	$VMX_RETURN_AST,%rsi;					\
+	movq	%rdi,%rsp;						\
+	addq	$VMXCTX_TMPSTKTOP,%rsp;					\
+	callq	vmx_return;						\
+9:
+#else
+#define	VMX_CHECK_AST							\
+	movq	%gs:CPU_THREAD,%rax;					\
+	movl	T_ASTFLAG(%rax),%eax;					\
+	test	%al,%al;						\
+	je	9f;							\
+	movq	$VMX_RETURN_AST,%rsi;					\
+	movq	%rdi,%rsp;						\
+	addq	$VMXCTX_TMPSTKTOP,%rsp;					\
+	callq	vmx_return;						\
+9:
+#endif
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define	VMX_GUEST_RESTORE						\
+	movq	%rdi,%rsp;						\
+	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
+	movq	%rsi,%cr2;						\
+	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
+	movq	VMXCTX_GUEST_RDX(%rdi),%rdx;				\
+	movq	VMXCTX_GUEST_RCX(%rdi),%rcx;				\
+	movq	VMXCTX_GUEST_R8(%rdi),%r8;				\
+	movq	VMXCTX_GUEST_R9(%rdi),%r9;				\
+	movq	VMXCTX_GUEST_RAX(%rdi),%rax;				\
+	movq	VMXCTX_GUEST_RBX(%rdi),%rbx;				\
+	movq	VMXCTX_GUEST_RBP(%rdi),%rbp;				\
+	movq	VMXCTX_GUEST_R10(%rdi),%r10;				\
+	movq	VMXCTX_GUEST_R11(%rdi),%r11;				\
+	movq	VMXCTX_GUEST_R12(%rdi),%r12;				\
+	movq	VMXCTX_GUEST_R13(%rdi),%r13;				\
+	movq	VMXCTX_GUEST_R14(%rdi),%r14;				\
+	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
+	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define	VM_INSTRUCTION_ERROR(reg)					\
+	jnc 	1f;							\
+	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
+	jmp 	3f;							\
+1:	jnz 	2f;							\
+	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
+	jmp 	3f;							\
+2:	movl 	$VM_SUCCESS,reg;					\
+3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+	.text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+	movq	(%rsp),%rax			/* return address */
+	movq    %r15,VMXCTX_HOST_R15(%rdi)
+	movq    %r14,VMXCTX_HOST_R14(%rdi)
+	movq    %r13,VMXCTX_HOST_R13(%rdi)
+	movq    %r12,VMXCTX_HOST_R12(%rdi)
+	movq    %rbp,VMXCTX_HOST_RBP(%rdi)
+	movq    %rsp,VMXCTX_HOST_RSP(%rdi)
+	movq    %rbx,VMXCTX_HOST_RBX(%rdi)
+	movq    %rax,VMXCTX_HOST_RIP(%rdi)
+
+	/*
+	 * XXX save host debug registers
+	 */
+	movl	$VMX_RETURN_DIRECT,%eax
+	ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+	/* Restore host context. */
+	movq	VMXCTX_HOST_R15(%rdi),%r15
+	movq	VMXCTX_HOST_R14(%rdi),%r14
+	movq	VMXCTX_HOST_R13(%rdi),%r13
+	movq	VMXCTX_HOST_R12(%rdi),%r12
+	movq	VMXCTX_HOST_RBP(%rdi),%rbp
+	movq	VMXCTX_HOST_RSP(%rdi),%rsp
+	movq	VMXCTX_HOST_RBX(%rdi),%rbx
+	movq	VMXCTX_HOST_RIP(%rdi),%rax
+	movq	%rax,(%rsp)			/* return address */
+
+	/*
+	 * XXX restore host debug registers
+	 */
+	movl	%esi,%eax
+	ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+	/*
+	 * Save guest state that is not automatically saved in the vmcs.
+	 */
+	movq	%rdi,VMXCTX_GUEST_RDI(%rsp)
+	movq	%rsi,VMXCTX_GUEST_RSI(%rsp)
+	movq	%rdx,VMXCTX_GUEST_RDX(%rsp)
+	movq	%rcx,VMXCTX_GUEST_RCX(%rsp)
+	movq	%r8,VMXCTX_GUEST_R8(%rsp)
+	movq	%r9,VMXCTX_GUEST_R9(%rsp)
+	movq	%rax,VMXCTX_GUEST_RAX(%rsp)
+	movq	%rbx,VMXCTX_GUEST_RBX(%rsp)
+	movq	%rbp,VMXCTX_GUEST_RBP(%rsp)
+	movq	%r10,VMXCTX_GUEST_R10(%rsp)
+	movq	%r11,VMXCTX_GUEST_R11(%rsp)
+	movq	%r12,VMXCTX_GUEST_R12(%rsp)
+	movq	%r13,VMXCTX_GUEST_R13(%rsp)
+	movq	%r14,VMXCTX_GUEST_R14(%rsp)
+	movq	%r15,VMXCTX_GUEST_R15(%rsp)
+
+	movq	%cr2,%rdi
+	movq	%rdi,VMXCTX_GUEST_CR2(%rsp)
+
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_LONGJMP,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmresume
+
+	/*
+	 * Capture the reason why vmresume failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMRESUME,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmlaunch
+
+	/*
+	 * Capture the reason why vmlaunch failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMLAUNCH,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_launch)
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c
new file mode 100644
index 0000000000..a93b252c91
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c
@@ -0,0 +1,809 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpic.c 279683 2015-03-06 02:05:45Z tychon $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#include <x86/apicreg.h>
+#include <dev/ic/i8259.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+#include "vmm_lapic.h"
+#include "vioapic.h"
+#include "vatpic.h"
+
+static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)");
+
+#define	VATPIC_LOCK(vatpic)		mtx_lock_spin(&((vatpic)->mtx))
+#define	VATPIC_UNLOCK(vatpic)		mtx_unlock_spin(&((vatpic)->mtx))
+#define	VATPIC_LOCKED(vatpic)		mtx_owned(&((vatpic)->mtx))
+
+enum irqstate {
+	IRQSTATE_ASSERT,
+	IRQSTATE_DEASSERT,
+	IRQSTATE_PULSE
+};
+
+struct atpic {
+	bool		ready;
+	int		icw_num;
+	int		rd_cmd_reg;
+
+	bool		aeoi;
+	bool		poll;
+	bool		rotate;
+	bool		sfn;		/* special fully-nested mode */
+
+	int		irq_base;
+	uint8_t		request;	/* Interrupt Request Register (IIR) */
+	uint8_t		service;	/* Interrupt Service (ISR) */
+	uint8_t		mask;		/* Interrupt Mask Register (IMR) */
+	uint8_t		smm;		/* special mask mode */
+
+	int		acnt[8];	/* sum of pin asserts and deasserts */
+	int		lowprio;	/* lowest priority irq */
+
+	bool		intr_raised;
+};
+
+struct vatpic {
+	struct vm	*vm;
+	struct mtx	mtx;
+	struct atpic	atpic[2];
+	uint8_t		elc[2];
+};
+
+#define	VATPIC_CTR0(vatpic, fmt)					\
+	VM_CTR0((vatpic)->vm, fmt)
+
+#define	VATPIC_CTR1(vatpic, fmt, a1)					\
+	VM_CTR1((vatpic)->vm, fmt, a1)
+
+#define	VATPIC_CTR2(vatpic, fmt, a1, a2)				\
+	VM_CTR2((vatpic)->vm, fmt, a1, a2)
+
+#define	VATPIC_CTR3(vatpic, fmt, a1, a2, a3)				\
+	VM_CTR3((vatpic)->vm, fmt, a1, a2, a3)
+
+#define	VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4)			\
+	VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4)
+
+/*
+ * Loop over all the pins in priority order from highest to lowest.
+ */
+#define	ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar)			\
+	for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7;		\
+	    tmpvar < 8;							\
+	    tmpvar++, pinvar = (pinvar + 1) & 0x7)
+
+static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate);
+
+static __inline bool
+master_atpic(struct vatpic *vatpic, struct atpic *atpic)
+{
+
+	if (atpic == &vatpic->atpic[0])
+		return (true);
+	else
+		return (false);
+}
+
+static __inline int
+vatpic_get_highest_isrpin(struct atpic *atpic)
+{
+	int bit, pin;
+	int i;
+
+	ATPIC_PIN_FOREACH(pin, atpic, i) {
+                bit = (1 << pin);
+
+		if (atpic->service & bit) {
+			/*
+			 * An IS bit that is masked by an IMR bit will not be
+			 * cleared by a non-specific EOI in Special Mask Mode.
+			 */
+			if (atpic->smm && (atpic->mask & bit) != 0)
+				continue;
+			else
+				return (pin);
+		}
+	}
+
+	return (-1);
+}
+
+static __inline int
+vatpic_get_highest_irrpin(struct atpic *atpic)
+{
+	int serviced;
+	int bit, pin, tmp;
+
+	/*
+	 * In 'Special Fully-Nested Mode' when an interrupt request from
+	 * a slave is in service, the slave is not locked out from the
+	 * master's priority logic.
+	 */
+	serviced = atpic->service;
+	if (atpic->sfn)
+		serviced &= ~(1 << 2);
+
+	/*
+	 * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits
+	 * further interrupts at that level and enables interrupts from all
+	 * other levels that are not masked. In other words the ISR has no
+	 * bearing on the levels that can generate interrupts.
+	 */
+	if (atpic->smm)
+		serviced = 0;
+
+	ATPIC_PIN_FOREACH(pin, atpic, tmp) {
+		bit = 1 << pin;
+
+		/*
+		 * If there is already an interrupt in service at the same
+		 * or higher priority then bail.
+		 */
+		if ((serviced & bit) != 0)
+			break;
+
+		/*
+		 * If an interrupt is asserted and not masked then return
+		 * the corresponding 'pin' to the caller.
+		 */
+		if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0)
+			return (pin);
+	}
+
+	return (-1);
+}
+
+static void
+vatpic_notify_intr(struct vatpic *vatpic)
+{
+	struct atpic *atpic;
+	int pin;
+
+	KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked"));
+
+	/*
+	 * First check the slave.
+	 */
+	atpic = &vatpic->atpic[1];
+	if (!atpic->intr_raised &&
+	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
+		VATPIC_CTR4(vatpic, "atpic slave notify pin = %d "
+		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
+		    atpic->mask, atpic->request, atpic->service);
+
+		/*
+		 * Cascade the request from the slave to the master.
+		 */
+		atpic->intr_raised = true;
+		vatpic_set_pinstate(vatpic, 2, true);
+		vatpic_set_pinstate(vatpic, 2, false);
+	} else {
+		VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts "
+		    "(imr 0x%x irr 0x%x isr 0x%x)",
+		    atpic->mask, atpic->request, atpic->service);
+	}
+
+	/*
+	 * Then check the master.
+	 */
+	atpic = &vatpic->atpic[0];
+	if (!atpic->intr_raised &&
+	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
+		VATPIC_CTR4(vatpic, "atpic master notify pin = %d "
+		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
+		    atpic->mask, atpic->request, atpic->service);
+
+		/*
+		 * From Section 3.6.2, "Interrupt Modes", in the
+		 * MPtable Specification, Version 1.4
+		 *
+		 * PIC interrupts are routed to both the Local APIC
+		 * and the I/O APIC to support operation in 1 of 3
+		 * modes.
+		 *
+		 * 1. Legacy PIC Mode: the PIC effectively bypasses
+		 * all APIC components.  In this mode the local APIC is
+		 * disabled and LINT0 is reconfigured as INTR to
+		 * deliver the PIC interrupt directly to the CPU.
+		 *
+		 * 2. Virtual Wire Mode: the APIC is treated as a
+		 * virtual wire which delivers interrupts from the PIC
+		 * to the CPU.  In this mode LINT0 is programmed as
+		 * ExtINT to indicate that the PIC is the source of
+		 * the interrupt.
+		 *
+		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
+		 * fielded by the I/O APIC and delivered to the appropriate
+		 * CPU.  In this mode the I/O APIC input 0 is programmed
+		 * as ExtINT to indicate that the PIC is the source of the
+		 * interrupt.
+		 */
+		atpic->intr_raised = true;
+		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
+		vioapic_pulse_irq(vatpic->vm, 0);
+	} else {
+		VATPIC_CTR3(vatpic, "atpic master no eligible interrupts "
+		    "(imr 0x%x irr 0x%x isr 0x%x)",
+		    atpic->mask, atpic->request, atpic->service);
+	}
+}
+
+static int
+vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+	VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val);
+
+	atpic->ready = false;
+
+	atpic->icw_num = 1;
+	atpic->request = 0;
+	atpic->mask = 0;
+	atpic->lowprio = 7;
+	atpic->rd_cmd_reg = 0;
+	atpic->poll = 0;
+	atpic->smm = 0;
+
+	if ((val & ICW1_SNGL) != 0) {
+		VATPIC_CTR0(vatpic, "vatpic cascade mode required");
+		return (-1);
+	}
+
+	if ((val & ICW1_IC4) == 0) {
+		VATPIC_CTR0(vatpic, "vatpic icw4 required");
+		return (-1);
+	}
+
+	atpic->icw_num++;
+
+	return (0);
+}
+
+static int
+vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+	VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val);
+
+	atpic->irq_base = val & 0xf8;
+
+	atpic->icw_num++;
+
+	return (0);
+}
+
+static int
+vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+	VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val);
+
+	atpic->icw_num++;
+
+	return (0);
+}
+
+static int
+vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+	VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val);
+
+	if ((val & ICW4_8086) == 0) {
+		VATPIC_CTR0(vatpic, "vatpic microprocessor mode required");
+		return (-1);
+	}
+
+	if ((val & ICW4_AEOI) != 0)
+		atpic->aeoi = true;
+
+	if ((val & ICW4_SFNM) != 0) {
+		if (master_atpic(vatpic, atpic)) {
+			atpic->sfn = true;
+		} else {
+			VATPIC_CTR1(vatpic, "Ignoring special fully nested "
+			    "mode on slave atpic: %#x", val);
+		}
+	}
+
+	atpic->icw_num = 0;
+	atpic->ready = true;
+
+	return (0);
+}
+
+static int
+vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+	VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val);
+
+	atpic->mask = val & 0xff;
+
+	return (0);
+}
+
+static int
+vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+	VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val);
+
+	atpic->rotate = ((val & OCW2_R) != 0);
+
+	if ((val & OCW2_EOI) != 0) {
+		int isr_bit;
+
+		if ((val & OCW2_SL) != 0) {
+			/* specific EOI */
+			isr_bit = val & 0x7;
+		} else {
+			/* non-specific EOI */
+			isr_bit = vatpic_get_highest_isrpin(atpic);
+		}
+
+		if (isr_bit != -1) {
+			atpic->service &= ~(1 << isr_bit);
+
+			if (atpic->rotate)
+				atpic->lowprio = isr_bit;
+		}
+	} else if ((val & OCW2_SL) != 0 && atpic->rotate == true) {
+		/* specific priority */
+		atpic->lowprio = val & 0x7;
+	}
+
+	return (0);
+}
+
+static int
+vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+	VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val);
+
+	if (val & OCW3_ESMM) {
+		atpic->smm = val & OCW3_SMM ? 1 : 0;
+		VATPIC_CTR2(vatpic, "%s atpic special mask mode %s",
+		    master_atpic(vatpic, atpic) ? "master" : "slave",
+		    atpic->smm ?  "enabled" : "disabled");
+	}
+
+	if (val & OCW3_RR) {
+		/* read register command */
+		atpic->rd_cmd_reg = val & OCW3_RIS;
+
+		/* Polling mode */
+		atpic->poll = ((val & OCW3_P) != 0);
+	}
+
+	return (0);
+}
+
+static void
+vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate)
+{
+	struct atpic *atpic;
+	int oldcnt, newcnt;
+	bool level;
+
+	KASSERT(pin >= 0 && pin < 16,
+	    ("vatpic_set_pinstate: invalid pin number %d", pin));
+	KASSERT(VATPIC_LOCKED(vatpic),
+	    ("vatpic_set_pinstate: vatpic is not locked"));
+
+	atpic = &vatpic->atpic[pin >> 3];
+
+	oldcnt = atpic->acnt[pin & 0x7];
+	if (newstate)
+		atpic->acnt[pin & 0x7]++;
+	else
+		atpic->acnt[pin & 0x7]--;
+	newcnt = atpic->acnt[pin & 0x7];
+
+	if (newcnt < 0) {
+		VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt);
+	}
+
+	level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0);
+
+	if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) {
+		/* rising edge or level */
+		VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin);
+		atpic->request |= (1 << (pin & 0x7));
+	} else if (oldcnt == 1 && newcnt == 0) {
+		/* falling edge */
+		VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin);
+		if (level)
+			atpic->request &= ~(1 << (pin & 0x7));
+	} else {
+		VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d",
+		    pin, newstate ? "asserted" : "deasserted", newcnt);
+	}
+
+	vatpic_notify_intr(vatpic);
+}
+
+static int
+vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
+{
+	struct vatpic *vatpic;
+	struct atpic *atpic;
+
+	if (irq < 0 || irq > 15)
+		return (EINVAL);
+
+	vatpic = vm_atpic(vm);
+	atpic = &vatpic->atpic[irq >> 3];
+
+	if (atpic->ready == false)
+		return (0);
+
+	VATPIC_LOCK(vatpic);
+	switch (irqstate) {
+	case IRQSTATE_ASSERT:
+		vatpic_set_pinstate(vatpic, irq, true);
+		break;
+	case IRQSTATE_DEASSERT:
+		vatpic_set_pinstate(vatpic, irq, false);
+		break;
+	case IRQSTATE_PULSE:
+		vatpic_set_pinstate(vatpic, irq, true);
+		vatpic_set_pinstate(vatpic, irq, false);
+		break;
+	default:
+		panic("vatpic_set_irqstate: invalid irqstate %d", irqstate);
+	}
+	VATPIC_UNLOCK(vatpic);
+
+	return (0);
+}
+
+int
+vatpic_assert_irq(struct vm *vm, int irq)
+{
+	return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
+}
+
+int
+vatpic_deassert_irq(struct vm *vm, int irq)
+{
+	return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
+}
+
+int
+vatpic_pulse_irq(struct vm *vm, int irq)
+{
+	return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE));
+}
+
+int
+vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
+{
+	struct vatpic *vatpic;
+
+	if (irq < 0 || irq > 15)
+		return (EINVAL);
+
+	/*
+	 * See comment in vatpic_elc_handler.  These IRQs must be
+	 * edge triggered.
+	 */
+	if (trigger == LEVEL_TRIGGER) {
+		switch (irq) {
+		case 0:
+		case 1:
+		case 2:
+		case 8:
+		case 13:
+			return (EINVAL);
+		}
+	}
+
+	vatpic = vm_atpic(vm);
+
+	VATPIC_LOCK(vatpic);
+
+	if (trigger == LEVEL_TRIGGER)
+		vatpic->elc[irq >> 3] |=  1 << (irq & 0x7);
+	else
+		vatpic->elc[irq >> 3] &=  ~(1 << (irq & 0x7));
+
+	VATPIC_UNLOCK(vatpic);
+
+	return (0);
+}
+
+void
+vatpic_pending_intr(struct vm *vm, int *vecptr)
+{
+	struct vatpic *vatpic;
+	struct atpic *atpic;
+	int pin;
+
+	vatpic = vm_atpic(vm);
+
+	atpic = &vatpic->atpic[0];
+
+	VATPIC_LOCK(vatpic);
+
+	pin = vatpic_get_highest_irrpin(atpic);
+	if (pin == 2) {
+		atpic = &vatpic->atpic[1];
+		pin = vatpic_get_highest_irrpin(atpic);
+	}
+
+	/*
+	 * If there are no pins active at this moment then return the spurious
+	 * interrupt vector instead.
+	 */
+	if (pin == -1)
+		pin = 7;
+
+	KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin));
+	*vecptr = atpic->irq_base + pin;
+
+	VATPIC_UNLOCK(vatpic);
+}
+
+static void
+vatpic_pin_accepted(struct atpic *atpic, int pin)
+{
+	atpic->intr_raised = false;
+
+	if (atpic->acnt[pin] == 0)
+		atpic->request &= ~(1 << pin);
+
+	if (atpic->aeoi == true) {
+		if (atpic->rotate == true)
+			atpic->lowprio = pin;
+	} else {
+		atpic->service |= (1 << pin);
+	}
+}
+
+void
+vatpic_intr_accepted(struct vm *vm, int vector)
+{
+	struct vatpic *vatpic;
+	int pin;
+
+	vatpic = vm_atpic(vm);
+
+	VATPIC_LOCK(vatpic);
+
+	pin = vector & 0x7;
+
+	if ((vector & ~0x7) == vatpic->atpic[1].irq_base) {
+		vatpic_pin_accepted(&vatpic->atpic[1], pin);
+		/*
+		 * If this vector originated from the slave,
+		 * accept the cascaded interrupt too.
+		 */
+		vatpic_pin_accepted(&vatpic->atpic[0], 2);
+	} else {
+		vatpic_pin_accepted(&vatpic->atpic[0], pin);
+	}
+
+	vatpic_notify_intr(vatpic);
+
+	VATPIC_UNLOCK(vatpic);
+}
+
+static int
+vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
+	    int bytes, uint32_t *eax)
+{
+	int pin;
+
+	VATPIC_LOCK(vatpic);
+
+	if (atpic->poll) {
+		atpic->poll = 0;
+		pin = vatpic_get_highest_irrpin(atpic);
+		if (pin >= 0) {
+			vatpic_pin_accepted(atpic, pin);
+			*eax = 0x80 | pin;
+		} else {
+			*eax = 0;
+		}
+	} else {
+		if (port & ICU_IMR_OFFSET) {
+			/* read interrrupt mask register */
+			*eax = atpic->mask;
+		} else {
+			if (atpic->rd_cmd_reg == OCW3_RIS) {
+				/* read interrupt service register */
+				*eax = atpic->service;
+			} else {
+				/* read interrupt request register */
+				*eax = atpic->request;
+			}
+		}
+	}
+
+	VATPIC_UNLOCK(vatpic);
+
+	return (0);
+
+}
+
+static int
+vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
+    int bytes, uint32_t *eax)
+{
+	int error;
+	uint8_t val;
+
+	error = 0;
+	val = *eax;
+
+	VATPIC_LOCK(vatpic);
+
+	if (port & ICU_IMR_OFFSET) {
+		switch (atpic->icw_num) {
+		case 2:
+			error = vatpic_icw2(vatpic, atpic, val);
+			break;
+		case 3:
+			error = vatpic_icw3(vatpic, atpic, val);
+			break;
+		case 4:
+			error = vatpic_icw4(vatpic, atpic, val);
+			break;
+		default:
+			error = vatpic_ocw1(vatpic, atpic, val);
+			break;
+		}
+	} else {
+		if (val & (1 << 4))
+			error = vatpic_icw1(vatpic, atpic, val);
+
+		if (atpic->ready) {
+			if (val & (1 << 3))
+				error = vatpic_ocw3(vatpic, atpic, val);
+			else
+				error = vatpic_ocw2(vatpic, atpic, val);
+		}
+	}
+
+	if (atpic->ready)
+		vatpic_notify_intr(vatpic);
+
+	VATPIC_UNLOCK(vatpic);
+
+	return (error);
+}
+
+int
+vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *eax)
+{
+	struct vatpic *vatpic;
+	struct atpic *atpic;
+
+	vatpic = vm_atpic(vm);
+	atpic = &vatpic->atpic[0];
+
+	if (bytes != 1)
+		return (-1);
+ 
+	if (in) {
+		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
+	}
+ 
+	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
+}
+
+int
+vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *eax)
+{
+	struct vatpic *vatpic;
+	struct atpic *atpic;
+
+	vatpic = vm_atpic(vm);
+	atpic = &vatpic->atpic[1];
+
+	if (bytes != 1)
+		return (-1);
+
+	if (in) {
+		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
+	}
+
+	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
+}
+
+int
+vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *eax)
+{
+	struct vatpic *vatpic;
+	bool is_master;
+
+	vatpic = vm_atpic(vm);
+	is_master = (port == IO_ELCR1);
+
+	if (bytes != 1)
+		return (-1);
+
+	VATPIC_LOCK(vatpic);
+
+	if (in) {
+		if (is_master)
+			*eax = vatpic->elc[0];
+		else
+			*eax = vatpic->elc[1];
+	} else {
+		/*
+		 * For the master PIC the cascade channel (IRQ2), the
+		 * heart beat timer (IRQ0), and the keyboard
+		 * controller (IRQ1) cannot be programmed for level
+		 * mode.
+		 *
+		 * For the slave PIC the real time clock (IRQ8) and
+		 * the floating point error interrupt (IRQ13) cannot
+		 * be programmed for level mode.
+		 */
+		if (is_master)
+			vatpic->elc[0] = (*eax & 0xf8);
+		else
+			vatpic->elc[1] = (*eax & 0xde);
+	}
+
+	VATPIC_UNLOCK(vatpic);
+
+	return (0);
+}
+
+struct vatpic *
+vatpic_init(struct vm *vm)
+{
+	struct vatpic *vatpic;
+
+	vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO);
+	vatpic->vm = vm;
+
+	mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN);
+
+	return (vatpic);
+}
+
+void
+vatpic_cleanup(struct vatpic *vatpic)
+{
+	free(vatpic, M_VATPIC);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h
new file mode 100644
index 0000000000..ef5e51b158
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vatpic.h 273706 2014-10-26 19:03:06Z neel $
+ */
+
+#ifndef _VATPIC_H_
+#define	_VATPIC_H_
+
+#include <isa/isareg.h>
+
+#define	ICU_IMR_OFFSET	1
+
+#define	IO_ELCR1	0x4d0
+#define	IO_ELCR2	0x4d1
+
+struct vatpic *vatpic_init(struct vm *vm);
+void vatpic_cleanup(struct vatpic *vatpic);
+
+int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port,
+    int bytes, uint32_t *eax);
+int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port,
+    int bytes, uint32_t *eax);
+int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *eax);
+
+int vatpic_assert_irq(struct vm *vm, int irq);
+int vatpic_deassert_irq(struct vm *vm, int irq);
+int vatpic_pulse_irq(struct vm *vm, int irq);
+int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger);
+
+void vatpic_pending_intr(struct vm *vm, int *vecptr);
+void vatpic_intr_accepted(struct vm *vm, int vector);
+
+#endif	/* _VATPIC_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
new file mode 100644
index 0000000000..ce17bdc92c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
@@ -0,0 +1,458 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpit.c 273706 2014-10-26 19:03:06Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+#include "vatpic.h"
+#include "vioapic.h"
+#include "vatpit.h"
+
+static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)");
+
+#define	VATPIT_LOCK(vatpit)		mtx_lock_spin(&((vatpit)->mtx))
+#define	VATPIT_UNLOCK(vatpit)		mtx_unlock_spin(&((vatpit)->mtx))
+#define	VATPIT_LOCKED(vatpit)		mtx_owned(&((vatpit)->mtx))
+
+#define	TIMER_SEL_MASK		0xc0
+#define	TIMER_RW_MASK		0x30
+#define	TIMER_MODE_MASK		0x0f
+#define	TIMER_SEL_READBACK	0xc0
+
+#define	TIMER_STS_OUT		0x80
+#define	TIMER_STS_NULLCNT	0x40
+
+#define	TIMER_RB_LCTR		0x20
+#define	TIMER_RB_LSTATUS	0x10
+#define	TIMER_RB_CTR_2		0x08
+#define	TIMER_RB_CTR_1		0x04
+#define	TIMER_RB_CTR_0		0x02
+
+#define	TMR2_OUT_STS		0x20
+
+#define	PIT_8254_FREQ		1193182
+#define	TIMER_DIV(freq, hz)	(((freq) + (hz) / 2) / (hz))
+
+struct vatpit_callout_arg {
+	struct vatpit	*vatpit;
+	int		channel_num;
+};
+
+
+struct channel {
+	int		mode;
+	uint16_t	initial;	/* initial counter value */
+	sbintime_t	now_sbt;	/* uptime when counter was loaded */
+	uint8_t		cr[2];
+	uint8_t		ol[2];
+	bool		slatched;	/* status latched */
+	uint8_t		status;
+	int		crbyte;
+	int		olbyte;
+	int		frbyte;
+	struct callout	callout;
+	sbintime_t	callout_sbt;	/* target time */
+	struct vatpit_callout_arg callout_arg;
+};
+
+struct vatpit {
+	struct vm	*vm;
+	struct mtx	mtx;
+
+	sbintime_t	freq_sbt;
+
+	struct channel	channel[3];
+};
+
+static void pit_timer_start_cntr0(struct vatpit *vatpit);
+
+static int
+vatpit_get_out(struct vatpit *vatpit, int channel)
+{
+	struct channel *c;
+	sbintime_t delta_ticks;
+	int out;
+
+	c = &vatpit->channel[channel];
+
+	switch (c->mode) {
+	case TIMER_INTTC:
+		delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
+		out = ((c->initial - delta_ticks) <= 0);
+		break;
+	default:
+		out = 0;
+		break;
+	}
+
+	return (out);
+}
+
+static void
+vatpit_callout_handler(void *a)
+{
+	struct vatpit_callout_arg *arg = a;
+	struct vatpit *vatpit;
+	struct callout *callout;
+	struct channel *c;
+
+	vatpit = arg->vatpit;
+	c = &vatpit->channel[arg->channel_num];
+	callout = &c->callout;
+
+	VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num);
+
+	VATPIT_LOCK(vatpit);
+
+	if (callout_pending(callout))		/* callout was reset */
+		goto done;
+
+	if (!callout_active(callout))		/* callout was stopped */
+		goto done;
+
+	callout_deactivate(callout);
+
+	if (c->mode == TIMER_RATEGEN) {
+		pit_timer_start_cntr0(vatpit);
+	}
+
+	vatpic_pulse_irq(vatpit->vm, 0);
+	vioapic_pulse_irq(vatpit->vm, 2);
+
+done:
+	VATPIT_UNLOCK(vatpit);
+	return;
+}
+
+static void
+pit_timer_start_cntr0(struct vatpit *vatpit)
+{
+	struct channel *c;
+	sbintime_t now, delta, precision;
+
+	c = &vatpit->channel[0];
+	if (c->initial != 0) {
+		delta = c->initial * vatpit->freq_sbt;
+		precision = delta >> tc_precexp;
+		c->callout_sbt = c->callout_sbt + delta;
+
+		/*
+		 * Reset 'callout_sbt' if the time that the callout
+		 * was supposed to fire is more than 'c->initial'
+		 * ticks in the past.
+		 */
+		now = sbinuptime();
+		if (c->callout_sbt < now)
+			c->callout_sbt = now + delta;
+
+		callout_reset_sbt(&c->callout, c->callout_sbt,
+		    precision, vatpit_callout_handler, &c->callout_arg,
+		    C_ABSOLUTE);
+	}
+}
+
+static uint16_t
+pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch)
+{
+	uint16_t lval;
+	sbintime_t delta_ticks;
+
+	/* cannot latch a new value until the old one has been consumed */
+	if (latch && c->olbyte != 0)
+		return (0);
+
+	if (c->initial == 0) {
+		/*
+		 * This is possibly an o/s bug - reading the value of
+		 * the timer without having set up the initial value.
+		 *
+		 * The original user-space version of this code set
+		 * the timer to 100hz in this condition; do the same
+		 * here.
+		 */
+		c->initial = TIMER_DIV(PIT_8254_FREQ, 100);
+		c->now_sbt = sbinuptime();
+		c->status &= ~TIMER_STS_NULLCNT;
+	}
+
+	delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
+
+	lval = c->initial - delta_ticks % c->initial;
+
+	if (latch) {
+		c->olbyte = 2;
+		c->ol[1] = lval;		/* LSB */
+		c->ol[0] = lval >> 8;		/* MSB */
+	}
+
+	return (lval);
+}
+
+static int
+pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd)
+{
+	struct channel *c;
+
+	c = &vatpit->channel[channel];
+
+	/*
+	 * Latch the count/status of the timer if not already latched.
+	 * N.B. that the count/status latch-select bits are active-low.
+	 */
+	if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) {
+		(void) pit_update_counter(vatpit, c, true);
+	}
+
+	if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) {
+		c->slatched = true;
+		/*
+		 * For mode 0, see if the elapsed time is greater
+		 * than the initial value - this results in the
+		 * output pin being set to 1 in the status byte.
+		 */
+		if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel))
+			c->status |= TIMER_STS_OUT;
+		else
+			c->status &= ~TIMER_STS_OUT;
+	}
+
+	return (0);
+}
+
+static int
+pit_readback(struct vatpit *vatpit, uint8_t cmd)
+{
+	int error;
+
+	/*
+	 * The readback command can apply to all timers.
+	 */
+	error = 0;
+	if (cmd & TIMER_RB_CTR_0)
+		error = pit_readback1(vatpit, 0, cmd);
+	if (!error && cmd & TIMER_RB_CTR_1)
+		error = pit_readback1(vatpit, 1, cmd);
+	if (!error && cmd & TIMER_RB_CTR_2)
+		error = pit_readback1(vatpit, 2, cmd);
+
+	return (error);
+}
+
+
+static int
+vatpit_update_mode(struct vatpit *vatpit, uint8_t val)
+{
+	struct channel *c;
+	int sel, rw, mode;
+
+	sel = val & TIMER_SEL_MASK;
+	rw = val & TIMER_RW_MASK;
+	mode = val & TIMER_MODE_MASK;
+
+	if (sel == TIMER_SEL_READBACK)
+		return (pit_readback(vatpit, val));
+
+	if (rw != TIMER_LATCH && rw != TIMER_16BIT)
+		return (-1);
+
+	if (rw != TIMER_LATCH) {
+		/*
+		 * Counter mode is not affected when issuing a
+		 * latch command.
+		 */
+		if (mode != TIMER_INTTC &&
+		    mode != TIMER_RATEGEN &&
+		    mode != TIMER_SQWAVE &&
+		    mode != TIMER_SWSTROBE)
+			return (-1);
+	}
+
+	c = &vatpit->channel[sel >> 6];
+	if (rw == TIMER_LATCH)
+		pit_update_counter(vatpit, c, true);
+	else {
+		c->mode = mode;
+		c->olbyte = 0;	/* reset latch after reprogramming */
+		c->status |= TIMER_STS_NULLCNT;
+	}
+
+	return (0);
+}
+
+int
+vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *eax)
+{
+	struct vatpit *vatpit;
+	struct channel *c;
+	uint8_t val;
+	int error;
+
+	vatpit = vm_atpit(vm);
+
+	if (bytes != 1)
+		return (-1);
+
+	val = *eax;
+
+	if (port == TIMER_MODE) {
+		if (in) {
+			VM_CTR0(vatpit->vm, "vatpit attempt to read mode");
+			return (-1);
+		}
+
+		VATPIT_LOCK(vatpit);
+		error = vatpit_update_mode(vatpit, val);
+		VATPIT_UNLOCK(vatpit);
+
+		return (error);
+	}
+
+	/* counter ports */
+	KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2,
+	    ("invalid port 0x%x", port));
+	c = &vatpit->channel[port - TIMER_CNTR0];
+
+	VATPIT_LOCK(vatpit);
+	if (in && c->slatched) {
+		/*
+		 * Return the status byte if latched
+		 */
+		*eax = c->status;
+		c->slatched = false;
+		c->status = 0;
+	} else if (in) {
+		/*
+		 * The spec says that once the output latch is completely
+		 * read it should revert to "following" the counter. Use
+		 * the free running counter for this case (i.e. Linux
+		 * TSC calibration). Assuming the access mode is 16-bit,
+		 * toggle the MSB/LSB bit on each read.
+		 */
+		if (c->olbyte == 0) {
+			uint16_t tmp;
+
+			tmp = pit_update_counter(vatpit, c, false);
+			if (c->frbyte)
+				tmp >>= 8;
+			tmp &= 0xff;
+			*eax = tmp;
+			c->frbyte ^= 1;
+		}  else
+			*eax = c->ol[--c->olbyte];
+	} else {
+		c->cr[c->crbyte++] = *eax;
+		if (c->crbyte == 2) {
+			c->status &= ~TIMER_STS_NULLCNT;
+			c->frbyte = 0;
+			c->crbyte = 0;
+			c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
+			c->now_sbt = sbinuptime();
+			/* Start an interval timer for channel 0 */
+			if (port == TIMER_CNTR0) {
+				c->callout_sbt = c->now_sbt;
+				pit_timer_start_cntr0(vatpit);
+			}
+			if (c->initial == 0)
+				c->initial = 0xffff;
+		}
+	}
+	VATPIT_UNLOCK(vatpit);
+
+	return (0);
+}
+
+int
+vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *eax)
+{
+	struct vatpit *vatpit;
+
+	vatpit = vm_atpit(vm);
+
+	if (in) {
+			VATPIT_LOCK(vatpit);
+			if (vatpit_get_out(vatpit, 2))
+				*eax = TMR2_OUT_STS;
+			else
+				*eax = 0;
+
+			VATPIT_UNLOCK(vatpit);
+	}
+
+	return (0);
+}
+
+struct vatpit *
+vatpit_init(struct vm *vm)
+{
+	struct vatpit *vatpit;
+	struct bintime bt;
+	struct vatpit_callout_arg *arg;
+	int i;
+
+	vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO);
+	vatpit->vm = vm;
+
+	mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN);
+
+	FREQ2BT(PIT_8254_FREQ, &bt);
+	vatpit->freq_sbt = bttosbt(bt);
+
+	for (i = 0; i < 3; i++) {
+		callout_init(&vatpit->channel[i].callout, true);
+		arg = &vatpit->channel[i].callout_arg;
+		arg->vatpit = vatpit;
+		arg->channel_num = i;
+	}
+
+	return (vatpit);
+}
+
+void
+vatpit_cleanup(struct vatpit *vatpit)
+{
+	int i;
+
+	for (i = 0; i < 3; i++)
+		callout_drain(&vatpit->channel[i].callout);
+
+	free(vatpit, M_VATPIT);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h
new file mode 100644
index 0000000000..f20ad73e47
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vatpit.h 273706 2014-10-26 19:03:06Z neel $
+ */
+
+#ifndef _VATPIT_H_
+#define	_VATPIT_H_
+
+#include <machine/timerreg.h>
+
+#define	NMISC_PORT	0x61
+
+struct vatpit *vatpit_init(struct vm *vm);
+void vatpit_cleanup(struct vatpit *vatpit);
+
+int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *eax);
+int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port,
+    int bytes, uint32_t *eax);
+
+#endif	/* _VATPIT_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.c b/usr/src/uts/i86pc/io/vmm/io/vdev.c
new file mode 100644
index 0000000000..0f835625f3
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vdev.c
@@ -0,0 +1,282 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+	SLIST_ENTRY(vdev) 	 entry;
+	struct vdev_ops 	*ops;
+	void			*dev;
+};
+static SLIST_HEAD(, vdev)	vdev_head;
+static int 		  	vdev_count;
+
+struct vdev_region {
+	SLIST_ENTRY(vdev_region) 	 entry;
+	struct vdev_ops 		*ops;
+	void				*dev;
+	struct io_region		*io;
+};
+static SLIST_HEAD(, vdev_region)	 region_head;
+static int 		  		 region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT 	(0)
+#define VDEV_RESET	(1)
+#define VDEV_HALT	(2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+	struct vdev 	*vd;
+	int		 rc;
+
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		// printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+		switch (event) {
+			case VDEV_INIT:
+				rc = vd->ops->init(vd->dev);
+				break;
+			case VDEV_RESET:
+				rc = vd->ops->reset(vd->dev);
+				break;
+			case VDEV_HALT:
+				rc = vd->ops->halt(vd->dev);
+				break;
+			default:
+				break;
+		}
+		if (rc) {
+			printf("vdev %s init failed rc=%d\n",
+			    vd->ops->name, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+int
+vdev_init(void)
+{
+	return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+	return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+	return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+	SLIST_INIT(&vdev_head);
+	vdev_count = 0;
+
+	SLIST_INIT(&region_head);
+	region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+	struct vdev *vd;
+     
+	// TODO: locking
+	while (!SLIST_EMPTY(&vdev_head)) {
+		vd = SLIST_FIRST(&vdev_head);
+		SLIST_REMOVE_HEAD(&vdev_head, entry);
+		free(vd, M_VDEV);
+		vdev_count--;
+	}
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+	struct vdev *vd;
+	vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); 
+	vd->ops = ops;
+	vd->dev = dev;
+	
+	// TODO: locking
+	SLIST_INSERT_HEAD(&vdev_head, vd, entry); 
+	vdev_count++;
+	return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+	struct vdev 	*vd, *found;
+
+	found = NULL;
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		if (vd->dev == dev) {
+			found = vd;
+		}
+	}
+
+	if (found) {
+		SLIST_REMOVE(&vdev_head, found, vdev, entry);
+		free(found, M_VDEV);
+	}
+}
+
+#define IN_RANGE(val, start, end)	\
+    (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev) 
+{
+	struct 		vdev_region *region, *found;
+	uint64_t	region_base;
+	uint64_t	region_end;
+
+	found = NULL;
+
+	// TODO: locking
+	// FIXME: we should verify we are in the context the current
+	// 	  vcpu here as well.
+	SLIST_FOREACH(region, &region_head, entry) {
+		region_base = region->io->base;
+		region_end = region_base + region->io->len;
+		if (IN_RANGE(io->base, region_base, region_end) &&
+		    IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+		    (dev && dev == region->dev)) {
+			found = region;
+			break;
+		}
+	}
+	return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	if (region) {
+		return -EEXIST;
+	}
+
+	region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+	region->io = io;
+	region->ops = ops;
+	region->dev = dev;
+
+	// TODO: locking
+	SLIST_INSERT_HEAD(&region_head, region, entry); 
+	region_count++;
+
+	return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	
+	if (region) {
+		SLIST_REMOVE(&region_head, region, vdev_region, entry);
+		free(region, M_VDEV);
+		region_count--;
+	}
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+	struct vdev_region 	*region;
+	struct io_region	 io;
+	region_attr_t		 attr;
+	int			 rc;
+
+	io.base = gpa;
+	io.len = size;
+
+	region = vdev_find_region(&io, NULL);
+	if (!region)
+		return -EINVAL;
+	
+	attr = (read) ? MMIO_READ : MMIO_WRITE;
+	if (!(region->io->attr & attr))
+		return -EPERM;
+
+	if (read)
+		rc = region->ops->memread(region->dev, gpa, size, data);
+	else 
+		rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+	return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+	return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.h b/usr/src/uts/i86pc/io/vmm/io/vdev.h
new file mode 100644
index 0000000000..dd2df75ad8
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vdev.h
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vdev.h 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _VDEV_H_
+#define	_VDEV_H_
+
+typedef enum {
+	BYTE	= 1,
+	WORD	= 2,
+	DWORD	= 4,
+	QWORD	= 8,
+} opsize_t;
+
+typedef enum {
+	MMIO_READ = 1,
+	MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+	uint64_t	base;
+	uint64_t	len;
+	region_attr_t	attr;
+	int		vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+	const char	*name;
+	vdev_init_t	init;
+	vdev_reset_t	reset;
+	vdev_halt_t	halt;
+	vdev_memread_t	memread;
+	vdev_memwrite_t	memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int  vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int  vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif	/* _VDEV_H_ */
+
diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.c b/usr/src/uts/i86pc/io/vmm/io/vhpet.c
new file mode 100644
index 0000000000..25f6013da0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.c
@@ -0,0 +1,821 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <dev/acpica/acpi_hpet.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vatpic.h"
+#include "vioapic.h"
+#include "vhpet.h"
+
+#include "vmm_ktr.h"
+
+static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet");
+
+#define	HPET_FREQ	10000000		/* 10.0 Mhz */
+#define	FS_PER_S	1000000000000000ul
+
+/* Timer N Configuration and Capabilities Register */
+#define	HPET_TCAP_RO_MASK	(HPET_TCAP_INT_ROUTE 	|		\
+				 HPET_TCAP_FSB_INT_DEL	|		\
+				 HPET_TCAP_SIZE		|		\
+				 HPET_TCAP_PER_INT)
+/*
+ * HPET requires at least 3 timers and up to 32 timers per block.
+ */
+#define	VHPET_NUM_TIMERS	8
+CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32);
+
+struct vhpet_callout_arg {
+	struct vhpet *vhpet;
+	int timer_num;
+};
+
+struct vhpet {
+	struct vm	*vm;
+	struct mtx	mtx;
+	sbintime_t	freq_sbt;
+
+	uint64_t	config;		/* Configuration */
+	uint64_t	isr;		/* Interrupt Status */
+	uint32_t	countbase;	/* HPET counter base value */
+	sbintime_t	countbase_sbt;	/* uptime corresponding to base value */
+
+	struct {
+		uint64_t	cap_config;	/* Configuration */
+		uint64_t	msireg;		/* FSB interrupt routing */
+		uint32_t	compval;	/* Comparator */
+		uint32_t	comprate;
+		struct callout	callout;
+		sbintime_t	callout_sbt;	/* time when counter==compval */
+		struct vhpet_callout_arg arg;
+	} timer[VHPET_NUM_TIMERS];
+};
+
+#define	VHPET_LOCK(vhp)		mtx_lock(&((vhp)->mtx))
+#define	VHPET_UNLOCK(vhp)	mtx_unlock(&((vhp)->mtx))
+
+static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter,
+    sbintime_t now);
+
+static uint64_t
+vhpet_capabilities(void)
+{
+	uint64_t cap = 0;
+
+	cap |= 0x8086 << 16;			/* vendor id */
+	cap |= HPET_CAP_LEG_RT;			/* legacy routing capable */
+	cap |= (VHPET_NUM_TIMERS - 1) << 8;	/* number of timers */
+	cap |= 1;				/* revision */
+	cap &= ~HPET_CAP_COUNT_SIZE;		/* 32-bit timer */
+
+	cap &= 0xffffffff;
+	cap |= (FS_PER_S / HPET_FREQ) << 32;	/* tick period in fs */
+
+	return (cap);
+}
+
+static __inline bool
+vhpet_counter_enabled(struct vhpet *vhpet)
+{
+
+	return ((vhpet->config & HPET_CNF_ENABLE) ? true : false);
+}
+
+static __inline bool
+vhpet_timer_msi_enabled(struct vhpet *vhpet, int n)
+{
+	const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN;
+
+	/*
+	 * LegacyReplacement Route configuration takes precedence over MSI
+	 * for timers 0 and 1.
+	 */
+	if (n == 0 || n == 1) {
+		if (vhpet->config & HPET_CNF_LEG_RT)
+			return (false);
+	}
+
+	if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable)
+		return (true);
+	else
+		return (false);
+}
+
+static __inline int
+vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n)
+{
+	/*
+	 * If the timer is configured to use MSI then treat it as if the
+	 * timer is not connected to the ioapic.
+	 */
+	if (vhpet_timer_msi_enabled(vhpet, n))
+		return (0);
+
+	if (vhpet->config & HPET_CNF_LEG_RT) {
+		/*
+		 * In "legacy routing" timers 0 and 1 are connected to
+		 * ioapic pins 2 and 8 respectively.
+		 */
+		switch (n) {
+		case 0:
+			return (2);
+		case 1:
+			return (8);
+		}
+	}
+
+	return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9);
+}
+
+static __inline int
+vhpet_timer_atpic_pin(struct vhpet *vhpet, int n)
+{
+	if (vhpet->config & HPET_CNF_LEG_RT) {
+		/*
+		 * In "legacy routing" timers 0 and 1 are connected to
+		 * 8259 master pin 0 and slave pin 0 respectively.
+		 */
+		switch (n) {
+		case 0:
+			return (0);
+		case 1:
+			return (8);
+		}
+	}
+
+	return (-1);
+}
+
+static uint32_t
+vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
+{
+	uint32_t val;
+	sbintime_t now, delta;
+
+	val = vhpet->countbase;
+	if (vhpet_counter_enabled(vhpet)) {
+		now = sbinuptime();
+		delta = now - vhpet->countbase_sbt;
+#ifdef	__FreeBSD__
+		KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: "
+		    "%#lx to %#lx", vhpet->countbase_sbt, now));
+#else
+		KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: "
+		    "%lx to %lx", vhpet->countbase_sbt, now));
+#endif
+		val += delta / vhpet->freq_sbt;
+		if (nowptr != NULL)
+			*nowptr = now;
+	} else {
+		/*
+		 * The sbinuptime corresponding to the 'countbase' is
+		 * meaningless when the counter is disabled. Make sure
+		 * that the the caller doesn't want to use it.
+		 */
+		KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL"));
+	}
+	return (val);
+}
+
+static void
+vhpet_timer_clear_isr(struct vhpet *vhpet, int n)
+{
+	int pin, legacy_pin;
+
+	if (vhpet->isr & (1 << n)) {
+		pin = vhpet_timer_ioapic_pin(vhpet, n);
+		KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n));
+		vioapic_deassert_irq(vhpet->vm, pin);
+
+		legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
+		if (legacy_pin != -1)
+			vatpic_deassert_irq(vhpet->vm, legacy_pin);
+
+		vhpet->isr &= ~(1 << n);
+	}
+}
+
+static __inline bool
+vhpet_periodic_timer(struct vhpet *vhpet, int n)
+{
+
+	return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0);
+}
+
+static __inline bool
+vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n)
+{
+
+	return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0);
+}
+
+static __inline bool
+vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
+{
+
+	KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: "
+	    "timer %d is using MSI", n));
+
+	/* The legacy replacement interrupts are always edge triggered */
+	if (vhpet->config & HPET_CNF_LEG_RT) {
+		if (n == 0 || n == 1)
+			return (true);
+	}
+
+	if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0)
+		return (true);
+	else
+		return (false);
+}
+
+static void
+vhpet_timer_interrupt(struct vhpet *vhpet, int n)
+{
+	int pin, legacy_pin;
+
+	/* If interrupts are not enabled for this timer then just return. */
+	if (!vhpet_timer_interrupt_enabled(vhpet, n))
+		return;
+
+	/*
+	 * If a level triggered interrupt is already asserted then just return.
+	 */
+	if ((vhpet->isr & (1 << n)) != 0) {
+		VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n);
+		return;
+	}
+
+	if (vhpet_timer_msi_enabled(vhpet, n)) {
+		lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32,
+		    vhpet->timer[n].msireg & 0xffffffff);
+		return;
+	}	
+
+	pin = vhpet_timer_ioapic_pin(vhpet, n);
+	if (pin == 0) {
+		VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n);
+		return;
+	}
+
+	legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
+
+	if (vhpet_timer_edge_trig(vhpet, n)) {
+		vioapic_pulse_irq(vhpet->vm, pin);
+		if (legacy_pin != -1)
+			vatpic_pulse_irq(vhpet->vm, legacy_pin);
+	} else {
+		vhpet->isr |= 1 << n;
+		vioapic_assert_irq(vhpet->vm, pin);
+		if (legacy_pin != -1)
+			vatpic_assert_irq(vhpet->vm, legacy_pin);
+	}
+}
+
+static void
+vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter)
+{
+	uint32_t compval, comprate, compnext;
+
+	KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n));
+
+	compval = vhpet->timer[n].compval;
+	comprate = vhpet->timer[n].comprate;
+
+	/*
+	 * Calculate the comparator value to be used for the next periodic
+	 * interrupt.
+	 *
+	 * This function is commonly called from the callout handler.
+	 * In this scenario the 'counter' is ahead of 'compval'. To find
+	 * the next value to program into the accumulator we divide the
+	 * number space between 'compval' and 'counter' into 'comprate'
+	 * sized units. The 'compval' is rounded up such that is "ahead"
+	 * of 'counter'.
+	 */
+	compnext = compval + ((counter - compval) / comprate + 1) * comprate;
+
+	vhpet->timer[n].compval = compnext;
+}
+
+static void
+vhpet_handler(void *a)
+{
+	int n;
+	uint32_t counter;
+	sbintime_t now;
+	struct vhpet *vhpet;
+	struct callout *callout;
+	struct vhpet_callout_arg *arg;
+
+	arg = a;
+	vhpet = arg->vhpet;
+	n = arg->timer_num;
+	callout = &vhpet->timer[n].callout;
+
+	VM_CTR1(vhpet->vm, "hpet t%d fired", n);
+
+	VHPET_LOCK(vhpet);
+
+	if (callout_pending(callout))		/* callout was reset */
+		goto done;
+
+	if (!callout_active(callout))		/* callout was stopped */
+		goto done;
+
+	callout_deactivate(callout);
+
+	if (!vhpet_counter_enabled(vhpet))
+		panic("vhpet(%p) callout with counter disabled", vhpet);
+
+	counter = vhpet_counter(vhpet, &now);
+	vhpet_start_timer(vhpet, n, counter, now);
+	vhpet_timer_interrupt(vhpet, n);
+done:
+	VHPET_UNLOCK(vhpet);
+	return;
+}
+
+static void
+vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now)
+{
+
+	VM_CTR1(vhpet->vm, "hpet t%d stopped", n);
+	callout_stop(&vhpet->timer[n].callout);
+
+	/*
+	 * If the callout was scheduled to expire in the past but hasn't
+	 * had a chance to execute yet then trigger the timer interrupt
+	 * here. Failing to do so will result in a missed timer interrupt
+	 * in the guest. This is especially bad in one-shot mode because
+	 * the next interrupt has to wait for the counter to wrap around.
+	 */
+	if (vhpet->timer[n].callout_sbt < now) {
+		VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after "
+		    "stopping timer", n);
+		vhpet_timer_interrupt(vhpet, n);
+	}
+}
+
+static void
+vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now)
+{
+	sbintime_t delta, precision;
+
+	/* If interrupts are not enabled for this timer then just return. */
+	if (!vhpet_timer_interrupt_enabled(vhpet, n))
+		return;
+
+	if (vhpet->timer[n].comprate != 0)
+		vhpet_adjust_compval(vhpet, n, counter);
+	else {
+		/*
+		 * In one-shot mode it is the guest's responsibility to make
+		 * sure that the comparator value is not in the "past". The
+		 * hardware doesn't have any belt-and-suspenders to deal with
+		 * this so we don't either.
+		 */
+	}
+
+	delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt;
+	precision = delta >> tc_precexp;
+	vhpet->timer[n].callout_sbt = now + delta;
+	callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt,
+	    precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE);
+}
+
+static void
+vhpet_start_counting(struct vhpet *vhpet)
+{
+	int i;
+
+	vhpet->countbase_sbt = sbinuptime();
+	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+		/*
+		 * Restart the timers based on the value of the main counter
+		 * when it stopped counting.
+		 */
+		vhpet_start_timer(vhpet, i, vhpet->countbase,
+		    vhpet->countbase_sbt);
+	}
+}
+
+static void
+vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now)
+{
+	int i;
+
+	vhpet->countbase = counter;
+	for (i = 0; i < VHPET_NUM_TIMERS; i++)
+		vhpet_stop_timer(vhpet, i, now);
+}
+
+static __inline void
+update_register(uint64_t *regptr, uint64_t data, uint64_t mask)
+{
+
+	*regptr &= ~mask;
+	*regptr |= (data & mask);
+}
+
+static void
+vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data,
+    uint64_t mask)
+{
+	bool clear_isr;
+	int old_pin, new_pin;
+	uint32_t allowed_irqs;
+	uint64_t oldval, newval;
+
+	if (vhpet_timer_msi_enabled(vhpet, n) ||
+	    vhpet_timer_edge_trig(vhpet, n)) {
+		if (vhpet->isr & (1 << n))
+			panic("vhpet timer %d isr should not be asserted", n);
+	}
+	old_pin = vhpet_timer_ioapic_pin(vhpet, n);
+	oldval = vhpet->timer[n].cap_config;
+
+	newval = oldval;
+	update_register(&newval, data, mask);
+	newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE);
+	newval |= oldval & HPET_TCAP_RO_MASK;
+
+	if (newval == oldval)
+		return;
+
+	vhpet->timer[n].cap_config = newval;
+	VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval);
+
+	/*
+	 * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field.
+	 * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set
+	 * it to the default value of 0.
+	 */
+	allowed_irqs = vhpet->timer[n].cap_config >> 32;
+	new_pin = vhpet_timer_ioapic_pin(vhpet, n);
+	if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) {
+		VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, "
+		    "allowed_irqs 0x%08x", n, new_pin, allowed_irqs);
+		new_pin = 0;
+		vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE;
+	}
+
+	if (!vhpet_periodic_timer(vhpet, n))
+		vhpet->timer[n].comprate = 0;
+
+	/*
+	 * If the timer's ISR bit is set then clear it in the following cases:
+	 * - interrupt is disabled
+	 * - interrupt type is changed from level to edge or fsb.
+	 * - interrupt routing is changed
+	 *
+	 * This is to ensure that this timer's level triggered interrupt does
+	 * not remain asserted forever.
+	 */
+	if (vhpet->isr & (1 << n)) {
+		KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d",
+		    n, old_pin));
+		if (!vhpet_timer_interrupt_enabled(vhpet, n))
+			clear_isr = true;
+		else if (vhpet_timer_msi_enabled(vhpet, n))
+			clear_isr = true;
+		else if (vhpet_timer_edge_trig(vhpet, n))
+			clear_isr = true;
+		else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin)
+			clear_isr = true;
+		else
+			clear_isr = false;
+
+		if (clear_isr) {
+			VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to "
+			    "configuration change", n);
+			vioapic_deassert_irq(vhpet->vm, old_pin);
+			vhpet->isr &= ~(1 << n);
+		}
+	}
+}
+
+int
+vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size,
+    void *arg)
+{
+	struct vhpet *vhpet;
+	uint64_t data, mask, oldval, val64;
+	uint32_t isr_clear_mask, old_compval, old_comprate, counter;
+	sbintime_t now, *nowptr;
+	int i, offset;
+
+	vhpet = vm_hpet(vm);
+	offset = gpa - VHPET_BASE;
+
+	VHPET_LOCK(vhpet);
+
+	/* Accesses to the HPET should be 4 or 8 bytes wide */
+	switch (size) {
+	case 8:
+		mask = 0xffffffffffffffff;
+		data = val;
+		break;
+	case 4:
+		mask = 0xffffffff;
+		data = val;
+		if ((offset & 0x4) != 0) {
+			mask <<= 32;
+			data <<= 32;
+		} 
+		break;
+	default:
+		VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
+		    "offset 0x%08x, size %d", offset, size);
+		goto done;
+	}
+
+	/* Access to the HPET should be naturally aligned to its width */
+	if (offset & (size - 1)) {
+		VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
+		    "offset 0x%08x, size %d", offset, size);
+		goto done;
+	}
+
+	if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
+		/*
+		 * Get the most recent value of the counter before updating
+		 * the 'config' register. If the HPET is going to be disabled
+		 * then we need to update 'countbase' with the value right
+		 * before it is disabled.
+		 */
+		nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL;
+		counter = vhpet_counter(vhpet, nowptr);
+		oldval = vhpet->config;
+		update_register(&vhpet->config, data, mask);
+		if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) {
+			if (vhpet_counter_enabled(vhpet)) {
+				vhpet_start_counting(vhpet);
+				VM_CTR0(vhpet->vm, "hpet enabled");
+			} else {
+				vhpet_stop_counting(vhpet, counter, now);
+				VM_CTR0(vhpet->vm, "hpet disabled");
+			}
+		}
+		goto done;
+	}
+
+	if (offset == HPET_ISR || offset == HPET_ISR + 4) {
+		isr_clear_mask = vhpet->isr & data;
+		for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+			if ((isr_clear_mask & (1 << i)) != 0) {
+				VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i);
+				vhpet_timer_clear_isr(vhpet, i);
+			}
+		}
+		goto done;
+	}
+
+	if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
+		/* Zero-extend the counter to 64-bits before updating it */
+		val64 = vhpet_counter(vhpet, NULL);
+		update_register(&val64, data, mask);
+		vhpet->countbase = val64;
+		if (vhpet_counter_enabled(vhpet))
+			vhpet_start_counting(vhpet);
+		goto done;
+	}
+
+	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+		if (offset == HPET_TIMER_CAP_CNF(i) ||
+		    offset == HPET_TIMER_CAP_CNF(i) + 4) {
+			vhpet_timer_update_config(vhpet, i, data, mask);
+			break;
+		}
+
+		if (offset == HPET_TIMER_COMPARATOR(i) ||
+		    offset == HPET_TIMER_COMPARATOR(i) + 4) {
+			old_compval = vhpet->timer[i].compval;
+			old_comprate = vhpet->timer[i].comprate;
+			if (vhpet_periodic_timer(vhpet, i)) {
+				/*
+				 * In periodic mode writes to the comparator
+				 * change the 'compval' register only if the
+				 * HPET_TCNF_VAL_SET bit is set in the config
+				 * register.
+				 */
+				val64 = vhpet->timer[i].comprate;
+				update_register(&val64, data, mask);
+				vhpet->timer[i].comprate = val64;
+				if ((vhpet->timer[i].cap_config &
+				    HPET_TCNF_VAL_SET) != 0) {
+					vhpet->timer[i].compval = val64;
+				}
+			} else {
+				KASSERT(vhpet->timer[i].comprate == 0,
+				    ("vhpet one-shot timer %d has invalid "
+				    "rate %u", i, vhpet->timer[i].comprate));
+				val64 = vhpet->timer[i].compval;
+				update_register(&val64, data, mask);
+				vhpet->timer[i].compval = val64;
+			}
+			vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET;
+
+			if (vhpet->timer[i].compval != old_compval ||
+			    vhpet->timer[i].comprate != old_comprate) {
+				if (vhpet_counter_enabled(vhpet)) {
+					counter = vhpet_counter(vhpet, &now);
+					vhpet_start_timer(vhpet, i, counter,
+					    now);
+				}
+			}
+			break;
+		}
+
+		if (offset == HPET_TIMER_FSB_VAL(i) ||
+		    offset == HPET_TIMER_FSB_ADDR(i)) {
+			update_register(&vhpet->timer[i].msireg, data, mask);
+			break;
+		}
+	}
+done:
+	VHPET_UNLOCK(vhpet);
+	return (0);
+}
+
+int
+vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size,
+    void *arg)
+{
+	int i, offset;
+	struct vhpet *vhpet;
+	uint64_t data;
+
+	vhpet = vm_hpet(vm);
+	offset = gpa - VHPET_BASE;
+
+	VHPET_LOCK(vhpet);
+
+	/* Accesses to the HPET should be 4 or 8 bytes wide */
+	if (size != 4 && size != 8) {
+		VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
+		    "offset 0x%08x, size %d", offset, size);
+		data = 0;
+		goto done;
+	}
+
+	/* Access to the HPET should be naturally aligned to its width */
+	if (offset & (size - 1)) {
+		VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
+		    "offset 0x%08x, size %d", offset, size);
+		data = 0;
+		goto done;
+	}
+
+	if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) {
+		data = vhpet_capabilities();
+		goto done;	
+	}
+
+	if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
+		data = vhpet->config;
+		goto done;
+	}
+
+	if (offset == HPET_ISR || offset == HPET_ISR + 4) {
+		data = vhpet->isr;
+		goto done;
+	}
+
+	if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
+		data = vhpet_counter(vhpet, NULL);
+		goto done;
+	}
+
+	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+		if (offset == HPET_TIMER_CAP_CNF(i) ||
+		    offset == HPET_TIMER_CAP_CNF(i) + 4) {
+			data = vhpet->timer[i].cap_config;
+			break;
+		}
+
+		if (offset == HPET_TIMER_COMPARATOR(i) ||
+		    offset == HPET_TIMER_COMPARATOR(i) + 4) {
+			data = vhpet->timer[i].compval;
+			break;
+		}
+
+		if (offset == HPET_TIMER_FSB_VAL(i) ||
+		    offset == HPET_TIMER_FSB_ADDR(i)) {
+			data = vhpet->timer[i].msireg;
+			break;
+		}
+	}
+
+	if (i >= VHPET_NUM_TIMERS)
+		data = 0;
+done:
+	VHPET_UNLOCK(vhpet);
+
+	if (size == 4) {
+		if (offset & 0x4)
+			data >>= 32;
+	}
+	*rval = data;
+	return (0);
+}
+
+struct vhpet *
+vhpet_init(struct vm *vm)
+{
+	int i, pincount;
+	struct vhpet *vhpet;
+	uint64_t allowed_irqs;
+	struct vhpet_callout_arg *arg;
+	struct bintime bt;
+
+	vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO);
+        vhpet->vm = vm;
+	mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF);
+
+	FREQ2BT(HPET_FREQ, &bt);
+	vhpet->freq_sbt = bttosbt(bt);
+
+	pincount = vioapic_pincount(vm);
+	if (pincount >= 24)
+		allowed_irqs = 0x00f00000;	/* irqs 20, 21, 22 and 23 */
+	else
+		allowed_irqs = 0;
+
+	/*
+	 * Initialize HPET timer hardware state.
+	 */
+	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+		vhpet->timer[i].cap_config = allowed_irqs << 32;
+		vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT;
+		vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL;
+
+		vhpet->timer[i].compval = 0xffffffff;
+		callout_init(&vhpet->timer[i].callout, 1);
+
+		arg = &vhpet->timer[i].arg;
+		arg->vhpet = vhpet;
+		arg->timer_num = i;
+	}
+
+	return (vhpet);
+}
+
+void
+vhpet_cleanup(struct vhpet *vhpet)
+{
+	int i;
+
+	for (i = 0; i < VHPET_NUM_TIMERS; i++)
+		callout_drain(&vhpet->timer[i].callout);
+
+	free(vhpet, M_VHPET);
+}
+
+int
+vhpet_getcap(struct vm_hpet_cap *cap)
+{
+
+	cap->capabilities = vhpet_capabilities();
+	return (0);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.h b/usr/src/uts/i86pc/io/vmm/io/vhpet.h
new file mode 100644
index 0000000000..868809d166
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vhpet.h 258579 2013-11-25 19:04:51Z neel $
+ */
+
+#ifndef _VHPET_H_
+#define	_VHPET_H_
+
+#define	VHPET_BASE	0xfed00000
+#define	VHPET_SIZE	1024
+
+struct vhpet *vhpet_init(struct vm *vm);
+void 	vhpet_cleanup(struct vhpet *vhpet);
+int	vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val,
+	    int size, void *arg);
+int	vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val,
+	    int size, void *arg);
+int	vhpet_getcap(struct vm_hpet_cap *cap);
+
+#endif	/* _VHPET_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
new file mode 100644
index 0000000000..5adf5de16d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $");
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+
+#include <x86/apicreg.h>
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+#include "vioapic.h"
+
+#define	IOREGSEL	0x00
+#define	IOWIN		0x10
+
+#define	REDIR_ENTRIES	24
+#define	RTBL_RO_BITS	((uint64_t)(IOART_REM_IRR | IOART_DELIVS))
+
+struct vioapic {
+	struct vm	*vm;
+	struct mtx	mtx;
+	uint32_t	id;
+	uint32_t	ioregsel;
+	struct {
+		uint64_t reg;
+		int	 acnt;	/* sum of pin asserts (+1) and deasserts (-1) */
+	} rtbl[REDIR_ENTRIES];
+};
+
+#define	VIOAPIC_LOCK(vioapic)		mtx_lock_spin(&((vioapic)->mtx))
+#define	VIOAPIC_UNLOCK(vioapic)		mtx_unlock_spin(&((vioapic)->mtx))
+#define	VIOAPIC_LOCKED(vioapic)		mtx_owned(&((vioapic)->mtx))
+
+static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic");
+
+#define	VIOAPIC_CTR1(vioapic, fmt, a1)					\
+	VM_CTR1((vioapic)->vm, fmt, a1)
+
+#define	VIOAPIC_CTR2(vioapic, fmt, a1, a2)				\
+	VM_CTR2((vioapic)->vm, fmt, a1, a2)
+
+#define	VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3)				\
+	VM_CTR3((vioapic)->vm, fmt, a1, a2, a3)
+
+#define	VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4)			\
+	VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4)
+
+#ifdef KTR
+static const char *
+pinstate_str(bool asserted)
+{
+
+	if (asserted)
+		return ("asserted");
+	else
+		return ("deasserted");
+}
+#endif
+
+static void
+vioapic_send_intr(struct vioapic *vioapic, int pin)
+{
+	int vector, delmode;
+	uint32_t low, high, dest;
+	bool level, phys;
+
+	KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
+	    ("vioapic_set_pinstate: invalid pin number %d", pin));
+
+	KASSERT(VIOAPIC_LOCKED(vioapic),
+	    ("vioapic_set_pinstate: vioapic is not locked"));
+
+	low = vioapic->rtbl[pin].reg;
+	high = vioapic->rtbl[pin].reg >> 32;
+
+	if ((low & IOART_INTMASK) == IOART_INTMSET) {
+		VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin);
+		return;
+	}
+
+	phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
+	delmode = low & IOART_DELMOD;
+	level = low & IOART_TRGRLVL ? true : false;
+	if (level)
+		vioapic->rtbl[pin].reg |= IOART_REM_IRR;
+
+	vector = low & IOART_INTVEC;
+	dest = high >> APIC_ID_SHIFT;
+	vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector);
+}
+
+static void
+vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate)
+{
+	int oldcnt, newcnt;
+	bool needintr;
+
+	KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
+	    ("vioapic_set_pinstate: invalid pin number %d", pin));
+
+	KASSERT(VIOAPIC_LOCKED(vioapic),
+	    ("vioapic_set_pinstate: vioapic is not locked"));
+
+	oldcnt = vioapic->rtbl[pin].acnt;
+	if (newstate)
+		vioapic->rtbl[pin].acnt++;
+	else
+		vioapic->rtbl[pin].acnt--;
+	newcnt = vioapic->rtbl[pin].acnt;
+
+	if (newcnt < 0) {
+		VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d",
+		    pin, newcnt);
+	}
+
+	needintr = false;
+	if (oldcnt == 0 && newcnt == 1) {
+		needintr = true;
+		VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin);
+	} else if (oldcnt == 1 && newcnt == 0) {
+		VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin);
+	} else {
+		VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d",
+		    pin, pinstate_str(newstate), newcnt);
+	}
+
+	if (needintr)
+		vioapic_send_intr(vioapic, pin);
+}
+
+enum irqstate {
+	IRQSTATE_ASSERT,
+	IRQSTATE_DEASSERT,
+	IRQSTATE_PULSE
+};
+
+static int
+vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
+{
+	struct vioapic *vioapic;
+
+	if (irq < 0 || irq >= REDIR_ENTRIES)
+		return (EINVAL);
+
+	vioapic = vm_ioapic(vm);
+
+	VIOAPIC_LOCK(vioapic);
+	switch (irqstate) {
+	case IRQSTATE_ASSERT:
+		vioapic_set_pinstate(vioapic, irq, true);
+		break;
+	case IRQSTATE_DEASSERT:
+		vioapic_set_pinstate(vioapic, irq, false);
+		break;
+	case IRQSTATE_PULSE:
+		vioapic_set_pinstate(vioapic, irq, true);
+		vioapic_set_pinstate(vioapic, irq, false);
+		break;
+	default:
+		panic("vioapic_set_irqstate: invalid irqstate %d", irqstate);
+	}
+	VIOAPIC_UNLOCK(vioapic);
+
+	return (0);
+}
+
+int
+vioapic_assert_irq(struct vm *vm, int irq)
+{
+
+	return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
+}
+
+int
+vioapic_deassert_irq(struct vm *vm, int irq)
+{
+
+	return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
+}
+
+int
+vioapic_pulse_irq(struct vm *vm, int irq)
+{
+
+	return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
+}
+
+/*
+ * Reset the vlapic's trigger-mode register to reflect the ioapic pin
+ * configuration.
+ */
+static void
+vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
+{
+	struct vioapic *vioapic;
+	struct vlapic *vlapic;
+	uint32_t low, high, dest;
+	int delmode, pin, vector;
+	bool level, phys;
+
+	vlapic = vm_lapic(vm, vcpuid);
+	vioapic = vm_ioapic(vm);
+
+	VIOAPIC_LOCK(vioapic);
+	/*
+	 * Reset all vectors to be edge-triggered.
+	 */
+	vlapic_reset_tmr(vlapic);
+	for (pin = 0; pin < REDIR_ENTRIES; pin++) {
+		low = vioapic->rtbl[pin].reg;
+		high = vioapic->rtbl[pin].reg >> 32;
+
+		level = low & IOART_TRGRLVL ? true : false;
+		if (!level)
+			continue;
+
+		/*
+		 * For a level-triggered 'pin' let the vlapic figure out if
+		 * an assertion on this 'pin' would result in an interrupt
+		 * being delivered to it. If yes, then it will modify the
+		 * TMR bit associated with this vector to level-triggered.
+		 */
+		phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
+		delmode = low & IOART_DELMOD;
+		vector = low & IOART_INTVEC;
+		dest = high >> APIC_ID_SHIFT;
+		vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
+	}
+	VIOAPIC_UNLOCK(vioapic);
+}
+
+static uint32_t
+vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr)
+{
+	int regnum, pin, rshift;
+
+	regnum = addr & 0xff;
+	switch (regnum) {
+	case IOAPIC_ID:
+		return (vioapic->id);
+		break;
+	case IOAPIC_VER:
+		return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11);
+		break;
+	case IOAPIC_ARB:
+		return (vioapic->id);
+		break;
+	default:
+		break;
+	}
+
+	/* redirection table entries */
+	if (regnum >= IOAPIC_REDTBL &&
+	    regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+		pin = (regnum - IOAPIC_REDTBL) / 2;
+		if ((regnum - IOAPIC_REDTBL) % 2)
+			rshift = 32;
+		else
+			rshift = 0;
+
+		return (vioapic->rtbl[pin].reg >> rshift);
+	}
+
+	return (0);
+}
+
+static void
+vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
+{
+	uint64_t data64, mask64;
+	uint64_t last, changed;
+	int regnum, pin, lshift;
+	cpuset_t allvcpus;
+
+	regnum = addr & 0xff;
+	switch (regnum) {
+	case IOAPIC_ID:
+		vioapic->id = data & APIC_ID_MASK;
+		break;
+	case IOAPIC_VER:
+	case IOAPIC_ARB:
+		/* readonly */
+		break;
+	default:
+		break;
+	}
+
+	/* redirection table entries */
+	if (regnum >= IOAPIC_REDTBL &&
+	    regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+		pin = (regnum - IOAPIC_REDTBL) / 2;
+		if ((regnum - IOAPIC_REDTBL) % 2)
+			lshift = 32;
+		else
+			lshift = 0;
+
+		last = vioapic->rtbl[pin].reg;
+
+		data64 = (uint64_t)data << lshift;
+		mask64 = (uint64_t)0xffffffff << lshift;
+		vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS;
+		vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS;
+
+		VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx",
+		    pin, vioapic->rtbl[pin].reg);
+
+		/*
+		 * If any fields in the redirection table entry (except mask
+		 * or polarity) have changed then rendezvous all the vcpus
+		 * to update their vlapic trigger-mode registers.
+		 */
+		changed = last ^ vioapic->rtbl[pin].reg;
+		if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
+			VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
+			    "vlapic trigger-mode register", pin);
+			VIOAPIC_UNLOCK(vioapic);
+#if 0	/* XXX */
+			allvcpus = vm_active_cpus(vioapic->vm);
+			vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
+			    vioapic_update_tmr, NULL);
+#endif
+			VIOAPIC_LOCK(vioapic);
+		}
+
+		/*
+		 * Generate an interrupt if the following conditions are met:
+		 * - pin is not masked
+		 * - previous interrupt has been EOIed
+		 * - pin level is asserted
+		 */
+		if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR &&
+		    (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 &&
+		    (vioapic->rtbl[pin].acnt > 0)) {
+			VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl "
+			    "write, acnt %d", pin, vioapic->rtbl[pin].acnt);
+			vioapic_send_intr(vioapic, pin);
+		}
+	}
+}
+
+static int
+vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa,
+    uint64_t *data, int size, bool doread)
+{
+	uint64_t offset;
+
+	offset = gpa - VIOAPIC_BASE;
+
+	/*
+	 * The IOAPIC specification allows 32-bit wide accesses to the
+	 * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+	 */
+	if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+		if (doread)
+			*data = 0;
+		return (0);
+	}
+
+	VIOAPIC_LOCK(vioapic);
+	if (offset == IOREGSEL) {
+		if (doread)
+			*data = vioapic->ioregsel;
+		else
+			vioapic->ioregsel = *data;
+	} else {
+		if (doread) {
+			*data = vioapic_read(vioapic, vcpuid,
+			    vioapic->ioregsel);
+		} else {
+			vioapic_write(vioapic, vcpuid, vioapic->ioregsel,
+			    *data);
+		}
+	}
+	VIOAPIC_UNLOCK(vioapic);
+
+	return (0);
+}
+
+int
+vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval,
+    int size, void *arg)
+{
+	int error;
+	struct vioapic *vioapic;
+
+	vioapic = vm_ioapic(vm);
+	error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true);
+	return (error);
+}
+
+int
+vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval,
+    int size, void *arg)
+{
+	int error;
+	struct vioapic *vioapic;
+
+	vioapic = vm_ioapic(vm);
+	error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false);
+	return (error);
+}
+
+void
+vioapic_process_eoi(struct vm *vm, int vcpuid, int vector)
+{
+	struct vioapic *vioapic;
+	int pin;
+
+	KASSERT(vector >= 0 && vector < 256,
+	    ("vioapic_process_eoi: invalid vector %d", vector));
+
+	vioapic = vm_ioapic(vm);
+	VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector);
+
+	/*
+	 * XXX keep track of the pins associated with this vector instead
+	 * of iterating on every single pin each time.
+	 */
+	VIOAPIC_LOCK(vioapic);
+	for (pin = 0; pin < REDIR_ENTRIES; pin++) {
+		if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0)
+			continue;
+		if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector)
+			continue;
+		vioapic->rtbl[pin].reg &= ~IOART_REM_IRR;
+		if (vioapic->rtbl[pin].acnt > 0) {
+			VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, "
+			    "acnt %d", pin, vioapic->rtbl[pin].acnt);
+			vioapic_send_intr(vioapic, pin);
+		}
+	}
+	VIOAPIC_UNLOCK(vioapic);
+}
+
+struct vioapic *
+vioapic_init(struct vm *vm)
+{
+	int i;
+	struct vioapic *vioapic;
+
+	vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO);
+
+	vioapic->vm = vm;
+	mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN);
+
+	/* Initialize all redirection entries to mask all interrupts */
+	for (i = 0; i < REDIR_ENTRIES; i++)
+		vioapic->rtbl[i].reg = 0x0001000000010000UL;
+
+	return (vioapic);
+}
+
+void
+vioapic_cleanup(struct vioapic *vioapic)
+{
+
+	free(vioapic, M_VIOAPIC);
+}
+
+int
+vioapic_pincount(struct vm *vm)
+{
+
+	return (REDIR_ENTRIES);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.h b/usr/src/uts/i86pc/io/vmm/io/vioapic.h
new file mode 100644
index 0000000000..9479ebb10e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vioapic.h 258699 2013-11-27 22:18:08Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _VIOAPIC_H_
+#define	_VIOAPIC_H_
+
+#define	VIOAPIC_BASE	0xFEC00000
+#define	VIOAPIC_SIZE	4096
+
+#include "vdev.h"
+
+struct vm;
+
+struct vioapic *vioapic_init(struct vm *vm);
+void	vioapic_cleanup(struct vioapic *vioapic);
+
+int	vioapic_assert_irq(struct vm *vm, int irq);
+int	vioapic_deassert_irq(struct vm *vm, int irq);
+int	vioapic_pulse_irq(struct vm *vm, int irq);
+
+int	vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa,
+	    uint64_t wval, int size, void *arg);
+int	vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa,
+	    uint64_t *rval, int size, void *arg);
+
+int	vioapic_pincount(struct vm *vm);
+void	vioapic_process_eoi(struct vm *vm, int vcpuid, int vector);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
new file mode 100644
index 0000000000..9a0a3058ea
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
@@ -0,0 +1,1687 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/clock.h>
+#include <machine/smp.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vlapic.h"
+#include "vlapic_priv.h"
+#include "vioapic.h"
+
+#define	PRIO(x)			((x) >> 4)
+
+#define VLAPIC_VERSION		(16)
+
+#define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
+
+/*
+ * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
+ * vlapic_callout_handler() and vcpu accesses to:
+ * - timer_freq_bt, timer_period_bt, timer_fire_bt
+ * - timer LVT register
+ */
+#define	VLAPIC_TIMER_LOCK(vlapic)	mtx_lock_spin(&((vlapic)->timer_mtx))
+#define	VLAPIC_TIMER_UNLOCK(vlapic)	mtx_unlock_spin(&((vlapic)->timer_mtx))
+#define	VLAPIC_TIMER_LOCKED(vlapic)	mtx_owned(&((vlapic)->timer_mtx))
+
+#define VLAPIC_BUS_FREQ	tsc_freq
+
+static __inline uint32_t
+vlapic_get_id(struct vlapic *vlapic)
+{
+
+	if (x2apic(vlapic))
+		return (vlapic->vcpuid);
+	else
+		return (vlapic->vcpuid << 24);
+}
+
+static uint32_t
+x2apic_ldr(struct vlapic *vlapic)
+{
+	int apicid;
+	uint32_t ldr;
+
+	apicid = vlapic_get_id(vlapic);
+	ldr = 1 << (apicid & 0xf);
+	ldr |= (apicid & 0xffff0) << 12;
+	return (ldr);
+}
+
+void
+vlapic_dfr_write_handler(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+
+	lapic = vlapic->apic_page;
+	if (x2apic(vlapic)) {
+		VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
+		    lapic->dfr);
+		lapic->dfr = 0;
+		return;
+	}
+
+	lapic->dfr &= APIC_DFR_MODEL_MASK;
+	lapic->dfr |= APIC_DFR_RESERVED;
+
+	if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
+		VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
+	else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
+		VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
+	else
+		VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr);
+}
+
+void
+vlapic_ldr_write_handler(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+
+	lapic = vlapic->apic_page;
+
+	/* LDR is read-only in x2apic mode */
+	if (x2apic(vlapic)) {
+		VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
+		    lapic->ldr);
+		lapic->ldr = x2apic_ldr(vlapic);
+	} else {
+		lapic->ldr &= ~APIC_LDR_RESERVED;
+		VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
+	}
+}
+
+void
+vlapic_id_write_handler(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+	
+	/*
+	 * We don't allow the ID register to be modified so reset it back to
+	 * its default value.
+	 */
+	lapic = vlapic->apic_page;
+	lapic->id = vlapic_get_id(vlapic);
+}
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+	switch (dcr & 0xB) {
+	case APIC_TDCR_1:
+		return (1);
+	case APIC_TDCR_2:
+		return (2);
+	case APIC_TDCR_4:
+		return (4);
+	case APIC_TDCR_8:
+		return (8);
+	case APIC_TDCR_16:
+		return (16);
+	case APIC_TDCR_32:
+		return (32);
+	case APIC_TDCR_64:
+		return (64);
+	case APIC_TDCR_128:
+		return (128);
+	default:
+		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+	}
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+	    *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint32_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+	struct bintime bt_now, bt_rem;
+	struct LAPIC *lapic;
+	uint32_t ccr;
+	
+	ccr = 0;
+	lapic = vlapic->apic_page;
+
+	VLAPIC_TIMER_LOCK(vlapic);
+	if (callout_active(&vlapic->callout)) {
+		/*
+		 * If the timer is scheduled to expire in the future then
+		 * compute the value of 'ccr' based on the remaining time.
+		 */
+		binuptime(&bt_now);
+		if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
+			bt_rem = vlapic->timer_fire_bt;
+			bintime_sub(&bt_rem, &bt_now);
+			ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
+			ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
+		}
+	}
+#ifdef	__FreeBSD__
+	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
+	    "icr_timer is %#x", ccr, lapic->icr_timer));
+#else
+	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %x, "
+	    "icr_timer is %x", ccr, lapic->icr_timer));
+#endif
+	VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
+	    ccr, lapic->icr_timer);
+	VLAPIC_TIMER_UNLOCK(vlapic);
+	return (ccr);
+}
+
+void
+vlapic_dcr_write_handler(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+	int divisor;
+	
+	lapic = vlapic->apic_page;
+	VLAPIC_TIMER_LOCK(vlapic);
+
+	divisor = vlapic_timer_divisor(lapic->dcr_timer);
+	VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
+	    lapic->dcr_timer, divisor);
+
+	/*
+	 * Update the timer frequency and the timer period.
+	 *
+	 * XXX changes to the frequency divider will not take effect until
+	 * the timer is reloaded.
+	 */
+	FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
+	vlapic->timer_period_bt = vlapic->timer_freq_bt;
+	bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
+
+	VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+
+void
+vlapic_esr_write_handler(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+	
+	lapic = vlapic->apic_page;
+	lapic->esr = vlapic->esr_pending;
+	vlapic->esr_pending = 0;
+}
+
+int
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
+{
+	struct LAPIC *lapic;
+	uint32_t *irrptr, *tmrptr, mask;
+	int idx;
+
+	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
+
+	lapic = vlapic->apic_page;
+	if (!(lapic->svr & APIC_SVR_ENABLE)) {
+		VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
+		    "interrupt %d", vector);
+		return (0);
+	}
+
+	if (vector < 16) {
+		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR);
+		VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
+		    vector);
+		return (1);
+	}
+
+	if (vlapic->ops.set_intr_ready)
+		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
+
+	idx = (vector / 32) * 4;
+	mask = 1 << (vector % 32);
+
+	irrptr = &lapic->irr0;
+	atomic_set_int(&irrptr[idx], mask);
+
+	/*
+	 * Verify that the trigger-mode of the interrupt matches with
+	 * the vlapic TMR registers.
+	 */
+	tmrptr = &lapic->tmr0;
+	if ((tmrptr[idx] & mask) != (level ? mask : 0)) {
+		VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but "
+		    "interrupt is %s-triggered", idx / 4, tmrptr[idx],
+		    level ? "level" : "edge");
+	}
+
+	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+	return (1);
+}
+
+static __inline uint32_t *
+vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
+{
+	struct LAPIC	*lapic = vlapic->apic_page;
+	int 		 i;
+
+	switch (offset) {
+	case APIC_OFFSET_CMCI_LVT:
+		return (&lapic->lvt_cmci);
+	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+		return ((&lapic->lvt_timer) + i);;
+	default:
+		panic("vlapic_get_lvt: invalid LVT\n");
+	}
+}
+
+static __inline int
+lvt_off_to_idx(uint32_t offset)
+{
+	int index;
+
+	switch (offset) {
+	case APIC_OFFSET_CMCI_LVT:
+		index = APIC_LVT_CMCI;
+		break;
+	case APIC_OFFSET_TIMER_LVT:
+		index = APIC_LVT_TIMER;
+		break;
+	case APIC_OFFSET_THERM_LVT:
+		index = APIC_LVT_THERMAL;
+		break;
+	case APIC_OFFSET_PERF_LVT:
+		index = APIC_LVT_PMC;
+		break;
+	case APIC_OFFSET_LINT0_LVT:
+		index = APIC_LVT_LINT0;
+		break;
+	case APIC_OFFSET_LINT1_LVT:
+		index = APIC_LVT_LINT1;
+		break;
+	case APIC_OFFSET_ERROR_LVT:
+		index = APIC_LVT_ERROR;
+		break;
+	default:
+		index = -1;
+		break;
+	}
+#ifdef	__FreeBSD__
+	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
+	    "invalid lvt index %d for offset %#x", index, offset));
+#else
+	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
+	    "invalid lvt index %d for offset %x", index, offset));
+#endif
+
+	return (index);
+}
+
+static __inline uint32_t
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+	int idx;
+	uint32_t val;
+
+	idx = lvt_off_to_idx(offset);
+	val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
+	return (val);
+}
+
+void
+vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
+{
+	uint32_t *lvtptr, mask, val;
+	struct LAPIC *lapic;
+	int idx;
+	
+	lapic = vlapic->apic_page;
+	lvtptr = vlapic_get_lvtptr(vlapic, offset);	
+	val = *lvtptr;
+	idx = lvt_off_to_idx(offset);
+
+	if (!(lapic->svr & APIC_SVR_ENABLE))
+		val |= APIC_LVT_M;
+	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
+	switch (offset) {
+	case APIC_OFFSET_TIMER_LVT:
+		mask |= APIC_LVTT_TM;
+		break;
+	case APIC_OFFSET_ERROR_LVT:
+		break;
+	case APIC_OFFSET_LINT0_LVT:
+	case APIC_OFFSET_LINT1_LVT:
+		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
+		/* FALLTHROUGH */
+	default:
+		mask |= APIC_LVT_DM;
+		break;
+	}
+	val &= mask;
+	*lvtptr = val;
+	atomic_store_rel_32(&vlapic->lvt_last[idx], val);
+}
+
+static void
+vlapic_mask_lvts(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic = vlapic->apic_page;
+
+	lapic->lvt_cmci |= APIC_LVT_M;
+	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
+
+	lapic->lvt_timer |= APIC_LVT_M;
+	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	lapic->lvt_thermal |= APIC_LVT_M;
+	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
+
+	lapic->lvt_pcint |= APIC_LVT_M;
+	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
+
+	lapic->lvt_lint0 |= APIC_LVT_M;
+	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
+
+	lapic->lvt_lint1 |= APIC_LVT_M;
+	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
+
+	lapic->lvt_error |= APIC_LVT_M;
+	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
+}
+
+static int
+vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt)
+{
+	uint32_t vec, mode;
+
+	if (lvt & APIC_LVT_M)
+		return (0);
+
+	vec = lvt & APIC_LVT_VECTOR;
+	mode = lvt & APIC_LVT_DM;
+
+	switch (mode) {
+	case APIC_LVT_DM_FIXED:
+		if (vec < 16) {
+			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+			return (0);
+		}
+		if (vlapic_set_intr_ready(vlapic, vec, false))
+			vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true);
+		break;
+	case APIC_LVT_DM_NMI:
+		vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
+		break;
+	case APIC_LVT_DM_EXTINT:
+		vm_inject_extint(vlapic->vm, vlapic->vcpuid);
+		break;
+	default:
+		// Other modes ignored
+		return (0);
+	}
+	return (1);
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+	int i;
+	uint32_t *isrptr;
+
+	isrptr = &vlapic->apic_page->isr0;
+	for (i = 0; i < 8; i++)
+		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+	int isrvec, tpr, ppr;
+
+	/*
+	 * Note that the value on the stack at index 0 is always 0.
+	 *
+	 * This is a placeholder for the value of ISRV when none of the
+	 * bits is set in the ISRx registers.
+	 */
+	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+	tpr = vlapic->apic_page->tpr;
+
+#if 1
+	{
+		int i, lastprio, curprio, vector, idx;
+		uint32_t *isrptr;
+
+		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+			panic("isrvec_stk is corrupted: %d", isrvec);
+
+		/*
+		 * Make sure that the priority of the nested interrupts is
+		 * always increasing.
+		 */
+		lastprio = -1;
+		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+			curprio = PRIO(vlapic->isrvec_stk[i]);
+			if (curprio <= lastprio) {
+				dump_isrvec_stk(vlapic);
+				panic("isrvec_stk does not satisfy invariant");
+			}
+			lastprio = curprio;
+		}
+
+		/*
+		 * Make sure that each bit set in the ISRx registers has a
+		 * corresponding entry on the isrvec stack.
+		 */
+		i = 1;
+		isrptr = &vlapic->apic_page->isr0;
+		for (vector = 0; vector < 256; vector++) {
+			idx = (vector / 32) * 4;
+			if (isrptr[idx] & (1 << (vector % 32))) {
+				if (i > vlapic->isrvec_stk_top ||
+				    vlapic->isrvec_stk[i] != vector) {
+					dump_isrvec_stk(vlapic);
+					panic("ISR and isrvec_stk out of sync");
+				}
+				i++;
+			}
+		}
+	}
+#endif
+
+	if (PRIO(tpr) >= PRIO(isrvec))
+		ppr = tpr;
+	else
+		ppr = isrvec & 0xf0;
+
+	vlapic->apic_page->ppr = ppr;
+	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = vlapic->apic_page;
+	uint32_t	*isrptr, *tmrptr;
+	int		i, idx, bitpos, vector;
+
+	isrptr = &lapic->isr0;
+	tmrptr = &lapic->tmr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		bitpos = fls(isrptr[idx]);
+		if (bitpos-- != 0) {
+			if (vlapic->isrvec_stk_top <= 0) {
+				panic("invalid vlapic isrvec_stk_top %d",
+				      vlapic->isrvec_stk_top);
+			}
+			isrptr[idx] &= ~(1 << bitpos);
+			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+			vlapic->isrvec_stk_top--;
+			vlapic_update_ppr(vlapic);
+			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
+				vector = i * 32 + bitpos;
+				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
+				    vector);
+			}
+			return;
+		}
+	}
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
+{
+
+	return (lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+	uint32_t lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
+
+void
+vlapic_set_error(struct vlapic *vlapic, uint32_t mask)
+{
+	uint32_t lvt;
+
+	vlapic->esr_pending |= mask;
+	if (vlapic->esr_firing)
+		return;
+	vlapic->esr_firing = 1;
+
+	// The error LVT always uses the fixed delivery mode.
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
+	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
+	}
+	vlapic->esr_firing = 0;
+}
+
+static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+	uint32_t lvt;
+
+	KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
+	
+	// The timer LVT always uses the fixed delivery mode.
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+		VLAPIC_CTR0(vlapic, "vlapic timer fired");
+		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
+	}
+}
+
+static VMM_STAT(VLAPIC_INTR_CMC,
+    "corrected machine check interrupts generated by vlapic");
+
+void
+vlapic_fire_cmci(struct vlapic *vlapic)
+{
+	uint32_t lvt;
+
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
+	if (vlapic_fire_lvt(vlapic, lvt)) {
+		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
+	}
+}
+
+static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
+    "lvts triggered");
+
+int
+vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
+{
+	uint32_t lvt;
+
+	if (vlapic_enabled(vlapic) == false) {
+		/*
+		 * When the local APIC is global/hardware disabled,
+		 * LINT[1:0] pins are configured as INTR and NMI pins,
+		 * respectively.
+		*/
+		switch (vector) {
+			case APIC_LVT_LINT0:
+				vm_inject_extint(vlapic->vm, vlapic->vcpuid);
+				break;
+			case APIC_LVT_LINT1:
+				vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
+				break;
+			default:
+				break;
+		}
+		return (0);
+	}
+
+	switch (vector) {
+	case APIC_LVT_LINT0:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT);
+		break;
+	case APIC_LVT_LINT1:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT);
+		break;
+	case APIC_LVT_TIMER:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+		lvt |= APIC_LVT_DM_FIXED;
+		break;
+	case APIC_LVT_ERROR:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
+		lvt |= APIC_LVT_DM_FIXED;
+		break;
+	case APIC_LVT_PMC:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT);
+		break;
+	case APIC_LVT_THERMAL:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT);
+		break;
+	case APIC_LVT_CMCI:
+		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
+		break;
+	default:
+		return (EINVAL);
+	}
+	if (vlapic_fire_lvt(vlapic, lvt)) {
+		vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
+		    LVTS_TRIGGERRED, vector, 1);
+	}
+	return (0);
+}
+
+static void
+vlapic_callout_handler(void *arg)
+{
+	struct vlapic *vlapic;
+	struct bintime bt, btnow;
+	sbintime_t rem_sbt;
+
+	vlapic = arg;
+
+	VLAPIC_TIMER_LOCK(vlapic);
+	if (callout_pending(&vlapic->callout))	/* callout was reset */
+		goto done;
+
+	if (!callout_active(&vlapic->callout))	/* callout was stopped */
+		goto done;
+
+	callout_deactivate(&vlapic->callout);
+
+	vlapic_fire_timer(vlapic);
+
+	if (vlapic_periodic_timer(vlapic)) {
+		binuptime(&btnow);
+#ifdef	__FreeBSD__
+		KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
+		    ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
+		    btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
+		    vlapic->timer_fire_bt.frac));
+#else
+		KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
+		    ("vlapic callout at %lx.%lx, expected at %lx.%lx",
+		    btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
+		    vlapic->timer_fire_bt.frac));
+#endif
+
+		/*
+		 * Compute the delta between when the timer was supposed to
+		 * fire and the present time.
+		 */
+		bt = btnow;
+		bintime_sub(&bt, &vlapic->timer_fire_bt);
+
+		rem_sbt = bttosbt(vlapic->timer_period_bt);
+		if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
+			/*
+			 * Adjust the time until the next countdown downward
+			 * to account for the lost time.
+			 */
+			rem_sbt -= bttosbt(bt);
+		} else {
+			/*
+			 * If the delta is greater than the timer period then
+			 * just reset our time base instead of trying to catch
+			 * up.
+			 */
+			vlapic->timer_fire_bt = btnow;
+			VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
+			    "usecs, period is %lu usecs - resetting time base",
+			    bttosbt(bt) / SBT_1US,
+			    bttosbt(vlapic->timer_period_bt) / SBT_1US);
+		}
+
+		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
+		callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
+		    vlapic_callout_handler, vlapic, 0);
+	}
+done:
+	VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+void
+vlapic_icrtmr_write_handler(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+	sbintime_t sbt;
+	uint32_t icr_timer;
+
+	VLAPIC_TIMER_LOCK(vlapic);
+
+	lapic = vlapic->apic_page;
+	icr_timer = lapic->icr_timer;
+
+	vlapic->timer_period_bt = vlapic->timer_freq_bt;
+	bintime_mul(&vlapic->timer_period_bt, icr_timer);
+
+	if (icr_timer != 0) {
+		binuptime(&vlapic->timer_fire_bt);
+		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
+
+		sbt = bttosbt(vlapic->timer_period_bt);
+		callout_reset_sbt(&vlapic->callout, sbt, 0,
+		    vlapic_callout_handler, vlapic, 0);
+	} else
+		callout_stop(&vlapic->callout);
+
+	VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+/*
+ * This function populates 'dmask' with the set of vcpus that match the
+ * addressing specified by the (dest, phys, lowprio) tuple.
+ * 
+ * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
+ * or xAPIC (8-bit) destination field.
+ */
+static void
+vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
+    bool lowprio, bool x2apic_dest)
+{
+	struct vlapic *vlapic;
+	uint32_t dfr, ldr, ldest, cluster;
+	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
+	cpuset_t amask;
+	int vcpuid;
+
+	if ((x2apic_dest && dest == 0xffffffff) ||
+	    (!x2apic_dest && dest == 0xff)) {
+		/*
+		 * Broadcast in both logical and physical modes.
+		 */
+		*dmask = vm_active_cpus(vm);
+		return;
+	}
+
+	if (phys) {
+		/*
+		 * Physical mode: destination is APIC ID.
+		 */
+		CPU_ZERO(dmask);
+		vcpuid = vm_apicid2vcpuid(vm, dest);
+		if (vcpuid < VM_MAXCPU)
+			CPU_SET(vcpuid, dmask);
+	} else {
+		/*
+		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
+		 * bitmask. This model is only avilable in the xAPIC mode.
+		 */
+		mda_flat_ldest = dest & 0xff;
+
+		/*
+		 * In the "Cluster Model" the MDA is used to identify a
+		 * specific cluster and a set of APICs in that cluster.
+		 */
+		if (x2apic_dest) {
+			mda_cluster_id = dest >> 16;
+			mda_cluster_ldest = dest & 0xffff;
+		} else {
+			mda_cluster_id = (dest >> 4) & 0xf;
+			mda_cluster_ldest = dest & 0xf;
+		}
+
+		/*
+		 * Logical mode: match each APIC that has a bit set
+		 * in it's LDR that matches a bit in the ldest.
+		 */
+		CPU_ZERO(dmask);
+		amask = vm_active_cpus(vm);
+		while ((vcpuid = CPU_FFS(&amask)) != 0) {
+			vcpuid--;
+			CPU_CLR(vcpuid, &amask);
+
+			vlapic = vm_lapic(vm, vcpuid);
+			dfr = vlapic->apic_page->dfr;
+			ldr = vlapic->apic_page->ldr;
+
+			if ((dfr & APIC_DFR_MODEL_MASK) ==
+			    APIC_DFR_MODEL_FLAT) {
+				ldest = ldr >> 24;
+				mda_ldest = mda_flat_ldest;
+			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
+			    APIC_DFR_MODEL_CLUSTER) {
+				if (x2apic(vlapic)) {
+					cluster = ldr >> 16;
+					ldest = ldr & 0xffff;
+				} else {
+					cluster = ldr >> 28;
+					ldest = (ldr >> 24) & 0xf;
+				}
+				if (cluster != mda_cluster_id)
+					continue;
+				mda_ldest = mda_cluster_ldest;
+			} else {
+				/*
+				 * Guest has configured a bad logical
+				 * model for this vcpu - skip it.
+				 */
+				VLAPIC_CTR1(vlapic, "vlapic has bad logical "
+				    "model %x - cannot deliver interrupt", dfr);
+				continue;
+			}
+
+			if ((mda_ldest & ldest) != 0) {
+				CPU_SET(vcpuid, dmask);
+				if (lowprio)
+					break;
+			}
+		}
+	}
+}
+
+static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
+
+static void
+vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
+{
+	struct LAPIC *lapic = vlapic->apic_page;
+
+	if (lapic->tpr != val) {
+		VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed "
+		    "from %#x to %#x", lapic->tpr, val);
+		lapic->tpr = val;
+		vlapic_update_ppr(vlapic);
+	}
+}
+
+static uint8_t
+vlapic_get_tpr(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic = vlapic->apic_page;
+
+	return (lapic->tpr);
+}
+
+void
+vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
+{
+	uint8_t tpr;
+
+	if (val & ~0xf) {
+		vm_inject_gp(vlapic->vm, vlapic->vcpuid);
+		return;
+	}
+
+	tpr = val << 4;
+	vlapic_set_tpr(vlapic, tpr);
+}
+
+uint64_t
+vlapic_get_cr8(struct vlapic *vlapic)
+{
+	uint8_t tpr;
+
+	tpr = vlapic_get_tpr(vlapic);
+	return (tpr >> 4);
+}
+
+int
+vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
+{
+	int i;
+	bool phys;
+	cpuset_t dmask;
+	uint64_t icrval;
+	uint32_t dest, vec, mode;
+	struct vlapic *vlapic2;
+	struct vm_exit *vmexit;
+	struct LAPIC *lapic;
+
+	lapic = vlapic->apic_page;
+	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
+	icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
+
+	if (x2apic(vlapic))
+		dest = icrval >> 32;
+	else
+		dest = icrval >> (32 + 24);
+	vec = icrval & APIC_VECTOR_MASK;
+	mode = icrval & APIC_DELMODE_MASK;
+
+	if (mode == APIC_DELMODE_FIXED && vec < 16) {
+		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+		VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
+		return (0);
+	}
+
+	VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
+
+	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+		switch (icrval & APIC_DEST_MASK) {
+		case APIC_DEST_DESTFLD:
+			phys = ((icrval & APIC_DESTMODE_LOG) == 0);
+			vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
+			    x2apic(vlapic));
+			break;
+		case APIC_DEST_SELF:
+			CPU_SETOF(vlapic->vcpuid, &dmask);
+			break;
+		case APIC_DEST_ALLISELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			break;
+		case APIC_DEST_ALLESELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			CPU_CLR(vlapic->vcpuid, &dmask);
+			break;
+		default:
+			CPU_ZERO(&dmask);	/* satisfy gcc */
+			break;
+		}
+
+		while ((i = CPU_FFS(&dmask)) != 0) {
+			i--;
+			CPU_CLR(i, &dmask);
+			if (mode == APIC_DELMODE_FIXED) {
+				lapic_intr_edge(vlapic->vm, i, vec);
+				vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
+						    IPIS_SENT, i, 1);
+				VLAPIC_CTR2(vlapic, "vlapic sending ipi %d "
+				    "to vcpuid %d", vec, i);
+			} else {
+				vm_inject_nmi(vlapic->vm, i);
+				VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi "
+				    "to vcpuid %d", i);
+			}
+		}
+
+		return (0);	/* handled completely in the kernel */
+	}
+
+	if (mode == APIC_DELMODE_INIT) {
+		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+			return (0);
+
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/* move from INIT to waiting-for-SIPI state */
+			if (vlapic2->boot_state == BS_INIT) {
+				vlapic2->boot_state = BS_SIPI;
+			}
+
+			return (0);
+		}
+	}
+
+	if (mode == APIC_DELMODE_STARTUP) {
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/*
+			 * Ignore SIPIs in any state other than wait-for-SIPI
+			 */
+			if (vlapic2->boot_state != BS_SIPI)
+				return (0);
+
+			vlapic2->boot_state = BS_RUNNING;
+
+			*retu = true;
+			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+			vmexit->u.spinup_ap.vcpu = dest;
+			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+			return (0);
+		}
+	}
+
+	/*
+	 * This will cause a return to userland.
+	 */
+	return (1);
+}
+
+void
+vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val)
+{
+	int vec;
+
+	KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode"));
+
+	vec = val & 0xff;
+	lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
+	vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT,
+	    vlapic->vcpuid, 1);
+	VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
+{
+	struct LAPIC	*lapic = vlapic->apic_page;
+	int	  	 idx, i, bitpos, vector;
+	uint32_t	*irrptr, val;
+
+	if (vlapic->ops.pending_intr)
+		return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
+
+	irrptr = &lapic->irr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		val = atomic_load_acq_int(&irrptr[idx]);
+		bitpos = fls(val);
+		if (bitpos != 0) {
+			vector = i * 32 + (bitpos - 1);
+			if (PRIO(vector) > PRIO(lapic->ppr)) {
+				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+				if (vecptr != NULL)
+					*vecptr = vector;
+				return (1);
+			} else 
+				break;
+		}
+	}
+	return (0);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = vlapic->apic_page;
+	uint32_t	*irrptr, *isrptr;
+	int		idx, stk_top;
+
+	if (vlapic->ops.intr_accepted)
+		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
+
+	/*
+	 * clear the ready bit for vector being accepted in irr 
+	 * and set the vector as in service in isr.
+	 */
+	idx = (vector / 32) * 4;
+
+	irrptr = &lapic->irr0;
+	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+	isrptr = &lapic->isr0;
+	isrptr[idx] |= 1 << (vector % 32);
+	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+	/*
+	 * Update the PPR
+	 */
+	vlapic->isrvec_stk_top++;
+
+	stk_top = vlapic->isrvec_stk_top;
+	if (stk_top >= ISRVEC_STK_SIZE)
+		panic("isrvec_stk_top overflow %d", stk_top);
+
+	vlapic->isrvec_stk[stk_top] = vector;
+	vlapic_update_ppr(vlapic);
+}
+
+void
+vlapic_svr_write_handler(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+	uint32_t old, new, changed;
+
+	lapic = vlapic->apic_page;
+
+	new = lapic->svr;
+	old = vlapic->svr_last;
+	vlapic->svr_last = new;
+
+	changed = old ^ new;
+	if ((changed & APIC_SVR_ENABLE) != 0) {
+		if ((new & APIC_SVR_ENABLE) == 0) {
+			/*
+			 * The apic is now disabled so stop the apic timer
+			 * and mask all the LVT entries.
+			 */
+			VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
+			VLAPIC_TIMER_LOCK(vlapic);
+			callout_stop(&vlapic->callout);
+			VLAPIC_TIMER_UNLOCK(vlapic);
+			vlapic_mask_lvts(vlapic);
+		} else {
+			/*
+			 * The apic is now enabled so restart the apic timer
+			 * if it is configured in periodic mode.
+			 */
+			VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
+			if (vlapic_periodic_timer(vlapic))
+				vlapic_icrtmr_write_handler(vlapic);
+		}
+	}
+}
+
+int
+vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+    uint64_t *data, bool *retu)
+{
+	struct LAPIC	*lapic = vlapic->apic_page;
+	uint32_t	*reg;
+	int		 i;
+
+	/* Ignore MMIO accesses in x2APIC mode */
+	if (x2apic(vlapic) && mmio_access) {
+		VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode",
+		    offset);
+		*data = 0;
+		goto done;
+	}
+
+	if (!x2apic(vlapic) && !mmio_access) {
+		/*
+		 * XXX Generate GP fault for MSR accesses in xAPIC mode
+		 */
+		VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in "
+		    "xAPIC mode", offset);
+		*data = 0;
+		goto done;
+	}
+
+	if (offset > sizeof(*lapic)) {
+		*data = 0;
+		goto done;
+	}
+	
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			*data = lapic->id;
+			break;
+		case APIC_OFFSET_VER:
+			*data = lapic->version;
+			break;
+		case APIC_OFFSET_TPR:
+			*data = vlapic_get_tpr(vlapic);
+			break;
+		case APIC_OFFSET_APR:
+			*data = lapic->apr;
+			break;
+		case APIC_OFFSET_PPR:
+			*data = lapic->ppr;
+			break;
+		case APIC_OFFSET_EOI:
+			*data = lapic->eoi;
+			break;
+		case APIC_OFFSET_LDR:
+			*data = lapic->ldr;
+			break;
+		case APIC_OFFSET_DFR:
+			*data = lapic->dfr;
+			break;
+		case APIC_OFFSET_SVR:
+			*data = lapic->svr;
+			break;
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+			i = (offset - APIC_OFFSET_ISR0) >> 2;
+			reg = &lapic->isr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+			i = (offset - APIC_OFFSET_TMR0) >> 2;
+			reg = &lapic->tmr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+			i = (offset - APIC_OFFSET_IRR0) >> 2;
+			reg = &lapic->irr0;
+			*data = atomic_load_acq_int(reg + i);
+			break;
+		case APIC_OFFSET_ESR:
+			*data = lapic->esr;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			*data = lapic->icr_lo;
+			if (x2apic(vlapic))
+				*data |= (uint64_t)lapic->icr_hi << 32;
+			break;
+		case APIC_OFFSET_ICR_HI: 
+			*data = lapic->icr_hi;
+			break;
+		case APIC_OFFSET_CMCI_LVT:
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			*data = vlapic_get_lvt(vlapic, offset);	
+#ifdef INVARIANTS
+			reg = vlapic_get_lvtptr(vlapic, offset);
+			KASSERT(*data == *reg, ("inconsistent lvt value at "
+			    "offset %#lx: %#lx/%#x", offset, *data, *reg));
+#endif
+			break;
+		case APIC_OFFSET_TIMER_ICR:
+			*data = lapic->icr_timer;
+			break;
+		case APIC_OFFSET_TIMER_CCR:
+			*data = vlapic_get_ccr(vlapic);
+			break;
+		case APIC_OFFSET_TIMER_DCR:
+			*data = lapic->dcr_timer;
+			break;
+		case APIC_OFFSET_SELF_IPI:
+			/*
+			 * XXX generate a GP fault if vlapic is in x2apic mode
+			 */
+			*data = 0;
+			break;
+		case APIC_OFFSET_RRR:
+		default:
+			*data = 0;
+			break;
+	}
+done:
+	VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
+	return 0;
+}
+
+int
+vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+    uint64_t data, bool *retu)
+{
+	struct LAPIC	*lapic = vlapic->apic_page;
+	uint32_t	*regptr;
+	int		retval;
+
+#ifdef	__FreeBSD__
+	KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
+	    ("vlapic_write: invalid offset %#lx", offset));
+#else
+	KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
+	    ("vlapic_write: invalid offset %lx", offset));
+#endif
+
+	VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx",
+	    offset, data);
+
+	if (offset > sizeof(*lapic))
+		return (0);
+
+	/* Ignore MMIO accesses in x2APIC mode */
+	if (x2apic(vlapic) && mmio_access) {
+		VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx "
+		    "in x2APIC mode", data, offset);
+		return (0);
+	}
+
+	/*
+	 * XXX Generate GP fault for MSR accesses in xAPIC mode
+	 */
+	if (!x2apic(vlapic) && !mmio_access) {
+		VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx "
+		    "in xAPIC mode", data, offset);
+		return (0);
+	}
+
+	retval = 0;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			lapic->id = data;
+			vlapic_id_write_handler(vlapic);
+			break;
+		case APIC_OFFSET_TPR:
+			vlapic_set_tpr(vlapic, data & 0xff);
+			break;
+		case APIC_OFFSET_EOI:
+			vlapic_process_eoi(vlapic);
+			break;
+		case APIC_OFFSET_LDR:
+			lapic->ldr = data;
+			vlapic_ldr_write_handler(vlapic);
+			break;
+		case APIC_OFFSET_DFR:
+			lapic->dfr = data;
+			vlapic_dfr_write_handler(vlapic);
+			break;
+		case APIC_OFFSET_SVR:
+			lapic->svr = data;
+			vlapic_svr_write_handler(vlapic);
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			lapic->icr_lo = data;
+			if (x2apic(vlapic))
+				lapic->icr_hi = data >> 32;
+			retval = vlapic_icrlo_write_handler(vlapic, retu);
+			break;
+		case APIC_OFFSET_ICR_HI:
+			lapic->icr_hi = data;
+			break;
+		case APIC_OFFSET_CMCI_LVT:
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			regptr = vlapic_get_lvtptr(vlapic, offset);
+			*regptr = data;
+			vlapic_lvt_write_handler(vlapic, offset);
+			break;
+		case APIC_OFFSET_TIMER_ICR:
+			lapic->icr_timer = data;
+			vlapic_icrtmr_write_handler(vlapic);
+			break;
+
+		case APIC_OFFSET_TIMER_DCR:
+			lapic->dcr_timer = data;
+			vlapic_dcr_write_handler(vlapic);
+			break;
+
+		case APIC_OFFSET_ESR:
+			vlapic_esr_write_handler(vlapic);
+			break;
+
+		case APIC_OFFSET_SELF_IPI:
+			if (x2apic(vlapic))
+				vlapic_self_ipi_handler(vlapic, data);
+			break;
+
+		case APIC_OFFSET_VER:
+		case APIC_OFFSET_APR:
+		case APIC_OFFSET_PPR:
+		case APIC_OFFSET_RRR:
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+		case APIC_OFFSET_TIMER_CCR:
+		default:
+			// Read only.
+			break;
+	}
+
+	return (retval);
+}
+
+static void
+vlapic_reset(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic;
+	
+	lapic = vlapic->apic_page;
+	bzero(lapic, sizeof(struct LAPIC));
+
+	lapic->id = vlapic_get_id(vlapic);
+	lapic->version = VLAPIC_VERSION;
+	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
+	lapic->dfr = 0xffffffff;
+	lapic->svr = APIC_SVR_VECTOR;
+	vlapic_mask_lvts(vlapic);
+	vlapic_reset_tmr(vlapic);
+
+	lapic->dcr_timer = 0;
+	vlapic_dcr_write_handler(vlapic);
+
+	if (vlapic->vcpuid == 0)
+		vlapic->boot_state = BS_RUNNING;	/* BSP */
+	else
+		vlapic->boot_state = BS_INIT;		/* AP */
+
+	vlapic->svr_last = lapic->svr;
+}
+
+void
+vlapic_init(struct vlapic *vlapic)
+{
+	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
+	KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU,
+	    ("vlapic_init: vcpuid is not initialized"));
+	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
+	    "initialized"));
+
+	/*
+	 * If the vlapic is configured in x2apic mode then it will be
+	 * accessed in the critical section via the MSR emulation code.
+	 *
+	 * Therefore the timer mutex must be a spinlock because blockable
+	 * mutexes cannot be acquired in a critical section.
+	 */
+	mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
+	callout_init(&vlapic->callout, 1);
+
+	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
+
+	if (vlapic->vcpuid == 0)
+		vlapic->msr_apicbase |= APICBASE_BSP;
+
+	vlapic_reset(vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+
+	callout_drain(&vlapic->callout);
+}
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+	return (vlapic->msr_apicbase);
+}
+
+int
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new)
+{
+
+	if (vlapic->msr_apicbase != new) {
+		VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx "
+		    "not supported", vlapic->msr_apicbase, new);
+		return (-1);
+	}
+
+	return (0);
+}
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	struct vlapic *vlapic;
+	struct LAPIC *lapic;
+
+	vlapic = vm_lapic(vm, vcpuid);
+
+	if (state == X2APIC_DISABLED)
+		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+	else
+		vlapic->msr_apicbase |= APICBASE_X2APIC;
+
+	/*
+	 * Reset the local APIC registers whose values are mode-dependent.
+	 *
+	 * XXX this works because the APIC mode can be changed only at vcpu
+	 * initialization time.
+	 */
+	lapic = vlapic->apic_page;
+	lapic->id = vlapic_get_id(vlapic);
+	if (x2apic(vlapic)) {
+		lapic->ldr = x2apic_ldr(vlapic);
+		lapic->dfr = 0;
+	} else {
+		lapic->ldr = 0;
+		lapic->dfr = 0xffffffff;
+	}
+
+	if (state == X2APIC_ENABLED) {
+		if (vlapic->ops.enable_x2apic_mode)
+			(*vlapic->ops.enable_x2apic_mode)(vlapic);
+	}
+}
+
+void
+vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
+    int delmode, int vec)
+{
+	bool lowprio;
+	int vcpuid;
+	cpuset_t dmask;
+
+	if (delmode != IOART_DELFIXED &&
+	    delmode != IOART_DELLOPRI &&
+	    delmode != IOART_DELEXINT) {
+		VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
+		return;
+	}
+	lowprio = (delmode == IOART_DELLOPRI);
+
+	/*
+	 * We don't provide any virtual interrupt redirection hardware so
+	 * all interrupts originating from the ioapic or MSI specify the
+	 * 'dest' in the legacy xAPIC format.
+	 */
+	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
+
+	while ((vcpuid = CPU_FFS(&dmask)) != 0) {
+		vcpuid--;
+		CPU_CLR(vcpuid, &dmask);
+		if (delmode == IOART_DELEXINT) {
+			vm_inject_extint(vm, vcpuid);
+		} else {
+			lapic_set_intr(vm, vcpuid, vec, level);
+		}
+	}
+}
+
+void
+vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
+{
+	/*
+	 * Post an interrupt to the vcpu currently running on 'hostcpu'.
+	 *
+	 * This is done by leveraging features like Posted Interrupts (Intel)
+	 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
+	 *
+	 * If neither of these features are available then fallback to
+	 * sending an IPI to 'hostcpu'.
+	 */
+	if (vlapic->ops.post_intr)
+		(*vlapic->ops.post_intr)(vlapic, hostcpu);
+	else
+		ipi_cpu(hostcpu, ipinum);
+}
+
+bool
+vlapic_enabled(struct vlapic *vlapic)
+{
+	struct LAPIC *lapic = vlapic->apic_page;
+
+	if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
+	    (lapic->svr & APIC_SVR_ENABLE) != 0)
+		return (true);
+	else
+		return (false);
+}
+
+static void
+vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
+{
+	struct LAPIC *lapic;
+	uint32_t *tmrptr, mask;
+	int idx;
+
+	lapic = vlapic->apic_page;
+	tmrptr = &lapic->tmr0;
+	idx = (vector / 32) * 4;
+	mask = 1 << (vector % 32);
+	if (level)
+		tmrptr[idx] |= mask;
+	else
+		tmrptr[idx] &= ~mask;
+
+	if (vlapic->ops.set_tmr != NULL)
+		(*vlapic->ops.set_tmr)(vlapic, vector, level);
+}
+
+void
+vlapic_reset_tmr(struct vlapic *vlapic)
+{
+	int vector;
+
+	VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
+
+	for (vector = 0; vector <= 255; vector++)
+		vlapic_set_tmr(vlapic, vector, false);
+}
+
+void
+vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
+    int delmode, int vector)
+{
+	cpuset_t dmask;
+	bool lowprio;
+
+	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+
+	/*
+	 * A level trigger is valid only for fixed and lowprio delivery modes.
+	 */
+	if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
+		VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
+		    "delivery-mode %d", delmode);
+		return;
+	}
+
+	lowprio = (delmode == APIC_DELMODE_LOWPRIO);
+	vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
+
+	if (!CPU_ISSET(vlapic->vcpuid, &dmask))
+		return;
+
+	VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
+	vlapic_set_tmr(vlapic, vector, true);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
new file mode 100644
index 0000000000..3fa705d818
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
@@ -0,0 +1,109 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vlapic.h 262281 2014-02-21 06:03:54Z neel $
+ */
+
+#ifndef _VLAPIC_H_
+#define	_VLAPIC_H_
+
+struct vm;
+enum x2apic_state;
+
+int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+    uint64_t data, bool *retu);
+int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+    uint64_t *data, bool *retu);
+
+/*
+ * Returns 0 if there is no eligible vector that can be delivered to the
+ * guest at this time and non-zero otherwise.
+ *
+ * If an eligible vector number is found and 'vecptr' is not NULL then it will
+ * be stored in the location pointed to by 'vecptr'.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ */
+int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'vlapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+
+/*
+ * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise.
+ */
+int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
+
+/*
+ * Post an interrupt to the vcpu running on 'hostcpu'. This will use a
+ * hardware assist if available (e.g. Posted Interrupt) or fall back to
+ * sending an 'ipinum' to interrupt the 'hostcpu'.
+ */
+void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum);
+
+void vlapic_set_error(struct vlapic *vlapic, uint32_t mask);
+void vlapic_fire_cmci(struct vlapic *vlapic);
+int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
+
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+bool vlapic_enabled(struct vlapic *vlapic);
+
+void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
+    int delmode, int vec);
+
+/* Reset the trigger-mode bits for all vectors to be edge-triggered */
+void vlapic_reset_tmr(struct vlapic *vlapic);
+
+/*
+ * Set the trigger-mode bit associated with 'vector' to level-triggered if
+ * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to
+ * this 'vlapic'.
+ */
+void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
+    int delmode, int vector);
+
+void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val);
+uint64_t vlapic_get_cr8(struct vlapic *vlapic);
+
+/* APIC write handlers */
+void vlapic_id_write_handler(struct vlapic *vlapic);
+void vlapic_ldr_write_handler(struct vlapic *vlapic);
+void vlapic_dfr_write_handler(struct vlapic *vlapic);
+void vlapic_svr_write_handler(struct vlapic *vlapic);
+void vlapic_esr_write_handler(struct vlapic *vlapic);
+int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu);
+void vlapic_icrtmr_write_handler(struct vlapic *vlapic);
+void vlapic_dcr_write_handler(struct vlapic *vlapic);
+void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset);
+void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val);
+#endif	/* _VLAPIC_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
new file mode 100644
index 0000000000..f9bd2e0e8b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vlapic_priv.h 263211 2014-03-15 23:09:34Z tychon $
+ */
+
+#ifndef _VLAPIC_PRIV_H_
+#define	_VLAPIC_PRIV_H_
+
+#include <x86/apicreg.h>
+
+/*
+ * APIC Register:		Offset	   Description
+ */
+#define APIC_OFFSET_ID		0x20	/* Local APIC ID		*/
+#define APIC_OFFSET_VER		0x30	/* Local APIC Version		*/
+#define APIC_OFFSET_TPR		0x80	/* Task Priority Register	*/
+#define APIC_OFFSET_APR		0x90	/* Arbitration Priority		*/
+#define APIC_OFFSET_PPR		0xA0	/* Processor Priority Register	*/
+#define APIC_OFFSET_EOI		0xB0	/* EOI Register			*/
+#define APIC_OFFSET_RRR		0xC0	/* Remote read			*/
+#define APIC_OFFSET_LDR		0xD0	/* Logical Destination		*/
+#define APIC_OFFSET_DFR		0xE0	/* Destination Format Register	*/
+#define APIC_OFFSET_SVR		0xF0	/* Spurious Vector Register	*/
+#define APIC_OFFSET_ISR0	0x100	/* In Service Register		*/
+#define APIC_OFFSET_ISR1	0x110
+#define APIC_OFFSET_ISR2	0x120
+#define APIC_OFFSET_ISR3	0x130
+#define APIC_OFFSET_ISR4	0x140
+#define APIC_OFFSET_ISR5	0x150
+#define APIC_OFFSET_ISR6	0x160
+#define APIC_OFFSET_ISR7	0x170
+#define APIC_OFFSET_TMR0	0x180	/* Trigger Mode Register	*/
+#define APIC_OFFSET_TMR1	0x190
+#define APIC_OFFSET_TMR2	0x1A0
+#define APIC_OFFSET_TMR3	0x1B0
+#define APIC_OFFSET_TMR4	0x1C0
+#define APIC_OFFSET_TMR5	0x1D0
+#define APIC_OFFSET_TMR6	0x1E0
+#define APIC_OFFSET_TMR7	0x1F0
+#define APIC_OFFSET_IRR0	0x200	/* Interrupt Request Register	*/
+#define APIC_OFFSET_IRR1	0x210
+#define APIC_OFFSET_IRR2	0x220
+#define APIC_OFFSET_IRR3	0x230
+#define APIC_OFFSET_IRR4	0x240
+#define APIC_OFFSET_IRR5	0x250
+#define APIC_OFFSET_IRR6	0x260
+#define APIC_OFFSET_IRR7	0x270
+#define APIC_OFFSET_ESR		0x280	/* Error Status Register	*/
+#define APIC_OFFSET_CMCI_LVT	0x2F0	/* Local Vector Table (CMCI)	*/
+#define APIC_OFFSET_ICR_LOW	0x300	/* Interrupt Command Register	*/
+#define APIC_OFFSET_ICR_HI	0x310
+#define APIC_OFFSET_TIMER_LVT	0x320	/* Local Vector Table (Timer)	*/
+#define APIC_OFFSET_THERM_LVT	0x330	/* Local Vector Table (Thermal)	*/
+#define APIC_OFFSET_PERF_LVT	0x340	/* Local Vector Table (PMC)	*/
+#define APIC_OFFSET_LINT0_LVT	0x350	/* Local Vector Table (LINT0)	*/
+#define APIC_OFFSET_LINT1_LVT	0x360	/* Local Vector Table (LINT1)	*/
+#define APIC_OFFSET_ERROR_LVT	0x370	/* Local Vector Table (ERROR)	*/
+#define APIC_OFFSET_TIMER_ICR	0x380	/* Timer's Initial Count	*/
+#define APIC_OFFSET_TIMER_CCR	0x390	/* Timer's Current Count	*/
+#define APIC_OFFSET_TIMER_DCR	0x3E0	/* Timer's Divide Configuration	*/
+#define	APIC_OFFSET_SELF_IPI	0x3F0	/* Self IPI register */
+
+#define	VLAPIC_CTR0(vlapic, format)					\
+	VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define	VLAPIC_CTR1(vlapic, format, p1)					\
+	VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define	VLAPIC_CTR2(vlapic, format, p1, p2)				\
+	VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2)
+
+#define	VLAPIC_CTR3(vlapic, format, p1, p2, p3)				\
+	VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3)
+
+#define	VLAPIC_CTR_IRR(vlapic, msg)					\
+do {									\
+	uint32_t *irrptr = &(vlapic)->apic_page->irr0;			\
+	irrptr[0] = irrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]);	\
+} while (0)
+
+#define	VLAPIC_CTR_ISR(vlapic, msg)					\
+do {									\
+	uint32_t *isrptr = &(vlapic)->apic_page->isr0;			\
+	isrptr[0] = isrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);	\
+} while (0)
+
+enum boot_state {
+	BS_INIT,
+	BS_SIPI,
+	BS_RUNNING
+};
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define	ISRVEC_STK_SIZE		(16 + 1)
+
+#define VLAPIC_MAXLVT_INDEX	APIC_LVT_CMCI
+
+struct vlapic;
+
+struct vlapic_ops {
+	int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level);
+	int (*pending_intr)(struct vlapic *vlapic, int *vecptr);
+	void (*intr_accepted)(struct vlapic *vlapic, int vector);
+	void (*post_intr)(struct vlapic *vlapic, int hostcpu);
+	void (*set_tmr)(struct vlapic *vlapic, int vector, bool level);
+	void (*enable_x2apic_mode)(struct vlapic *vlapic);
+};
+
+struct vlapic {
+	struct vm		*vm;
+	int			vcpuid;
+	struct LAPIC		*apic_page;
+	struct vlapic_ops	ops;
+
+	uint32_t		esr_pending;
+	int			esr_firing;
+
+	struct callout	callout;	/* vlapic timer */
+	struct bintime	timer_fire_bt;	/* callout expiry time */
+	struct bintime	timer_freq_bt;	/* timer frequency */
+	struct bintime	timer_period_bt; /* timer period */
+	struct mtx	timer_mtx;
+
+	/*
+	 * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+	 * A vector is popped from the stack when the processor does an EOI.
+	 * The vector on the top of the stack is used to compute the
+	 * Processor Priority in conjunction with the TPR.
+	 */
+	uint8_t		isrvec_stk[ISRVEC_STK_SIZE];
+	int		isrvec_stk_top;
+
+	uint64_t	msr_apicbase;
+	enum boot_state	boot_state;
+
+	/*
+	 * Copies of some registers in the virtual APIC page. We do this for
+	 * a couple of different reasons:
+	 * - to be able to detect what changed (e.g. svr_last)
+	 * - to maintain a coherent snapshot of the register (e.g. lvt_last)
+	 */
+	uint32_t	svr_last;
+	uint32_t	lvt_last[VLAPIC_MAXLVT_INDEX + 1];
+};
+
+void vlapic_init(struct vlapic *vlapic);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+#endif	/* _VLAPIC_PRIV_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/offsets.in b/usr/src/uts/i86pc/io/vmm/offsets.in
new file mode 100644
index 0000000000..4b1fe1d6b6
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/offsets.in
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "intel/vmx_cpufunc.h"
+#include "intel/vmx.h"
+
+vmxctx
+	tmpstktop	VMXCTX_TMPSTKTOP
+	guest_rdi	VMXCTX_GUEST_RDI
+	guest_rsi	VMXCTX_GUEST_RSI
+	guest_rdx	VMXCTX_GUEST_RDX
+	guest_rcx	VMXCTX_GUEST_RCX
+	guest_r8	VMXCTX_GUEST_R8
+	guest_r9	VMXCTX_GUEST_R9
+	guest_rax	VMXCTX_GUEST_RAX
+	guest_rbx	VMXCTX_GUEST_RBX
+	guest_rbp	VMXCTX_GUEST_RBP
+	guest_r10	VMXCTX_GUEST_R10
+	guest_r11	VMXCTX_GUEST_R11
+	guest_r12	VMXCTX_GUEST_R12
+	guest_r13	VMXCTX_GUEST_R13
+	guest_r14	VMXCTX_GUEST_R14
+	guest_r15	VMXCTX_GUEST_R15
+	guest_cr2	VMXCTX_GUEST_CR2
+	host_r15	VMXCTX_HOST_R15
+	host_r14	VMXCTX_HOST_R14
+	host_r13	VMXCTX_HOST_R13
+	host_r12	VMXCTX_HOST_R12
+	host_rbp	VMXCTX_HOST_RBP
+	host_rsp	VMXCTX_HOST_RSP
+	host_rbx	VMXCTX_HOST_RBX
+	host_rip	VMXCTX_HOST_RIP
+	launch_error	VMXCTX_LAUNCH_ERROR
+
+vmx			VMX_SIZE
+
+\#define	VM_SUCCESS		0
+\#define	VM_FAIL_INVALID		1
+\#define	VM_FAIL_VALID		2
+
+\#define	VMX_RETURN_DIRECT	0
+\#define	VMX_RETURN_LONGJMP	1
+\#define	VMX_RETURN_VMRESUME	2
+\#define	VMX_RETURN_VMLAUNCH	3
+\#define	VMX_RETURN_AST		4
+
+cpu
+	cpu_thread
+
+_kthread
+	t_lwp
+	_tu._ts._t_astflag	T_ASTFLAG
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
new file mode 100644
index 0000000000..7081368f4a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -0,0 +1,1894 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <x86/psl.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include "vmm_ioport.h"
+#include "vmm_ktr.h"
+#include "vmm_host.h"
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include "vatpic.h"
+#include "vatpit.h"
+#include "vhpet.h"
+#include "vioapic.h"
+#include "vlapic.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+
+#ifdef	__FreeBSD__
+#include "io/ppt.h"
+#include "io/iommu.h"
+#endif
+
+struct vhpet;
+struct vioapic;
+struct vlapic;
+
+struct vcpu {
+	int		flags;
+	enum vcpu_state	state;
+	struct mtx	mtx;
+	int		hostcpu;	/* host cpuid this vcpu last ran on */
+	struct vlapic	*vlapic;
+	int		 vcpuid;
+	struct savefpu	*guestfpu;	/* guest fpu state */
+	void		*stats;
+	struct vm_exit	exitinfo;
+	uint64_t	nextrip;	/* (x) next instruction to execute */
+	enum x2apic_state x2apic_state;
+	uint64_t	exitintinfo;
+	int		nmi_pending;
+	int		extint_pending;
+	struct vm_exception exception;
+	int		exception_pending;
+};
+
+#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
+#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
+#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
+
+#define	VM_MAX_MEMORY_SEGMENTS	8
+
+struct vm {
+	void		*cookie;	/* processor-specific data */
+	void		*iommu;		/* iommu-specific data */
+	struct vcpu	vcpu[VM_MAXCPU];
+	struct vhpet	*vhpet;
+	struct vioapic	*vioapic;	/* virtual ioapic */
+	struct vatpic	*vatpic;	/* virtual atpic */
+	struct vatpit	*vatpit;	/* virtual atpit */
+	int		num_mem_segs;
+	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	char		name[VM_MAX_NAMELEN];
+
+	/*
+	 * Set of active vcpus.
+	 * An active vcpu is one that has been started implicitly (BSP) or
+	 * explicitly (AP) by sending it a startup ipi.
+	 */
+	cpuset_t	active_cpus;
+
+	vm_rendezvous_func_t rendezvous_func;
+};
+
+static int vmm_initialized;
+
+static struct vmm_ops *ops;
+#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
+#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
+
+#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
+#define	VMRUN(vmi, vcpu, rip) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
+    	(ops != NULL ? 							\
+    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
+	ENXIO)
+#define	VMMMAP_GET(vmi, gpa) \
+	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define	VMGETREG(vmi, vcpu, num, retval)		\
+	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETREG(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define	VMGETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMSETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMGETCAP(vmi, vcpu, num, retval)	\
+	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETCAP(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+#define	VLAPIC_INIT(vmi, vcpu)			\
+	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
+#define	VLAPIC_CLEANUP(vmi, vlapic)		\
+	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
+
+#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	fpu_stop_emulating()	clts()
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+
+/* statistics */
+static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static int vmm_ipinum;
+SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
+    "IPI vector used for vcpu notifications");
+
+static void
+vcpu_cleanup(struct vm *vm, int i)
+{
+	struct vcpu *vcpu = &vm->vcpu[i];
+
+	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
+#ifdef	__FreeBSD__
+	vmm_stat_free(vcpu->stats);	
+#endif
+	fpu_save_area_free(vcpu->guestfpu);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+	struct vcpu *vcpu;
+	
+	vcpu = &vm->vcpu[vcpu_id];
+
+	vcpu_lock_init(vcpu);
+	vcpu->hostcpu = NOCPU;
+	vcpu->vcpuid = vcpu_id;
+	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
+	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->exitintinfo = 0;
+	vcpu->guestfpu = fpu_save_area_alloc();
+	fpu_save_area_reset(vcpu->guestfpu);
+#ifdef	__FreeBSD__
+	vcpu->stats = vmm_stat_alloc();
+#endif
+}
+
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+	struct vcpu *vcpu;
+
+	if (cpuid < 0 || cpuid >= VM_MAXCPU)
+		panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+	vcpu = &vm->vcpu[cpuid];
+
+	return (&vcpu->exitinfo);
+}
+
+static int
+vmm_init(void)
+{
+	int error;
+
+#ifndef	__FreeBSD__
+	vmm_sol_glue_init();
+#endif
+
+	vmm_host_state_init();
+#ifdef	__FreeBSD__
+	vmm_ipi_init();
+#endif
+
+	error = vmm_mem_init();
+	if (error)
+		return (error);
+	
+	if (vmm_is_intel())
+		ops = &vmm_ops_intel;
+	else if (vmm_is_amd())
+		ops = &vmm_ops_amd;
+	else
+		return (ENXIO);
+
+	return (VMM_INIT());
+}
+
+#ifdef	__FreeBSD__
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		vmmdev_init();
+		if (ppt_num_devices() > 0)
+			iommu_init();
+		error = vmm_init();
+		if (error == 0)
+			vmm_initialized = 1;
+		break;
+	case MOD_UNLOAD:
+		error = vmmdev_cleanup();
+		if (error == 0) {
+#ifndef	__FreeBSD__
+			vmm_sol_glue_cleanup();
+#endif
+			iommu_cleanup();
+			vmm_ipi_cleanup();
+			error = VMM_CLEANUP();
+			/*
+			 * Something bad happened - prevent new
+			 * VMs from being created
+			 */
+			if (error)
+				vmm_initialized = 0;
+		}
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t vmm_kmod = {
+	"vmm",
+	vmm_handler,
+	NULL
+};
+
+/*
+ * vmm initialization has the following dependencies:
+ *
+ * - iommu initialization must happen after the pci passthru driver has had
+ *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
+ *
+ * - VT-x initialization requires smp_rendezvous() and therefore must happen
+ *   after SMP is fully functional (after SI_SUB_SMP).
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+#else
+int
+vmm_mod_load()
+{
+	int	error;
+
+	vmmdev_init();
+	error = vmm_init();
+	if (error == 0)
+		vmm_initialized = 1;
+
+	return (error);
+}
+
+int
+vmm_mod_unload()
+{
+	int	error;
+
+	error = vmmdev_cleanup();
+	if (error)
+		return (error);
+	error = VMM_CLEANUP();
+	if (error)
+		return (error);
+	vmm_initialized = 0;
+
+	return (0);
+}
+#endif
+
+int
+vm_create(const char *name, struct vm **retvm)
+{
+	int i;
+	struct vm *vm;
+	vm_paddr_t maxaddr;
+
+	const int BSP = 0;
+
+	/*
+	 * If vmm.ko could not be successfully initialized then don't attempt
+	 * to create the virtual machine.
+	 */
+	if (!vmm_initialized)
+		return (ENXIO);
+
+	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+		return (EINVAL);
+
+	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+	strcpy(vm->name, name);
+	vm->cookie = VMINIT(vm);
+
+	vm->vioapic = vioapic_init(vm);
+	vm->vhpet = vhpet_init(vm);
+	vm->vatpic = vatpic_init(vm);
+	vm->vatpit = vatpit_init(vm);
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu_init(vm, i);
+	}
+
+#ifdef	__FreeBSD__
+	maxaddr = vmm_mem_maxaddr();
+	vm->iommu = iommu_create_domain(maxaddr);
+#endif
+
+	*retvm = vm;
+	return (0);
+}
+
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+	size_t len;
+	vm_paddr_t hpa;
+	void *host_domain;
+
+#ifdef	__FreeBSD__
+	host_domain = iommu_host_domain();
+#endif
+
+	len = 0;
+	while (len < seg->len) {
+		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+		if (hpa == (vm_paddr_t)-1) {
+			panic("vm_free_mem_segs: cannot free hpa "
+			      "associated with gpa 0x%016lx", seg->gpa + len);
+		}
+
+#ifdef	__FreeBSD__
+		/*
+		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+		 */
+		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+#endif
+
+		vmm_mem_free(hpa, PAGE_SIZE);
+
+		len += PAGE_SIZE;
+	}
+
+#ifdef	__FreeBSD__
+	/*
+	 * Invalidate cached translations associated with 'vm->iommu' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(vm->iommu);
+#endif
+
+	bzero(seg, sizeof(struct vm_memory_segment));
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+	int i;
+
+#ifdef	__FreeBSD__
+	ppt_unassign_all(vm);
+#endif
+
+	for (i = 0; i < vm->num_mem_segs; i++)
+		vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+	vm->num_mem_segs = 0;
+
+	for (i = 0; i < VM_MAXCPU; i++)
+		vcpu_cleanup(vm, i);
+
+	vatpit_cleanup(vm->vatpit);
+	vhpet_cleanup(vm->vhpet);
+	vatpic_cleanup(vm->vatpic);
+	vioapic_cleanup(vm->vioapic);
+
+#ifdef	__FreeBSD__
+	iommu_destroy_domain(vm->iommu);
+#endif
+
+	VMCLEANUP(vm->cookie);
+
+	free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+	return (vm->name);
+}
+
+#ifdef	__FreeBSD__
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+			   VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+			   VM_PROT_NONE, spok));
+}
+#endif
+
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+	int i;
+	vm_paddr_t gpabase, gpalimit;
+
+	if (gpa & PAGE_MASK)
+		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		gpabase = vm->mem_segs[i].gpa;
+		gpalimit = gpabase + vm->mem_segs[i].len;
+		if (gpa >= gpabase && gpa < gpalimit)
+			return (FALSE);
+	}
+
+	return (TRUE);
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	int error, available, allocated;
+	struct vm_memory_segment *seg;
+	vm_paddr_t g, hpa;
+	void *host_domain;
+
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+		return (EINVAL);
+	
+	available = allocated = 0;
+	g = gpa;
+	while (g < gpa + len) {
+		if (vm_gpa_available(vm, g))
+			available++;
+		else
+			allocated++;
+
+		g += PAGE_SIZE;
+	}
+
+	/*
+	 * If there are some allocated and some available pages in the address
+	 * range then it is an error.
+	 */
+	if (allocated && available)
+		return (EINVAL);
+
+	/*
+	 * If the entire address range being requested has already been
+	 * allocated then there isn't anything more to do.
+	 */
+	if (allocated && available == 0)
+		return (0);
+
+	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+		return (E2BIG);
+
+#ifdef	__FreeBSD__
+	host_domain = iommu_host_domain();
+#endif
+
+	seg = &vm->mem_segs[vm->num_mem_segs];
+
+	error = 0;
+	seg->gpa = gpa;
+	seg->len = 0;
+	while (seg->len < len) {
+		hpa = vmm_mem_alloc(PAGE_SIZE);
+		if (hpa == 0) {
+			error = ENOMEM;
+			break;
+		}
+
+		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+		if (error)
+			break;
+
+#ifdef	__FreeBSD__
+		/*
+		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+		 */
+		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
+		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+#endif
+
+		seg->len += PAGE_SIZE;
+	}
+
+	if (error) {
+		vm_free_mem_seg(vm, seg);
+		return (error);
+	}
+
+#ifdef	__FreeBSD__
+	/*
+	 * Invalidate cached translations associated with 'host_domain' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(host_domain);
+#endif
+
+	vm->num_mem_segs++;
+
+	return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	vm_paddr_t nextpage;
+
+	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+	if (len > nextpage - gpa)
+		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+	return (VMMMAP_GET(vm->cookie, gpa));
+}
+
+void *
+vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
+	    void **cookie)
+{
+#ifdef	__FreeBSD__
+	int count, pageoff;
+	vm_page_t m;
+
+	pageoff = gpa & PAGE_MASK;
+	if (len > PAGE_SIZE - pageoff)
+		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
+
+	if (count == 1) {
+		*cookie = m;
+		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
+	} else {
+		*cookie = NULL;
+		return (NULL);
+	}
+#else
+	int pageoff;
+	vm_paddr_t hpa;
+
+	pageoff = gpa & PAGE_MASK;
+	if (len > PAGE_SIZE - pageoff)
+		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+	hpa = vm_gpa2hpa(vm, gpa, len);
+	if (hpa == (vm_paddr_t)-1)
+		return (NULL);
+
+	return (hat_kpm_pfn2va(btop(hpa)) + pageoff);
+#endif
+}
+
+void
+vm_gpa_release(void *cookie)
+{
+#ifdef	__FreeBSD__
+	vm_page_t m = cookie;
+
+	vm_page_lock(m);
+	vm_page_unhold(m);
+	vm_page_unlock(m);
+#endif
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+		  struct vm_memory_segment *seg)
+{
+	int i;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if (gpabase == vm->mem_segs[i].gpa) {
+			*seg = vm->mem_segs[i];
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
+{
+	struct vcpu *vcpu;
+	int error;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	error = VMSETREG(vm->cookie, vcpuid, reg, val);
+	if (error || reg != VM_REG_GUEST_RIP)
+		return (error);
+
+	/* Set 'nextrip' to match the value of %rip */
+	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
+	vcpu = &vm->vcpu[vcpuid];
+	vcpu->nextrip = val;
+	return (0);
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_IDTR:
+	case VM_REG_GUEST_GDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+	
+	switch (reg) {
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_SS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_TR:
+	case VM_REG_GUEST_LDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+
+	/* flush host state to the pcb */
+	fpuexit(curthread);
+
+	/* restore guest FPU state */
+	fpu_stop_emulating();
+	fpurestore(vcpu->guestfpu);
+
+	/*
+	 * The FPU is now "dirty" with the guest's state so turn on emulation
+	 * to trap any access to the FPU by the host.
+	 */
+	fpu_start_emulating();
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+
+	if ((rcr0() & CR0_TS) == 0)
+		panic("fpu emulation not enabled in host!");
+
+	/* save guest FPU state */
+	fpu_stop_emulating();
+	fpusave(vcpu->guestfpu);
+	fpu_start_emulating();
+}
+
+static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
+static int
+vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
+    bool from_idle)
+{
+	int error;
+
+	vcpu_assert_locked(vcpu);
+
+	/*
+	 * State transitions from the vmmdev_ioctl() must always begin from
+	 * the VCPU_IDLE state. This guarantees that there is only a single
+	 * ioctl() operating on a vcpu at any point.
+	 */
+	if (from_idle) {
+		while (vcpu->state != VCPU_IDLE)
+			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+	} else {
+		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
+		    "vcpu idle state"));
+	}
+
+	if (vcpu->state == VCPU_RUNNING) {
+		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
+		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
+	} else {
+		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
+		    "vcpu that is not running", vcpu->hostcpu));
+	}
+
+	/*
+	 * The following state transitions are allowed:
+	 * IDLE -> FROZEN -> IDLE
+	 * FROZEN -> RUNNING -> FROZEN
+	 * FROZEN -> SLEEPING -> FROZEN
+	 */
+	switch (vcpu->state) {
+	case VCPU_IDLE:
+	case VCPU_RUNNING:
+	case VCPU_SLEEPING:
+		error = (newstate != VCPU_FROZEN);
+		break;
+	case VCPU_FROZEN:
+		error = (newstate == VCPU_FROZEN);
+		break;
+	default:
+		error = 1;
+		break;
+	}
+
+	if (error)
+		return (EBUSY);
+
+	vcpu->state = newstate;
+	if (newstate == VCPU_RUNNING)
+		vcpu->hostcpu = curcpu;
+	else
+		vcpu->hostcpu = NOCPU;
+
+	if (newstate == VCPU_IDLE)
+		wakeup(&vcpu->state);
+
+	return (0);
+}
+
+static void
+vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
+		panic("Error %d setting state to %d\n", error, newstate);
+}
+
+static void
+vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
+		panic("Error %d setting state to %d", error, newstate);
+}
+
+/*
+ * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
+ */
+static int
+vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
+{
+	struct vm_exit *vmexit;
+	struct vcpu *vcpu;
+	int t, timo, spindown;
+
+	vcpu = &vm->vcpu[vcpuid];
+	spindown = 0;
+
+	vcpu_lock(vcpu);
+
+	/*
+	 * Do a final check for pending NMI or interrupts before
+	 * really putting this thread to sleep.
+	 *
+	 * These interrupts could have happened any time after we
+	 * returned from VMRUN() and before we grabbed the vcpu lock.
+	 */
+	if (vm->rendezvous_func == NULL &&
+	    !vm_nmi_pending(vm, vcpuid) &&
+	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
+		t = ticks;
+		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+		if (vlapic_enabled(vcpu->vlapic)) {
+			/*
+			 * XXX msleep_spin() is not interruptible so use the
+			 * 'timo' to put an upper bound on the sleep time.
+			 */
+			timo = hz;
+			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
+		} else {
+			/*
+			 * Spindown the vcpu if the apic is disabled and it
+			 * had entered the halted state.
+			 */
+			spindown = 1;
+		}
+		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+	}
+	vcpu_unlock(vcpu);
+
+#ifdef	__FreeBSD__
+	/*
+	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
+	 * outside the confines of the vcpu spinlock.
+	 */
+	if (spindown) {
+		*retu = true;
+		vmexit = vm_exitinfo(vm, vcpuid);
+		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
+		vm_deactivate_cpu(vm, vcpuid);
+		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
+	}
+#endif
+
+	return (0);
+}
+
+static int
+vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
+{
+	struct vie *vie;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+	uint64_t gla, gpa, cs_base;
+	struct vm_guest_paging *paging;
+	mem_region_read_t mread;
+	mem_region_write_t mwrite;
+	enum vm_cpu_mode cpu_mode;
+	int cs_d, error, length;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	gla = vme->u.inst_emul.gla;
+	gpa = vme->u.inst_emul.gpa;
+	cs_base = vme->u.inst_emul.cs_base;
+	cs_d = vme->u.inst_emul.cs_d;
+	vie = &vme->u.inst_emul.vie;
+	paging = &vme->u.inst_emul.paging;
+	cpu_mode = paging->cpu_mode;
+
+	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
+
+	/* Fetch, decode and emulate the faulting instruction */
+	if (vie->num_valid == 0) {
+		/*
+		 * If the instruction length is not known then assume a
+		 * maximum size instruction.
+		 */
+		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
+		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
+		    cs_base, length, vie);
+	} else {
+		/*
+		 * The instruction bytes have already been copied into 'vie'
+		 */
+		error = 0;
+	}
+	if (error == 1)
+		return (0);		/* Resume guest to handle page fault */
+	else if (error == -1)
+		return (EFAULT);
+	else if (error != 0)
+		panic("%s: vmm_fetch_instruction error %d", __func__, error);
+
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
+		return (EFAULT);
+
+	/*
+	 * If the instruction length was not specified then update it now
+	 * along with 'nextrip'.
+	 */
+	if (vme->inst_length == 0) {
+		vme->inst_length = vie->num_processed;
+		vcpu->nextrip += vie->num_processed;
+	}
+ 
+	/* return to userland unless this is an in-kernel emulated device */
+	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
+		mread = lapic_mmio_read;
+		mwrite = lapic_mmio_write;
+	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
+		mread = vioapic_mmio_read;
+		mwrite = vioapic_mmio_write;
+	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
+		mread = vhpet_mmio_read;
+		mwrite = vhpet_mmio_write;
+	} else {
+		*retu = true;
+		return (0);
+	}
+
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+	    mread, mwrite, retu);
+
+	return (error);
+}
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+	int error, vcpuid;
+	struct vcpu *vcpu;
+	struct pcb *pcb;
+	uint64_t tscval;
+	struct vm_exit *vme;
+	bool retu, intr_disabled;
+
+	vcpuid = vmrun->cpuid;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+restart:
+	critical_enter();
+
+	tscval = rdtsc();
+
+#ifdef	__FreeBSD__
+	pcb = PCPU_GET(curpcb);
+	set_pcb_flags(pcb, PCB_FULL_IRET);
+#endif
+
+#ifndef	__FreeBSD__
+	installctx(curthread, vcpu, save_guest_fpustate,
+	    restore_guest_fpustate, NULL, NULL, NULL, NULL);
+#endif
+	restore_guest_fpustate(vcpu);
+
+	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
+	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
+	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
+
+	save_guest_fpustate(vcpu);
+#ifndef	__FreeBSD__
+	removectx(curthread, vcpu, save_guest_fpustate,
+	    restore_guest_fpustate, NULL, NULL, NULL, NULL);
+#endif
+
+	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+	critical_exit();
+
+	if (error == 0) {
+		retu = false;
+		vcpu->nextrip = vme->rip + vme->inst_length;
+		switch (vme->exitcode) {
+		case VM_EXITCODE_HLT:
+			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
+			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
+			break;
+		case VM_EXITCODE_INST_EMUL:
+			error = vm_handle_inst_emul(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_INOUT:
+		case VM_EXITCODE_INOUT_STR:
+			error = vm_handle_inout(vm, vcpuid, vme, &retu);
+			break;
+		default:
+			retu = true;	/* handled in userland */
+			break;
+		}
+	}
+
+	if (error == 0 && retu == false) {
+		goto restart;
+	}
+
+	/* copy the exit information */
+	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
+	return (error);
+}
+
+int
+vm_restart_instruction(void *arg, int vcpuid)
+{
+	struct vm *vm;
+	struct vcpu *vcpu;
+	enum vcpu_state state;
+	uint64_t rip;
+	int error;
+
+	vm = arg;
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	state = vcpu_get_state(vm, vcpuid, NULL);
+	if (state == VCPU_RUNNING) {
+		/*
+		 * When a vcpu is "running" the next instruction is determined
+		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
+		 * Thus setting 'inst_length' to zero will cause the current
+		 * instruction to be restarted.
+		 */
+		vcpu->exitinfo.inst_length = 0;
+		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
+		    "setting inst_length to zero", vcpu->exitinfo.rip);
+	} else if (state == VCPU_FROZEN) {
+		/*
+		 * When a vcpu is "frozen" it is outside the critical section
+		 * around VMRUN() and 'nextrip' points to the next instruction.
+		 * Thus instruction restart is achieved by setting 'nextrip'
+		 * to the vcpu's %rip.
+		 */
+		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
+		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
+		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
+		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
+		vcpu->nextrip = rip;
+	} else {
+		panic("%s: invalid state %d", __func__, state);
+	}
+	return (0);
+}
+
+int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+	struct vcpu *vcpu;
+	int type, vector;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		vector = info & 0xff;
+		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+			return (EINVAL);
+		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+			return (EINVAL);
+		if (info & VM_INTINFO_RSVD)
+			return (EINVAL);
+	} else {
+		info = 0;
+	}
+	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+	vcpu->exitintinfo = info;
+	return (0);
+}
+
+enum exc_class {
+	EXC_BENIGN,
+	EXC_CONTRIBUTORY,
+	EXC_PAGEFAULT
+};
+
+#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+	int type, vector;
+
+#ifdef	__FreeBSD__
+	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+#else
+	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
+#endif
+	type = info & VM_INTINFO_TYPE;
+	vector = info & 0xff;
+
+	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+	switch (type) {
+	case VM_INTINFO_HWINTR:
+	case VM_INTINFO_SWINTR:
+	case VM_INTINFO_NMI:
+		return (EXC_BENIGN);
+	default:
+		/*
+		 * Hardware exception.
+		 *
+		 * SVM and VT-x use identical type values to represent NMI,
+		 * hardware interrupt and software interrupt.
+		 *
+		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
+		 * for exceptions except #BP and #OF. #BP and #OF use a type
+		 * value of '5' or '6'. Therefore we don't check for explicit
+		 * values of 'type' to classify 'intinfo' into a hardware
+		 * exception.
+		 */
+		break;
+	}
+
+	switch (vector) {
+	case IDT_PF:
+	case IDT_VE:
+		return (EXC_PAGEFAULT);
+	case IDT_DE:
+	case IDT_TS:
+	case IDT_NP:
+	case IDT_SS:
+	case IDT_GP:
+		return (EXC_CONTRIBUTORY);
+	default:
+		return (EXC_BENIGN);
+	}
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+    uint64_t *retinfo)
+{
+	enum exc_class exc1, exc2;
+	int type1, vector1;
+
+#ifdef	__FreeBSD__
+	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+#else
+	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
+	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
+#endif
+
+	/*
+	 * If an exception occurs while attempting to call the double-fault
+	 * handler the processor enters shutdown mode (aka triple fault).
+	 */
+	type1 = info1 & VM_INTINFO_TYPE;
+	vector1 = info1 & 0xff;
+	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+		    info1, info2);
+#ifdef	__FreeBSD__
+		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+#endif
+		*retinfo = 0;
+		return (0);
+	}
+
+	/*
+	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+	 */
+	exc1 = exception_class(info1);
+	exc2 = exception_class(info2);
+	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+		/* Convert nested fault into a double fault. */
+		*retinfo = IDT_DF;
+		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		*retinfo |= VM_INTINFO_DEL_ERRCODE;
+	} else {
+		/* Handle exceptions serially */
+		*retinfo = info2;
+	}
+	return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+	uint64_t info = 0;
+
+	if (vcpu->exception_pending) {
+		info = vcpu->exception.vector & 0xff;
+		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		if (vcpu->exception.error_code_valid) {
+			info |= VM_INTINFO_DEL_ERRCODE;
+			info |= (uint64_t)vcpu->exception.error_code << 32;
+		}
+	}
+	return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+	struct vcpu *vcpu;
+	uint64_t info1, info2;
+	int valid;
+
+	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	info1 = vcpu->exitintinfo;
+	vcpu->exitintinfo = 0;
+
+	info2 = 0;
+	if (vcpu->exception_pending) {
+		info2 = vcpu_exception_intinfo(vcpu);
+		vcpu->exception_pending = 0;
+		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+		    vcpu->exception.vector, info2);
+	}
+
+	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+	} else if (info1 & VM_INTINFO_VALID) {
+		*retinfo = info1;
+		valid = 1;
+	} else if (info2 & VM_INTINFO_VALID) {
+		*retinfo = info2;
+		valid = 1;
+	} else {
+		valid = 0;
+	}
+
+	if (valid) {
+		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+	}
+
+	return (valid);
+}
+
+int
+vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (exception->vector < 0 || exception->vector >= 32)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpu->exception_pending) {
+		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
+		    "pending exception %d", exception->vector,
+		    vcpu->exception.vector);
+		return (EBUSY);
+	}
+
+	vcpu->exception_pending = 1;
+	vcpu->exception = *exception;
+	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
+	return (0);
+}
+
+int
+vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+	struct vcpu *vcpu;
+	int pending;
+
+	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+	vcpu = &vm->vcpu[vcpuid];
+	pending = vcpu->exception_pending;
+	if (pending) {
+		vcpu->exception_pending = 0;
+		*exception = vcpu->exception;
+		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
+		    exception->vector);
+	}
+	return (pending);
+}
+
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+    int errcode)
+{
+	struct vm_exception exception;
+	struct vm_exit *vmexit;
+	struct vm *vm;
+	int error;
+
+	vm = vmarg;
+
+	exception.vector = vector;
+	exception.error_code = errcode;
+	exception.error_code_valid = errcode_valid;
+	error = vm_inject_exception(vm, vcpuid, &exception);
+	KASSERT(error == 0, ("vm_inject_exception error %d", error));
+
+	/*
+	 * A fault-like exception allows the instruction to be restarted
+	 * after the exception handler returns.
+	 *
+	 * By setting the inst_length to 0 we ensure that the instruction
+	 * pointer remains at the faulting instruction.
+	 */
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->inst_length = 0;
+}
+
+void
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
+{
+	struct vm *vm;
+	int error;
+
+	vm = vmarg;
+	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
+	    error_code, cr2);
+
+	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
+	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
+
+	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
+}
+
+static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
+int
+vm_inject_nmi(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu->nmi_pending = 1;
+	vcpu_notify_event(vm, vcpuid, false);
+
+	return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpu->nmi_pending == 0)
+		panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+	vcpu->nmi_pending = 0;
+	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
+}
+
+static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
+
+int
+vm_inject_extint(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu->extint_pending = 1;
+	vcpu_notify_event(vm, vcpuid, false);
+
+	return (0);
+}
+
+int
+vm_extint_pending(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	return (vcpu->extint_pending);
+}
+
+void
+vm_extint_clear(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpu->extint_pending == 0)
+		panic("vm_extint_clear: inconsistent extint_pending state");
+
+	vcpu->extint_pending = 0;
+	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+struct vhpet *
+vm_hpet(struct vm *vm)
+{
+	return (vm->vhpet);
+}
+
+struct vioapic *
+vm_ioapic(struct vm *vm)
+{
+	return (vm->vioapic);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].vlapic);
+}
+
+#ifdef	__FreeBSD__
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+	int found, i, n;
+	int b, s, f;
+	char *val, *cp, *cp2;
+
+	/*
+	 * XXX
+	 * The length of an environment variable is limited to 128 bytes which
+	 * puts an upper limit on the number of passthru devices that may be
+	 * specified using a single environment variable.
+	 *
+	 * Work around this by scanning multiple environment variable
+	 * names instead of a single one - yuck!
+	 */
+	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
+
+	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
+	found = 0;
+	for (i = 0; names[i] != NULL && !found; i++) {
+		cp = val = getenv(names[i]);
+		while (cp != NULL && *cp != '\0') {
+			if ((cp2 = strchr(cp, ' ')) != NULL)
+				*cp2 = '\0';
+
+			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+			if (n == 3 && bus == b && slot == s && func == f) {
+				found = 1;
+				break;
+			}
+		
+			if (cp2 != NULL)
+				*cp2++ = ' ';
+
+			cp = cp2;
+		}
+		freeenv(val);
+	}
+	return (found);
+}
+#endif
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+	return (vm->iommu);
+}
+
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
+    bool from_idle)
+{
+	int error;
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
+	vcpu_unlock(vcpu);
+
+	return (error);
+}
+
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
+{
+	struct vcpu *vcpu;
+	enum vcpu_state state;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	state = vcpu->state;
+	if (hostcpu != NULL)
+		*hostcpu = vcpu->hostcpu;
+	vcpu_unlock(vcpu);
+
+	return (state);
+}
+
+int
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (CPU_ISSET(vcpuid, &vm->active_cpus))
+		return (EBUSY);
+
+	VCPU_CTR0(vm, vcpuid, "activated");
+	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
+	return (0);
+}
+
+cpuset_t
+vm_active_cpus(struct vm *vm)
+{
+
+	return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+	return (vm->vcpu[vcpuid].stats);
+}
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*state = vm->vcpu[vcpuid].x2apic_state;
+
+	return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (state >= X2APIC_STATE_LAST)
+		return (EINVAL);
+
+	vm->vcpu[vcpuid].x2apic_state = state;
+
+	vlapic_set_x2apic_state(vm, vcpuid, state);
+
+	return (0);
+}
+
+/*
+ * This function is called to ensure that a vcpu "sees" a pending event
+ * as soon as possible:
+ * - If the vcpu thread is sleeping then it is woken up.
+ * - If the vcpu is running on a different host_cpu then an IPI will be directed
+ *   to the host_cpu to cause the vcpu to trap into the hypervisor.
+ */
+void
+vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+{
+	int hostcpu;
+	struct vcpu *vcpu;
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	hostcpu = vcpu->hostcpu;
+	if (vcpu->state == VCPU_RUNNING) {
+		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
+		if (hostcpu != curcpu) {
+			if (lapic_intr) {
+				vlapic_post_intr(vcpu->vlapic, hostcpu,
+				    vmm_ipinum);
+			} else {
+				ipi_cpu(hostcpu, vmm_ipinum);
+			}
+		} else {
+			/*
+			 * If the 'vcpu' is running on 'curcpu' then it must
+			 * be sending a notification to itself (e.g. SELF_IPI).
+			 * The pending event will be picked up when the vcpu
+			 * transitions back to guest context.
+			 */
+		}
+	} else {
+		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
+		    "with hostcpu %d", vcpu->state, hostcpu));
+		if (vcpu->state == VCPU_SLEEPING)
+			wakeup_one(vcpu);
+	}
+	vcpu_unlock(vcpu);
+}
+
+int
+vm_apicid2vcpuid(struct vm *vm, int apicid)
+{
+	/*
+	 * XXX apic id is assumed to be numerically identical to vcpu id
+	 */
+	return (apicid);
+}
+
+struct vatpic *
+vm_atpic(struct vm *vm)
+{
+	return (vm->vatpic);
+}
+
+struct vatpit *
+vm_atpit(struct vm *vm)
+{
+	return (vm->vatpit);
+}
+
+enum vm_reg_name
+vm_segment_name(int seg)
+{
+	static enum vm_reg_name seg_names[] = {
+		VM_REG_GUEST_ES,
+		VM_REG_GUEST_CS,
+		VM_REG_GUEST_SS,
+		VM_REG_GUEST_DS,
+		VM_REG_GUEST_FS,
+		VM_REG_GUEST_GS
+	};
+
+	KASSERT(seg >= 0 && seg < nitems(seg_names),
+	    ("%s: invalid segment encoding %d", __func__, seg));
+	return (seg_names[seg]);
+}
+
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int idx;
+
+#ifdef	__FreeBSD__
+	for (idx = 0; idx < num_copyinfo; idx++) {
+		if (copyinfo[idx].cookie != NULL)
+			vm_gpa_release(copyinfo[idx].cookie);
+	}
+#endif
+	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int error, idx, nused;
+	size_t n, off, remaining;
+	void *hva, *cookie;
+	uint64_t gpa;
+
+	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+	nused = 0;
+	remaining = len;
+	while (remaining > 0) {
+		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+		if (error)
+			return (error);
+		off = gpa & PAGE_MASK;
+		n = min(remaining, PAGE_SIZE - off);
+		copyinfo[nused].gpa = gpa;
+		copyinfo[nused].len = n;
+		remaining -= n;
+		gla += n;
+		nused++;
+	}
+
+	for (idx = 0; idx < nused; idx++) {
+		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+		    prot, &cookie);
+		if (hva == NULL)
+			break;
+		copyinfo[idx].hva = hva;
+		copyinfo[idx].cookie = cookie;
+	}
+
+	if (idx != nused) {
+		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+    size_t len)
+{
+	char *dst;
+	int idx;
+	
+	dst = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		dst += copyinfo[idx].len;
+		idx++;
+	}
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len)
+{
+	const char *src;
+	int idx;
+
+	src = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		src += copyinfo[idx].len;
+		idx++;
+	}
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.conf b/usr/src/uts/i86pc/io/vmm/vmm.conf
new file mode 100644
index 0000000000..8833076014
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm.conf
@@ -0,0 +1 @@
+name="vmm" parent="pseudo";
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.c b/usr/src/uts/i86pc/io/vmm/vmm_host.c
new file mode 100644
index 0000000000..b94caf4009
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_host.c
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+	vmm_host_efer = rdmsr(MSR_EFER);
+	vmm_host_pat = rdmsr(MSR_PAT);
+
+	/*
+	 * We always want CR0.TS to be set when the processor does a VM exit.
+	 *
+	 * With emulation turned on unconditionally after a VM exit, we are
+	 * able to trap inadvertent use of the FPU until the guest FPU state
+	 * has been safely squirreled away.
+	 */
+	vmm_host_cr0 = rcr0() | CR0_TS;
+
+	vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+	return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+	return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+	return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+	return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+#ifdef	__FreeBSD__
+	return (GSEL(GDATA_SEL, SEL_KPL));
+#else
+	return (SEL_GDT(GDT_KDATA, SEL_KPL));
+#endif
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+#ifdef	__FreeBSD__
+	return (GSEL(GCODE_SEL, SEL_KPL));
+#else
+	return (SEL_GDT(GDT_KCODE, SEL_KPL));
+#endif
+}
+
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+#ifdef	__FreeBSD__
+	return (GSEL(GPROC0_SEL, SEL_KPL));
+#else
+	return (SEL_GDT(GDT_KTSS, SEL_KPL));
+#endif
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+#ifdef	__FreeBSD__
+	return (0);
+#else
+	return (rdmsr(MSR_FSBASE));
+#endif
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+#ifdef	__FreeBSD__
+	return (r_idt.rd_base);
+#else
+	desctbr_t idtr;
+
+	rd_idtr(&idtr);
+	return (idtr.dtr_base);
+#endif
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h
new file mode 100644
index 0000000000..5de015a228
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_host.h 242275 2012-10-29 01:51:24Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef	_VMM_HOST_H_
+#define	_VMM_HOST_H_
+
+#ifndef	__FreeBSD__
+#include <sys/cpuvar.h>
+#endif
+
+#ifndef	_KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+#ifdef	__FreeBSD__
+	return ((uint64_t)PCPU_GET(tssp));
+#else
+	return ((u_long)CPU->cpu_tss);
+#endif
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+#ifdef	__FreeBSD__
+	return ((uint64_t)&gdt[NGDT * curcpu]);
+#else
+	desctbr_t gdtr;
+
+	rd_gdtr(&gdtr);
+	return (gdtr.dtr_base);
+#endif
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+#ifdef	__FreeBSD__
+	return ((uint64_t)&__pcpu[curcpu]);
+#else
+	return (rdmsr(MSR_GSBASE));
+#endif
+}
+
+#ifndef	__FreeBSD__
+static __inline uint64_t
+vmm_get_host_fssel(void)
+{
+	return (KFS_SEL);
+}
+
+static __inline uint64_t
+vmm_get_host_gssel(void)
+{
+	return (KGS_SEL);
+}
+#endif
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000000..72c7056e26
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
@@ -0,0 +1,2370 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $");
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+#else	/* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/_iovec.h>
+
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <vmmapi.h>
+#define	KASSERT(exp,msg)	assert((exp))
+#endif	/* _KERNEL */
+
+#include <machine/vmm_instruction_emul.h>
+#include <x86/psl.h>
+#include <x86/specialreg.h>
+
+/* struct vie_op.op_type */
+enum {
+	VIE_OP_TYPE_NONE = 0,
+	VIE_OP_TYPE_MOV,
+	VIE_OP_TYPE_MOVSX,
+	VIE_OP_TYPE_MOVZX,
+	VIE_OP_TYPE_AND,
+	VIE_OP_TYPE_OR,
+	VIE_OP_TYPE_SUB,
+	VIE_OP_TYPE_TWO_BYTE,
+	VIE_OP_TYPE_PUSH,
+	VIE_OP_TYPE_CMP,
+	VIE_OP_TYPE_POP,
+	VIE_OP_TYPE_MOVS,
+	VIE_OP_TYPE_GROUP1,
+	VIE_OP_TYPE_STOS,
+	VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
+#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
+#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
+#define	VIE_OP_F_NO_MODRM	(1 << 3)
+#define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
+
+static const struct vie_op two_byte_opcodes[256] = {
+	[0xB6] = {
+		.op_byte = 0xB6,
+		.op_type = VIE_OP_TYPE_MOVZX,
+	},
+	[0xB7] = {
+		.op_byte = 0xB7,
+		.op_type = VIE_OP_TYPE_MOVZX,
+	},
+	[0xBE] = {
+		.op_byte = 0xBE,
+		.op_type = VIE_OP_TYPE_MOVSX,
+	},
+};
+
+static const struct vie_op one_byte_opcodes[256] = {
+	[0x0F] = {
+		.op_byte = 0x0F,
+		.op_type = VIE_OP_TYPE_TWO_BYTE
+	},
+	[0x2B] = {
+		.op_byte = 0x2B,
+		.op_type = VIE_OP_TYPE_SUB,
+	},
+	[0x3B] = {
+		.op_byte = 0x3B,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
+	[0x88] = {
+		.op_byte = 0x88,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0x89] = {
+		.op_byte = 0x89,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0x8A] = {
+		.op_byte = 0x8A,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0x8B] = {
+		.op_byte = 0x8B,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0xA1] = {
+		.op_byte = 0xA1,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xA3] = {
+		.op_byte = 0xA3,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xA4] = {
+		.op_byte = 0xA4,
+		.op_type = VIE_OP_TYPE_MOVS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xA5] = {
+		.op_byte = 0xA5,
+		.op_type = VIE_OP_TYPE_MOVS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xAA] = {
+		.op_byte = 0xAA,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xAB] = {
+		.op_byte = 0xAB,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xC6] = {
+		/* XXX Group 11 extended opcode - not just MOV */
+		.op_byte = 0xC6,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM8,
+	},
+	[0xC7] = {
+		.op_byte = 0xC7,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM,
+	},
+	[0x23] = {
+		.op_byte = 0x23,
+		.op_type = VIE_OP_TYPE_AND,
+	},
+	[0x81] = {
+		/* XXX Group 1 extended opcode */
+		.op_byte = 0x81,
+		.op_type = VIE_OP_TYPE_GROUP1,
+		.op_flags = VIE_OP_F_IMM,
+	},
+	[0x83] = {
+		/* XXX Group 1 extended opcode */
+		.op_byte = 0x83,
+		.op_type = VIE_OP_TYPE_GROUP1,
+		.op_flags = VIE_OP_F_IMM8,
+	},
+	[0x8F] = {
+		/* XXX Group 1A extended opcode - not just POP */
+		.op_byte = 0x8F,
+		.op_type = VIE_OP_TYPE_POP,
+	},
+	[0xFF] = {
+		/* XXX Group 5 extended opcode - not just PUSH */
+		.op_byte = 0xFF,
+		.op_type = VIE_OP_TYPE_PUSH,
+	}
+};
+
+/* struct vie.mod */
+#define	VIE_MOD_INDIRECT		0
+#define	VIE_MOD_INDIRECT_DISP8		1
+#define	VIE_MOD_INDIRECT_DISP32		2
+#define	VIE_MOD_DIRECT			3
+
+/* struct vie.rm */
+#define	VIE_RM_SIB			4
+#define	VIE_RM_DISP32			5
+
+#define	GB				(1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15
+};
+
+static uint64_t size2mask[] = {
+	[1] = 0xff,
+	[2] = 0xffff,
+	[4] = 0xffffffff,
+	[8] = 0xffffffffffffffff,
+};
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+	int error;
+
+	error = vm_get_register(vm, vcpuid, reg, rval);
+
+	return (error);
+}
+
+static void
+vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
+{
+	*lhbr = 0;
+	*reg = gpr_map[vie->reg];
+
+	/*
+	 * 64-bit mode imposes limitations on accessing legacy high byte
+	 * registers (lhbr).
+	 *
+	 * The legacy high-byte registers cannot be addressed if the REX
+	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
+	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
+	 *
+	 * If the REX prefix is not present then the values 4, 5, 6 and 7
+	 * of the 'ModRM:reg' field address the legacy high-byte registers,
+	 * %ah, %ch, %dh and %bh respectively.
+	 */
+	if (!vie->rex_present) {
+		if (vie->reg & 0x4) {
+			*lhbr = 1;
+			*reg = gpr_map[vie->reg & 0x3];
+		}
+	}
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+	uint64_t val;
+	int error, lhbr;
+	enum vm_reg_name reg;
+
+	vie_calc_bytereg(vie, &reg, &lhbr);
+	error = vm_get_register(vm, vcpuid, reg, &val);
+
+	/*
+	 * To obtain the value of a legacy high byte register shift the
+	 * base register right by 8 bits (%ah = %rax >> 8).
+	 */
+	if (lhbr)
+		*rval = val >> 8;
+	else
+		*rval = val;
+	return (error);
+}
+
+static int
+vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
+{
+	uint64_t origval, val, mask;
+	int error, lhbr;
+	enum vm_reg_name reg;
+
+	vie_calc_bytereg(vie, &reg, &lhbr);
+	error = vm_get_register(vm, vcpuid, reg, &origval);
+	if (error == 0) {
+		val = byte;
+		mask = 0xff;
+		if (lhbr) {
+			/*
+			 * Shift left by 8 to store 'byte' in a legacy high
+			 * byte register.
+			 */
+			val <<= 8;
+			mask <<= 8;
+		}
+		val |= origval & ~mask;
+		error = vm_set_register(vm, vcpuid, reg, val);
+	}
+	return (error);
+}
+
+int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+		    uint64_t val, int size)
+{
+	int error;
+	uint64_t origval;
+
+	switch (size) {
+	case 1:
+	case 2:
+		error = vie_read_register(vm, vcpuid, reg, &origval);
+		if (error)
+			return (error);
+		val &= size2mask[size];
+		val |= origval & ~size2mask[size];
+		break;
+	case 4:
+		val &= 0xffffffffUL;
+		break;
+	case 8:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	error = vm_set_register(vm, vcpuid, reg, val);
+	return (error);
+}
+
+#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+/*
+ * Return the status flags that would result from doing (x - y).
+ */
+#define	GETCC(sz)							\
+static u_long								\
+getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
+{									\
+	u_long rflags;							\
+									\
+	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
+	    "=r" (rflags), "+r" (x) : "m" (y));				\
+	return (rflags);						\
+} struct __hack
+
+GETCC(8);
+GETCC(16);
+GETCC(32);
+GETCC(64);
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
+	    ("getcc: invalid operand size %d", opsize));
+
+	if (opsize == 1)
+		return (getcc8(x, y));
+	else if (opsize == 2)
+		return (getcc16(x, y));
+	else if (opsize == 4)
+		return (getcc32(x, y));
+	else
+		return (getcc64(x, y));
+}
+
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint8_t byte;
+	uint64_t val;
+
+	size = vie->opsize;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x88:
+		/*
+		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 88/r:	mov r/m8, r8
+		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
+		 */
+		size = 1;	/* override for byte operation */
+		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
+		if (error == 0)
+			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
+		break;
+	case 0x89:
+		/*
+		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m16, r16
+		 * 89/r:	mov r/m32, r32
+		 * REX.W + 89/r	mov r/m64, r64
+		 */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0x8A:
+		/*
+		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8A/r:	mov r8, r/m8
+		 * REX + 8A/r:	mov r8, r/m8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0)
+			error = vie_write_bytereg(vm, vcpuid, vie, val);
+		break;
+	case 0x8B:
+		/*
+		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8B/r:	mov r16, r/m16
+		 * 8B/r:	mov r32, r/m32
+		 * REX.W 8B/r:	mov r64, r/m64
+		 */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = gpr_map[vie->reg];
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xA1:
+		/*
+		 * MOV from seg:moffset to AX/EAX/RAX
+		 * A1:		mov AX, moffs16
+		 * A1:		mov EAX, moffs32
+		 * REX.W + A1:	mov RAX, moffs64
+		 */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = VM_REG_GUEST_RAX;
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xA3:
+		/*
+		 * MOV from AX/EAX/RAX to seg:moffset
+		 * A3:		mov moffs16, AX
+		 * A3:		mov moffs32, EAX 
+		 * REX.W + A3:	mov moffs64, RAX
+		 */
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0xC6:
+		/*
+		 * MOV from imm8 to mem (ModRM:r/m)
+		 * C6/0		mov r/m8, imm8
+		 * REX + C6/0	mov r/m8, imm8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
+		break;
+	case 0xC7:
+		/*
+		 * MOV from imm16/imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m16, imm16
+		 * C7/0		mov r/m32, imm32
+		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
+		 */
+		val = vie->immediate & size2mask[size];
+		error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+static int
+emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	     mem_region_read_t memread, mem_region_write_t memwrite,
+	     void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val;
+
+	size = vie->opsize;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0xB6:
+		/*
+		 * MOV and zero extend byte from mem (ModRM:r/m) to
+		 * reg (ModRM:reg).
+		 *
+		 * 0F B6/r		movzx r16, r/m8
+		 * 0F B6/r		movzx r32, r/m8
+		 * REX.W + 0F B6/r	movzx r64, r/m8
+		 */
+
+		/* get the first operand */
+		error = memread(vm, vcpuid, gpa, &val, 1, arg);
+		if (error)
+			break;
+
+		/* get the second operand */
+		reg = gpr_map[vie->reg];
+
+		/* zero-extend byte */
+		val = (uint8_t)val;
+
+		/* write the result */
+		error = vie_update_register(vm, vcpuid, reg, val, size);
+		break;
+	case 0xB7:
+		/*
+		 * MOV and zero extend word from mem (ModRM:r/m) to
+		 * reg (ModRM:reg).
+		 *
+		 * 0F B7/r		movzx r32, r/m16
+		 * REX.W + 0F B7/r	movzx r64, r/m16
+		 */
+		error = memread(vm, vcpuid, gpa, &val, 2, arg);
+		if (error)
+			return (error);
+
+		reg = gpr_map[vie->reg];
+
+		/* zero-extend word */
+		val = (uint16_t)val;
+
+		error = vie_update_register(vm, vcpuid, reg, val, size);
+		break;
+	case 0xBE:
+		/*
+		 * MOV and sign extend byte from mem (ModRM:r/m) to
+		 * reg (ModRM:reg).
+		 *
+		 * 0F BE/r		movsx r16, r/m8
+		 * 0F BE/r		movsx r32, r/m8
+		 * REX.W + 0F BE/r	movsx r64, r/m8
+		 */
+
+		/* get the first operand */
+		error = memread(vm, vcpuid, gpa, &val, 1, arg);
+		if (error)
+			break;
+
+		/* get the second operand */
+		reg = gpr_map[vie->reg];
+
+		/* sign extend byte */
+		val = (int8_t)val;
+
+		/* write the result */
+		error = vie_update_register(vm, vcpuid, reg, val, size);
+		break;
+	default:
+		break;
+	}
+	return (error);
+}
+
+/*
+ * Helper function to calculate and validate a linear address.
+ *
+ * Returns 0 on success and 1 if an exception was injected into the guest.
+ */
+static int
+get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
+    int opsize, int addrsize, int prot, enum vm_reg_name seg,
+    enum vm_reg_name gpr, uint64_t *gla)
+{
+	struct seg_desc desc;
+	uint64_t cr0, val, rflags;
+	int error;
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
+	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
+	    __func__, error, seg));
+
+	error = vie_read_register(vm, vcpuid, gpr, &val);
+	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
+	    error, gpr));
+
+	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
+	    addrsize, prot, gla)) {
+		if (seg == VM_REG_GUEST_SS)
+			vm_inject_ss(vm, vcpuid, 0);
+		else
+			vm_inject_gp(vm, vcpuid);
+		return (1);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, *gla)) {
+		if (seg == VM_REG_GUEST_SS)
+			vm_inject_ss(vm, vcpuid, 0);
+		else
+			vm_inject_gp(vm, vcpuid);
+		return (1);
+	}
+
+	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (1);
+	}
+
+	return (0);
+}
+
+static int
+emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
+	uint64_t rcx, rdi, rsi, rflags;
+	int error, opsize, seg, repeat;
+
+	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
+	val = 0;
+	error = 0;
+
+	/*
+	 * XXX although the MOVS instruction is only supposed to be used with
+	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
+	 *
+	 * Empirically the "repnz" prefix has identical behavior to "rep"
+	 * and the zero flag does not make a difference.
+	 */
+	repeat = vie->repz_present | vie->repnz_present;
+
+	if (repeat) {
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+		/*
+		 * The count register is %rcx, %ecx or %cx depending on the
+		 * address size of the instruction.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+			return (0);
+	}
+
+	/*
+	 *	Source		Destination	Comments
+	 *	--------------------------------------------
+	 * (1)  memory		memory		n/a
+	 * (2)  memory		mmio		emulated
+	 * (3)  mmio		memory		emulated
+	 * (4)  mmio		mmio		emulated
+	 *
+	 * At this point we don't have sufficient information to distinguish
+	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
+	 * out because it will succeed only when operating on regular memory.
+	 *
+	 * XXX the emulation doesn't properly handle the case where 'gpa'
+	 * is straddling the boundary between the normal memory and MMIO.
+	 */
+
+	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
+	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
+	if (error)
+		goto done;
+
+	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		/*
+		 * case (2): read from system memory and write to mmio.
+		 */
+		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+		if (error)
+			goto done;
+	} else if (error > 0) {
+		/*
+		 * Resume guest execution to handle fault.
+		 */
+		goto done;
+	} else {
+		/*
+		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
+		 * if 'srcaddr' is in the mmio space.
+		 */
+
+		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
+		if (error)
+			goto done;
+
+		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
+		    PROT_WRITE, copyinfo, nitems(copyinfo));
+		if (error == 0) {
+			/*
+			 * case (3): read from MMIO and write to system memory.
+			 *
+			 * A MMIO read can have side-effects so we
+			 * commit to it only after vm_copy_setup() is
+			 * successful. If a page-fault needs to be
+			 * injected into the guest then it will happen
+			 * before the MMIO read is attempted.
+			 */
+			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+			if (error)
+				goto done;
+
+			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
+			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		} else if (error > 0) {
+			/*
+			 * Resume guest execution to handle fault.
+			 */
+			goto done;
+		} else {
+			/*
+			 * Case (4): read from and write to mmio.
+			 */
+			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
+			    PROT_READ, &srcgpa);
+			if (error)
+				goto done;
+			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
+			if (error)
+				goto done;
+
+			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
+			   PROT_WRITE, &dstgpa);
+			if (error)
+				goto done;
+			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
+			if (error)
+				goto done;
+		}
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
+	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	if (rflags & PSL_D) {
+		rsi -= opsize;
+		rdi -= opsize;
+	} else {
+		rsi += opsize;
+		rdi += opsize;
+	}
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+	if (repeat) {
+		rcx = rcx - 1;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+		    rcx, vie->addrsize);
+		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+		/*
+		 * Repeat the instruction if the count register is not zero.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+			vm_restart_instruction(vm, vcpuid);
+	}
+done:
+	if (error < 0)
+		return (EFAULT);
+	else
+		return (0);
+}
+
+static int
+emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error, opsize, repeat;
+	uint64_t val;
+	uint64_t rcx, rdi, rflags;
+
+	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
+	repeat = vie->repz_present | vie->repnz_present;
+
+	if (repeat) {
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+		/*
+		 * The count register is %rcx, %ecx or %cx depending on the
+		 * address size of the instruction.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+			return (0);
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
+
+	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	if (rflags & PSL_D)
+		rdi -= opsize;
+	else
+		rdi += opsize;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+	if (repeat) {
+		rcx = rcx - 1;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+		    rcx, vie->addrsize);
+		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+		/*
+		 * Repeat the instruction if the count register is not zero.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+			vm_restart_instruction(vm, vcpuid);
+	}
+
+	return (0);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t result, rflags, rflags2, val1, val2;
+
+	size = vie->opsize;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x23:
+		/*
+		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+		 * result in reg.
+		 *
+		 * 23/r		and r16, r/m16
+		 * 23/r		and r32, r/m32
+		 * REX.W + 23/r	and r64, r/m64
+		 */
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		result = val1 & val2;
+		error = vie_update_register(vm, vcpuid, reg, result, size);
+		break;
+	case 0x81:
+	case 0x83:
+		/*
+		 * AND mem (ModRM:r/m) with immediate and store the
+		 * result in mem.
+		 *
+		 * 81 /4		and r/m16, imm16
+		 * 81 /4		and r/m32, imm32
+		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
+		 *
+		 * 83 /4		and r/m16, imm8 sign-extended to 16
+		 * 83 /4		and r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
+		 */
+
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &val1, size, arg);
+                if (error)
+			break;
+
+                /*
+		 * perform the operation with the pre-fetched immediate
+		 * operand and write the result
+		 */
+                result = val1 & vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, result, size, arg);
+		break;
+	default:
+		break;
+	}
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+
+	/*
+	 * OF and CF are cleared; the SF, ZF and PF flags are set according
+	 * to the result; AF is undefined.
+	 *
+	 * The updated status flags are obtained by subtracting 0 from 'result'.
+	 */
+	rflags2 = getcc(size, result, 0);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
+static int
+emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t val1, result, rflags, rflags2;
+
+	size = vie->opsize;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x81:
+	case 0x83:
+		/*
+		 * OR mem (ModRM:r/m) with immediate and store the
+		 * result in mem.
+		 *
+		 * 81 /1		or r/m16, imm16
+		 * 81 /1		or r/m32, imm32
+		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
+		 *
+		 * 83 /1		or r/m16, imm8 sign-extended to 16
+		 * 83 /1		or r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
+		 */
+
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &val1, size, arg);
+                if (error)
+			break;
+
+                /*
+		 * perform the operation with the pre-fetched immediate
+		 * operand and write the result
+		 */
+                result = val1 | vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, result, size, arg);
+		break;
+	default:
+		break;
+	}
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+
+	/*
+	 * OF and CF are cleared; the SF, ZF and PF flags are set according
+	 * to the result; AF is undefined.
+	 *
+	 * The updated status flags are obtained by subtracting 0 from 'result'.
+	 */
+	rflags2 = getcc(size, result, 0);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t op1, op2, rflags, rflags2;
+	enum vm_reg_name reg;
+
+	size = vie->opsize;
+	switch (vie->op.op_byte) {
+	case 0x3B:
+		/*
+		 * 3B/r		CMP r16, r/m16
+		 * 3B/r		CMP r32, r/m32
+		 * REX.W + 3B/r	CMP r64, r/m64
+		 *
+		 * Compare first operand (reg) with second operand (r/m) and
+		 * set status flags in EFLAGS register. The comparison is
+		 * performed by subtracting the second operand from the first
+		 * operand and then setting the status flags.
+		 */
+
+		/* Get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &op1);
+		if (error)
+			return (error);
+
+		/* Get the second operand */
+		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		if (error)
+			return (error);
+
+		rflags2 = getcc(size, op1, op2);
+		break;
+	case 0x81:
+	case 0x83:
+		/*
+		 * 81 /7		cmp r/m16, imm16
+		 * 81 /7		cmp r/m32, imm32
+		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
+		 *
+		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
+		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
+		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
+		 *
+		 * Compare mem (ModRM:r/m) with immediate and set
+		 * status flags according to the results.  The
+		 * comparison is performed by subtracting the
+		 * immediate from the first operand and then setting
+		 * the status flags.
+		 *
+		 */
+
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &op1, size, arg);
+		if (error)
+			return (error);
+
+		rflags2 = getcc(size, op1, vie->immediate);
+		break;
+	default:
+		return (EINVAL);
+	}
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
+static int
+emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t nval, rflags, rflags2, val1, val2;
+	enum vm_reg_name reg;
+
+	size = vie->opsize;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x2B:
+		/*
+		 * SUB r/m from r and store the result in r
+		 * 
+		 * 2B/r            SUB r16, r/m16
+		 * 2B/r            SUB r32, r/m32
+		 * REX.W + 2B/r    SUB r64, r/m64
+		 */
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		nval = val1 - val2;
+		error = vie_update_register(vm, vcpuid, reg, nval, size);
+		break;
+	default:
+		break;
+	}
+
+	if (!error) {
+		rflags2 = getcc(size, val1, val2);
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
+		    &rflags);
+		if (error)
+			return (error);
+
+		rflags &= ~RFLAGS_STATUS_BITS;
+		rflags |= rflags2 & RFLAGS_STATUS_BITS;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
+		    rflags, 8);
+	}
+
+	return (error);
+}
+
+static int
+emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	struct seg_desc ss_desc;
+	uint64_t cr0, rflags, rsp, stack_gla, val;
+	int error, size, stackaddrsize, pushop;
+
+	val = 0;
+	size = vie->opsize;
+	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
+
+	/*
+	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+	 */
+	if (paging->cpu_mode == CPU_MODE_REAL) {
+		stackaddrsize = 2;
+	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
+		 * - Stack pointer size is always 64-bits.
+		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
+		 * - 16-bit PUSH/POP is supported by using the operand size
+		 *   override prefix (66H).
+		 */
+		stackaddrsize = 8;
+		size = vie->opsize_override ? 2 : 8;
+	} else {
+		/*
+		 * In protected or compability mode the 'B' flag in the
+		 * stack-segment descriptor determines the size of the
+		 * stack pointer.
+		 */
+		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+		    __func__, error));
+		if (SEG_DESC_DEF32(ss_desc.access))
+			stackaddrsize = 4;
+		else
+			stackaddrsize = 2;
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+	if (pushop) {
+		rsp -= size;
+	}
+
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
+	    &stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (0);
+	}
+
+	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
+	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
+	if (error == -1) {
+		/*
+		 * XXX cannot return a negative error value here because it
+		 * ends up being the return value of the VM_RUN() ioctl and
+		 * is interpreted as a pseudo-error (for e.g. ERESTART).
+		 */
+		return (EFAULT);
+	} else if (error == 1) {
+		/* Resume guest execution to handle page fault */
+		return (0);
+	}
+
+	if (pushop) {
+		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+		if (error == 0)
+			vm_copyout(vm, vcpuid, &val, copyinfo, size);
+	} else {
+		vm_copyin(vm, vcpuid, copyinfo, &val, size);
+		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
+		rsp += size;
+	}
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+
+	if (error == 0) {
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+		    stackaddrsize);
+		KASSERT(error == 0, ("error %d updating rsp", error));
+	}
+	return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * PUSH is part of the group 5 extended opcodes and is identified
+	 * by ModRM:reg = b110.
+	 */
+	if ((vie->reg & 7) != 6)
+		return (EINVAL);
+
+	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+	    memwrite, arg);
+	return (error);
+}
+
+static int
+emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * POP is part of the group 1A extended opcodes and is identified
+	 * by ModRM:reg = b000.
+	 */
+	if ((vie->reg & 7) != 0)
+		return (EINVAL);
+
+	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+	    memwrite, arg);
+	return (error);
+}
+
+static int
+emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
+{
+	int error;
+
+	switch (vie->reg & 7) {
+	case 0x1:	/* OR */
+		error = emulate_or(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case 0x4:	/* AND */
+		error = emulate_and(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case 0x7:	/* CMP */
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
+{
+	int error;
+
+	if (!vie->decoded)
+		return (EINVAL);
+
+	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_GROUP1:
+		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_POP:
+		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_PUSH:
+		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_CMP:
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_MOV:
+		error = emulate_mov(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_MOVSX:
+	case VIE_OP_TYPE_MOVZX:
+		error = emulate_movx(vm, vcpuid, gpa, vie,
+				     memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_MOVS:
+		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_STOS:
+		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_AND:
+		error = emulate_and(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_OR:
+		error = emulate_or(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_SUB:
+		error = emulate_sub(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+int
+vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
+{
+	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
+	    ("%s: invalid size %d", __func__, size));
+	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
+
+	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
+		return (0);
+
+	return ((gla & (size - 1)) ? 1 : 0);
+}
+
+int
+vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
+{
+	uint64_t mask;
+
+	if (cpu_mode != CPU_MODE_64BIT)
+		return (0);
+
+	/*
+	 * The value of the bit 47 in the 'gla' should be replicated in the
+	 * most significant 16 bits.
+	 */
+	mask = ~((1UL << 48) - 1);
+	if (gla & (1UL << 47))
+		return ((gla & mask) != mask);
+	else
+		return ((gla & mask) != 0);
+}
+
+uint64_t
+vie_size2mask(int size)
+{
+	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
+	    ("vie_size2mask: invalid size %d", size));
+	return (size2mask[size]);
+}
+
+int
+vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
+    struct seg_desc *desc, uint64_t offset, int length, int addrsize,
+    int prot, uint64_t *gla)
+{
+	uint64_t firstoff, low_limit, high_limit, segbase;
+	int glasize, type;
+
+	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
+	    ("%s: invalid segment %d", __func__, seg));
+	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
+	    ("%s: invalid operand size %d", __func__, length));
+#ifdef	__FreeBSD__
+	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
+	    ("%s: invalid prot %#x", __func__, prot));
+#else
+	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
+	    ("%s: invalid prot %x", __func__, prot));
+#endif
+
+	firstoff = offset;
+	if (cpu_mode == CPU_MODE_64BIT) {
+		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
+		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
+		glasize = 8;
+	} else {
+		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
+		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
+		glasize = 4;
+		/*
+		 * If the segment selector is loaded with a NULL selector
+		 * then the descriptor is unusable and attempting to use
+		 * it results in a #GP(0).
+		 */
+		if (SEG_DESC_UNUSABLE(desc->access))
+			return (-1);
+
+		/* 
+		 * The processor generates a #NP exception when a segment
+		 * register is loaded with a selector that points to a
+		 * descriptor that is not present. If this was the case then
+		 * it would have been checked before the VM-exit.
+		 */
+#ifdef	__FreeBSD__
+		KASSERT(SEG_DESC_PRESENT(desc->access),
+		    ("segment %d not present: %#x", seg, desc->access));
+#else
+		KASSERT(SEG_DESC_PRESENT(desc->access),
+		    ("segment %d not present: %x", seg, desc->access));
+#endif
+
+		/*
+		 * The descriptor type must indicate a code/data segment.
+		 */
+		type = SEG_DESC_TYPE(desc->access);
+#ifdef	__FreeBSD__
+		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
+		    "descriptor type %#x", seg, type));
+#else
+		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
+		    "descriptor type %x", seg, type));
+#endif
+
+		if (prot & PROT_READ) {
+			/* #GP on a read access to a exec-only code segment */
+			if ((type & 0xA) == 0x8)
+				return (-1);
+		}
+
+		if (prot & PROT_WRITE) {
+			/*
+			 * #GP on a write access to a code segment or a
+			 * read-only data segment.
+			 */
+			if (type & 0x8)			/* code segment */
+				return (-1);
+
+			if ((type & 0xA) == 0)		/* read-only data seg */
+				return (-1);
+		}
+
+		/*
+		 * 'desc->limit' is fully expanded taking granularity into
+		 * account.
+		 */
+		if ((type & 0xC) == 0x4) {
+			/* expand-down data segment */
+			low_limit = desc->limit + 1;
+			high_limit = SEG_DESC_DEF32(desc->access) ?
+			    0xffffffff : 0xffff;
+		} else {
+			/* code segment or expand-up data segment */
+			low_limit = 0;
+			high_limit = desc->limit;
+		}
+
+		while (length > 0) {
+			offset &= vie_size2mask(addrsize);
+			if (offset < low_limit || offset > high_limit)
+				return (-1);
+			offset++;
+			length--;
+		}
+	}
+
+	/*
+	 * In 64-bit mode all segments except %fs and %gs have a segment
+	 * base address of 0.
+	 */
+	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
+	    seg != VM_REG_GUEST_GS) {
+		segbase = 0;
+	} else {
+		segbase = desc->base;
+	}
+
+	/*
+	 * Truncate 'firstoff' to the effective address size before adding
+	 * it to the segment base.
+	 */
+	firstoff &= vie_size2mask(addrsize);
+	*gla = (segbase + firstoff) & vie_size2mask(glasize);
+	return (0);
+}
+
+#ifdef _KERNEL
+void
+vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
+{
+	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
+	    ("%s: invalid instruction length (%d)", __func__, inst_length));
+
+	bzero(vie, sizeof(struct vie));
+
+	vie->base_register = VM_REG_LAST;
+	vie->index_register = VM_REG_LAST;
+	vie->segment_register = VM_REG_LAST;
+
+	if (inst_length) {
+		bcopy(inst_bytes, vie->inst, inst_length);
+		vie->num_valid = inst_length;
+	}
+}
+
+static int
+pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
+{
+	int error_code = 0;
+
+	if (pte & PG_V)
+		error_code |= PGEX_P;
+	if (prot & VM_PROT_WRITE)
+		error_code |= PGEX_W;
+	if (usermode)
+		error_code |= PGEX_U;
+	if (rsvd)
+		error_code |= PGEX_RSV;
+	if (prot & VM_PROT_EXECUTE)
+		error_code |= PGEX_I;
+
+	return (error_code);
+}
+
+static void
+ptp_release(void **cookie)
+{
+	if (*cookie != NULL) {
+		vm_gpa_release(*cookie);
+		*cookie = NULL;
+	}
+}
+
+static void *
+ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
+{
+	void *ptr;
+
+	ptp_release(cookie);
+	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
+	return (ptr);
+}
+
+int
+vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa)
+{
+	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
+#ifdef	__FreeBSD__
+#endif
+	u_int retries;
+	uint64_t *ptpbase, ptpphys, pte, pgsize;
+	uint32_t *ptpbase32, pte32;
+	void *cookie;
+
+	usermode = (paging->cpl == 3 ? 1 : 0);
+	writable = prot & VM_PROT_WRITE;
+	cookie = NULL;
+	retval = 0;
+#ifdef	__FreeBSD__
+	retries = 0;
+#endif
+restart:
+	ptpphys = paging->cr3;		/* root of the page tables */
+	ptp_release(&cookie);
+#ifdef	__FreeBSD__
+	if (retries++ > 0)
+		maybe_yield();
+#endif
+
+	if (vie_canonical_check(paging->cpu_mode, gla)) {
+		/*
+		 * XXX assuming a non-stack reference otherwise a stack fault
+		 * should be generated.
+		 */
+		vm_inject_gp(vm, vcpuid);
+		goto fault;
+	}
+
+	if (paging->paging_mode == PAGING_MODE_FLAT) {
+		*gpa = gla;
+		goto done;
+	}
+
+	if (paging->paging_mode == PAGING_MODE_32) {
+		nlevels = 2;
+		while (--nlevels >= 0) {
+			/* Zero out the lower 12 bits. */
+			ptpphys &= ~0xfff;
+
+			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
+
+			if (ptpbase32 == NULL)
+				goto error;
+
+			ptpshift = PAGE_SHIFT + nlevels * 10;
+			ptpindex = (gla >> ptpshift) & 0x3FF;
+			pgsize = 1UL << ptpshift;
+
+			pte32 = ptpbase32[ptpindex];
+
+			if ((pte32 & PG_V) == 0 ||
+			    (usermode && (pte32 & PG_U) == 0) ||
+			    (writable && (pte32 & PG_RW) == 0)) {
+				pfcode = pf_error_code(usermode, prot, 0,
+				    pte32);
+				vm_inject_pf(vm, vcpuid, pfcode, gla);
+				goto fault;
+			}
+
+			/*
+			 * Emulate the x86 MMU's management of the accessed
+			 * and dirty flags. While the accessed flag is set
+			 * at every level of the page table, the dirty flag
+			 * is only set at the last level providing the guest
+			 * physical address.
+			 */
+			if ((pte32 & PG_A) == 0) {
+				if (atomic_cmpset_32(&ptpbase32[ptpindex],
+				    pte32, pte32 | PG_A) == 0) {
+					goto restart;
+				}
+			}
+
+			/* XXX must be ignored if CR4.PSE=0 */
+			if (nlevels > 0 && (pte32 & PG_PS) != 0)
+				break;
+
+			ptpphys = pte32;
+		}
+
+		/* Set the dirty bit in the page table entry if necessary */
+		if (writable && (pte32 & PG_M) == 0) {
+			if (atomic_cmpset_32(&ptpbase32[ptpindex],
+			    pte32, pte32 | PG_M) == 0) {
+				goto restart;
+			}
+		}
+
+		/* Zero out the lower 'ptpshift' bits */
+		pte32 >>= ptpshift; pte32 <<= ptpshift;
+		*gpa = pte32 | (gla & (pgsize - 1));
+		goto done;
+	}
+
+	if (paging->paging_mode == PAGING_MODE_PAE) {
+		/* Zero out the lower 5 bits and the upper 32 bits */
+		ptpphys &= 0xffffffe0UL;
+
+		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
+		if (ptpbase == NULL)
+			goto error;
+
+		ptpindex = (gla >> 30) & 0x3;
+
+		pte = ptpbase[ptpindex];
+
+		if ((pte & PG_V) == 0) {
+			pfcode = pf_error_code(usermode, prot, 0, pte);
+			vm_inject_pf(vm, vcpuid, pfcode, gla);
+			goto fault;
+		}
+
+		ptpphys = pte;
+
+		nlevels = 2;
+	} else
+		nlevels = 4;
+	while (--nlevels >= 0) {
+		/* Zero out the lower 12 bits and the upper 12 bits */
+		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
+		if (ptpbase == NULL)
+			goto error;
+
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gla >> ptpshift) & 0x1FF;
+		pgsize = 1UL << ptpshift;
+
+		pte = ptpbase[ptpindex];
+
+		if ((pte & PG_V) == 0 ||
+		    (usermode && (pte & PG_U) == 0) ||
+		    (writable && (pte & PG_RW) == 0)) {
+			pfcode = pf_error_code(usermode, prot, 0, pte);
+			vm_inject_pf(vm, vcpuid, pfcode, gla);
+			goto fault;
+		}
+
+		/* Set the accessed bit in the page table entry */
+		if ((pte & PG_A) == 0) {
+			if (atomic_cmpset_64(&ptpbase[ptpindex],
+			    pte, pte | PG_A) == 0) {
+				goto restart;
+			}
+		}
+
+		if (nlevels > 0 && (pte & PG_PS) != 0) {
+			if (pgsize > 1 * GB) {
+				pfcode = pf_error_code(usermode, prot, 1, pte);
+				vm_inject_pf(vm, vcpuid, pfcode, gla);
+				goto fault;
+			}
+			break;
+		}
+
+		ptpphys = pte;
+	}
+
+	/* Set the dirty bit in the page table entry if necessary */
+	if (writable && (pte & PG_M) == 0) {
+		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
+			goto restart;
+	}
+
+	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+	*gpa = pte | (gla & (pgsize - 1));
+done:
+	ptp_release(&cookie);
+	return (retval);
+error:
+	retval = -1;
+	goto done;
+fault:
+	retval = 1;
+	goto done;
+}
+
+int
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t rip, int inst_length, struct vie *vie)
+{
+	struct vm_copyinfo copyinfo[2];
+	int error, prot;
+
+	if (inst_length > VIE_INST_SIZE)
+		panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
+	prot = PROT_READ | PROT_EXEC;
+	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		vie->num_valid = inst_length;
+	}
+	return (error);
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+
+	if (vie->num_processed < vie->num_valid) {
+		*x = vie->inst[vie->num_processed];
+		return (0);
+	} else
+		return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+
+	vie->num_processed++;
+}
+
+static bool
+segment_override(uint8_t x, int *seg)
+{
+
+	switch (x) {
+	case 0x2E:
+		*seg = VM_REG_GUEST_CS;
+		break;
+	case 0x36:
+		*seg = VM_REG_GUEST_SS;
+		break;
+	case 0x3E:
+		*seg = VM_REG_GUEST_DS;
+		break;
+	case 0x26:
+		*seg = VM_REG_GUEST_ES;
+		break;
+	case 0x64:
+		*seg = VM_REG_GUEST_FS;
+		break;
+	case 0x65:
+		*seg = VM_REG_GUEST_GS;
+		break;
+	default:
+		return (false);
+	}
+	return (true);
+}
+
+static int
+decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
+{
+	uint8_t x;
+
+	while (1) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		if (x == 0x66)
+			vie->opsize_override = 1;
+		else if (x == 0x67)
+			vie->addrsize_override = 1;
+		else if (x == 0xF3)
+			vie->repz_present = 1;
+		else if (x == 0xF2)
+			vie->repnz_present = 1;
+		else if (segment_override(x, &vie->segment_register))
+			vie->segment_override = 1;
+		else
+			break;
+
+		vie_advance(vie);
+	}
+
+	/*
+	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
+	 * - Only one REX prefix is allowed per instruction.
+	 * - The REX prefix must immediately precede the opcode byte or the
+	 *   escape opcode byte.
+	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
+	 *   the mandatory prefix must come before the REX prefix.
+	 */
+	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
+		vie->rex_present = 1;
+		vie->rex_w = x & 0x8 ? 1 : 0;
+		vie->rex_r = x & 0x4 ? 1 : 0;
+		vie->rex_x = x & 0x2 ? 1 : 0;
+		vie->rex_b = x & 0x1 ? 1 : 0;
+		vie_advance(vie);
+	}
+
+	/*
+	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
+	 */
+	if (cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * Default address size is 64-bits and default operand size
+		 * is 32-bits.
+		 */
+		vie->addrsize = vie->addrsize_override ? 4 : 8;
+		if (vie->rex_w)
+			vie->opsize = 8;
+		else if (vie->opsize_override)
+			vie->opsize = 2;
+		else
+			vie->opsize = 4;
+	} else if (cs_d) {
+		/* Default address and operand sizes are 32-bits */
+		vie->addrsize = vie->addrsize_override ? 2 : 4;
+		vie->opsize = vie->opsize_override ? 2 : 4;
+	} else {
+		/* Default address and operand sizes are 16-bits */
+		vie->addrsize = vie->addrsize_override ? 4 : 2;
+		vie->opsize = vie->opsize_override ? 4 : 2;
+	}
+	return (0);
+}
+
+static int
+decode_two_byte_opcode(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->op = two_byte_opcodes[x];
+
+	if (vie->op.op_type == VIE_OP_TYPE_NONE)
+		return (-1);
+
+	vie_advance(vie);
+	return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->op = one_byte_opcodes[x];
+
+	if (vie->op.op_type == VIE_OP_TYPE_NONE)
+		return (-1);
+
+	vie_advance(vie);
+
+	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
+		return (decode_two_byte_opcode(vie));
+
+	return (0);
+}
+
+static int
+decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
+{
+	uint8_t x;
+
+	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+		return (0);
+
+	if (cpu_mode == CPU_MODE_REAL)
+		return (-1);
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->mod = (x >> 6) & 0x3;
+	vie->rm =  (x >> 0) & 0x7;
+	vie->reg = (x >> 3) & 0x7;
+
+	/*
+	 * A direct addressing mode makes no sense in the context of an EPT
+	 * fault. There has to be a memory access involved to cause the
+	 * EPT fault.
+	 */
+	if (vie->mod == VIE_MOD_DIRECT)
+		return (-1);
+
+	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+		/*
+		 * Table 2-5: Special Cases of REX Encodings
+		 *
+		 * mod=0, r/m=5 is used in the compatibility mode to
+		 * indicate a disp32 without a base register.
+		 *
+		 * mod!=3, r/m=4 is used in the compatibility mode to
+		 * indicate that the SIB byte is present.
+		 *
+		 * The 'b' bit in the REX prefix is don't care in
+		 * this case.
+		 */
+	} else {
+		vie->rm |= (vie->rex_b << 3);
+	}
+
+	vie->reg |= (vie->rex_r << 3);
+
+	/* SIB */
+	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+		goto done;
+
+	vie->base_register = gpr_map[vie->rm];
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	case VIE_MOD_INDIRECT:
+		if (vie->rm == VIE_RM_DISP32) {
+			vie->disp_bytes = 4;
+			/*
+			 * Table 2-7. RIP-Relative Addressing
+			 *
+			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
+			 * whereas in compatibility mode it just implies disp32.
+			 */
+
+			if (cpu_mode == CPU_MODE_64BIT)
+				vie->base_register = VM_REG_GUEST_RIP;
+			else
+				vie->base_register = VM_REG_LAST;
+		}
+		break;
+	}
+
+done:
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+	uint8_t x;
+
+	/* Proceed only if SIB byte is present */
+	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+		return (0);
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	/* De-construct the SIB byte */
+	vie->ss = (x >> 6) & 0x3;
+	vie->index = (x >> 3) & 0x7;
+	vie->base = (x >> 0) & 0x7;
+
+	/* Apply the REX prefix modifiers */
+	vie->index |= vie->rex_x << 3;
+	vie->base |= vie->rex_b << 3;
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	}
+
+	if (vie->mod == VIE_MOD_INDIRECT &&
+	    (vie->base == 5 || vie->base == 13)) {
+		/*
+		 * Special case when base register is unused if mod = 0
+		 * and base = %rbp or %r13.
+		 *
+		 * Documented in:
+		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+		 * Table 2-5: Special Cases of REX Encodings
+		 */
+		vie->disp_bytes = 4;
+	} else {
+		vie->base_register = gpr_map[vie->base];
+	}
+
+	/*
+	 * All encodings of 'index' are valid except for %rsp (4).
+	 *
+	 * Documented in:
+	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+	 * Table 2-5: Special Cases of REX Encodings
+	 */
+	if (vie->index != 4)
+		vie->index_register = gpr_map[vie->index];
+
+	/* 'scale' makes sense only in the context of an index register */
+	if (vie->index_register < VM_REG_LAST)
+		vie->scale = 1 << vie->ss;
+
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+	int n, i;
+	uint8_t x;
+
+	union {
+		char	buf[4];
+		int8_t	signed8;
+		int32_t	signed32;
+	} u;
+
+	if ((n = vie->disp_bytes) == 0)
+		return (0);
+
+	if (n != 1 && n != 4)
+		panic("decode_displacement: invalid disp_bytes %d", n);
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+
+	if (n == 1)
+		vie->displacement = u.signed8;		/* sign-extended */
+	else
+		vie->displacement = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[4];
+		int8_t	signed8;
+		int16_t	signed16;
+		int32_t	signed32;
+	} u;
+
+	/* Figure out immediate operand size (if any) */
+	if (vie->op.op_flags & VIE_OP_F_IMM) {
+		/*
+		 * Section 2.2.1.5 "Immediates", Intel SDM:
+		 * In 64-bit mode the typical size of immediate operands
+		 * remains 32-bits. When the operand size if 64-bits, the
+		 * processor sign-extends all immediates to 64-bits prior
+		 * to their use.
+		 */
+		if (vie->opsize == 4 || vie->opsize == 8)
+			vie->imm_bytes = 4;
+		else
+			vie->imm_bytes = 2;
+	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
+		vie->imm_bytes = 1;
+	}
+
+	if ((n = vie->imm_bytes) == 0)
+		return (0);
+
+	KASSERT(n == 1 || n == 2 || n == 4,
+	    ("%s: invalid number of immediate bytes: %d", __func__, n));
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+
+	/* sign-extend the immediate value before use */
+	if (n == 1)
+		vie->immediate = u.signed8;
+	else if (n == 2)
+		vie->immediate = u.signed16;
+	else
+		vie->immediate = u.signed32;
+
+	return (0);
+}
+
+static int
+decode_moffset(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[8];
+		uint64_t u64;
+	} u;
+
+	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+		return (0);
+
+	/*
+	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+	 * The memory offset size follows the address-size of the instruction.
+	 */
+	n = vie->addrsize;
+	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+	u.u64 = 0;
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	vie->displacement = u.u64;
+	return (0);
+}
+
+/*
+ * Verify that all the bytes in the instruction buffer were consumed.
+ */
+static int
+verify_inst_length(struct vie *vie)
+{
+
+	if (vie->num_processed)
+		return (0);
+	else
+		return (-1);
+}
+
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+	int error;
+	uint64_t base, idx, gla2;
+
+	/* Skip 'gla' verification */
+	if (gla == VIE_INVALID_GLA)
+		return (0);
+
+	base = 0;
+	if (vie->base_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->base_register, &base);
+		if (error) {
+			printf("verify_gla: error %d getting base reg %d\n",
+				error, vie->base_register);
+			return (-1);
+		}
+
+		/*
+		 * RIP-relative addressing starts from the following
+		 * instruction
+		 */
+		if (vie->base_register == VM_REG_GUEST_RIP)
+			base += vie->num_valid;
+	}
+
+	idx = 0;
+	if (vie->index_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+		if (error) {
+			printf("verify_gla: error %d getting index reg %d\n",
+				error, vie->index_register);
+			return (-1);
+		}
+	}
+
+	/* XXX assuming that the base address of the segment is 0 */
+	gla2 = base + vie->scale * idx + vie->displacement;
+	gla2 &= size2mask[vie->addrsize];
+	if (gla != gla2) {
+		printf("verify_gla mismatch: "
+		       "base(0x%0lx), scale(%d), index(0x%0lx), "
+		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla, gla2);
+		return (-1);
+	}
+
+	return (0);
+}
+
+int
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
+		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
+{
+
+	if (decode_prefixes(vie, cpu_mode, cs_d))
+		return (-1);
+
+	if (decode_opcode(vie))
+		return (-1);
+
+	if (decode_modrm(vie, cpu_mode))
+		return (-1);
+
+	if (decode_sib(vie))
+		return (-1);
+
+	if (decode_displacement(vie))
+		return (-1);
+
+	if (decode_immediate(vie))
+		return (-1);
+
+	if (decode_moffset(vie))
+		return (-1);
+
+	if (verify_inst_length(vie))
+		return (-1);
+
+	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
+		if (verify_gla(vm, cpuid, gla, vie))
+			return (-1);
+	}
+
+	vie->decoded = 1;	/* success */
+
+	return (0);
+}
+#endif	/* _KERNEL */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c
new file mode 100644
index 0000000000..bea750f162
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include "vatpic.h"
+#include "vatpit.h"
+#include "vmm_ioport.h"
+#include "vmm_ktr.h"
+
+#define	MAX_IOPORTS		1280
+
+ioport_handler_func_t ioport_handler[MAX_IOPORTS] = {
+	[TIMER_MODE] = vatpit_handler,
+	[TIMER_CNTR0] = vatpit_handler,
+	[TIMER_CNTR1] = vatpit_handler,
+	[TIMER_CNTR2] = vatpit_handler,
+	[NMISC_PORT] = vatpit_nmisc_handler,
+	[IO_ICU1] = vatpic_master_handler,
+	[IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler,
+	[IO_ICU2] = vatpic_slave_handler,
+	[IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler,
+	[IO_ELCR1] = vatpic_elc_handler,
+	[IO_ELCR2] = vatpic_elc_handler,
+};
+
+#ifdef KTR
+static const char *
+inout_instruction(struct vm_exit *vmexit)
+{
+	int index;
+
+	static const char *iodesc[] = {
+		"outb", "outw", "outl",
+		"inb", "inw", "inl",
+		"outsb", "outsw", "outsd",
+		"insb", "insw", "insd",
+	};
+
+	switch (vmexit->u.inout.bytes) {
+	case 1:
+		index = 0;
+		break;
+	case 2:
+		index = 1;
+		break;
+	default:
+		index = 2;
+		break;
+	}
+
+	if (vmexit->u.inout.in)
+		index += 3;
+
+	if (vmexit->u.inout.string)
+		index += 6;
+
+	KASSERT(index < nitems(iodesc), ("%s: invalid index %d",
+	    __func__, index));
+
+	return (iodesc[index]);
+}
+#endif	/* KTR */
+
+static int
+emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit,
+    bool *retu)
+{
+	ioport_handler_func_t handler;
+	uint32_t mask, val;
+	int error;
+
+	/*
+	 * If there is no handler for the I/O port then punt to userspace.
+	 */
+	if (vmexit->u.inout.port >= MAX_IOPORTS ||
+	    (handler = ioport_handler[vmexit->u.inout.port]) == NULL) {
+		*retu = true;
+		return (0);
+	}
+
+	mask = vie_size2mask(vmexit->u.inout.bytes);
+
+	if (!vmexit->u.inout.in) {
+		val = vmexit->u.inout.eax & mask;
+	}
+
+	error = (*handler)(vm, vcpuid, vmexit->u.inout.in,
+	    vmexit->u.inout.port, vmexit->u.inout.bytes, &val);
+	if (error) {
+		/*
+		 * The value returned by this function is also the return value
+		 * of vm_run(). This needs to be a positive number otherwise it
+		 * can be interpreted as a "pseudo-error" like ERESTART.
+		 *
+		 * Enforce this by mapping all errors to EIO.
+		 */
+		return (EIO);
+	}
+
+	if (vmexit->u.inout.in) {
+		vmexit->u.inout.eax &= ~mask;
+		vmexit->u.inout.eax |= val & mask;
+		error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
+		    vmexit->u.inout.eax);
+		KASSERT(error == 0, ("emulate_ioport: error %d setting guest "
+		    "rax register", error));
+	}
+	*retu = false;
+	return (0);
+}
+
+static int
+emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
+{
+	*retu = true;
+	return (0);	/* Return to userspace to finish emulation */
+}
+
+int
+vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
+{
+	int bytes, error;
+
+	bytes = vmexit->u.inout.bytes;
+	KASSERT(bytes == 1 || bytes == 2 || bytes == 4,
+	    ("vm_handle_inout: invalid operand size %d", bytes));
+
+	if (vmexit->u.inout.string)
+		error = emulate_inout_str(vm, vcpuid, vmexit, retu);
+	else
+		error = emulate_inout_port(vm, vcpuid, vmexit, retu);
+
+	VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s",
+	    vmexit->u.inout.rep ? "rep " : "",
+	    inout_instruction(vmexit),
+	    vmexit->u.inout.port,
+	    error ? "error" : (*retu ? "userspace" : "handled"));
+
+	return (error);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h
new file mode 100644
index 0000000000..624dd8f1d8
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_ioport.h 273706 2014-10-26 19:03:06Z neel $
+ */
+
+#ifndef	_VMM_IOPORT_H_
+#define	_VMM_IOPORT_H_
+
+typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid,
+    bool in, int port, int bytes, uint32_t *val);
+
+int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu);
+
+#endif	/* _VMM_IOPORT_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h
new file mode 100644
index 0000000000..4dff03ba1f
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_ipi.h 260466 2014-01-09 03:25:54Z neel $
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+#ifdef	__FreeBSD__
+int	vmm_ipi_alloc(void);
+void	vmm_ipi_free(int num);
+#endif
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h
new file mode 100644
index 0000000000..917c7f83a4
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_ktr.h 258699 2013-11-27 22:18:08Z neel $
+ */
+
+#ifndef _VMM_KTR_H_
+#define	_VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#ifndef KTR_VMM
+#define	KTR_VMM	KTR_GEN
+#endif
+
+#define	VCPU_CTR0(vm, vcpuid, format)					\
+CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid))
+
+#define	VCPU_CTR1(vm, vcpuid, format, p1)				\
+CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1))
+
+#define	VCPU_CTR2(vm, vcpuid, format, p1, p2)				\
+CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2))
+
+#define	VCPU_CTR3(vm, vcpuid, format, p1, p2, p3)			\
+CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3))
+
+#define	VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4)			\
+CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid),		\
+    (p1), (p2), (p3), (p4))
+
+#define	VM_CTR0(vm, format)						\
+CTR1(KTR_VMM, "vm %s: " format, vm_name((vm)))
+
+#define	VM_CTR1(vm, format, p1)						\
+CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1))
+
+#define	VM_CTR2(vm, format, p1, p2)					\
+CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2))
+
+#define	VM_CTR3(vm, format, p1, p2, p3)					\
+CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3))
+
+#define	VM_CTR4(vm, format, p1, p2, p3, p4)				\
+CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4))
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
new file mode 100644
index 0000000000..3215c74a44
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
@@ -0,0 +1,256 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_ktr.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+/*
+ * Some MSI message definitions
+ */
+#define	MSI_X86_ADDR_MASK	0xfff00000
+#define	MSI_X86_ADDR_BASE	0xfee00000
+#define	MSI_X86_ADDR_RH		0x00000008	/* Redirection Hint */
+#define	MSI_X86_ADDR_LOG	0x00000004	/* Destination Mode */
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
+{
+	struct vlapic *vlapic;
+
+	if (cpu < 0 || cpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (vector < 32 || vector > 255)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	if (vlapic_set_intr_ready(vlapic, vector, level))
+		vcpu_notify_event(vm, cpu, true);
+	return (0);
+}
+
+int
+lapic_set_local_intr(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+	cpuset_t dmask;
+	int error;
+
+	if (cpu < -1 || cpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (cpu == -1)
+		dmask = vm_active_cpus(vm);
+	else
+		CPU_SETOF(cpu, &dmask);
+	error = 0;
+	while ((cpu = CPU_FFS(&dmask)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &dmask);
+		vlapic = vm_lapic(vm, cpu);
+		error = vlapic_trigger_lvt(vlapic, vector);
+		if (error)
+			break;
+	}
+
+	return (error);
+}
+
+int
+lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg)
+{
+	int delmode, vec;
+	uint32_t dest;
+	bool phys;
+
+	VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg);
+
+	if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) {
+		VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr);
+		return (-1);
+	}
+
+	/*
+	 * Extract the x86-specific fields from the MSI addr/msg
+	 * params according to the Intel Arch spec, Vol3 Ch 10.
+	 *
+	 * The PCI specification does not support level triggered
+	 * MSI/MSI-X so ignore trigger level in 'msg'.
+	 *
+	 * The 'dest' is interpreted as a logical APIC ID if both
+	 * the Redirection Hint and Destination Mode are '1' and
+	 * physical otherwise.
+	 */
+	dest = (addr >> 12) & 0xff;
+	phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) !=
+	    (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG));
+	delmode = msg & APIC_DELMODE_MASK;
+	vec = msg & 0xff;
+
+	VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d",
+	    phys ? "physical" : "logical", dest, vec);
+
+	vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec);
+	return (0);
+}
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+	if (msr >= 0x800 && msr <= 0xBFF)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+	return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+	if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu)
+{
+	int error;
+	u_int offset;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		*rval = vlapic_get_apicbase(vlapic);
+		error = 0;
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_read(vlapic, 0, offset, rval, retu);
+	}
+
+	return (error);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu)
+{
+	int error;
+	u_int offset;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		error = vlapic_set_apicbase(vlapic, val);
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_write(vlapic, 0, offset, val, retu);
+	}
+
+	return (error);
+}
+
+int
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+		 void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
+
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
+	 */
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_write(vlapic, 1, off, wval, arg);
+	return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+		void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
+
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses should be aligned on a
+	 * 16-byte boundary.  They are also suggested to be 4 bytes
+	 * wide, alas not all OSes follow suggestions.
+	 */
+	off &= ~3;
+	if (off & 0xf)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_read(vlapic, 1, off, rval, arg);
+	return (error);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
new file mode 100644
index 0000000000..ee47ee7783
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.h 259863 2013-12-25 06:46:31Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define	_VMM_LAPIC_H_
+
+struct vm;
+
+boolean_t lapic_msr(u_int num);
+int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval,
+	    bool *retu);
+int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval,
+	    bool *retu);
+
+int	lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+			uint64_t *rval, int size, void *arg);
+int	lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+			 uint64_t wval, int size, void *arg);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int	lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig);
+
+#define	LAPIC_TRIG_LEVEL	true
+#define	LAPIC_TRIG_EDGE		false
+static __inline int
+lapic_intr_level(struct vm *vm, int cpu, int vector)
+{
+
+	return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL));
+}
+
+static __inline int
+lapic_intr_edge(struct vm *vm, int cpu, int vector)
+{
+
+	return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE));
+}
+
+/*
+ * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'.  'cpu' can
+ * be set to -1 to trigger the interrupt on all CPUs.
+ */
+int	lapic_set_local_intr(struct vm *vm, int cpu, int vector);
+
+int	lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h
new file mode 100644
index 0000000000..05dc37fb9a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_mem.h 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef	_VMM_MEM_H_
+#define	_VMM_MEM_H_
+
+int		vmm_mem_init(void);
+vm_paddr_t	vmm_mem_alloc(size_t size);
+void		vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t	vmm_mem_maxaddr(void);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
new file mode 100644
index 0000000000..79e1cb1a44
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -0,0 +1,1040 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/cpuvar.h>
+#include <sys/ioccom.h>
+#include <sys/stat.h>
+#include <sys/vmsystm.h>
+#include <sys/ddi.h>
+/*
+ * struct modctl in <sys/modctl.h> contains "void *__unused".  
+ * Do this ugly workaround to avoid it.
+ */
+#undef	__unused
+#include <sys/sunddi.h>
+#include <sys/fs/dv_node.h>
+
+#include <sys/vmm.h>
+#include <sys/vmm_instruction_emul.h>
+#include <sys/vmm_dev.h>
+#include <sys/vmm_impl.h>
+
+#include <vm/vm.h>
+#include <vm/seg_dev.h>
+
+#include "io/vatpic.h"
+#include "io/vioapic.h"
+#include "vmm_lapic.h"
+
+static dev_info_t *vmm_dip;
+static void *vmm_statep;
+
+static SLIST_HEAD(, vmm_softc) head;
+
+static kmutex_t vmmdev_mtx;
+
+/*
+ * vmm trace ring
+ */
+int	vmm_dmsg_ring_size = VMM_DMSG_RING_SIZE;
+static	vmm_trace_rbuf_t *vmm_debug_rbuf;
+static	vmm_trace_dmsg_t *vmm_trace_dmsg_alloc(void);
+static	void vmm_trace_dmsg_free(void);
+static	void vmm_trace_rbuf_alloc(void);
+static	void vmm_trace_rbuf_free(void);
+
+/*
+ * This routine is used to manage debug messages
+ * on ring buffer.
+ */
+static vmm_trace_dmsg_t *
+vmm_trace_dmsg_alloc(void)
+{
+	vmm_trace_dmsg_t *dmsg_alloc, *dmsg = vmm_debug_rbuf->dmsgp;
+
+	if (vmm_debug_rbuf->looped == TRUE) {
+		vmm_debug_rbuf->dmsgp = dmsg->next;
+		return (vmm_debug_rbuf->dmsgp);
+	}
+
+	/*
+	 * If we're looping for the first time,
+	 * connect the ring.
+	 */
+	if (((vmm_debug_rbuf->size + (sizeof (vmm_trace_dmsg_t))) >
+	    vmm_debug_rbuf->maxsize) && (vmm_debug_rbuf->dmsgh != NULL)) {
+		dmsg->next = vmm_debug_rbuf->dmsgh;
+		vmm_debug_rbuf->dmsgp = vmm_debug_rbuf->dmsgh;
+		vmm_debug_rbuf->looped = TRUE;
+		return (vmm_debug_rbuf->dmsgp);
+	}
+
+	/* If we've gotten this far then memory allocation is needed */
+	dmsg_alloc = kmem_zalloc(sizeof (vmm_trace_dmsg_t), KM_NOSLEEP);
+	if (dmsg_alloc == NULL) {
+		vmm_debug_rbuf->allocfailed++;
+		return (dmsg_alloc);
+	} else {
+		vmm_debug_rbuf->size += sizeof (vmm_trace_dmsg_t);
+	}
+
+	if (vmm_debug_rbuf->dmsgp != NULL) {
+		dmsg->next = dmsg_alloc;
+		vmm_debug_rbuf->dmsgp = dmsg->next;
+		return (vmm_debug_rbuf->dmsgp);
+	} else {
+		/*
+		 * We should only be here if we're initializing
+		 * the ring buffer.
+		 */
+		if (vmm_debug_rbuf->dmsgh == NULL) {
+			vmm_debug_rbuf->dmsgh = dmsg_alloc;
+		} else {
+			/* Something is wrong */
+			kmem_free(dmsg_alloc, sizeof (vmm_trace_dmsg_t));
+			return (NULL);
+		}
+
+		vmm_debug_rbuf->dmsgp = dmsg_alloc;
+		return (vmm_debug_rbuf->dmsgp);
+	}
+}
+
+/*
+ * Free all messages on debug ring buffer.
+ */
+static void
+vmm_trace_dmsg_free(void)
+{
+	vmm_trace_dmsg_t *dmsg_next, *dmsg = vmm_debug_rbuf->dmsgh;
+
+	while (dmsg != NULL) {
+		dmsg_next = dmsg->next;
+		kmem_free(dmsg, sizeof (vmm_trace_dmsg_t));
+
+		/*
+		 * If we've looped around the ring than we're done.
+		 */
+		if (dmsg_next == vmm_debug_rbuf->dmsgh) {
+			break;
+		} else {
+			dmsg = dmsg_next;
+		}
+	}
+}
+
+static void
+vmm_trace_rbuf_alloc(void)
+{
+	vmm_debug_rbuf = kmem_zalloc(sizeof (vmm_trace_rbuf_t), KM_SLEEP);
+
+	mutex_init(&vmm_debug_rbuf->lock, NULL, MUTEX_DRIVER, NULL);
+
+	if (vmm_dmsg_ring_size > 0) {
+		vmm_debug_rbuf->maxsize = vmm_dmsg_ring_size;
+	}
+}
+
+
+static void
+vmm_trace_rbuf_free(void)
+{
+	vmm_trace_dmsg_free();
+	mutex_destroy(&vmm_debug_rbuf->lock);
+	kmem_free(vmm_debug_rbuf, sizeof (vmm_trace_rbuf_t));
+}
+
+static void
+vmm_vtrace_log(const char *fmt, va_list ap)
+{
+	vmm_trace_dmsg_t *dmsg;
+
+	if (vmm_debug_rbuf == NULL) {
+		return;
+	}
+
+	/*
+	 * If max size of ring buffer is smaller than size
+	 * required for one debug message then just return
+	 * since we have no room for the debug message.
+	 */
+	if (vmm_debug_rbuf->maxsize < (sizeof (vmm_trace_dmsg_t))) {
+		return;
+	}
+
+	mutex_enter(&vmm_debug_rbuf->lock);
+
+	/* alloc or reuse on ring buffer */
+	dmsg = vmm_trace_dmsg_alloc();
+
+	if (dmsg == NULL) {
+		/* resource allocation failed */
+		mutex_exit(&vmm_debug_rbuf->lock);
+		return;
+	}
+
+	gethrestime(&dmsg->timestamp);
+
+	(void) vsnprintf(dmsg->buf, sizeof (dmsg->buf), fmt, ap);
+
+	mutex_exit(&vmm_debug_rbuf->lock);
+}
+
+void
+vmm_trace_log(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vmm_vtrace_log(fmt, ap);
+	va_end(ap);
+}
+
+void
+vmmdev_init(void)
+{
+	vmm_trace_rbuf_alloc();
+}
+
+int
+vmmdev_cleanup(void)
+{
+	int	error;
+
+	if (SLIST_EMPTY(&head))
+		error = 0;
+	else
+		error = EBUSY;
+
+	if (error == 0)
+		vmm_trace_dmsg_free();
+
+	return (error);
+}
+
+int
+vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode,
+    cred_t *credp, int *rvalp)
+{
+	int error, vcpu, state_changed;
+	struct vm_memory_segment seg;
+	struct vm_register vmreg;
+	struct vm_seg_desc vmsegdesc;
+	struct vm_run vmrun;
+	struct vm_exception vmexc;
+	struct vm_lapic_irq vmirq;
+	struct vm_lapic_msi vmmsi;
+	struct vm_ioapic_irq ioapic_irq;
+	struct vm_isa_irq isa_irq;
+	struct vm_capability vmcap;
+	struct vm_nmi vmnmi;
+	struct vm_x2apic x2apic;
+	struct vm_gla2gpa gg;
+	struct vm_activate_cpu vac;
+	int pincount;
+	int i;
+
+	vcpu = -1;
+	state_changed = 0;
+
+	/*
+	 * Some VMM ioctls can operate only on vcpus that are not running.
+	 */
+	switch (cmd) {
+	case VM_RUN:
+	case VM_GET_REGISTER:
+	case VM_SET_REGISTER:
+	case VM_GET_SEGMENT_DESCRIPTOR:
+	case VM_SET_SEGMENT_DESCRIPTOR:
+	case VM_INJECT_EXCEPTION:
+	case VM_GET_CAPABILITY:
+	case VM_SET_CAPABILITY:
+	case VM_PPTDEV_MSI:
+	case VM_PPTDEV_MSIX:
+	case VM_SET_X2APIC_STATE:
+	case VM_GLA2GPA:
+	case VM_ACTIVATE_CPU:
+	case VM_RESTART_INSTRUCTION:
+		/*
+		 * XXX fragile, handle with care
+		 * Assumes that the first field of the ioctl data is the vcpu.
+		 */
+		if (ddi_copyin((void *)arg, &vcpu, sizeof (vcpu), mode)) {
+			return (EFAULT);
+		}
+		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+			error = EINVAL;
+			goto done;
+		}
+
+		error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
+		if (error)
+			goto done;
+
+		state_changed = 1;
+		break;
+	case VM_MAP_MEMORY:
+		/*
+		 * ioctls that operate on the entire virtual machine must
+		 * prevent all vcpus from running.
+		 */
+		error = 0;
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+			error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
+			if (error)
+				break;
+		}
+
+		if (error) {
+			while (--vcpu >= 0)
+				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
+			goto done;
+		}
+
+		state_changed = 2;
+		break;
+
+	default:
+		break;
+	}
+
+	switch(cmd) {
+	case VM_RUN:
+		if (ddi_copyin((void *)arg, &vmrun,
+		    sizeof (struct vm_run), mode)) {
+			return (EFAULT);
+		}
+		error = vm_run(sc->vm, &vmrun);
+		if (ddi_copyout(&vmrun, (void *)arg,
+		    sizeof (struct vm_run), mode)) {
+			return (EFAULT);
+		}
+		break;
+	case VM_LAPIC_IRQ:
+		if (ddi_copyin((void *)arg, &vmirq,
+		    sizeof (struct vm_lapic_irq), mode)) {
+			return (EFAULT);
+		}
+		error = lapic_intr_edge(sc->vm, vmirq.cpuid, vmirq.vector);
+		if (ddi_copyout(&vmirq, (void *)arg,
+		    sizeof (struct vm_lapic_irq), mode)) {
+			return (EFAULT);
+		}
+		break;
+	case VM_LAPIC_LOCAL_IRQ:
+		if (ddi_copyin((void *)arg, &vmirq,
+		    sizeof (struct vm_lapic_irq), mode)) {
+			return (EFAULT);
+		}
+		error = lapic_set_local_intr(sc->vm, vmirq.cpuid,
+		    vmirq.vector);
+		if (ddi_copyout(&vmirq, (void *)arg,
+		    sizeof (struct vm_lapic_irq), mode)) {
+			return (EFAULT);
+		}
+		break;
+	case VM_LAPIC_MSI:
+		if (ddi_copyin((void *)arg, &vmmsi,
+		    sizeof (struct vm_lapic_msi), mode)) {
+			return (EFAULT);
+		}
+		error = lapic_intr_msi(sc->vm, vmmsi.addr, vmmsi.msg);
+		if (ddi_copyout(&vmmsi, (void *)arg,
+		    sizeof (struct vm_lapic_msi), mode)) {
+			return (EFAULT);
+		}
+	case VM_IOAPIC_ASSERT_IRQ:
+		if (ddi_copyin((void *)arg, &ioapic_irq,
+		    sizeof (struct vm_ioapic_irq), mode)) {
+			return (EFAULT);
+		}
+		error = vioapic_assert_irq(sc->vm, ioapic_irq.irq);;
+		if (ddi_copyout(&ioapic_irq, (void *)arg,
+		    sizeof (struct vm_ioapic_irq), mode)) {
+			return (EFAULT);
+		}
+		break;
+	case VM_IOAPIC_DEASSERT_IRQ:
+		if (ddi_copyin((void *)arg, &ioapic_irq,
+		    sizeof (struct vm_ioapic_irq), mode)) {
+			return (EFAULT);
+		}
+		error = vioapic_deassert_irq(sc->vm, ioapic_irq.irq);
+		if (ddi_copyout(&ioapic_irq, (void *)arg,
+		    sizeof (struct vm_ioapic_irq), mode)) {
+			return (EFAULT);
+		}
+		break;
+	case VM_IOAPIC_PULSE_IRQ:
+		if (ddi_copyin((void *)arg, &ioapic_irq,
+		    sizeof (struct vm_ioapic_irq), mode)) {
+			return (EFAULT);
+		}
+		error = vioapic_pulse_irq(sc->vm, ioapic_irq.irq);
+		if (ddi_copyout(&ioapic_irq, (void *)arg,
+		    sizeof (struct vm_ioapic_irq), mode)) {
+			return (EFAULT);
+		}
+		break;
+	case VM_IOAPIC_PINCOUNT:
+		error = 0;
+		pincount = vioapic_pincount(sc->vm);
+		if (ddi_copyout(&pincount, (void *)arg, sizeof (int), mode)) {
+			return (EFAULT);
+		}
+		break;
+	case VM_ISA_ASSERT_IRQ:
+		if (ddi_copyin((void *)arg, &isa_irq,
+		    sizeof (struct vm_isa_irq), mode)) {
+			return (EFAULT);
+		}
+		error = vatpic_assert_irq(sc->vm, isa_irq.atpic_irq);
+		if (error == 0 && isa_irq.ioapic_irq != -1)
+			error = vioapic_assert_irq(sc->vm,
+			    isa_irq.ioapic_irq);
+		if (ddi_copyout(&isa_irq, (void *)arg,
+		    sizeof (struct vm_isa_irq), mode)) {
+			return (EFAULT);
+		
+		}
+		break;
+	case VM_ISA_DEASSERT_IRQ:
+		if (ddi_copyin((void *)arg, &isa_irq,
+		    sizeof (struct vm_isa_irq), mode)) {
+			return (EFAULT);
+		}
+		error = vatpic_deassert_irq(sc->vm, isa_irq.atpic_irq);
+		if (error == 0 && isa_irq.ioapic_irq != -1)
+			error = vioapic_deassert_irq(sc->vm,
+			    isa_irq.ioapic_irq);
+		if (ddi_copyout(&isa_irq, (void *)arg,
+		    sizeof (struct vm_isa_irq), mode)) {
+			return (EFAULT);
+		
+		}
+		break;
+	case VM_ISA_PULSE_IRQ:
+		if (ddi_copyin((void *)arg, &isa_irq,
+		    sizeof (struct vm_isa_irq), mode)) {
+			return (EFAULT);
+		}
+		error = vatpic_pulse_irq(sc->vm, isa_irq.atpic_irq);
+		if (error == 0 && isa_irq.ioapic_irq != -1)
+			error = vioapic_pulse_irq(sc->vm, isa_irq.ioapic_irq);
+		if (ddi_copyout(&isa_irq, (void *)arg,
+		    sizeof (struct vm_isa_irq), mode)) {
+			return (EFAULT);
+		
+		}
+		break;
+	case VM_MAP_MEMORY:
+		if (ddi_copyin((void *)arg, &seg,
+		    sizeof (struct vm_memory_segment), mode)) {
+			return (EFAULT);
+		}
+		error = vm_malloc(sc->vm, seg.gpa, seg.len);
+		break;
+	case VM_GET_MEMORY_SEG:
+		if (ddi_copyin((void *)arg, &seg,
+		    sizeof (struct vm_memory_segment), mode)) {
+			return (EFAULT);
+		}
+		seg.len = 0;
+		(void)vm_gpabase2memseg(sc->vm, seg.gpa, &seg);
+		if (ddi_copyout(&seg, (void *)arg,
+		    sizeof (struct vm_memory_segment), mode)) {
+			return (EFAULT);
+		}
+		error = 0;
+		break;
+	case VM_GET_REGISTER:
+		if (ddi_copyin((void *)arg, &vmreg,
+		    sizeof (struct vm_register), mode)) {
+			return (EFAULT);
+		}
+		error = vm_get_register(sc->vm, vmreg.cpuid, vmreg.regnum,
+					&vmreg.regval);
+		if (!error) {
+			if (ddi_copyout(&vmreg, (void *)arg,
+				 sizeof (struct vm_register), mode)) {
+				return (EFAULT);
+			}
+		}
+		break;
+	case VM_SET_REGISTER:
+		if (ddi_copyin((void *)arg, &vmreg,
+		    sizeof (struct vm_register), mode)) {
+			return (EFAULT);
+		}
+		error = vm_set_register(sc->vm, vmreg.cpuid, vmreg.regnum,
+					vmreg.regval);
+		break;
+	case VM_SET_SEGMENT_DESCRIPTOR:
+		if (ddi_copyin((void *)arg, &vmsegdesc,
+		    sizeof (struct vm_seg_desc), mode)) {
+			return (EFAULT);
+		}
+		error = vm_set_seg_desc(sc->vm, vmsegdesc.cpuid,
+					vmsegdesc.regnum,
+					&vmsegdesc.desc);
+		break;
+	case VM_GET_SEGMENT_DESCRIPTOR:
+		if (ddi_copyin((void *)arg, &vmsegdesc,
+		    sizeof (struct vm_seg_desc), mode)) {
+			return (EFAULT);
+		}
+		error = vm_get_seg_desc(sc->vm, vmsegdesc.cpuid,
+					vmsegdesc.regnum,
+					&vmsegdesc.desc);
+		if (!error) {
+			if (ddi_copyout(&vmsegdesc, (void *)arg,
+			    sizeof (struct vm_seg_desc), mode)) {
+				return (EFAULT);
+			}
+		}
+		break;
+	case VM_GET_CAPABILITY:
+		if (ddi_copyin((void *)arg, &vmcap,
+		    sizeof (struct vm_capability), mode)) {
+			return (EFAULT);
+		}
+		error = vm_get_capability(sc->vm, vmcap.cpuid,
+					  vmcap.captype,
+					  &vmcap.capval);
+		if (!error) {
+			if (ddi_copyout(&vmcap, (void *)arg,
+			    sizeof (struct vm_capability), mode)) {
+				return (EFAULT);
+			}
+		}
+		break;
+	case VM_SET_CAPABILITY:
+		if (ddi_copyin((void *)arg, &vmcap,
+		    sizeof (struct vm_capability), mode)) {
+			return (EFAULT);
+		}
+		error = vm_set_capability(sc->vm, vmcap.cpuid,
+					  vmcap.captype,
+					  vmcap.capval);
+		break;
+	case VM_SET_X2APIC_STATE:
+		if (ddi_copyin((void *)arg, &x2apic,
+		    sizeof (struct vm_x2apic), mode)) {
+			return (EFAULT);
+		}
+		error = vm_set_x2apic_state(sc->vm,
+					    x2apic.cpuid, x2apic.state);
+		break;
+	case VM_GET_X2APIC_STATE:
+		if (ddi_copyin((void *)arg, &x2apic,
+		    sizeof (struct vm_x2apic), mode)) {
+			return (EFAULT);
+		}
+		error = vm_get_x2apic_state(sc->vm,
+					    x2apic.cpuid, &x2apic.state);
+		if (!error) {
+			if (ddi_copyout(&x2apic, (void *)arg,
+			    sizeof (struct vm_x2apic), mode)) {
+				return (EFAULT);
+			}
+		}
+		break;
+	case VM_GLA2GPA: {
+		CTASSERT(PROT_READ == VM_PROT_READ);
+		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
+		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
+		if (ddi_copyin((void *)arg, &gg,
+		    sizeof (struct vm_gla2gpa), mode)) {
+			return (EFAULT);
+		}
+		error = vm_gla2gpa(sc->vm, gg.vcpuid, &gg.paging, gg.gla,
+		    gg.prot, &gg.gpa);
+		KASSERT(error == 0 || error == 1 || error == -1,
+		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
+		if (error >= 0) {
+			/*
+			 * error = 0: the translation was successful
+			 * error = 1: a fault was injected into the guest
+			 */
+			gg.fault = error;
+			error = 0;
+			if (ddi_copyout(&gg, (void *)arg,
+			    sizeof (struct vm_gla2gpa), mode)) {
+				return (EFAULT);
+			}
+		} else {
+			error = EFAULT;
+		}
+		break;
+	}
+	case VM_ACTIVATE_CPU:
+		if (ddi_copyin((void *)arg, &vac,
+		    sizeof (struct vm_activate_cpu), mode)) {
+			return (EFAULT);
+		}
+		error = vm_activate_cpu(sc->vm, vac.vcpuid);
+		break;
+	case VM_RESTART_INSTRUCTION:
+		error = vm_restart_instruction(sc->vm, vcpu);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	if (state_changed == 1) {
+		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
+	} else if (state_changed == 2) {
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
+	}
+
+done:
+	/* Make sure that no handler returns a bogus value like ERESTART */
+	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
+	return (error);
+}
+
+static
+minor_t vmm_find_free_minor(void)
+{
+	minor_t		minor;
+
+	for (minor = 1; ; minor++) {
+		if (ddi_get_soft_state(vmm_statep, minor) == NULL)
+			break;
+	}
+
+	return (minor);
+}
+
+int
+vmmdev_do_vm_create(dev_info_t *dip, char *name)
+{
+	struct vmm_softc	*sc;
+	minor_t			minor;
+	int			error;
+
+	mutex_enter(&vmmdev_mtx);
+
+	if (strlen(name) >= VM_MAX_NAMELEN) {
+		mutex_exit(&vmmdev_mtx);
+		return (EINVAL);
+	}
+
+	minor = vmm_find_free_minor();
+	if (ddi_soft_state_zalloc(vmm_statep, minor) == DDI_FAILURE) {
+		mutex_exit(&vmmdev_mtx);
+		return (DDI_FAILURE);
+	}
+
+	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
+		ddi_soft_state_free(vmm_statep, minor);
+		mutex_exit(&vmmdev_mtx);
+		return (DDI_FAILURE);
+	}
+	strcpy(sc->name, name);
+	sc->minor = minor;
+
+	if (ddi_create_minor_node(dip, name, S_IFCHR, minor,
+	    DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_soft_state_free(vmm_statep, minor);
+		mutex_exit(&vmmdev_mtx);
+		return (DDI_FAILURE);
+	}
+
+	error = vm_create(name, &sc->vm);
+	if (error != 0) {
+		ddi_soft_state_free(vmm_statep, minor);
+		ddi_remove_minor_node(dip, name);
+		mutex_exit(&vmmdev_mtx);
+		return (error);
+	}
+	SLIST_INSERT_HEAD(&head, sc, link);
+
+	mutex_exit(&vmmdev_mtx);
+
+	return (0);
+}
+
+static struct vmm_softc *
+vmm_lookup(char *name)
+{
+	struct vmm_softc	*sc;
+
+	SLIST_FOREACH(sc, &head, link) {
+		if (strcmp(sc->name, name) == 0) {
+			break;
+		}
+	}
+
+	return (sc);
+
+}
+
+struct vm *
+vm_lookup_by_name(char *name)
+{
+	struct vmm_softc	*sc;
+
+	mutex_enter(&vmmdev_mtx);
+
+	if ((sc = vmm_lookup(name)) == NULL) {
+		mutex_exit(&vmmdev_mtx);
+		return (NULL);
+	}
+
+	mutex_exit(&vmmdev_mtx);
+
+	return (sc->vm);
+}
+
+int
+vmmdev_do_vm_destroy(dev_info_t *dip, char *name)
+{
+	struct vmm_softc	*sc;
+	dev_info_t      *pdip = ddi_get_parent(dip);
+
+	mutex_enter(&vmmdev_mtx);
+
+	if ((sc = vmm_lookup(name)) == NULL) {
+		mutex_exit(&vmmdev_mtx);
+		return (ENOENT);
+	}
+
+	if (sc->open) {
+		mutex_exit(&vmmdev_mtx);
+		return (EBUSY);
+	}
+
+	vm_destroy(sc->vm);
+	SLIST_REMOVE(&head, sc, vmm_softc, link);
+	ddi_remove_minor_node(dip, name);
+	ddi_soft_state_free(vmm_statep, sc->minor);
+	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
+
+	mutex_exit(&vmmdev_mtx);
+
+	return (0);
+}
+
+int
+vmmdev_do_vm_mmap(struct vmm_softc *vmm_sc, off_t off, int nprot)
+{
+	vm_paddr_t	paddr;
+
+	mutex_enter(&vmmdev_mtx);
+
+	paddr = vm_gpa2hpa(vmm_sc->vm, (vm_paddr_t)off, PAGE_SIZE);
+	if (paddr == -1) {
+		return (-1);
+	}
+
+	mutex_exit(&vmmdev_mtx);
+
+	return (btop(paddr));
+}
+
+
+static int
+vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+	minor_t			minor;
+	struct vmm_softc	*sc;
+
+	minor = getminor(*devp);
+	if (minor == VMM_CTL_MINOR) {
+		/*
+		 * Master control device must be opened exclusively.
+		 */
+		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
+			return (EINVAL);
+		}
+
+		return (0);
+	}
+
+	mutex_enter(&vmmdev_mtx);
+	sc = ddi_get_soft_state(vmm_statep, minor);
+	if (sc == NULL) {
+		mutex_exit(&vmmdev_mtx);
+		return (ENXIO);
+	}
+
+	if (sc->open) {
+		mutex_exit(&vmmdev_mtx);
+		return (EBUSY);
+	}
+	sc->open = B_TRUE;
+	mutex_exit(&vmmdev_mtx);
+
+	return (0);
+}
+
+static int
+vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+	minor_t			minor;
+	struct vmm_softc	*sc;
+
+	minor = getminor(dev);
+	if (minor == VMM_CTL_MINOR)
+		return (0);
+
+	mutex_enter(&vmmdev_mtx);
+	sc = ddi_get_soft_state(vmm_statep, minor);
+	if (sc == NULL) {
+		mutex_exit(&vmmdev_mtx);
+		return (ENXIO);
+	}
+
+	sc->open = B_FALSE;
+	mutex_exit(&vmmdev_mtx);
+
+	return (0);
+}
+
+static int
+vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	struct vmm_softc	*sc;
+	struct vmm_ioctl	kvi;
+	minor_t			minor;
+
+	minor = getminor(dev);
+
+	if (minor == VMM_CTL_MINOR) {
+		if (ddi_copyin((void *)arg, &kvi, sizeof (struct vmm_ioctl),
+		    mode)) {
+			return (EFAULT);
+		}
+		switch (cmd) {
+		case VMM_CREATE_VM:
+			if ((mode & FWRITE) == 0)
+				return (EPERM);
+			return (vmmdev_do_vm_create(vmm_dip, kvi.vmm_name));
+		case VMM_DESTROY_VM:
+			if ((mode & FWRITE) == 0)
+				return (EPERM);
+			return (vmmdev_do_vm_destroy(vmm_dip, kvi.vmm_name));
+		default:
+			break;
+		}
+	}
+
+	sc = ddi_get_soft_state(vmm_statep, minor);
+	ASSERT(sc);
+
+	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
+}
+
+static int
+vmm_mmap(dev_t dev, off_t off, int prot)
+{
+	struct vmm_softc	*sc;
+
+	sc = ddi_get_soft_state(vmm_statep, getminor(dev));
+	ASSERT(sc);
+
+	return (vmmdev_do_vm_mmap(sc, off, prot));
+}
+
+static int
+vmm_segmap(dev_t dev, off_t off, struct as *as,
+		  caddr_t *addrp, off_t len, unsigned int prot,
+		  unsigned int maxprot, unsigned int flags, cred_t *credp)
+{
+	struct segdev_crargs	dev_a;
+	int			error;
+
+	as_rangelock(as);
+
+	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+	if (error != 0) {
+		as_rangeunlock(as);
+		return (error);
+	}
+
+	dev_a.mapfunc = vmm_mmap;
+	dev_a.dev = dev;
+	dev_a.offset = off;
+	dev_a.type = (flags & MAP_TYPE);
+	dev_a.prot = (uchar_t)prot;
+	dev_a.maxprot = (uchar_t)maxprot;
+	dev_a.hat_attr = 0;
+	dev_a.hat_flags = HAT_LOAD_NOCONSIST;
+	dev_a.devmap_data = NULL;
+
+	error = as_map(as, *addrp, len, segdev_create, &dev_a);
+
+	as_rangeunlock(as);
+
+	return (error);
+}
+
+static int
+vmm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	return (0);
+}
+
+static int
+vmm_probe(dev_info_t *dip)
+{
+	if (driver_installed(ddi_name_to_major("kvm"))) {
+		cmn_err(CE_WARN, "kvm is installed\n");
+		return (DDI_PROBE_FAILURE);
+	}
+
+	return (DDI_PROBE_SUCCESS);
+}
+
+static int
+vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	switch (cmd) {
+	case DDI_ATTACH:
+		break;
+	default:
+		return (DDI_FAILURE);
+	}
+
+	if (vmm_mod_load()) {
+		return (DDI_FAILURE);
+	}
+
+	vmm_dip = dip;
+
+	/*
+	 * Create control node.  Other nodes will be created on demand.
+	 */
+	if (ddi_create_minor_node(dip, VMM_CTL_MINOR_NODE, S_IFCHR,
+	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
+		return (DDI_FAILURE);
+	}
+
+	ddi_report_dev(dip);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	switch (cmd) {
+	case DDI_DETACH:
+		break;
+	default:
+		return (DDI_FAILURE);
+	}
+
+	if (vmm_mod_unload()) {;
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Remove the control node.
+	 */
+	ddi_remove_minor_node(dip, VMM_CTL_MINOR_NODE);
+	vmm_dip = NULL;
+
+	return (DDI_SUCCESS);
+}
+
+static struct cb_ops vmm_cb_ops = {
+	vmm_open,
+	vmm_close,
+	nodev,		/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	nodev,		/* read */
+	nodev,		/* write */
+	vmm_ioctl,
+	nodev,		/* devmap */
+	vmm_mmap,
+	vmm_segmap,
+	nochpoll,	/* poll */
+	ddi_prop_op,
+	NULL,
+	D_NEW | D_MP | D_DEVMAP
+};
+
+static struct dev_ops vmm_ops = {
+	DEVO_REV,
+	0,
+	ddi_no_info,
+	nulldev,	/* identify */
+	vmm_probe,
+	vmm_attach,
+	vmm_detach,
+	nodev,		/* reset */
+	&vmm_cb_ops,
+	(struct bus_ops *)NULL
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"vmm",
+	&vmm_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	int	error;
+
+	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
+
+	error = ddi_soft_state_init(&vmm_statep, sizeof (struct vmm_softc), 0);
+	if (error) {
+		return (error);
+	}
+
+	error = mod_install(&modlinkage);
+	if (error) {
+		ddi_soft_state_fini(&vmm_statep);
+	}
+
+	return (error);
+}
+
+int
+_fini(void)
+{
+	int	error;
+
+	error = mod_remove(&modlinkage);
+	if (error) {
+		return (error);
+	}
+	ddi_soft_state_fini(&vmm_statep);
+
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
new file mode 100644
index 0000000000..6588f5a46d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -0,0 +1,779 @@
+/*
+ * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/kern/subr_sleepqueue.c 261520 2014-02-05 18:13:27Z jhb $
+ */
+/*-
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/kern/subr_unit.c 255057 2013-08-30 07:37:45Z kib $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/archsystm.h>
+#include <sys/cpuset.h>
+#include <sys/fp.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/spl.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+#include <sys/vmm_impl.h>
+
+#include <vm/as.h>
+#include <vm/seg_kmem.h>
+
+vm_paddr_t
+pmap_kextract(vm_offset_t va)
+{
+	pfn_t	pfn;
+
+	pfn = hat_getpfnum(kas.a_hat, (caddr_t)va);
+	ASSERT(pfn != PFN_INVALID);
+	return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK);
+}
+
+int
+cpusetobj_ffs(const cpuset_t *set)
+{
+#if	CPUSET_WORDS > 1
+	int	i, cbit;
+
+	cbit = 0;
+	for (i = 0; i < CPUSET_WORDS; i++) {
+		if (set->cpub[i] != 0) {
+			cbit = ffsl(set->cpub[i]);
+			cbit += i * sizeof (set->cpub[0]);
+			break;
+		}
+	}
+	return (cbit);
+#else
+	return(ffsl(*set));
+#endif
+}
+
+void
+smp_rendezvous(void (* setup_func)(void *),
+	       void (* action_func)(void *),
+	       void (* teardown_func)(void *),
+	       void *arg)
+{
+	cpuset_t cpuset;
+
+	ASSERT(setup_func == NULL);
+	ASSERT(teardown_func == NULL);
+
+	CPUSET_ALL(cpuset);
+	xc_sync((xc_arg_t)arg, 0, 0, CPUSET2BV(cpuset), (xc_func_t)action_func);
+}
+
+struct kmem_item {
+	void			*addr;
+	size_t			size;
+	LIST_ENTRY(kmem_item)	next;
+};
+static kmutex_t kmem_items_lock;
+static LIST_HEAD(, kmem_item) kmem_items;
+
+void *
+malloc(unsigned long size, struct malloc_type *mtp, int flags)
+{
+	void			*p;
+	struct kmem_item	*i;
+	int			kmem_flag = KM_SLEEP;
+
+	if (flags & M_NOWAIT)
+		kmem_flag = KM_NOSLEEP;
+
+	if (flags & M_ZERO) {
+		p = kmem_zalloc(size + sizeof(struct kmem_item), kmem_flag);
+	} else {
+		p = kmem_alloc(size + sizeof(struct kmem_item), kmem_flag);
+	}
+
+	mutex_enter(&kmem_items_lock);
+	i = p + size;
+	i->addr = p;
+	i->size = size;
+
+	LIST_INSERT_HEAD(&kmem_items, i, next);
+	mutex_exit(&kmem_items_lock);
+
+	return (p);
+}
+
+void
+free(void *addr, struct malloc_type *mtp)
+{
+	struct kmem_item	*i;
+
+	mutex_enter(&kmem_items_lock);
+	LIST_FOREACH(i, &kmem_items, next) {
+		if (i->addr == addr)
+			break;
+	}
+	ASSERT(i != NULL);
+	LIST_REMOVE(i, next);
+	mutex_exit(&kmem_items_lock);
+
+	kmem_free(addr, i->size + sizeof(struct kmem_item));
+}
+
+void
+mtx_init(struct mtx *mtx, char *name, const char *type_name, int opts)
+{
+	if (opts & MTX_SPIN) {
+		mutex_init(&mtx->m, name, MUTEX_SPIN,
+		    (ddi_iblock_cookie_t)ipltospl(DISP_LEVEL));
+	} else {
+		mutex_init(&mtx->m, name, MUTEX_DRIVER, NULL);
+	}
+}
+
+void
+mtx_destroy(struct mtx *mtx)
+{
+	mutex_destroy(&mtx->m);
+}
+
+void
+critical_enter(void)
+{
+	kpreempt_disable();
+	thread_affinity_set(curthread, CPU_CURRENT);
+}
+
+void
+critical_exit(void)
+{
+	thread_affinity_clear(curthread);
+	kpreempt_enable();
+}
+
+struct unr {
+	u_int		item;
+	struct unr	*link;
+};
+
+#define	UNR_HASHSIZE	8
+
+struct unrhdr {
+	struct mtx	*mtx;
+	struct unr	*hash[UNR_HASHSIZE];
+	u_int		min;
+	u_int		max;
+	u_int		next;
+};
+
+#define	HASH_UNR(uh, i)	((uh)->hash[(i) & ((UNR_HASHSIZE) - 1)])
+
+static struct mtx unr_mtx;
+
+/*
+ * Allocate a new unrheader set.
+ *
+ * Highest and lowest valid values given as parameters.
+ */
+struct unrhdr *
+new_unrhdr(int low, int high, struct mtx *mtx)
+{
+	struct unrhdr	*uh;
+
+	uh = kmem_zalloc(sizeof (struct unrhdr), KM_SLEEP);
+	if (mtx) {
+		uh->mtx = mtx;
+	} else {
+		uh->mtx = &unr_mtx;
+	}
+	uh->min = low;
+	uh->max = high;
+	uh->next = uh->min;
+
+	return (uh);
+}
+
+void
+delete_unrhdr(struct unrhdr *uh)
+{
+	kmem_free(uh, sizeof (struct unrhdr));
+}
+
+static struct unr *
+unr_lookup(struct unrhdr *uh, int item)
+{
+	struct unr	*unr;
+
+	ASSERT(MUTEX_HELD(&uh->mtx->m));
+
+	for (unr = HASH_UNR(uh, item); unr != NULL; unr = unr->link) {
+		if (unr->item == item)
+			break;
+	}
+
+	return (unr);
+}
+
+int
+alloc_unr(struct unrhdr *uh)
+{
+	struct unr	*unr;
+	int		item, start;
+
+	mutex_enter(&uh->mtx->m);
+	start = uh->next;
+	for (;;) {
+		item = uh->next;
+		if (++uh->next == uh->max) {
+			uh->next = uh->min;
+		}
+
+		if (unr_lookup(uh, item) == NULL) {
+			unr = kmem_zalloc(sizeof (struct unr), KM_SLEEP);
+			unr->item = item;
+			unr->link = HASH_UNR(uh, item);
+			HASH_UNR(uh, item) = unr;
+			break;
+		}
+
+		if (item == start) {
+			item = -1;
+			break;
+		}
+	}
+	mutex_exit(&uh->mtx->m);
+
+	return (item);
+}
+
+void
+free_unr(struct unrhdr *uh, u_int item)
+{
+	struct unr	*unr, **unrp;
+
+	mutex_enter(&uh->mtx->m);
+	unrp = &HASH_UNR(uh, item);
+	for (;;) {
+		ASSERT(*unrp != NULL);
+		if ((*unrp)->item == item)
+			break;
+		unrp = &(*unrp)->link;
+	}
+	unr = *unrp;
+	*unrp = unr->link;
+	mutex_exit(&uh->mtx->m);
+	kmem_free(unr, sizeof(struct unr));
+}
+
+
+static void
+vmm_glue_callout_handler(void *arg)
+{
+	struct callout *c = arg;
+
+	c->c_flags &= ~CALLOUT_PENDING;
+	if (c->c_flags & CALLOUT_ACTIVE) {
+		(c->c_func)(c->c_arg);
+	}
+}
+
+void
+vmm_glue_callout_init(struct callout *c, int mpsafe)
+{
+	cyc_handler_t	hdlr;
+	cyc_time_t	when;
+
+	hdlr.cyh_level = CY_LOW_LEVEL;
+	hdlr.cyh_func = vmm_glue_callout_handler;
+	hdlr.cyh_arg = c;
+	when.cyt_when = CY_INFINITY;
+	when.cyt_interval = CY_INFINITY;
+
+	mutex_enter(&cpu_lock);
+	c->c_cyc_id = cyclic_add(&hdlr, &when);
+	c->c_flags |= CALLOUT_ACTIVE;
+	mutex_exit(&cpu_lock);
+}
+
+int
+vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
+    void (*func)(void *), void *arg, int flags)
+{
+	ASSERT(c->c_cyc_id != CYCLIC_NONE);
+
+	c->c_func = func;
+	c->c_arg = arg;
+	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+	if (flags & C_ABSOLUTE)
+		cyclic_reprogram(c->c_cyc_id, sbt);
+	else
+		cyclic_reprogram(c->c_cyc_id, sbt + gethrtime());
+
+	return (0);
+}
+
+int
+vmm_glue_callout_stop(struct callout *c)
+{
+	ASSERT(c->c_cyc_id != CYCLIC_NONE);
+	cyclic_reprogram(c->c_cyc_id, CY_INFINITY);
+	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+	return (0);
+}
+
+int
+vmm_glue_callout_drain(struct callout *c)
+{
+	ASSERT(c->c_cyc_id != CYCLIC_NONE);
+	mutex_enter(&cpu_lock);
+	cyclic_remove(c->c_cyc_id);
+	c->c_cyc_id = CYCLIC_NONE;
+	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+	mutex_exit(&cpu_lock);
+
+	return (0);
+}
+
+static int
+ipi_cpu_justreturn(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
+{
+	return (0);
+}
+
+void
+ipi_cpu(int cpu, u_int ipi)
+{
+	cpuset_t	set;
+
+	CPUSET_ONLY(set, cpu);
+	xc_call_nowait(NULL, NULL, NULL, CPUSET2BV(set),
+		       ipi_cpu_justreturn);
+}
+
+#define	SC_TABLESIZE	256			/* Must be power of 2. */
+#define	SC_MASK		(SC_TABLESIZE - 1)
+#define	SC_SHIFT	8
+#define	SC_HASH(wc)	((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
+			    SC_MASK)
+#define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
+
+struct sleepqueue {
+	u_int sq_blockedcnt;			/* Num. of blocked threads. */
+	LIST_ENTRY(sleepqueue) sq_hash;		/* Chain. */
+	void		*sq_wchan;		/* Wait channel. */
+	kcondvar_t	sq_cv;
+};
+
+struct sleepqueue_chain {
+	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
+	struct mtx	sc_lock;		/* Spin lock for this chain. */
+};
+
+static struct sleepqueue_chain	sleepq_chains[SC_TABLESIZE];
+
+#define	SLEEPQ_CACHE_SZ		(64)
+static kmem_cache_t		*vmm_sleepq_cache;
+
+static int
+vmm_sleepq_cache_init(void *buf, void *user_arg, int kmflags)
+{
+	struct sleepqueue *sq = (struct sleepqueue *)buf;
+
+	bzero(sq, sizeof (struct sleepqueue));
+	cv_init(&sq->sq_cv, NULL, CV_DRIVER, NULL);
+
+	return (0);
+}
+
+static void
+vmm_sleepq_cache_fini(void *buf, void *user_arg)
+{
+	struct sleepqueue *sq = (struct sleepqueue *)buf;
+	cv_destroy(&sq->sq_cv);
+}
+
+static void
+init_sleepqueues(void)
+{
+	int	i;
+
+        for (i = 0; i < SC_TABLESIZE; i++) {
+		LIST_INIT(&sleepq_chains[i].sc_queues);
+		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
+			 MTX_SPIN);
+	}
+
+	vmm_sleepq_cache = kmem_cache_create("vmm_sleepq_cache",
+	    sizeof (struct sleepqueue), SLEEPQ_CACHE_SZ, vmm_sleepq_cache_init,
+	    vmm_sleepq_cache_fini, NULL, NULL, NULL, 0);
+
+}
+
+/*
+ * Lock the sleep queue chain associated with the specified wait channel.
+ */
+static void
+sleepq_lock(void *wchan)
+{
+	struct sleepqueue_chain *sc;
+
+	sc = SC_LOOKUP(wchan);
+	mtx_lock_spin(&sc->sc_lock);
+}
+
+/*
+ * Look up the sleep queue associated with a given wait channel in the hash
+ * table locking the associated sleep queue chain.  If no queue is found in
+ * the table, NULL is returned.
+ */
+static struct sleepqueue *
+sleepq_lookup(void *wchan)
+{
+	struct sleepqueue_chain	*sc;
+	struct sleepqueue	*sq;
+
+	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	sc = SC_LOOKUP(wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+		if (sq->sq_wchan == wchan)
+			return (sq);
+	return (NULL);
+}
+
+/*
+ * Unlock the sleep queue chain associated with a given wait channel.
+ */
+static void
+sleepq_release(void *wchan)
+{
+	struct sleepqueue_chain *sc;
+
+	sc = SC_LOOKUP(wchan);
+	mtx_unlock_spin(&sc->sc_lock);
+}
+
+struct sleepqueue *
+sleepq_add(void *wchan)
+{
+	struct sleepqueue_chain	*sc;
+	struct sleepqueue	*sq;
+
+	sc = SC_LOOKUP(wchan);
+
+	/* Look up the sleep queue associated with the wait channel 'wchan'. */
+	sq = sleepq_lookup(wchan);
+
+	if (sq == NULL) {
+		sq = kmem_cache_alloc(vmm_sleepq_cache, KM_SLEEP);
+		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
+		sq->sq_wchan = wchan;
+	}
+
+        sq->sq_blockedcnt++;
+
+	return (sq);
+}
+
+void
+sleepq_remove(struct sleepqueue *sq)
+{
+	sq->sq_blockedcnt--;
+
+	if (sq->sq_blockedcnt == 0) {
+		LIST_REMOVE(sq, sq_hash);
+		kmem_cache_free(vmm_sleepq_cache, sq);
+	}
+}
+
+int
+msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int ticks)
+{
+	struct sleepqueue	*sq;
+	int			error;
+
+	sleepq_lock(chan);
+	sq = sleepq_add(chan);
+	sleepq_release(chan);
+
+	cv_reltimedwait(&sq->sq_cv, &mtx->m, ticks, TR_CLOCK_TICK);
+
+	sleepq_lock(chan);
+	sleepq_remove(sq);
+	sleepq_release(chan);
+
+	return (error);
+}
+
+void
+wakeup(void *chan)
+{
+	struct sleepqueue	*sq;
+
+	sleepq_lock(chan);
+        sq = sleepq_lookup(chan);
+	if (sq != NULL) {
+		cv_broadcast(&sq->sq_cv);
+	}
+	sleepq_release(chan);
+}
+
+void
+wakeup_one(void *chan)
+{
+	struct sleepqueue	*sq;
+
+	sleepq_lock(chan);
+        sq = sleepq_lookup(chan);
+	if (sq != NULL) {
+		cv_signal(&sq->sq_cv);
+	}
+	sleepq_release(chan);
+}
+
+u_int	cpu_high;		/* Highest arg to CPUID */
+u_int	cpu_exthigh;		/* Highest arg to extended CPUID */
+u_int	cpu_id;			/* Stepping ID */
+char	cpu_vendor[20];		/* CPU Origin code */
+
+static void
+vmm_cpuid_init(void)
+{
+	u_int regs[4];
+
+	do_cpuid(0, regs);
+	cpu_high = regs[0];
+	((u_int *)&cpu_vendor)[0] = regs[1];
+	((u_int *)&cpu_vendor)[1] = regs[3];
+	((u_int *)&cpu_vendor)[2] = regs[2];
+	cpu_vendor[12] = '\0';
+
+	do_cpuid(1, regs);
+	cpu_id = regs[0];
+
+	do_cpuid(0x80000000, regs);
+	cpu_exthigh = regs[0];
+}
+
+struct savefpu {
+	fpu_ctx_t	fsa_fp_ctx;
+};
+
+static vmem_t *fpu_save_area_arena;
+
+static void
+fpu_save_area_init(void)
+{
+	fpu_save_area_arena = vmem_create("fpu_save_area",
+	    NULL, 0, XSAVE_AREA_ALIGN,
+	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_BESTFIT | VM_SLEEP);
+}
+
+static void
+fpu_save_area_cleanup(void)
+{
+	vmem_destroy(fpu_save_area_arena);
+}
+
+struct savefpu *
+fpu_save_area_alloc(void)
+{
+	return (vmem_alloc(fpu_save_area_arena, sizeof (struct savefpu),
+			   VM_SLEEP));
+}
+
+void
+fpu_save_area_free(struct savefpu *fsa)
+{
+	vmem_free(fpu_save_area_arena, fsa, sizeof (struct savefpu));
+}
+
+void
+fpu_save_area_reset(struct savefpu *fsa)
+{
+	extern const struct fxsave_state sse_initial;
+	extern const struct xsave_state avx_initial;
+	struct fpu_ctx *fp;
+	struct fxsave_state *fx;
+	struct xsave_state *xs;
+
+	fp = &fsa->fsa_fp_ctx;
+
+	fp->fpu_regs.kfpu_status = 0;
+	fp->fpu_regs.kfpu_xstatus = 0;
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		fx = &fp->fpu_regs.kfpu_u.kfpu_fx;
+		bcopy(&sse_initial, fx, sizeof (*fx));
+		break;
+	case FP_XSAVE:
+		fp->fpu_xsave_mask = (XFEATURE_ENABLED_X87 |
+		    XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX);
+		xs = &fp->fpu_regs.kfpu_u.kfpu_xs;
+		bcopy(&avx_initial, xs, sizeof (*xs));
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+}
+
+void
+fpuexit(kthread_t *td)
+{
+	fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu);
+}
+
+static __inline void
+vmm_fxrstor(struct fxsave_state *addr)
+{
+	__asm __volatile("fxrstor %0" : : "m" (*(addr)));
+}
+
+static __inline void
+vmm_fxsave(struct fxsave_state *addr)
+{
+	__asm __volatile("fxsave %0" : "=m" (*(addr)));
+}
+
+static __inline void
+vmm_xrstor(struct xsave_state *addr, uint64_t mask)
+{
+	uint32_t low, hi;
+
+	low = mask;
+	hi = mask >> 32;
+	__asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi));
+}
+
+static __inline void
+vmm_xsave(struct xsave_state *addr, uint64_t mask)
+{
+	uint32_t low, hi;
+
+	low = mask;
+	hi = mask >> 32;
+	__asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) :
+	    "memory");
+}
+
+void
+fpurestore(void *arg)
+{
+	struct savefpu *fsa = (struct savefpu *)arg;
+	struct fpu_ctx *fp;
+
+	fp = &fsa->fsa_fp_ctx;
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		vmm_fxrstor(&fp->fpu_regs.kfpu_u.kfpu_fx);
+		break;
+	case FP_XSAVE:
+		vmm_xrstor(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+}
+
+void
+fpusave(void *arg)
+{
+	struct savefpu *fsa = (struct savefpu *)arg;
+	struct fpu_ctx *fp;
+
+	fp = &fsa->fsa_fp_ctx;
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		vmm_fxsave(&fp->fpu_regs.kfpu_u.kfpu_fx);
+		break;
+	case FP_XSAVE:
+		vmm_xsave(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+}
+
+void
+vmm_sol_glue_init(void)
+{
+	vmm_cpuid_init();
+	fpu_save_area_init();
+	init_sleepqueues();
+}
+
+void
+vmm_sol_glue_cleanup(void)
+{
+	fpu_save_area_cleanup();
+	kmem_cache_destroy(vmm_sleepq_cache);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
new file mode 100644
index 0000000000..3bb5412d16
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+
+#include <vm/vm.h>
+#include <machine/pmap.h>
+
+#include <sys/ddi.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+int
+vmm_mem_init(void)
+{
+	return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+	clock_t usec = 2 * 1000000;
+	vm_paddr_t pa;
+	caddr_t addr;
+
+	if (size != PAGE_SIZE)
+		panic("vmm_mem_alloc: invalid allocation size %lu", size);
+
+	while (usec > 0) {
+		if ((addr = kmem_zalloc(PAGE_SIZE, KM_NOSLEEP)) != NULL) {
+			ASSERT(((uintptr_t)addr & PAGE_MASK) == 0);
+			pa = vtophys((vm_offset_t)addr);
+			return (pa);
+		}
+		delay(drv_usectohz((clock_t)500000));
+		usec -= 500000;
+	}
+
+	return (NULL);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+	page_t	*pp;
+
+	if (base & PAGE_MASK) {
+		panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+		      "0x%0x boundary\n", base, PAGE_SIZE);
+	}
+
+	if (length != PAGE_SIZE) {
+		panic("vmm_mem_free: invalid length %lu", length);
+	}
+
+	pp = page_numtopp_nolock(btop(base));
+	kmem_free((void *)pp->p_offset, PAGE_SIZE);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+	return (ptob(physmax + 1));
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
new file mode 100644
index 0000000000..9bf7a60e0b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
@@ -0,0 +1,127 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_stat.h 250427 2013-05-10 02:59:49Z neel $
+ */
+
+#ifndef _VMM_STAT_H_
+#define	_VMM_STAT_H_
+
+struct vm;
+
+#define	MAX_VMM_STAT_ELEMS	64		/* arbitrary */
+
+enum vmm_stat_scope {
+	VMM_STAT_SCOPE_ANY,
+	VMM_STAT_SCOPE_INTEL,		/* Intel VMX specific statistic */
+	VMM_STAT_SCOPE_AMD,		/* AMD SVM specific statistic */
+};
+
+struct vmm_stat_type {
+	int	index;			/* position in the stats buffer */
+	int	nelems;			/* standalone or array */
+	const char *desc;		/* description of statistic */
+	enum vmm_stat_scope scope;
+};
+
+void	vmm_stat_init(void *arg);
+
+#define	VMM_STAT_DEFINE(type, nelems, desc, scope)			\
+	struct vmm_stat_type type[1] = {				\
+		{ -1, nelems, desc, scope }				\
+	};								\
+	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+#define	VMM_STAT_DECLARE(type)						\
+	extern struct vmm_stat_type type[1]
+
+#define	VMM_STAT(type, desc)		\
+	VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY)
+#define	VMM_STAT_INTEL(type, desc)	\
+	VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL)
+#define	VMM_STAT_AMD(type, desc)	\
+	VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD)
+
+#define	VMM_STAT_ARRAY(type, nelems, desc)	\
+	VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY)
+
+void	*vmm_stat_alloc(void);
+void 	vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int	vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+int	vmm_stat_desc_copy(int index, char *buf, int buflen);
+
+static void __inline
+vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst,
+		    int statidx, uint64_t x)
+{
+#ifdef VMM_KEEP_STATS
+	uint64_t *stats;
+	
+	stats = vcpu_stats(vm, vcpu);
+
+	if (vst->index >= 0 && statidx < vst->nelems)
+		stats[vst->index + statidx] += x;
+#endif
+}
+		   
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+
+#ifdef VMM_KEEP_STATS
+	vmm_stat_array_incr(vm, vcpu, vst, 0, x);
+#endif
+}
+
+VMM_STAT_DECLARE(VCPU_MIGRATIONS);
+VMM_STAT_DECLARE(VMEXIT_COUNT);
+VMM_STAT_DECLARE(VMEXIT_EXTINT);
+VMM_STAT_DECLARE(VMEXIT_HLT);
+VMM_STAT_DECLARE(VMEXIT_CR_ACCESS);
+VMM_STAT_DECLARE(VMEXIT_RDMSR);
+VMM_STAT_DECLARE(VMEXIT_WRMSR);
+VMM_STAT_DECLARE(VMEXIT_MTRAP);
+VMM_STAT_DECLARE(VMEXIT_PAUSE);
+VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW);
+VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW);
+VMM_STAT_DECLARE(VMEXIT_INOUT);
+VMM_STAT_DECLARE(VMEXIT_CPUID);
+VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT);
+VMM_STAT_DECLARE(VMEXIT_INST_EMUL);
+VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
+VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
+VMM_STAT_DECLARE(VMEXIT_USERSPACE);
+VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
+VMM_STAT_DECLARE(VMEXIT_USERSPACE);
+VMM_STAT_DECLARE(VMEXIT_EXCEPTION);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c
new file mode 100644
index 0000000000..fabd42e13c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c
@@ -0,0 +1,125 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+	unsigned int regs[4];
+
+	/*
+	 * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+	 *
+	 * Both Intel and AMD support this bit.
+	 */
+	if (cpu_exthigh >= 0x80000001) {
+		do_cpuid(0x80000001, regs);
+		if (regs[3] & (1 << 26))
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
+#ifdef	__FreeBSD__
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define	DUMP_REG(x)	printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define	DUMP_SEG(x)	printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+	DUMP_REG(rdi);
+	DUMP_REG(rsi);
+	DUMP_REG(rdx);
+	DUMP_REG(rcx);
+	DUMP_REG(r8);
+	DUMP_REG(r9);
+	DUMP_REG(rax);
+	DUMP_REG(rbx);
+	DUMP_REG(rbp);
+	DUMP_REG(r10);
+	DUMP_REG(r11);
+	DUMP_REG(r12);
+	DUMP_REG(r13);
+	DUMP_REG(r14);
+	DUMP_REG(r15);
+	DUMP_REG(trapno);
+	DUMP_REG(addr);
+	DUMP_REG(flags);
+	DUMP_REG(err);
+	DUMP_REG(rip);
+	DUMP_REG(rflags);
+	DUMP_REG(rsp);
+	DUMP_SEG(cs);
+	DUMP_SEG(ss);
+	DUMP_SEG(fs);
+	DUMP_SEG(gs);
+	DUMP_SEG(es);
+	DUMP_SEG(ds);
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h
new file mode 100644
index 0000000000..fe1c1c9449
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_util.h 245678 2013-01-20 03:42:49Z neel $
+ */
+
+#ifndef _VMM_UTIL_H_
+#define	_VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t	vmm_is_intel(void);
+boolean_t	vmm_is_amd(void);
+boolean_t	vmm_supports_1G_pages(void);
+
+void		dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmx_assym.s b/usr/src/uts/i86pc/io/vmm/vmx_assym.s
new file mode 100644
index 0000000000..d84ca30275
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmx_assym.s
@@ -0,0 +1 @@
+#include "vmx_assym.h"
diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c
new file mode 100644
index 0000000000..02222ef5e7
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/x86.c
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/clock.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+
+#include "x86.h"
+
+#define	CPUID_VM_HIGH		0x40000000
+
+static const char bhyve_id[12] = "bhyve bhyve ";
+
+static uint64_t bhyve_xcpuids;
+
+int
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+	int error;
+	unsigned int 	func, regs[4];
+	enum x2apic_state x2apic_state;
+
+	/*
+	 * Requests for invalid CPUID levels should map to the highest
+	 * available level instead.
+	 */
+	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+		if (*eax > cpu_exthigh)
+			*eax = cpu_exthigh;
+	} else if (*eax >= 0x40000000) {
+		if (*eax > CPUID_VM_HIGH)
+			*eax = CPUID_VM_HIGH;
+	} else if (*eax > cpu_high) {
+		*eax = cpu_high;
+	}
+
+	func = *eax;
+
+	/*
+	 * In general the approach used for CPU topology is to
+	 * advertise a flat topology where all CPUs are packages with
+	 * no multi-core or SMT.
+	 */
+	switch (func) {
+		/*
+		 * Pass these through to the guest
+		 */
+		case CPUID_0000_0000:
+		case CPUID_0000_0002:
+		case CPUID_0000_0003:
+		case CPUID_8000_0000:
+		case CPUID_8000_0002:
+		case CPUID_8000_0003:
+		case CPUID_8000_0004:
+		case CPUID_8000_0006:
+		case CPUID_8000_0008:
+			cpuid_count(*eax, *ecx, regs);
+			break;
+
+		case CPUID_8000_0001:
+			/*
+			 * Hide rdtscp/ia32_tsc_aux until we know how
+			 * to deal with them.
+			 */
+			cpuid_count(*eax, *ecx, regs);
+			regs[3] &= ~AMDID_RDTSCP;
+			break;
+
+		case CPUID_8000_0007:
+			cpuid_count(*eax, *ecx, regs);
+#ifdef	__FreeBSD__
+			/*
+			 * If the host TSCs are not synchronized across
+			 * physical cpus then we cannot advertise an
+			 * invariant tsc to a vcpu.
+			 *
+			 * XXX This still falls short because the vcpu
+			 * can observe the TSC moving backwards as it
+			 * migrates across physical cpus. But at least
+			 * it should discourage the guest from using the
+			 * TSC to keep track of time.
+			 */
+			if (!smp_tsc)
+				regs[3] &= ~AMDPM_TSC_INVARIANT;
+#endif
+			break;
+
+		case CPUID_0000_0001:
+			do_cpuid(1, regs);
+
+			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+			if (error) {
+				panic("x86_emulate_cpuid: error %d "
+				      "fetching x2apic state", error);
+			}
+
+			/*
+			 * Override the APIC ID only in ebx
+			 */
+			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
+
+			/*
+			 * Don't expose VMX, SpeedStep or TME capability.
+			 * Advertise x2APIC capability and Hypervisor guest.
+			 */
+			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+
+			regs[2] |= CPUID2_HV;
+
+			if (x2apic_state != X2APIC_DISABLED)
+				regs[2] |= CPUID2_X2APIC;
+
+			/*
+			 * Hide xsave/osxsave/avx until the FPU save/restore
+			 * issues are resolved
+			 */
+			regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+				     CPUID2_AVX);
+
+			/*
+			 * Hide monitor/mwait until we know how to deal with
+			 * these instructions.
+			 */
+			regs[2] &= ~CPUID2_MON;
+
+                        /*
+			 * Hide the performance and debug features.
+			 */
+			regs[2] &= ~CPUID2_PDCM;
+			
+			/*
+			 * No TSC deadline support in the APIC yet
+			 */
+			regs[2] &= ~CPUID2_TSCDLT;
+
+			/*
+			 * Hide thermal monitoring
+			 */
+			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+			
+			/*
+			 * Machine check handling is done in the host.
+			 */
+			regs[3] &= ~(CPUID_MCA | CPUID_MCE);
+
+                        /*
+                        * Hide the debug store capability.
+                        */
+			regs[3] &= ~CPUID_DS;
+
+			/*
+			 * Disable multi-core.
+			 */
+			regs[1] &= ~CPUID_HTT_CORES;
+			regs[3] &= ~CPUID_HTT;
+			break;
+
+		case CPUID_0000_0004:
+			do_cpuid(4, regs);
+
+			/*
+			 * Do not expose topology.
+			 */
+			regs[0] &= 0xffff8000;
+			/*
+			 * The maximum number of processor cores in
+			 * this physical processor package and the
+			 * maximum number of threads sharing this
+			 * cache are encoded with "plus 1" encoding.
+			 * Adding one to the value in this register
+			 * field to obtains the actual value.
+			 *
+			 * Therefore 0 for both indicates 1 core
+			 * per package and no cache sharing.
+			 */
+			break;
+
+		case CPUID_0000_0006:
+		case CPUID_0000_0007:
+		case CPUID_0000_000A:
+		case CPUID_0000_000D:
+			/*
+			 * Handle the access, but report 0 for
+			 * all options
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+			break;
+
+		case CPUID_0000_000B:
+			/*
+			 * Processor topology enumeration
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = *ecx & 0xff;
+			regs[3] = vcpu_id;
+			break;
+
+		case 0x40000000:
+			regs[0] = CPUID_VM_HIGH;
+			bcopy(bhyve_id, &regs[1], 4);
+			bcopy(bhyve_id + 4, &regs[2], 4);
+			bcopy(bhyve_id + 8, &regs[3], 4);
+			break;
+
+		default:
+			/*
+			 * The leaf value has already been clamped so
+			 * simply pass this through, keeping count of
+			 * how many unhandled leaf values have been seen.
+			 */
+			atomic_add_long(&bhyve_xcpuids, 1);
+			cpuid_count(*eax, *ecx, regs);
+			break;
+	}
+
+	*eax = regs[0];
+	*ebx = regs[1];
+	*ecx = regs[2];
+	*edx = regs[3];
+
+	return (1);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h
new file mode 100644
index 0000000000..db2340b37b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/x86.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/x86.h 255287 2013-09-06 05:16:10Z grehan $
+ */
+
+#ifndef _X86_H_
+#define	_X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001	(0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
+#define	CPUID_0000_000A	(0xA)
+#define	CPUID_0000_000B	(0xB)
+#define	CPUID_0000_000D	(0xD)
+#define CPUID_8000_0000	(0x80000000)
+#define CPUID_8000_0001	(0x80000001)
+#define CPUID_8000_0002	(0x80000002)
+#define CPUID_8000_0003	(0x80000003)
+#define CPUID_8000_0004	(0x80000004)
+#define CPUID_8000_0006	(0x80000006)
+#define CPUID_8000_0007	(0x80000007)
+#define CPUID_8000_0008	(0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK			(0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT			24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX	(1<<5)
+
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+		      uint32_t *ecx, uint32_t *edx);
+
+#endif
diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h
new file mode 100644
index 0000000000..a4fb0f2527
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/viona_io.h
@@ -0,0 +1,45 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef	_VIONA_IO_H_
+#define	_VIONA_IO_H_
+
+#define	VNA_IOC			(('V' << 16)|('C' << 8))
+#define	VNA_IOC_CREATE		(VNA_IOC | 1)
+#define	VNA_IOC_DELETE		(VNA_IOC | 2)
+#define	VNA_IOC_RX_RING_INIT	(VNA_IOC | 3)
+#define	VNA_IOC_TX_RING_INIT	(VNA_IOC | 4)
+#define	VNA_IOC_RX_RING_RESET	(VNA_IOC | 5)
+#define	VNA_IOC_TX_RING_RESET	(VNA_IOC | 6)
+#define	VNA_IOC_RX_RING_KICK	(VNA_IOC | 7)
+#define	VNA_IOC_TX_RING_KICK	(VNA_IOC | 8)
+#define	VNA_IOC_RX_INTR_CLR	(VNA_IOC | 9)
+#define	VNA_IOC_TX_INTR_CLR	(VNA_IOC | 10)
+#define VNA_IOC_SET_FEATURES	(VNA_IOC | 11)
+#define VNA_IOC_GET_FEATURES	(VNA_IOC | 12)
+
+typedef struct vioc_create {
+	datalink_id_t	c_linkid;
+	char		c_vmname[64];
+	size_t		c_lomem_size;
+	size_t		c_himem_size;
+} vioc_create_t;
+
+typedef struct vioc_ring_init {
+	uint16_t	ri_qsize;
+	uint64_t	ri_qaddr;
+} vioc_ring_init_t;
+
+#endif	/* _VIONA_IO_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
new file mode 100644
index 0000000000..e876ce748f
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -0,0 +1,565 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vmm.h 273375 2014-10-21 07:10:43Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_H_
+#define	_VMM_H_
+
+#include <x86/segments.h>
+
+enum vm_suspend_how {
+	VM_SUSPEND_NONE,
+	VM_SUSPEND_RESET,
+	VM_SUSPEND_POWEROFF,
+	VM_SUSPEND_HALT,
+	VM_SUSPEND_LAST
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15,
+	VM_REG_GUEST_CR0,
+	VM_REG_GUEST_CR3,
+	VM_REG_GUEST_CR4,
+	VM_REG_GUEST_DR7,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RIP,
+	VM_REG_GUEST_RFLAGS,
+	VM_REG_GUEST_ES,
+	VM_REG_GUEST_CS,
+	VM_REG_GUEST_SS,
+	VM_REG_GUEST_DS,
+	VM_REG_GUEST_FS,
+	VM_REG_GUEST_GS,
+	VM_REG_GUEST_LDTR,
+	VM_REG_GUEST_TR,
+	VM_REG_GUEST_IDTR,
+	VM_REG_GUEST_GDTR,
+	VM_REG_GUEST_EFER,
+	VM_REG_GUEST_CR2,
+	VM_REG_LAST
+};
+
+enum x2apic_state {
+	X2APIC_DISABLED,
+	X2APIC_ENABLED,
+	X2APIC_STATE_LAST
+};
+
+#define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
+#define	VM_INTINFO_DEL_ERRCODE	0x800
+#define	VM_INTINFO_RSVD		0x7ffff000
+#define	VM_INTINFO_VALID	0x80000000
+#define	VM_INTINFO_TYPE		0x700
+#define	VM_INTINFO_HWINTR	(0 << 8)
+#define	VM_INTINFO_NMI		(2 << 8)
+#define	VM_INTINFO_HWEXCEPTION	(3 << 8)
+#define	VM_INTINFO_SWINTR	(4 << 8)
+
+#define	VM_MAX_NAMELEN	32
+
+#ifdef _KERNEL
+
+struct vm;
+struct vm_exception;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vhpet;
+struct vioapic;
+struct vlapic;
+struct vm_guest_paging;
+
+typedef int	(*vmm_init_func_t)(void);
+typedef int	(*vmm_cleanup_func_t)(void);
+typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void	(*vmi_cleanup_func_t)(void *vmi);
+typedef int	(*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+				       vm_paddr_t hpa, size_t length,
+				       vm_memattr_t attr, int prot,
+				       boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
+typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t *retval);
+typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t val);
+typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
+typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
+
+struct vmm_ops {
+	vmm_init_func_t		init;		/* module wide initialization */
+	vmm_cleanup_func_t	cleanup;
+
+	vmi_init_func_t		vminit;		/* vm-specific initialization */
+	vmi_run_func_t		vmrun;
+	vmi_cleanup_func_t	vmcleanup;
+	vmi_mmap_set_func_t	vmmmap_set;
+	vmi_mmap_get_func_t	vmmmap_get;
+	vmi_get_register_t	vmgetreg;
+	vmi_set_register_t	vmsetreg;
+	vmi_get_desc_t		vmgetdesc;
+	vmi_set_desc_t		vmsetdesc;
+	vmi_get_cap_t		vmgetcap;
+	vmi_set_cap_t		vmsetcap;
+	vmi_vlapic_init		vlapic_init;
+	vmi_vlapic_cleanup	vlapic_cleanup;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+int vm_create(const char *name, struct vm **retvm);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+#ifdef	__FreeBSD__
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+#endif
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+#ifndef	__FreeBSD__
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+#endif
+void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
+		  void **cookie);
+void vm_gpa_release(void *cookie);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+	      struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *desc);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+int vm_inject_extint(struct vm *vm, int vcpu);
+int vm_extint_pending(struct vm *vm, int vcpuid);
+void vm_extint_clear(struct vm *vm, int vcpuid);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+struct vioapic *vm_ioapic(struct vm *vm);
+struct vhpet *vm_hpet(struct vm *vm);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+int vm_apicid2vcpuid(struct vm *vm, int apicid);
+int vm_activate_cpu(struct vm *vm, int vcpu);
+cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+
+typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+	VCPU_IDLE,
+	VCPU_FROZEN,
+	VCPU_RUNNING,
+	VCPU_SLEEPING,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
+    bool from_idle);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
+{
+	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
+}
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
+struct vatpic *vm_atpic(struct vm *vm);
+struct vatpit *vm_atpit(struct vm *vm);
+
+/*
+ * Inject exception 'vme' into the guest vcpu. This function returns 0 on
+ * success and non-zero on failure.
+ *
+ * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
+ * this function directly because they enforce the trap-like or fault-like
+ * behavior of an exception.
+ *
+ * This function should only be called in the context of the thread that is
+ * executing this vcpu.
+ */
+int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
+
+/*
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
+ *
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
+ */
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
+
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
+
+enum vm_reg_name vm_segment_name(int seg_encoding);
+
+struct vm_copyinfo {
+	uint64_t	gpa;
+	size_t		len;
+	void		*hva;
+	void		*cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout. 
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len);
+#endif	/* KERNEL */
+
+#define	VM_MAXCPU	16			/* maximum virtual cpus */
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+	VM_CAP_HALT_EXIT,
+	VM_CAP_MTRAP_EXIT,
+	VM_CAP_PAUSE_EXIT,
+	VM_CAP_UNRESTRICTED_GUEST,
+	VM_CAP_ENABLE_INVPCID,
+	VM_CAP_MAX
+};
+
+enum vm_intr_trigger {
+	EDGE_TRIGGER,
+	LEVEL_TRIGGER
+};
+	
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+	uint64_t	base;
+	uint32_t	limit;
+	uint32_t	access;
+};
+
+#define	SEG_DESC_TYPE(access)		((access) & 0x001f)
+#define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
+#define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
+#define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
+#define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
+#define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
+
+enum vm_cpu_mode {
+	CPU_MODE_REAL,
+	CPU_MODE_PROTECTED,
+	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
+	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
+};
+
+enum vm_paging_mode {
+	PAGING_MODE_FLAT,
+	PAGING_MODE_32,
+	PAGING_MODE_PAE,
+	PAGING_MODE_64,
+};
+
+struct vm_guest_paging {
+	uint64_t	cr3;
+	int		cpl;
+	enum vm_cpu_mode cpu_mode;
+	enum vm_paging_mode paging_mode;
+};
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+	uint8_t		op_byte;	/* actual opcode byte */
+	uint8_t		op_type;	/* type of operation (e.g. MOV) */
+	uint16_t	op_flags;
+};
+
+#define	VIE_INST_SIZE	15
+struct vie {
+	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
+	uint8_t		num_valid;		/* size of the instruction */
+	uint8_t		num_processed;
+
+	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
+	uint8_t		rex_w:1,		/* REX prefix */
+			rex_r:1,
+			rex_x:1,
+			rex_b:1,
+			rex_present:1,
+			repz_present:1,		/* REP/REPE/REPZ prefix */
+			repnz_present:1,	/* REPNE/REPNZ prefix */
+			opsize_override:1,	/* Operand size override */
+			addrsize_override:1,	/* Address size override */
+			segment_override:1;	/* Segment override */
+
+	uint8_t		mod:2,			/* ModRM byte */
+			reg:4,
+			rm:4;
+
+	uint8_t		ss:2,			/* SIB byte */
+			index:4,
+			base:4;
+
+	uint8_t		disp_bytes;
+	uint8_t		imm_bytes;
+
+	uint8_t		scale;
+	int		base_register;		/* VM_REG_GUEST_xyz */
+	int		index_register;		/* VM_REG_GUEST_xyz */
+	int		segment_register;	/* VM_REG_GUEST_xyz */
+
+	int64_t		displacement;		/* optional addr displacement */
+	int64_t		immediate;		/* optional immediate operand */
+
+	uint8_t		decoded;	/* set to 1 if successfully decoded */
+
+	struct vie_op	op;			/* opcode description */
+};
+
+enum vm_exitcode {
+	VM_EXITCODE_INOUT,
+	VM_EXITCODE_VMX,
+	VM_EXITCODE_BOGUS,
+	VM_EXITCODE_RDMSR,
+	VM_EXITCODE_WRMSR,
+	VM_EXITCODE_HLT,
+	VM_EXITCODE_MTRAP,
+	VM_EXITCODE_PAUSE,
+	VM_EXITCODE_PAGING,
+	VM_EXITCODE_INST_EMUL,
+	VM_EXITCODE_SPINUP_AP,
+	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
+	VM_EXITCODE_INOUT_STR,
+	VM_EXITCODE_MAX
+};
+
+struct vm_inout {
+	uint16_t	bytes:3;	/* 1 or 2 or 4 */
+	uint16_t	in:1;
+	uint16_t	string:1;
+	uint16_t	rep:1;
+	uint16_t	port;
+	uint32_t	eax;		/* valid for out */
+};
+
+struct vm_inout_str {
+	struct vm_inout	inout;		/* must be the first element */
+	struct vm_guest_paging paging;
+	uint64_t	rflags;
+	uint64_t	cr0;
+	uint64_t	index;
+	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
+	int		addrsize;
+	enum vm_reg_name seg_name;
+	struct seg_desc seg_desc;
+};
+
+struct vm_exit {
+	enum vm_exitcode	exitcode;
+	int			inst_length;	/* 0 means unknown */
+	uint64_t		rip;
+	union {
+		struct vm_inout	inout;
+		struct vm_inout_str inout_str;
+		struct {
+			uint64_t	gpa;
+			int		fault_type;
+		} paging;
+		struct {
+			uint64_t	gpa;
+			uint64_t	gla;
+			uint64_t	cs_base;
+			int		cs_d;		/* CS.D */
+			struct vm_guest_paging paging;
+			struct vie	vie;
+		} inst_emul;
+		/*
+		 * VMX specific payload. Used when there is no "better"
+		 * exitcode to represent the VM-exit.
+		 */
+		struct {
+			int		status;		/* vmx inst status */
+			/*
+			 * 'exit_reason' and 'exit_qualification' are valid
+			 * only if 'status' is zero.
+			 */
+			uint32_t	exit_reason;
+			uint64_t	exit_qualification;
+			/*
+			 * 'inst_error' and 'inst_type' are valid
+			 * only if 'status' is non-zero.
+			 */
+			int		inst_type;
+			int		inst_error;
+		} vmx;
+		struct {
+			uint32_t	code;		/* ecx value */
+			uint64_t	wval;
+		} msr;
+		struct {
+			int		vcpu;
+			uint64_t	rip;
+		} spinup_ap;
+		struct {
+			uint64_t	rflags;
+		} hlt;
+	} u;
+};
+
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+    int errcode);
+
+static __inline void
+vm_inject_ud(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static __inline void
+vm_inject_gp(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static __inline void
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static __inline void
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
+int vm_restart_instruction(void *vm, int vcpuid);
+
+#ifndef	__FreeBSD__
+#ifdef	_KERNEL
+extern void vmm_sol_glue_init(void);
+extern void vmm_sol_glue_cleanup(void);
+
+extern int vmm_mod_load(void);
+extern int vmm_mod_unload(void);
+#endif
+#endif
+
+#endif	/* _VMM_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
new file mode 100644
index 0000000000..3e74eb8786
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -0,0 +1,334 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vmm_dev.h 268889 2014-07-19 20:59:08Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef	_VMM_DEV_H_
+#define	_VMM_DEV_H_
+
+#ifdef _KERNEL
+void	vmmdev_init(void);
+int	vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+	vm_paddr_t	gpa;	/* in */
+	size_t		len;
+	int		wired;
+};
+
+struct vm_register {
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	uint64_t	regval;
+};
+
+struct vm_seg_desc {			/* data or code segment */
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	struct seg_desc desc;
+};
+
+struct vm_run {
+	int		cpuid;
+	struct vm_exit	vm_exit;
+};
+
+struct vm_exception {
+	int		cpuid;
+	int		vector;
+	uint32_t	error_code;
+	int		error_code_valid;
+	int		restart_instruction;
+};
+
+struct vm_lapic_msi {
+	uint64_t	msg;
+	uint64_t	addr;
+};
+
+struct vm_lapic_irq {
+	int		cpuid;
+	int		vector;
+};
+
+struct vm_ioapic_irq {
+	int		irq;
+};
+
+struct vm_isa_irq {
+	int		atpic_irq;
+	int		ioapic_irq;
+};
+
+struct vm_isa_irq_trigger {
+	int		atpic_irq;
+	enum vm_intr_trigger trigger;
+};
+
+struct vm_capability {
+	int		cpuid;
+	enum vm_cap_type captype;
+	int		capval;
+	int		allcpus;
+};
+
+struct vm_pptdev {
+	int		bus;
+	int		slot;
+	int		func;
+};
+
+struct vm_pptdev_mmio {
+	int		bus;
+	int		slot;
+	int		func;
+	vm_paddr_t	gpa;
+	vm_paddr_t	hpa;
+	size_t		len;
+};
+
+struct vm_pptdev_msi {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		numvec;		/* 0 means disabled */
+	uint32_t	msg;
+	uint64_t	addr;
+};
+
+struct vm_pptdev_msix {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		idx;
+	uint32_t	msg;
+	uint32_t	vector_control;
+	uint64_t	addr;
+};
+
+struct vm_nmi {
+	int		cpuid;
+};
+
+#define	MAX_VM_STATS	64
+struct vm_stats {
+	int		cpuid;				/* in */
+	int		num_entries;			/* out */
+	struct timeval	tv;
+	uint64_t	statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+	int		index;				/* in */
+	char		desc[128];			/* out */
+};
+
+struct vm_x2apic {
+	int			cpuid;
+	enum x2apic_state	state;
+};
+
+struct vm_gpa_pte {
+	uint64_t	gpa;				/* in */
+	uint64_t	pte[4];				/* out */
+	int		ptenum;
+};
+
+struct vm_hpet_cap {
+	uint32_t	capabilities;	/* lower 32 bits of HPET capabilities */
+};
+
+struct vm_activate_cpu {
+	int		vcpuid;
+};
+
+struct vm_gla2gpa {
+	int		vcpuid;		/* inputs */
+	int 		prot;		/* PROT_READ or PROT_WRITE */
+	uint64_t	gla;
+	struct vm_guest_paging paging;
+	int		fault;		/* outputs */
+	uint64_t	gpa;
+};
+
+struct vm_cpuset {
+	int		which;
+	int		cpusetsize;
+	cpuset_t	*cpus;
+};
+#define	VM_ACTIVE_CPUS		0
+#define	VM_SUSPENDED_CPUS	1
+
+enum {
+	/* general routines */
+	IOCNUM_ABIVERS = 0,
+	IOCNUM_RUN = 1,
+	IOCNUM_SET_CAPABILITY = 2,
+	IOCNUM_GET_CAPABILITY = 3,
+
+	/* memory apis */
+	IOCNUM_MAP_MEMORY = 10,
+	IOCNUM_GET_MEMORY_SEG = 11,
+	IOCNUM_GET_GPA_PMAP = 12,
+	IOCNUM_GLA2GPA = 13,
+
+	/* register/state accessors */
+	IOCNUM_SET_REGISTER = 20,
+	IOCNUM_GET_REGISTER = 21,
+	IOCNUM_SET_SEGMENT_DESCRIPTOR = 22,
+	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
+
+	/* interrupt injection */
+	IOCNUM_INJECT_EXCEPTION = 30,
+	IOCNUM_LAPIC_IRQ = 31,
+	IOCNUM_INJECT_NMI = 32,
+	IOCNUM_IOAPIC_ASSERT_IRQ = 33,
+	IOCNUM_IOAPIC_DEASSERT_IRQ = 34,
+	IOCNUM_IOAPIC_PULSE_IRQ = 35,
+	IOCNUM_LAPIC_MSI = 36,
+	IOCNUM_LAPIC_LOCAL_IRQ = 37,
+	IOCNUM_IOAPIC_PINCOUNT = 38,
+	IOCNUM_RESTART_INSTRUCTION = 39,
+
+	/* PCI pass-thru */
+	IOCNUM_BIND_PPTDEV = 40,
+	IOCNUM_UNBIND_PPTDEV = 41,
+	IOCNUM_MAP_PPTDEV_MMIO = 42,
+	IOCNUM_PPTDEV_MSI = 43,
+	IOCNUM_PPTDEV_MSIX = 44,
+
+	/* statistics */
+	IOCNUM_VM_STATS = 50, 
+	IOCNUM_VM_STAT_DESC = 51,
+
+	/* kernel device state */
+	IOCNUM_SET_X2APIC_STATE = 60,
+	IOCNUM_GET_X2APIC_STATE = 61,
+	IOCNUM_GET_HPET_CAPABILITIES = 62,
+
+	/* legacy interrupt injection */
+	IOCNUM_ISA_ASSERT_IRQ = 80,
+	IOCNUM_ISA_DEASSERT_IRQ = 81,
+	IOCNUM_ISA_PULSE_IRQ = 82,
+	IOCNUM_ISA_SET_IRQ_TRIGGER = 83,
+
+	/* vm_cpuset */
+	IOCNUM_ACTIVATE_CPU = 90,
+	IOCNUM_GET_CPUSET = 91,
+};
+
+#define	VM_RUN		\
+	_IOWR('v', IOCNUM_RUN, struct vm_run)
+#define	VM_MAP_MEMORY	\
+	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define	VM_GET_MEMORY_SEG \
+	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_SET_REGISTER \
+	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define	VM_GET_REGISTER \
+	_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define	VM_SET_SEGMENT_DESCRIPTOR \
+	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_GET_SEGMENT_DESCRIPTOR \
+	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_INJECT_EXCEPTION	\
+	_IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
+#define	VM_LAPIC_IRQ 		\
+	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define	VM_LAPIC_LOCAL_IRQ 	\
+	_IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq)
+#define	VM_LAPIC_MSI		\
+	_IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi)
+#define	VM_IOAPIC_ASSERT_IRQ	\
+	_IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq)
+#define	VM_IOAPIC_DEASSERT_IRQ	\
+	_IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq)
+#define	VM_IOAPIC_PULSE_IRQ	\
+	_IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq)
+#define	VM_IOAPIC_PINCOUNT	\
+	_IOR('v', IOCNUM_IOAPIC_PINCOUNT, int)
+#define	VM_ISA_ASSERT_IRQ	\
+	_IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq)
+#define	VM_ISA_DEASSERT_IRQ	\
+	_IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq)
+#define	VM_ISA_PULSE_IRQ	\
+	_IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq)
+#define	VM_ISA_SET_IRQ_TRIGGER	\
+	_IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger)
+#define	VM_SET_CAPABILITY \
+	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define	VM_GET_CAPABILITY \
+	_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define	VM_BIND_PPTDEV \
+	_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define	VM_UNBIND_PPTDEV \
+	_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define	VM_MAP_PPTDEV_MMIO \
+	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define	VM_PPTDEV_MSI \
+	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define	VM_PPTDEV_MSIX \
+	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_INJECT_NMI \
+	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#ifdef	__FreeBSD__
+#define	VM_STATS \
+	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#endif
+#define	VM_STAT_DESC \
+	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define	VM_SET_X2APIC_STATE \
+	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define	VM_GET_X2APIC_STATE \
+	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#define	VM_GET_HPET_CAPABILITIES \
+	_IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
+#define	VM_GET_GPA_PMAP \
+	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
+#define	VM_GLA2GPA	\
+	_IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
+#define	VM_ACTIVATE_CPU	\
+	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
+#define	VM_GET_CPUS	\
+	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define	VM_RESTART_INSTRUCTION \
+	_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
+#endif
diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h
new file mode 100644
index 0000000000..1602fa286d
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_impl.h
@@ -0,0 +1,86 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_IMPL_H_
+#define _VMM_IMPL_H_
+
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/varargs.h>
+
+/*
+ * /dev names:
+ *      /dev/vmmctl         - control device
+ *      /dev/vmm/<name>     - vm devices
+ */
+#define	VMM_DRIVER_NAME		"vmm"
+
+#define	VMM_CTL_MINOR_NODE	"ctl"
+#define	VMM_CTL_MINOR_NAME	VMM_DRIVER_NAME VMM_CTL_NODE
+#define	VMM_CTL_MINOR		0
+
+#define	VMM_IOC_BASE		(('V' << 16) | ('M' << 8))
+
+#define	VMM_CREATE_VM		(VMM_IOC_BASE | 0x01)
+#define	VMM_DESTROY_VM		(VMM_IOC_BASE | 0x02)
+
+struct vmm_ioctl {
+	char vmm_name[VM_MAX_NAMELEN];
+};
+
+#ifdef	_KERNEL
+struct vmm_softc {
+	boolean_t			open;
+	minor_t				minor;
+	struct vm			*vm;
+	char				name[VM_MAX_NAMELEN];
+	SLIST_ENTRY(vmm_softc)		link;
+};
+#endif
+
+/*
+ * VMM trace ring buffer constants
+ */
+#define	VMM_DMSG_RING_SIZE		0x100000	/* 1MB */
+#define	VMM_DMSG_BUF_SIZE		256
+
+/*
+ * VMM trace ring buffer content
+ */
+typedef struct vmm_trace_dmsg {
+	timespec_t		timestamp;
+	char			buf[VMM_DMSG_BUF_SIZE];
+	struct vmm_trace_dmsg	*next;
+} vmm_trace_dmsg_t;
+
+/*
+ * VMM trace ring buffer header
+ */
+typedef struct vmm_trace_rbuf {
+	kmutex_t		lock;		/* lock to avoid clutter */
+	int			looped;		/* completed ring */
+	int			allocfailed;	/* dmsg mem alloc failed */
+	size_t			size;		/* current size */
+	size_t			maxsize;	/* max size */
+	vmm_trace_dmsg_t	*dmsgh;		/* messages head */
+	vmm_trace_dmsg_t	*dmsgp;		/* ptr to last message */
+} vmm_trace_rbuf_t;
+
+/*
+ * VMM trace ring buffer interfaces
+ */
+void vmm_trace_log(const char *fmt, ...);
+
+#endif	/* _VMM_IMPL_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
new file mode 100644
index 0000000000..8138890a2c
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vmm_instruction_emul.h 276479 2014-12-31 20:31:32Z dim $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef	_VMM_INSTRUCTION_EMUL_H_
+#define	_VMM_INSTRUCTION_EMUL_H_
+
+#include <sys/mman.h>
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+				 uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+				  uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t mrr,
+    mem_region_write_t mrw, void *mrarg);
+
+int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+    uint64_t val, int size);
+
+/*
+ * Returns 1 if an alignment check exception should be injected and 0 otherwise.
+ */
+int vie_alignment_check(int cpl, int operand_size, uint64_t cr0,
+    uint64_t rflags, uint64_t gla);
+
+/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */
+int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
+
+uint64_t vie_size2mask(int size);
+
+int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
+    struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot,
+    uint64_t *gla);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ *
+ * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+			  struct vm_guest_paging *guest_paging,
+			  uint64_t rip, int inst_length, struct vie *vie);
+
+/*
+ * Translate the guest linear address 'gla' to a guest physical address.
+ *
+ * Returns 0 on success and '*gpa' contains the result of the translation.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa);
+
+void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
+
+/*
+ * Decode the instruction fetched into 'vie' so it can be emulated.
+ *
+ * 'gla' is the guest linear address provided by the hardware assist
+ * that caused the nested page table fault. It is used to verify that
+ * the software instruction decoding is in agreement with the hardware.
+ * 
+ * Some hardware assists do not provide the 'gla' to the hypervisor.
+ * To skip the 'gla' verification for this or any other reason pass
+ * in VIE_INVALID_GLA instead.
+ */
+#define	VIE_INVALID_GLA		(1UL << 63)	/* a non-canonical address */
+int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
+			   enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
+#endif	/* _KERNEL */
+
+#endif	/* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile
new file mode 100644
index 0000000000..c2b8bd8dcf
--- /dev/null
+++ b/usr/src/uts/i86pc/viona/Makefile
@@ -0,0 +1,72 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE		= viona
+OBJECTS		= $(VIONA_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/i86pc/io/viona
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/i86pc/Makefile.i86pc
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# Overrides
+#
+CFLAGS		+= $(CCVERBOSE)
+LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/i86pc/Makefile.targ
diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile
new file mode 100644
index 0000000000..b3ab735781
--- /dev/null
+++ b/usr/src/uts/i86pc/vmm/Makefile
@@ -0,0 +1,94 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= vmm
+OBJECTS		= $(VMM_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/i86pc/io/vmm
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/i86pc/Makefile.i86pc
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+#	Overrides and additions
+#
+
+# These sources only compile with gcc.  Workaround a confluence of cruft
+# regarding dmake and shadow compilation by neutering the sun compiler.
+amd64_CC	= $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc
+CFLAGS 		+= -_cc=-xdryrun
+
+ALL_BUILDS	= $(ALL_BUILDSONLY64)
+DEF_BUILDS	= $(DEF_BUILDSONLY64)
+PRE_INC_PATH	= -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 \
+	-I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64
+INC_PATH	+= -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io
+AS_INC_PATH	+= -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR)
+
+CFLAGS		+= -_gcc=-Wimplicit-function-declaration
+
+OFFSETS_SRC	= $(CONF_SRCDIR)/offsets.in
+ASSYM_H		= $(OBJS_DIR)/vmx_assym.h
+
+CLEANFILES	+= $(ASSYM_H)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/i86pc/Makefile.targ
+
+$(OBJECTS): $(ASSYM_H)
+
+$(ASSYM_H): $(OFFSETS_SRC) $(GENASSYM)
+	$(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SRC) >$@
-- 
cgit v1.2.3


From 8c6284f26f84d01dcbb0f93a15e9cf0da38c36de Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Mon, 11 May 2020 13:36:12 -0500
Subject: OS-8076 Panic in vlapic_callout_handler (#296)

Reviewed by: Hans Rosenfeld <hans.rosenfeld@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Mike Zeller <mike.zeller@joyent.com>
Approved by: Mike Zeller <mike.zeller@joyent.com>
---
 usr/src/compat/freebsd/sys/callout.h    | 24 +++++++++++++-------
 usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c | 39 ++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 28 deletions(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h
index 6087a09f54..11823e6321 100644
--- a/usr/src/compat/freebsd/sys/callout.h
+++ b/usr/src/compat/freebsd/sys/callout.h
@@ -12,6 +12,7 @@
 /*
  * Copyright 2014 Pluribus Networks Inc.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_
@@ -21,20 +22,27 @@
 
 struct callout {
 	cyclic_id_t	c_cyc_id;
-	int		c_flags;
+	hrtime_t	c_target;
+	hrtime_t	c_fired;
 	void		(*c_func)(void *);
 	void		*c_arg;
-
 };
 
-#define	CALLOUT_ACTIVE		0x0002	/* callout is currently active */
-#define	CALLOUT_PENDING		0x0004	/* callout is waiting for timeout */
-
 #define	C_ABSOLUTE		0x0200	/* event time is absolute. */
 
-#define	callout_active(c)	((c)->c_flags & CALLOUT_ACTIVE)
-#define	callout_deactivate(c)	((c)->c_flags &= ~CALLOUT_ACTIVE)
-#define	callout_pending(c)	((c)->c_flags & CALLOUT_PENDING)
+/* Callout considered active if t_target has not been zeroed */
+#define	callout_active(c)	((c)->c_target != 0)
+#define	callout_deactivate(c)	((c)->c_target = 0)
+
+/*
+ * If a callout is rescheduled (into the future) while its handler is running,
+ * it will be able to detect the pending invocation by the target time being
+ * greater than the time at which the handler was fired.
+ *
+ * This is only valid when checked from the callout handler, which is the only
+ * place where it is used by bhyve today.
+ */
+#define	callout_pending(c)	((c)->c_target > (c)->c_fired)
 
 void	vmm_glue_callout_init(struct callout *c, int mpsafe);
 int	vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt,
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
index c8d5aa24e9..2401774ab7 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -37,6 +37,7 @@
  *
  * Copyright 2014 Pluribus Networks Inc.
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/types.h>
@@ -320,8 +321,13 @@ vmm_glue_callout_handler(void *arg)
 {
 	struct callout *c = arg;
 
-	c->c_flags &= ~CALLOUT_PENDING;
-	if (c->c_flags & CALLOUT_ACTIVE) {
+	if (callout_active(c)) {
+		/*
+		 * Record the handler fire time so that callout_pending() is
+		 * able to detect if the callout becomes rescheduled during the
+		 * course of the handler.
+		 */
+		c->c_fired = gethrtime();
 		(c->c_func)(c->c_arg);
 	}
 }
@@ -337,17 +343,9 @@ vmm_glue_callout_init(struct callout *c, int mpsafe)
 	hdlr.cyh_arg = c;
 	when.cyt_when = CY_INFINITY;
 	when.cyt_interval = CY_INFINITY;
+	bzero(c, sizeof (*c));
 
 	mutex_enter(&cpu_lock);
-#if 0
-	/*
-	 * XXXJOY: according to the freebsd sources, callouts do not begin
-	 * their life in the ACTIVE state.
-	 */
-	c->c_flags |= CALLOUT_ACTIVE;
-#else
-	bzero(c, sizeof (*c));
-#endif
 	c->c_cyc_id = cyclic_add(&hdlr, &when);
 	mutex_exit(&cpu_lock);
 }
@@ -367,15 +365,14 @@ vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
 
 	ASSERT(c->c_cyc_id != CYCLIC_NONE);
 
+	if ((flags & C_ABSOLUTE) == 0) {
+		target += gethrtime();
+	}
+
 	c->c_func = func;
 	c->c_arg = arg;
-	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
-
-	if (flags & C_ABSOLUTE) {
-		cyclic_reprogram(c->c_cyc_id, target);
-	} else {
-		cyclic_reprogram(c->c_cyc_id, target + gethrtime());
-	}
+	c->c_target = target;
+	cyclic_reprogram(c->c_cyc_id, target);
 
 	return (0);
 }
@@ -384,8 +381,9 @@ int
 vmm_glue_callout_stop(struct callout *c)
 {
 	ASSERT(c->c_cyc_id != CYCLIC_NONE);
+
+	c->c_target = 0;
 	cyclic_reprogram(c->c_cyc_id, CY_INFINITY);
-	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	return (0);
 }
@@ -394,10 +392,11 @@ int
 vmm_glue_callout_drain(struct callout *c)
 {
 	ASSERT(c->c_cyc_id != CYCLIC_NONE);
+
+	c->c_target = 0;
 	mutex_enter(&cpu_lock);
 	cyclic_remove(c->c_cyc_id);
 	c->c_cyc_id = CYCLIC_NONE;
-	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 	mutex_exit(&cpu_lock);
 
 	return (0);
-- 
cgit v1.2.3


From 4c87aefe8930bd07275b8dd2e96ea5f24d93a52e Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Tue, 10 Oct 2017 12:37:29 +0200
Subject: 12665 want modern bhyve Portions contributed by: Hans Rosenfeld
 <hans.rosenfeld@joyent.com> Portions contributed by: John Levon
 <john.levon@joyent.com> Portions contributed by: Mike Gerdts
 <mike.gerdts@joyent.com> Portions contributed by: Andy Fiddaman
 <omnios@citrus-it.co.uk> Portions contributed by: Dominik Hassler
 <hadfl@omniosce.org> Portions contributed by: Jerry Jelinek
 <jerry.jelinek@joyent.com> Portions contributed by: Robert Mustacchi
 <rm@joyent.com> Portions contributed by: Mike Zeller <mike.zeller@joyent.com>
 Reviewed by: Andy Fiddaman <omnios@citrus-it.co.uk> Approved by: Dan McDonald
 <danmcd@joyent.com>

---
 exception_lists/check_rtime                       |    2 +
 exception_lists/copyright                         |   84 +
 exception_lists/cstyle                            |   83 +
 exception_lists/hdrchk                            |   50 +
 exception_lists/packaging                         |    6 +
 exception_lists/wscheck                           |   81 +
 usr/contrib/freebsd/amd64/machine/pmap.h          |  455 ++++
 usr/contrib/freebsd/amd64/machine/specialreg.h    |    6 -
 usr/contrib/freebsd/dev/io/iodev.h                |   44 +
 usr/contrib/freebsd/dev/mii/mii.h                 |  239 ++
 usr/contrib/freebsd/dev/nvme/nvme.h               | 1511 +++++++++++
 usr/contrib/freebsd/dev/usb/controller/xhcireg.h  |  224 ++
 usr/contrib/freebsd/dev/usb/usb.h                 |  801 ++++++
 usr/contrib/freebsd/dev/usb/usb_endian.h          |  121 +
 usr/contrib/freebsd/dev/usb/usb_freebsd.h         |  101 +
 usr/contrib/freebsd/dev/usb/usbdi.h               |  657 +++++
 usr/contrib/freebsd/isa/rtc.h                     |  125 +
 usr/contrib/freebsd/lib/libutil/humanize_number.c |  179 ++
 usr/contrib/freebsd/sys/ata.h                     |  392 ++-
 usr/contrib/freebsd/sys/linker_set.h              |  119 -
 usr/contrib/freebsd/sys/pciio.h                   |  146 ++
 usr/contrib/freebsd/sys/queue.h                   |  787 ++++++
 usr/contrib/freebsd/x86/segments.h                |  274 ++
 usr/contrib/freebsd/x86/specialreg.h              |  320 ++-
 usr/src/Makefile.master                           |    6 +
 usr/src/cmd/Makefile                              |    2 +
 usr/src/cmd/bhyve/Makefile                        |  129 +-
 usr/src/cmd/bhyve/Makefile.com                    |   94 -
 usr/src/cmd/bhyve/acpi.c                          | 1007 ++++++++
 usr/src/cmd/bhyve/acpi.h                          |    4 +-
 usr/src/cmd/bhyve/ahci.h                          |  534 ++--
 usr/src/cmd/bhyve/amd64/Makefile                  |   21 -
 usr/src/cmd/bhyve/atkbdc.c                        |  218 +-
 usr/src/cmd/bhyve/atkbdc.h                        |    2 +-
 usr/src/cmd/bhyve/bhyve_sol_glue.c                |   57 +-
 usr/src/cmd/bhyve/bhyvegc.c                       |   35 +-
 usr/src/cmd/bhyve/bhyvegc.h                       |    6 +-
 usr/src/cmd/bhyve/bhyverun.c                      |  772 +++++-
 usr/src/cmd/bhyve/bhyverun.h                      |   11 +-
 usr/src/cmd/bhyve/block_if.c                      |  576 ++++-
 usr/src/cmd/bhyve/block_if.h                      |   25 +-
 usr/src/cmd/bhyve/bootrom.c                       |  113 +
 usr/src/cmd/bhyve/bootrom.h                       |   40 +
 usr/src/cmd/bhyve/console.c                       |   41 +-
 usr/src/cmd/bhyve/console.h                       |   19 +-
 usr/src/cmd/bhyve/consport.c                      |   45 +-
 usr/src/cmd/bhyve/dbgport.c                       |  180 ++
 usr/src/cmd/bhyve/dbgport.h                       |    4 +-
 usr/src/cmd/bhyve/fwctl.c                         |  552 ++++
 usr/src/cmd/bhyve/fwctl.h                         |   56 +
 usr/src/cmd/bhyve/gdb.c                           | 1523 +++++++++++
 usr/src/cmd/bhyve/gdb.h                           |   38 +
 usr/src/cmd/bhyve/inout.c                         |   44 +-
 usr/src/cmd/bhyve/inout.h                         |    4 +-
 usr/src/cmd/bhyve/ioapic.c                        |   15 +-
 usr/src/cmd/bhyve/ioapic.h                        |   10 +-
 usr/src/cmd/bhyve/iov.c                           |  148 ++
 usr/src/cmd/bhyve/iov.h                           |   44 +
 usr/src/cmd/bhyve/mem.c                           |  135 +-
 usr/src/cmd/bhyve/mem.h                           |   10 +-
 usr/src/cmd/bhyve/mevent.c                        |  680 +++++
 usr/src/cmd/bhyve/mevent.h                        |   53 +
 usr/src/cmd/bhyve/mevent_test.c                   |  282 ++
 usr/src/cmd/bhyve/mptbl.c                         |    6 +-
 usr/src/cmd/bhyve/mptbl.h                         |    4 +-
 usr/src/cmd/bhyve/pci_ahci.c                      | 1060 +++++---
 usr/src/cmd/bhyve/pci_e82545.c                    | 2418 +++++++++++++++++
 usr/src/cmd/bhyve/pci_emul.c                      |  210 +-
 usr/src/cmd/bhyve/pci_emul.h                      |   12 +-
 usr/src/cmd/bhyve/pci_fbuf.c                      |  467 ++++
 usr/src/cmd/bhyve/pci_hostbridge.c                |  170 +-
 usr/src/cmd/bhyve/pci_irq.c                       |   47 +-
 usr/src/cmd/bhyve/pci_irq.h                       |    8 +-
 usr/src/cmd/bhyve/pci_lpc.c                       |   66 +-
 usr/src/cmd/bhyve/pci_lpc.h                       |    6 +-
 usr/src/cmd/bhyve/pci_nvme.c                      | 1953 ++++++++++++++
 usr/src/cmd/bhyve/pci_passthru.c                  |  937 +++++++
 usr/src/cmd/bhyve/pci_uart.c                      |  121 +
 usr/src/cmd/bhyve/pci_virtio_block.c              |  284 +-
 usr/src/cmd/bhyve/pci_virtio_console.c            |  701 +++++
 usr/src/cmd/bhyve/pci_virtio_net.c                |  723 ++++--
 usr/src/cmd/bhyve/pci_virtio_rnd.c                |  209 ++
 usr/src/cmd/bhyve/pci_virtio_scsi.c               |  737 ++++++
 usr/src/cmd/bhyve/pci_virtio_viona.c              |    9 +-
 usr/src/cmd/bhyve/pci_xhci.c                      | 2855 +++++++++++++++++++++
 usr/src/cmd/bhyve/pci_xhci.h                      |  355 +++
 usr/src/cmd/bhyve/pm.c                            |   65 +-
 usr/src/cmd/bhyve/pmtmr.c                         |  212 --
 usr/src/cmd/bhyve/post.c                          |    6 +-
 usr/src/cmd/bhyve/ps2kbd.c                        |  317 +--
 usr/src/cmd/bhyve/ps2kbd.h                        |    2 +
 usr/src/cmd/bhyve/ps2mouse.c                      |   79 +-
 usr/src/cmd/bhyve/ps2mouse.h                      |    6 +-
 usr/src/cmd/bhyve/rfb.c                           |  914 ++++++-
 usr/src/cmd/bhyve/rfb.h                           |    8 +-
 usr/src/cmd/bhyve/rtc.c                           |  317 +--
 usr/src/cmd/bhyve/rtc.h                           |    8 +-
 usr/src/cmd/bhyve/smbiostbl.c                     |   98 +-
 usr/src/cmd/bhyve/smbiostbl.h                     |    9 +-
 usr/src/cmd/bhyve/sockstream.c                    |   86 +
 usr/src/cmd/bhyve/sockstream.h                    |   35 +
 usr/src/cmd/bhyve/spinup_ap.c                     |    6 +-
 usr/src/cmd/bhyve/spinup_ap.h                     |    4 +-
 usr/src/cmd/bhyve/task_switch.c                   |  941 +++++++
 usr/src/cmd/bhyve/test/Makefile                   |   18 +
 usr/src/cmd/bhyve/test/Makefile.com               |   61 +
 usr/src/cmd/bhyve/test/Makefile.subdirs           |   29 +
 usr/src/cmd/bhyve/test/Makefile.targ              |   55 +
 usr/src/cmd/bhyve/test/scripts/Makefile           |   28 +
 usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh      |  231 ++
 usr/src/cmd/bhyve/test/tst/Makefile               |   18 +
 usr/src/cmd/bhyve/test/tst/mevent/Makefile        |   30 +
 usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c  |  172 ++
 usr/src/cmd/bhyve/test/tst/mevent/mevent.c        |   57 +
 usr/src/cmd/bhyve/test/tst/mevent/read.disable.c  |  163 ++
 usr/src/cmd/bhyve/test/tst/mevent/read.pause.c    |  152 ++
 usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c  |  108 +
 usr/src/cmd/bhyve/test/tst/mevent/testlib.c       |   70 +
 usr/src/cmd/bhyve/test/tst/mevent/testlib.h       |   93 +
 usr/src/cmd/bhyve/uart_emul.c                     |  987 ++++---
 usr/src/cmd/bhyve/uart_emul.h                     |    4 +-
 usr/src/cmd/bhyve/usb_emul.c                      |   78 +
 usr/src/cmd/bhyve/usb_emul.h                      |  164 ++
 usr/src/cmd/bhyve/usb_mouse.c                     |  809 ++++++
 usr/src/cmd/bhyve/vga.c                           |  106 +-
 usr/src/cmd/bhyve/vga.h                           |   80 +-
 usr/src/cmd/bhyve/virtio.c                        |   68 +-
 usr/src/cmd/bhyve/virtio.h                        |   75 +-
 usr/src/cmd/bhyve/xmsr.c                          |   21 +-
 usr/src/cmd/bhyve/xmsr.h                          |    4 +-
 usr/src/cmd/bhyveconsole/Makefile                 |   41 -
 usr/src/cmd/bhyveconsole/bhyveconsole.c           |  360 ---
 usr/src/cmd/bhyveconsole/i386/Makefile            |   43 -
 usr/src/cmd/bhyvectl/Makefile                     |   45 +-
 usr/src/cmd/bhyvectl/Makefile.com                 |   48 -
 usr/src/cmd/bhyvectl/amd64/Makefile               |   21 -
 usr/src/cmd/bhyvectl/bhyvectl.c                   | 2347 +++++++++++------
 usr/src/cmd/bhyveload-uefi/Makefile               |   41 -
 usr/src/cmd/bhyveload-uefi/Makefile.com           |   52 -
 usr/src/cmd/bhyveload-uefi/amd64/Makefile         |   21 -
 usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c       |  190 --
 usr/src/cmd/bhyveload-uefi/i386/Makefile          |   18 -
 usr/src/cmd/devfsadm/i386/misc_link_i386.c        |   18 +
 usr/src/cmd/mdb/intel/amd64/vmm/Makefile          |   20 -
 usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile    |   32 -
 usr/src/cmd/mdb/intel/amd64/vmm/vmm.c             |  238 --
 usr/src/compat/freebsd/amd64/machine/asmacros.h   |    3 +
 usr/src/compat/freebsd/amd64/machine/atomic.h     |   92 +-
 usr/src/compat/freebsd/amd64/machine/cpufunc.h    |  154 +-
 usr/src/compat/freebsd/amd64/machine/fpu.h        |    3 +-
 usr/src/compat/freebsd/amd64/machine/iodev.h      |   19 +
 usr/src/compat/freebsd/amd64/machine/md_var.h     |    4 +
 usr/src/compat/freebsd/amd64/machine/param.h      |    2 +
 usr/src/compat/freebsd/amd64/machine/pmap.h       |  461 +++-
 usr/src/compat/freebsd/amd64/machine/reg.h        |   23 +
 usr/src/compat/freebsd/amd64/machine/smp.h        |   11 +
 usr/src/compat/freebsd/amd64/machine/specialreg.h |   61 +
 usr/src/compat/freebsd/amd64/machine/vmm.h        |    3 +
 usr/src/compat/freebsd/amd64/machine/vmparam.h    |   26 +
 usr/src/compat/freebsd/err.h                      |   23 +
 usr/src/compat/freebsd/libutil.h                  |   14 +
 usr/src/compat/freebsd/net/ethernet.h             |   14 +
 usr/src/compat/freebsd/pthread_np.h               |    4 +-
 usr/src/compat/freebsd/sys/_cpuset.h              |   33 +
 usr/src/compat/freebsd/sys/callout.h              |    4 +
 usr/src/compat/freebsd/sys/cdefs.h                |   67 +-
 usr/src/compat/freebsd/sys/clock.h                |  110 +
 usr/src/compat/freebsd/sys/cpuset.h               |  112 +-
 usr/src/compat/freebsd/sys/endian.h               |   11 +
 usr/src/compat/freebsd/sys/eventhandler.h         |   19 +
 usr/src/compat/freebsd/sys/ioctl.h                |    2 +
 usr/src/compat/freebsd/sys/kernel.h               |   19 +-
 usr/src/compat/freebsd/sys/limits.h               |    5 +
 usr/src/compat/freebsd/sys/lock.h                 |   23 +
 usr/src/compat/freebsd/sys/malloc.h               |    5 +
 usr/src/compat/freebsd/sys/mutex.h                |    5 +-
 usr/src/compat/freebsd/sys/param.h                |    9 +
 usr/src/compat/freebsd/sys/sdt.h                  |   37 +
 usr/src/compat/freebsd/sys/sglist.h               |   29 +
 usr/src/compat/freebsd/sys/smp.h                  |    6 +-
 usr/src/compat/freebsd/sys/socket.h               |   23 +
 usr/src/compat/freebsd/sys/systm.h                |    9 -
 usr/src/compat/freebsd/sys/time.h                 |   28 +-
 usr/src/compat/freebsd/sys/types.h                |   15 +-
 usr/src/compat/freebsd/unistd.h                   |   23 +
 usr/src/compat/freebsd/vm/pmap.h                  |   21 -
 usr/src/compat/freebsd/vm/vm.h                    |   31 +-
 usr/src/compat/freebsd/vm/vm_param.h              |   21 +
 usr/src/compat/freebsd/x86/_types.h               |    2 +
 usr/src/compat/freebsd/x86/segments.h             |   15 +-
 usr/src/head/bhyve.h                              |   25 -
 usr/src/lib/Makefile                              |    6 +-
 usr/src/lib/libvmmapi/Makefile                    |    4 +-
 usr/src/lib/libvmmapi/Makefile.com                |   16 +-
 usr/src/lib/libvmmapi/common/mapfile-vers         |  140 +-
 usr/src/lib/libvmmapi/common/vmmapi.c             |  911 +++++--
 usr/src/lib/libvmmapi/common/vmmapi.h             |  126 +-
 usr/src/pkg/manifests/system-bhyve-tests.mf       |   35 +
 usr/src/pkg/manifests/system-bhyve.mf             |   46 +
 usr/src/pkg/manifests/system-library-bhyve.mf     |   31 +
 usr/src/req.flg                                   |    2 +
 usr/src/tools/scripts/build_cscope.conf           |    4 +-
 usr/src/tools/scripts/gensetdefs.pl               |   31 -
 usr/src/uts/Makefile.targ                         |    3 +-
 usr/src/uts/common/Makefile.files                 |    6 +-
 usr/src/uts/i86pc/Makefile.files                  |   42 +-
 usr/src/uts/i86pc/Makefile.i86pc                  |    1 +
 usr/src/uts/i86pc/Makefile.rules                  |   29 +
 usr/src/uts/i86pc/io/viona/viona.c                |   17 +-
 usr/src/uts/i86pc/io/vmm/README.sync              |   18 +
 usr/src/uts/i86pc/io/vmm/amd/amdv.c               |  137 +-
 usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c           | 1461 +++++++++++
 usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h         |  431 ++++
 usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c           |  735 ++++++
 usr/src/uts/i86pc/io/vmm/amd/npt.c                |   87 +
 usr/src/uts/i86pc/io/vmm/amd/npt.h                |   38 +
 usr/src/uts/i86pc/io/vmm/amd/offsets.in           |   36 +
 usr/src/uts/i86pc/io/vmm/amd/svm.c                | 2446 ++++++++++++++++++
 usr/src/uts/i86pc/io/vmm/amd/svm.h                |   74 +
 usr/src/uts/i86pc/io/vmm/amd/svm_msr.c            |  199 ++
 usr/src/uts/i86pc/io/vmm/amd/svm_msr.h            |   46 +
 usr/src/uts/i86pc/io/vmm/amd/svm_softc.h          |  131 +
 usr/src/uts/i86pc/io/vmm/amd/svm_support.s        |  164 ++
 usr/src/uts/i86pc/io/vmm/amd/vmcb.c               |  454 ++++
 usr/src/uts/i86pc/io/vmm/amd/vmcb.h               |  336 +++
 usr/src/uts/i86pc/io/vmm/intel/ept.c              |  362 +--
 usr/src/uts/i86pc/io/vmm/intel/ept.h              |   18 +-
 usr/src/uts/i86pc/io/vmm/intel/offsets.in         |   62 +
 usr/src/uts/i86pc/io/vmm/intel/vmcs.c             |  127 +-
 usr/src/uts/i86pc/io/vmm/intel/vmcs.h             |  102 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx.c              | 2407 +++++++++++++----
 usr/src/uts/i86pc/io/vmm/intel/vmx.h              |   92 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h     |    4 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h      |   39 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c          |  101 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h          |    4 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx_support.s      |  445 ++--
 usr/src/uts/i86pc/io/vmm/intel/vtd.c              |  690 +++++
 usr/src/uts/i86pc/io/vmm/io/iommu.h               |   76 +
 usr/src/uts/i86pc/io/vmm/io/ppt.h                 |   56 +
 usr/src/uts/i86pc/io/vmm/io/sol_iommu.c           |   86 +
 usr/src/uts/i86pc/io/vmm/io/sol_ppt.c             |   92 +
 usr/src/uts/i86pc/io/vmm/io/vatpic.c              |    5 +-
 usr/src/uts/i86pc/io/vmm/io/vatpic.h              |    2 +-
 usr/src/uts/i86pc/io/vmm/io/vatpit.c              |   87 +-
 usr/src/uts/i86pc/io/vmm/io/vatpit.h              |    8 +-
 usr/src/uts/i86pc/io/vmm/io/vhpet.c               |  104 +-
 usr/src/uts/i86pc/io/vmm/io/vhpet.h               |   12 +-
 usr/src/uts/i86pc/io/vmm/io/vioapic.c             |  176 +-
 usr/src/uts/i86pc/io/vmm/io/vioapic.h             |    8 +-
 usr/src/uts/i86pc/io/vmm/io/vlapic.c              |  236 +-
 usr/src/uts/i86pc/io/vmm/io/vlapic.h              |   27 +-
 usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h         |   23 +-
 usr/src/uts/i86pc/io/vmm/io/vpmtmr.c              |  105 +
 usr/src/uts/i86pc/io/vmm/io/vpmtmr.h              |   44 +
 usr/src/uts/i86pc/io/vmm/io/vrtc.c                | 1061 ++++++++
 usr/src/uts/i86pc/io/vmm/io/vrtc.h                |   60 +
 usr/src/uts/i86pc/io/vmm/offsets.in               |   72 -
 usr/src/uts/i86pc/io/vmm/vm/pmap.h                |   27 +
 usr/src/uts/i86pc/io/vmm/vm/vm_extern.h           |   35 +
 usr/src/uts/i86pc/io/vmm/vm/vm_glue.h             |   99 +
 usr/src/uts/i86pc/io/vmm/vm/vm_map.h              |   63 +
 usr/src/uts/i86pc/io/vmm/vm/vm_object.h           |   31 +
 usr/src/uts/i86pc/io/vmm/vm/vm_page.h             |   28 +
 usr/src/uts/i86pc/io/vmm/vm/vm_pager.h            |   23 +
 usr/src/uts/i86pc/io/vmm/vmm.c                    | 2235 ++++++++++++----
 usr/src/uts/i86pc/io/vmm/vmm.mapfile              |   62 +
 usr/src/uts/i86pc/io/vmm/vmm_host.c               |   57 +-
 usr/src/uts/i86pc/io/vmm/vmm_host.h               |   17 +-
 usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c   |  552 +++-
 usr/src/uts/i86pc/io/vmm/vmm_ioport.c             |   38 +-
 usr/src/uts/i86pc/io/vmm/vmm_ioport.h             |    4 +-
 usr/src/uts/i86pc/io/vmm/vmm_ipi.h                |   37 -
 usr/src/uts/i86pc/io/vmm/vmm_ktr.h                |    4 +-
 usr/src/uts/i86pc/io/vmm/vmm_lapic.c              |   17 +-
 usr/src/uts/i86pc/io/vmm/vmm_lapic.h              |    4 +-
 usr/src/uts/i86pc/io/vmm/vmm_mem.c                |  124 +
 usr/src/uts/i86pc/io/vmm/vmm_mem.h                |   12 +-
 usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c            | 2378 ++++++++++++-----
 usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c            |  268 ++
 usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c           |  907 +++----
 usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c            |  111 -
 usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c            |  297 +++
 usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c             | 1016 ++++++++
 usr/src/uts/i86pc/io/vmm/vmm_stat.c               |  172 ++
 usr/src/uts/i86pc/io/vmm/vmm_stat.h               |   69 +-
 usr/src/uts/i86pc/io/vmm/vmm_support.s            |   54 +
 usr/src/uts/i86pc/io/vmm/vmm_util.c               |    6 +-
 usr/src/uts/i86pc/io/vmm/vmm_util.h               |    4 +-
 usr/src/uts/i86pc/io/vmm/vmm_zsd.c                |  218 ++
 usr/src/uts/i86pc/io/vmm/vmx_assym.s              |    1 -
 usr/src/uts/i86pc/io/vmm/x86.c                    |  475 +++-
 usr/src/uts/i86pc/io/vmm/x86.h                    |   19 +-
 usr/src/uts/i86pc/os/gipt.c                       |  566 ++++
 usr/src/uts/i86pc/sys/gipt.h                      |   92 +
 usr/src/uts/i86pc/sys/viona_io.h                  |    5 +-
 usr/src/uts/i86pc/sys/vmm.h                       |  258 +-
 usr/src/uts/i86pc/sys/vmm_dev.h                   |  181 +-
 usr/src/uts/i86pc/sys/vmm_drv.h                   |   50 +
 usr/src/uts/i86pc/sys/vmm_impl.h                  |  105 +-
 usr/src/uts/i86pc/sys/vmm_instruction_emul.h      |   23 +-
 usr/src/uts/i86pc/viona/Makefile                  |    9 +-
 usr/src/uts/i86pc/vmm/Makefile                    |   74 +-
 usr/src/uts/req.flg                               |    3 +
 304 files changed, 56348 insertions(+), 9173 deletions(-)
 create mode 100644 usr/contrib/freebsd/amd64/machine/pmap.h
 delete mode 100644 usr/contrib/freebsd/amd64/machine/specialreg.h
 create mode 100644 usr/contrib/freebsd/dev/io/iodev.h
 create mode 100644 usr/contrib/freebsd/dev/mii/mii.h
 create mode 100644 usr/contrib/freebsd/dev/nvme/nvme.h
 create mode 100644 usr/contrib/freebsd/dev/usb/controller/xhcireg.h
 create mode 100644 usr/contrib/freebsd/dev/usb/usb.h
 create mode 100644 usr/contrib/freebsd/dev/usb/usb_endian.h
 create mode 100644 usr/contrib/freebsd/dev/usb/usb_freebsd.h
 create mode 100644 usr/contrib/freebsd/dev/usb/usbdi.h
 create mode 100644 usr/contrib/freebsd/isa/rtc.h
 create mode 100644 usr/contrib/freebsd/lib/libutil/humanize_number.c
 delete mode 100644 usr/contrib/freebsd/sys/linker_set.h
 create mode 100644 usr/contrib/freebsd/sys/pciio.h
 create mode 100644 usr/contrib/freebsd/sys/queue.h
 create mode 100644 usr/contrib/freebsd/x86/segments.h
 delete mode 100644 usr/src/cmd/bhyve/Makefile.com
 create mode 100644 usr/src/cmd/bhyve/acpi.c
 delete mode 100644 usr/src/cmd/bhyve/amd64/Makefile
 create mode 100644 usr/src/cmd/bhyve/bootrom.c
 create mode 100644 usr/src/cmd/bhyve/bootrom.h
 create mode 100644 usr/src/cmd/bhyve/dbgport.c
 create mode 100644 usr/src/cmd/bhyve/fwctl.c
 create mode 100644 usr/src/cmd/bhyve/fwctl.h
 create mode 100644 usr/src/cmd/bhyve/gdb.c
 create mode 100644 usr/src/cmd/bhyve/gdb.h
 create mode 100644 usr/src/cmd/bhyve/iov.c
 create mode 100644 usr/src/cmd/bhyve/iov.h
 create mode 100644 usr/src/cmd/bhyve/mevent.c
 create mode 100644 usr/src/cmd/bhyve/mevent.h
 create mode 100644 usr/src/cmd/bhyve/mevent_test.c
 create mode 100644 usr/src/cmd/bhyve/pci_e82545.c
 create mode 100644 usr/src/cmd/bhyve/pci_fbuf.c
 create mode 100644 usr/src/cmd/bhyve/pci_nvme.c
 create mode 100644 usr/src/cmd/bhyve/pci_passthru.c
 create mode 100644 usr/src/cmd/bhyve/pci_uart.c
 create mode 100644 usr/src/cmd/bhyve/pci_virtio_console.c
 create mode 100644 usr/src/cmd/bhyve/pci_virtio_rnd.c
 create mode 100644 usr/src/cmd/bhyve/pci_virtio_scsi.c
 create mode 100644 usr/src/cmd/bhyve/pci_xhci.c
 create mode 100644 usr/src/cmd/bhyve/pci_xhci.h
 delete mode 100644 usr/src/cmd/bhyve/pmtmr.c
 create mode 100644 usr/src/cmd/bhyve/sockstream.c
 create mode 100644 usr/src/cmd/bhyve/sockstream.h
 create mode 100644 usr/src/cmd/bhyve/task_switch.c
 create mode 100644 usr/src/cmd/bhyve/test/Makefile
 create mode 100644 usr/src/cmd/bhyve/test/Makefile.com
 create mode 100644 usr/src/cmd/bhyve/test/Makefile.subdirs
 create mode 100644 usr/src/cmd/bhyve/test/Makefile.targ
 create mode 100644 usr/src/cmd/bhyve/test/scripts/Makefile
 create mode 100644 usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh
 create mode 100644 usr/src/cmd/bhyve/test/tst/Makefile
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/Makefile
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/mevent.c
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/read.disable.c
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/read.pause.c
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/testlib.c
 create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/testlib.h
 create mode 100644 usr/src/cmd/bhyve/usb_emul.c
 create mode 100644 usr/src/cmd/bhyve/usb_emul.h
 create mode 100644 usr/src/cmd/bhyve/usb_mouse.c
 delete mode 100644 usr/src/cmd/bhyveconsole/Makefile
 delete mode 100644 usr/src/cmd/bhyveconsole/bhyveconsole.c
 delete mode 100644 usr/src/cmd/bhyveconsole/i386/Makefile
 delete mode 100644 usr/src/cmd/bhyvectl/Makefile.com
 delete mode 100644 usr/src/cmd/bhyvectl/amd64/Makefile
 delete mode 100644 usr/src/cmd/bhyveload-uefi/Makefile
 delete mode 100644 usr/src/cmd/bhyveload-uefi/Makefile.com
 delete mode 100644 usr/src/cmd/bhyveload-uefi/amd64/Makefile
 delete mode 100644 usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
 delete mode 100644 usr/src/cmd/bhyveload-uefi/i386/Makefile
 delete mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/Makefile
 delete mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
 delete mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
 create mode 100644 usr/src/compat/freebsd/amd64/machine/iodev.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/reg.h
 create mode 100644 usr/src/compat/freebsd/amd64/machine/specialreg.h
 create mode 100644 usr/src/compat/freebsd/err.h
 create mode 100644 usr/src/compat/freebsd/sys/_cpuset.h
 create mode 100644 usr/src/compat/freebsd/sys/clock.h
 create mode 100644 usr/src/compat/freebsd/sys/eventhandler.h
 create mode 100644 usr/src/compat/freebsd/sys/lock.h
 create mode 100644 usr/src/compat/freebsd/sys/sdt.h
 create mode 100644 usr/src/compat/freebsd/sys/sglist.h
 create mode 100644 usr/src/compat/freebsd/sys/socket.h
 create mode 100644 usr/src/compat/freebsd/unistd.h
 delete mode 100644 usr/src/compat/freebsd/vm/pmap.h
 create mode 100644 usr/src/compat/freebsd/vm/vm_param.h
 delete mode 100644 usr/src/head/bhyve.h
 create mode 100644 usr/src/pkg/manifests/system-bhyve-tests.mf
 create mode 100644 usr/src/pkg/manifests/system-bhyve.mf
 create mode 100644 usr/src/pkg/manifests/system-library-bhyve.mf
 delete mode 100644 usr/src/tools/scripts/gensetdefs.pl
 create mode 100644 usr/src/uts/i86pc/io/vmm/README.sync
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/npt.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/npt.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/offsets.in
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_msr.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_msr.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_support.s
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/vmcb.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/amd/vmcb.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/offsets.in
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vtd.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/iommu.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/ppt.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/sol_iommu.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/sol_ppt.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vpmtmr.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vpmtmr.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vrtc.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/vrtc.h
 delete mode 100644 usr/src/uts/i86pc/io/vmm/offsets.in
 create mode 100644 usr/src/uts/i86pc/io/vmm/vm/pmap.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_extern.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_glue.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_map.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_object.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_page.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_pager.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm.mapfile
 delete mode 100644 usr/src/uts/i86pc/io/vmm/vmm_ipi.h
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_mem.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
 delete mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_stat.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_support.s
 create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_zsd.c
 delete mode 100644 usr/src/uts/i86pc/io/vmm/vmx_assym.s
 create mode 100644 usr/src/uts/i86pc/os/gipt.c
 create mode 100644 usr/src/uts/i86pc/sys/gipt.h
 create mode 100644 usr/src/uts/i86pc/sys/vmm_drv.h

(limited to 'usr/src/uts/i86pc')

diff --git a/exception_lists/check_rtime b/exception_lists/check_rtime
index 01bb189dca..42964957d4 100644
--- a/exception_lists/check_rtime
+++ b/exception_lists/check_rtime
@@ -24,6 +24,7 @@
 # Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
 # Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
 # Copyright 2019 Peter Tribble.
+# Copyright 2018 Joyent, Inc.
 # Copyright 2020 Oxide Computer Company
 #
 
@@ -83,6 +84,7 @@ EXEC_STACK	^opt/os-tests/tests/secflags/stacky$
 
 # Objects for which we allow relocations to the text segment
 TEXTREL		^platform/.*/MACH(kernel)/unix$
+TEXTREL		^usr/sbin/amd64/bhyve$
 
 # Directories and files that are allowed to have no direct bound symbols
 NODIRECT	^platform/.*/MACH(kernel)/unix$
diff --git a/exception_lists/copyright b/exception_lists/copyright
index 647bc46b60..c62835e304 100644
--- a/exception_lists/copyright
+++ b/exception_lists/copyright
@@ -466,3 +466,87 @@ usr/src/uts/common/sys/THIRDPARTYLICENSE.firmload
 usr/src/uts/common/sys/THIRDPARTYLICENSE.firmload.descrip
 usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/*
 usr/src/uts/sparc/nsmb/ioc_check.ref
+
+# bhyve sources
+usr/src/cmd/bhyve/acpi.[ch]
+usr/src/cmd/bhyve/ahci.h
+usr/src/cmd/bhyve/atkbdc.[ch]
+usr/src/cmd/bhyve/bhyvegc.[ch]
+usr/src/cmd/bhyve/bhyverun.[ch]
+usr/src/cmd/bhyve/block_if.[ch]
+usr/src/cmd/bhyve/bootrom.[ch]
+usr/src/cmd/bhyve/console.[ch]
+usr/src/cmd/bhyve/consport.c
+usr/src/cmd/bhyve/dbgport.[ch]
+usr/src/cmd/bhyve/fwctl.[ch]
+usr/src/cmd/bhyve/gdb.[ch]
+usr/src/cmd/bhyve/inout.[ch]
+usr/src/cmd/bhyve/ioapic.[ch]
+usr/src/cmd/bhyve/mem.[ch]
+usr/src/cmd/bhyve/mevent.[ch]
+usr/src/cmd/bhyve/mevent_test.c
+usr/src/cmd/bhyve/mptbl.[ch]
+usr/src/cmd/bhyve/pci_ahci.c
+usr/src/cmd/bhyve/pci_e82545.c
+usr/src/cmd/bhyve/pci_emul.[ch]
+usr/src/cmd/bhyve/pci_fbuf.c
+usr/src/cmd/bhyve/pci_hostbridge.c
+usr/src/cmd/bhyve/pci_irq.[ch]
+usr/src/cmd/bhyve/pci_lpc.[ch]
+usr/src/cmd/bhyve/pci_nvme.c
+usr/src/cmd/bhyve/pci_passthru.c
+usr/src/cmd/bhyve/pci_uart.c
+usr/src/cmd/bhyve/pci_virtio_block.c
+usr/src/cmd/bhyve/pci_virtio_net.c
+usr/src/cmd/bhyve/pci_virtio_rnd.c
+usr/src/cmd/bhyve/pci_virtio_scsi.c
+usr/src/cmd/bhyve/pci_xhci.[ch]
+usr/src/cmd/bhyve/pm.c
+usr/src/cmd/bhyve/pmtmr.c
+usr/src/cmd/bhyve/post.c
+usr/src/cmd/bhyve/ps2kbd.[ch]
+usr/src/cmd/bhyve/ps2mouse.[ch]
+usr/src/cmd/bhyve/rfb.[ch]
+usr/src/cmd/bhyve/rtc.[ch]
+usr/src/cmd/bhyve/smbiostbl.[ch]
+usr/src/cmd/bhyve/sockstream.[ch]
+usr/src/cmd/bhyve/spinup_ap.[ch]
+usr/src/cmd/bhyve/task_switch.c
+usr/src/cmd/bhyve/uart_emul.[ch]
+usr/src/cmd/bhyve/usb_emul.[ch]
+usr/src/cmd/bhyve/usb_mouse.c
+usr/src/cmd/bhyve/vga.[ch]
+usr/src/cmd/bhyve/virtio.[ch]
+usr/src/cmd/bhyve/xmsr.[ch]
+usr/src/cmd/bhyvectl/bhyvectl.c
+usr/src/compat/freebsd/*.h
+usr/src/compat/freebsd/*/*.h
+usr/src/compat/freebsd/amd64/machine/*.h
+usr/contrib/freebsd/*/*.h
+usr/contrib/freebsd/*/*/*.h
+usr/contrib/freebsd/lib/libutil/*.c
+usr/src/lib/libvmmapi/common/vmmapi.[ch]
+usr/src/tools/scripts/gensetdefs.pl
+usr/src/uts/i86pc/io/vmm/amd/*.[chs]
+usr/src/uts/i86pc/io/vmm/intel/*.[chs]
+usr/src/uts/i86pc/io/vmm/intel/offsets.in
+usr/src/uts/i86pc/io/vmm/io/*.[ch]
+usr/src/uts/i86pc/io/vmm/README.sync
+usr/src/uts/i86pc/io/vmm/vmm.c
+usr/src/uts/i86pc/io/vmm/vmm_host.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_ipi.h
+usr/src/uts/i86pc/io/vmm/vmm_ktr.h
+usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_mem.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
+usr/src/uts/i86pc/io/vmm/vmm_stat.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_util.[ch]
+usr/src/uts/i86pc/io/vmm/vmx_assym.s
+usr/src/uts/i86pc/io/vmm/x86.[ch]
+usr/src/uts/i86pc/sys/vmm.h
+usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/uts/i86pc/sys/vmm_instruction_emul.h
diff --git a/exception_lists/cstyle b/exception_lists/cstyle
index d320dcfacc..73edc10e88 100644
--- a/exception_lists/cstyle
+++ b/exception_lists/cstyle
@@ -1326,3 +1326,86 @@ usr/src/uts/intel/sys/acpi/platform/acos2.h
 usr/src/uts/intel/sys/acpi/platform/acsolaris.h
 usr/src/uts/intel/sys/acpi/platform/acwin.h
 usr/src/uts/intel/sys/acpi/platform/acwin64.h
+
+# bhyve sources
+syntax: glob
+usr/src/cmd/bhyve/acpi.[ch]
+usr/src/cmd/bhyve/ahci.h
+usr/src/cmd/bhyve/atkbdc.[ch]
+usr/src/cmd/bhyve/bhyvegc.[ch]
+usr/src/cmd/bhyve/bhyverun.[ch]
+usr/src/cmd/bhyve/block_if.[ch]
+usr/src/cmd/bhyve/bootrom.[ch]
+usr/src/cmd/bhyve/console.[ch]
+usr/src/cmd/bhyve/consport.c
+usr/src/cmd/bhyve/dbgport.[ch]
+usr/src/cmd/bhyve/fwctl.[ch]
+usr/src/cmd/bhyve/gdb.[ch]
+usr/src/cmd/bhyve/inout.[ch]
+usr/src/cmd/bhyve/ioapic.[ch]
+usr/src/cmd/bhyve/iov.[ch]
+usr/src/cmd/bhyve/mem.[ch]
+usr/src/cmd/bhyve/mevent.[ch]
+usr/src/cmd/bhyve/mevent_test.c
+usr/src/cmd/bhyve/mptbl.[ch]
+usr/src/cmd/bhyve/pci_ahci.c
+usr/src/cmd/bhyve/pci_e82545.c
+usr/src/cmd/bhyve/pci_emul.[ch]
+usr/src/cmd/bhyve/pci_fbuf.c
+usr/src/cmd/bhyve/pci_hostbridge.c
+usr/src/cmd/bhyve/pci_irq.[ch]
+usr/src/cmd/bhyve/pci_lpc.[ch]
+usr/src/cmd/bhyve/pci_nvme.c
+usr/src/cmd/bhyve/pci_passthru.c
+usr/src/cmd/bhyve/pci_uart.c
+usr/src/cmd/bhyve/pci_virtio_block.c
+usr/src/cmd/bhyve/pci_virtio_console.c
+usr/src/cmd/bhyve/pci_virtio_net.c
+usr/src/cmd/bhyve/pci_virtio_rnd.c
+usr/src/cmd/bhyve/pci_virtio_scsi.c
+usr/src/cmd/bhyve/pci_xhci.[ch]
+usr/src/cmd/bhyve/pm.c
+usr/src/cmd/bhyve/pmtmr.c
+usr/src/cmd/bhyve/post.c
+usr/src/cmd/bhyve/ps2kbd.[ch]
+usr/src/cmd/bhyve/ps2mouse.[ch]
+usr/src/cmd/bhyve/rfb.[ch]
+usr/src/cmd/bhyve/rtc.[ch]
+usr/src/cmd/bhyve/smbiostbl.[ch]
+usr/src/cmd/bhyve/sockstream.[ch]
+usr/src/cmd/bhyve/spinup_ap.[ch]
+usr/src/cmd/bhyve/task_switch.c
+usr/src/cmd/bhyve/uart_emul.[ch]
+usr/src/cmd/bhyve/usb_emul.[ch]
+usr/src/cmd/bhyve/usb_mouse.c
+usr/src/cmd/bhyve/vga.[ch]
+usr/src/cmd/bhyve/virtio.[ch]
+usr/src/cmd/bhyve/xmsr.[ch]
+usr/src/cmd/bhyveconsole/bhyveconsole.c
+usr/src/cmd/bhyvectl/bhyvectl.c
+usr/src/compat/freebsd/*.h
+usr/src/compat/freebsd/*/*.h
+usr/src/compat/freebsd/amd64/machine/*.h
+usr/contrib/freebsd/*/*.h
+usr/contrib/freebsd/*/*/*.h
+usr/contrib/freebsd/lib/libutil/*.c
+usr/src/lib/libvmmapi/common/vmmapi.[ch]
+usr/src/uts/i86pc/io/vmm/amd/*.[ch]
+usr/src/uts/i86pc/io/vmm/intel/*.[chs]
+usr/src/uts/i86pc/io/vmm/io/*.[ch]
+usr/src/uts/i86pc/io/vmm/vmm.c
+usr/src/uts/i86pc/io/vmm/vmm_host.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_ipi.h
+usr/src/uts/i86pc/io/vmm/vmm_ktr.h
+usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_mem.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+usr/src/uts/i86pc/io/vmm/vmm_stat.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_util.[ch]
+usr/src/uts/i86pc/io/vmm/vmx_assym.s
+usr/src/uts/i86pc/io/vmm/x86.[ch]
+usr/src/uts/i86pc/sys/vmm.h
+usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/uts/i86pc/sys/vmm_instruction_emul.h
diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk
index c8edb3e5ae..7fa467f735 100644
--- a/exception_lists/hdrchk
+++ b/exception_lists/hdrchk
@@ -374,3 +374,53 @@ usr/src/uts/intel/sys/acpi/acresrc.h
 usr/src/uts/intel/sys/acpi/acstruct.h
 usr/src/uts/intel/sys/acpi/amlresrc.h
 usr/src/uts/intel/sys/acpi/platform/acwin64.h
+
+# bhyve sources
+syntax: glob
+usr/src/cmd/bhyve/acpi.h
+usr/src/cmd/bhyve/ahci.h
+usr/src/cmd/bhyve/atkbdc.h
+usr/src/cmd/bhyve/bhyvegc.h
+usr/src/cmd/bhyve/bhyverun.h
+usr/src/cmd/bhyve/block_if.h
+usr/src/cmd/bhyve/bootrom.h
+usr/src/cmd/bhyve/console.h
+usr/src/cmd/bhyve/dbgport.h
+usr/src/cmd/bhyve/inout.h
+usr/src/cmd/bhyve/ioapic.h
+usr/src/cmd/bhyve/mem.h
+usr/src/cmd/bhyve/mptbl.h
+usr/src/cmd/bhyve/pci_emul.h
+usr/src/cmd/bhyve/pci_irq.h
+usr/src/cmd/bhyve/pci_lpc.h
+usr/src/cmd/bhyve/ps2kbd.h
+usr/src/cmd/bhyve/ps2mouse.h
+usr/src/cmd/bhyve/rfb.h
+usr/src/cmd/bhyve/rtc.h
+usr/src/cmd/bhyve/smbiostbl.h
+usr/src/cmd/bhyve/sockstream.h
+usr/src/cmd/bhyve/spinup_ap.h
+usr/src/cmd/bhyve/uart_emul.h
+usr/src/cmd/bhyve/vga.h
+usr/src/cmd/bhyve/virtio.h
+usr/src/cmd/bhyve/xmsr.h
+usr/src/compat/freebsd/*.h
+usr/src/compat/freebsd/*/*.h
+usr/src/compat/freebsd/amd64/machine/*.h
+usr/contrib/freebsd/*/*.h
+usr/contrib/freebsd/*/*/*.h
+usr/src/lib/libvmmapi/common/vmmapi.h
+usr/src/uts/i86pc/io/vmm/intel/*.h
+usr/src/uts/i86pc/io/vmm/io/*.h
+usr/src/uts/i86pc/io/vmm/vmm_host.h
+usr/src/uts/i86pc/io/vmm/vmm_ioport.h
+usr/src/uts/i86pc/io/vmm/vmm_ipi.h
+usr/src/uts/i86pc/io/vmm/vmm_ktr.h
+usr/src/uts/i86pc/io/vmm/vmm_lapic.h
+usr/src/uts/i86pc/io/vmm/vmm_mem.h
+usr/src/uts/i86pc/io/vmm/vmm_stat.h
+usr/src/uts/i86pc/io/vmm/vmm_util.h
+usr/src/uts/i86pc/io/vmm/x86.h
+usr/src/uts/i86pc/sys/vmm.h
+usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/uts/i86pc/sys/vmm_instruction_emul.h
diff --git a/exception_lists/packaging b/exception_lists/packaging
index 41ca551cc2..cd1e8ed230 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -816,6 +816,12 @@ usr/lib/amd64/libsff.so			i386
 usr/lib/sparcv9/libsff.so		sparc
 usr/lib/libsff.so
 
+#
+# private bhyve files
+#
+lib/amd64/libvmmapi.so			i386
+usr/include/vmmapi.h			i386
+
 #
 # libcustr is private
 #
diff --git a/exception_lists/wscheck b/exception_lists/wscheck
index 489668a350..ac16cc54b2 100644
--- a/exception_lists/wscheck
+++ b/exception_lists/wscheck
@@ -26,3 +26,84 @@ usr/src/uts/common/io/e1000api/*
 usr/src/uts/common/io/qede/*
 usr/src/uts/common/io/i40e/core/*
 usr/src/uts/common/io/ixgbe/core/*
+
+# bhyve sources
+usr/src/cmd/bhyve/acpi.[ch]
+usr/src/cmd/bhyve/ahci.h
+usr/src/cmd/bhyve/atkbdc.[ch]
+usr/src/cmd/bhyve/bhyvegc.[ch]
+usr/src/cmd/bhyve/bhyverun.[ch]
+usr/src/cmd/bhyve/block_if.[ch]
+usr/src/cmd/bhyve/bootrom.[ch]
+usr/src/cmd/bhyve/console.[ch]
+usr/src/cmd/bhyve/consport.c
+usr/src/cmd/bhyve/dbgport.[ch]
+usr/src/cmd/bhyve/fwctl.[ch]
+usr/src/cmd/bhyve/gdb.[ch]
+usr/src/cmd/bhyve/inout.[ch]
+usr/src/cmd/bhyve/ioapic.[ch]
+usr/src/cmd/bhyve/mem.[ch]
+usr/src/cmd/bhyve/mevent.[ch]
+usr/src/cmd/bhyve/mevent_test.c
+usr/src/cmd/bhyve/mptbl.[ch]
+usr/src/cmd/bhyve/pci_ahci.c
+usr/src/cmd/bhyve/pci_e82545.c
+usr/src/cmd/bhyve/pci_emul.[ch]
+usr/src/cmd/bhyve/pci_fbuf.c
+usr/src/cmd/bhyve/pci_hostbridge.c
+usr/src/cmd/bhyve/pci_irq.[ch]
+usr/src/cmd/bhyve/pci_lpc.[ch]
+usr/src/cmd/bhyve/pci_nvme.c
+usr/src/cmd/bhyve/pci_passthru.c
+usr/src/cmd/bhyve/pci_uart.c
+usr/src/cmd/bhyve/pci_virtio_block.c
+usr/src/cmd/bhyve/pci_virtio_console.c
+usr/src/cmd/bhyve/pci_virtio_net.c
+usr/src/cmd/bhyve/pci_virtio_rnd.c
+usr/src/cmd/bhyve/pci_virtio_scsi.c
+usr/src/cmd/bhyve/pci_xhci.[ch]
+usr/src/cmd/bhyve/pm.c
+usr/src/cmd/bhyve/pmtmr.c
+usr/src/cmd/bhyve/post.c
+usr/src/cmd/bhyve/ps2kbd.[ch]
+usr/src/cmd/bhyve/ps2mouse.[ch]
+usr/src/cmd/bhyve/rfb.[ch]
+usr/src/cmd/bhyve/rtc.[ch]
+usr/src/cmd/bhyve/smbiostbl.[ch]
+usr/src/cmd/bhyve/sockstream.[ch]
+usr/src/cmd/bhyve/spinup_ap.[ch]
+usr/src/cmd/bhyve/task_switch.c
+usr/src/cmd/bhyve/uart_emul.[ch]
+usr/src/cmd/bhyve/usb_emul.[ch]
+usr/src/cmd/bhyve/usb_mouse.c
+usr/src/cmd/bhyve/vga.[ch]
+usr/src/cmd/bhyve/virtio.[ch]
+usr/src/cmd/bhyve/xmsr.[ch]
+usr/src/cmd/bhyveconsole/bhyveconsole.c
+usr/src/cmd/bhyvectl/bhyvectl.c
+usr/src/compat/freebsd/*.h
+usr/src/compat/freebsd/*/*.h
+usr/src/compat/freebsd/amd64/machine/*.h
+usr/contrib/freebsd/*/*.h
+usr/contrib/freebsd/*/*/*.h
+usr/contrib/freebsd/lib/libutil/*.c
+usr/src/lib/libvmmapi/common/vmmapi.[ch]
+usr/src/uts/i86pc/io/vmm/amd/*.[ch]
+usr/src/uts/i86pc/io/vmm/intel/*.[chs]
+usr/src/uts/i86pc/io/vmm/io/*.[ch]
+usr/src/uts/i86pc/io/vmm/vmm.c
+usr/src/uts/i86pc/io/vmm/vmm_host.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_ipi.h
+usr/src/uts/i86pc/io/vmm/vmm_ktr.h
+usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_mem.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+usr/src/uts/i86pc/io/vmm/vmm_stat.[ch]
+usr/src/uts/i86pc/io/vmm/vmm_util.[ch]
+usr/src/uts/i86pc/io/vmm/vmx_assym.s
+usr/src/uts/i86pc/io/vmm/x86.[ch]
+usr/src/uts/i86pc/sys/vmm.h
+usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/uts/i86pc/sys/vmm_instruction_emul.h
diff --git a/usr/contrib/freebsd/amd64/machine/pmap.h b/usr/contrib/freebsd/amd64/machine/pmap.h
new file mode 100644
index 0000000000..a0b8ee37f2
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/pmap.h
@@ -0,0 +1,455 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department and William Jolitz of UUNET Technologies Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Derived from hp300 version by Mike Hibler, this version by William
+ * Jolitz uses a recursive map [a pde points to the page directory] to
+ * map the page tables using the pagetables themselves. This is done to
+ * reduce the impact on kernel virtual memory for lots of sparse address
+ * space, and to reduce the cost of memory to each process.
+ *
+ *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
+ *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
+ * $FreeBSD$
+ */
+
+#ifndef _MACHINE_PMAP_H_
+#define	_MACHINE_PMAP_H_
+
+/*
+ * Page-directory and page-table entries follow this format, with a few
+ * of the fields not present here and there, depending on a lot of things.
+ */
+				/* ---- Intel Nomenclature ---- */
+#define	X86_PG_V	0x001	/* P	Valid			*/
+#define	X86_PG_RW	0x002	/* R/W	Read/Write		*/
+#define	X86_PG_U	0x004	/* U/S  User/Supervisor		*/
+#define	X86_PG_NC_PWT	0x008	/* PWT	Write through		*/
+#define	X86_PG_NC_PCD	0x010	/* PCD	Cache disable		*/
+#define	X86_PG_A	0x020	/* A	Accessed		*/
+#define	X86_PG_M	0x040	/* D	Dirty			*/
+#define	X86_PG_PS	0x080	/* PS	Page size (0=4k,1=2M)	*/
+#define	X86_PG_PTE_PAT	0x080	/* PAT	PAT index		*/
+#define	X86_PG_G	0x100	/* G	Global			*/
+#define	X86_PG_AVAIL1	0x200	/*    /	Available for system	*/
+#define	X86_PG_AVAIL2	0x400	/*   <	programmers use		*/
+#define	X86_PG_AVAIL3	0x800	/*    \				*/
+#define	X86_PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
+#define	X86_PG_NX	(1ul<<63) /* No-execute */
+#define	X86_PG_AVAIL(x)	(1ul << (x))
+
+/* Page level cache control fields used to determine the PAT type */
+#define	X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+#define	X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+
+/*
+ * Intel extended page table (EPT) bit definitions.
+ */
+#define	EPT_PG_READ		0x001	/* R	Read		*/
+#define	EPT_PG_WRITE		0x002	/* W	Write		*/
+#define	EPT_PG_EXECUTE		0x004	/* X	Execute		*/
+#define	EPT_PG_IGNORE_PAT	0x040	/* IPAT	Ignore PAT	*/
+#define	EPT_PG_PS		0x080	/* PS	Page size	*/
+#define	EPT_PG_A		0x100	/* A	Accessed	*/
+#define	EPT_PG_M		0x200	/* D	Dirty		*/
+#define	EPT_PG_MEMORY_TYPE(x)	((x) << 3) /* MT Memory Type	*/
+
+/*
+ * Define the PG_xx macros in terms of the bits on x86 PTEs.
+ */
+#define	PG_V		X86_PG_V
+#define	PG_RW		X86_PG_RW
+#define	PG_U		X86_PG_U
+#define	PG_NC_PWT	X86_PG_NC_PWT
+#define	PG_NC_PCD	X86_PG_NC_PCD
+#define	PG_A		X86_PG_A
+#define	PG_M		X86_PG_M
+#define	PG_PS		X86_PG_PS
+#define	PG_PTE_PAT	X86_PG_PTE_PAT
+#define	PG_G		X86_PG_G
+#define	PG_AVAIL1	X86_PG_AVAIL1
+#define	PG_AVAIL2	X86_PG_AVAIL2
+#define	PG_AVAIL3	X86_PG_AVAIL3
+#define	PG_PDE_PAT	X86_PG_PDE_PAT
+#define	PG_NX		X86_PG_NX
+#define	PG_PDE_CACHE	X86_PG_PDE_CACHE
+#define	PG_PTE_CACHE	X86_PG_PTE_CACHE
+
+/* Our various interpretations of the above */
+#define	PG_W		X86_PG_AVAIL3	/* "Wired" pseudoflag */
+#define	PG_MANAGED	X86_PG_AVAIL2
+#define	EPT_PG_EMUL_V	X86_PG_AVAIL(52)
+#define	EPT_PG_EMUL_RW	X86_PG_AVAIL(53)
+#define	PG_PROMOTED	X86_PG_AVAIL(54)	/* PDE only */
+#define	PG_FRAME	(0x000ffffffffff000ul)
+#define	PG_PS_FRAME	(0x000fffffffe00000ul)
+
+/*
+ * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
+ * (PTE) page mappings have identical settings for the following fields:
+ */
+#define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
+	    PG_M | PG_A | PG_U | PG_RW | PG_V)
+
+/*
+ * Page Protection Exception bits
+ */
+
+#define PGEX_P		0x01	/* Protection violation vs. not present */
+#define PGEX_W		0x02	/* during a Write cycle */
+#define PGEX_U		0x04	/* access from User mode (UPL) */
+#define PGEX_RSV	0x08	/* reserved PTE field is non-zero */
+#define PGEX_I		0x10	/* during an instruction fetch */
+
+/* 
+ * undef the PG_xx macros that define bits in the regular x86 PTEs that
+ * have a different position in nested PTEs. This is done when compiling
+ * code that needs to be aware of the differences between regular x86 and
+ * nested PTEs.
+ *
+ * The appropriate bitmask will be calculated at runtime based on the pmap
+ * type.
+ */
+#ifdef AMD64_NPT_AWARE
+#undef PG_AVAIL1		/* X86_PG_AVAIL1 aliases with EPT_PG_M */
+#undef PG_G
+#undef PG_A
+#undef PG_M
+#undef PG_PDE_PAT
+#undef PG_PDE_CACHE
+#undef PG_PTE_PAT
+#undef PG_PTE_CACHE
+#undef PG_RW
+#undef PG_V
+#endif
+
+/*
+ * Pte related macros.  This is complicated by having to deal with
+ * the sign extension of the 48th bit.
+ */
+#define KVADDR(l4, l3, l2, l1) ( \
+	((unsigned long)-1 << 47) | \
+	((unsigned long)(l4) << PML4SHIFT) | \
+	((unsigned long)(l3) << PDPSHIFT) | \
+	((unsigned long)(l2) << PDRSHIFT) | \
+	((unsigned long)(l1) << PAGE_SHIFT))
+
+#define UVADDR(l4, l3, l2, l1) ( \
+	((unsigned long)(l4) << PML4SHIFT) | \
+	((unsigned long)(l3) << PDPSHIFT) | \
+	((unsigned long)(l2) << PDRSHIFT) | \
+	((unsigned long)(l1) << PAGE_SHIFT))
+
+/*
+ * Number of kernel PML4 slots.  Can be anywhere from 1 to 64 or so,
+ * but setting it larger than NDMPML4E makes no sense.
+ *
+ * Each slot provides .5 TB of kernel virtual space.
+ */
+#define NKPML4E		4
+
+#define	NUPML4E		(NPML4EPG/2)	/* number of userland PML4 pages */
+#define	NUPDPE		(NUPML4E*NPDPEPG)/* number of userland PDP pages */
+#define	NUPDE		(NUPDPE*NPDEPG)	/* number of userland PD entries */
+
+/*
+ * NDMPML4E is the maximum number of PML4 entries that will be
+ * used to implement the direct map.  It must be a power of two,
+ * and should generally exceed NKPML4E.  The maximum possible
+ * value is 64; using 128 will make the direct map intrude into
+ * the recursive page table map.
+ */
+#define	NDMPML4E	8
+
+/*
+ * These values control the layout of virtual memory.  The starting address
+ * of the direct map, which is controlled by DMPML4I, must be a multiple of
+ * its size.  (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ *
+ * Note: KPML4I is the index of the (single) level 4 page that maps
+ * the KVA that holds KERNBASE, while KPML4BASE is the index of the
+ * first level 4 page that maps VM_MIN_KERNEL_ADDRESS.  If NKPML4E
+ * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
+ * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
+ * KERNBASE.
+ *
+ * (KPML4I combines with KPDPI to choose where KERNBASE starts.
+ * Or, in other words, KPML4I provides bits 39..47 of KERNBASE,
+ * and KPDPI provides bits 30..38.)
+ */
+#define	PML4PML4I	(NPML4EPG/2)	/* Index of recursive pml4 mapping */
+
+#define	KPML4BASE	(NPML4EPG-NKPML4E) /* KVM at highest addresses */
+#define	DMPML4I		rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
+
+#define	KPML4I		(NPML4EPG-1)
+#define	KPDPI		(NPDPEPG-2)	/* kernbase at -2GB */
+
+/*
+ * XXX doesn't really belong here I guess...
+ */
+#define ISA_HOLE_START    0xa0000
+#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START)
+
+#define	PMAP_PCID_NONE		0xffffffff
+#define	PMAP_PCID_KERN		0
+#define	PMAP_PCID_OVERMAX	0x1000
+
+#ifndef LOCORE
+
+#include <sys/queue.h>
+#include <sys/_cpuset.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+
+#include <vm/_vm_radix.h>
+
+typedef u_int64_t pd_entry_t;
+typedef u_int64_t pt_entry_t;
+typedef u_int64_t pdp_entry_t;
+typedef u_int64_t pml4_entry_t;
+
+/*
+ * Address of current address space page table maps and directories.
+ */
+#ifdef _KERNEL
+#define	addr_PTmap	(KVADDR(PML4PML4I, 0, 0, 0))
+#define	addr_PDmap	(KVADDR(PML4PML4I, PML4PML4I, 0, 0))
+#define	addr_PDPmap	(KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
+#define	addr_PML4map	(KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
+#define	addr_PML4pml4e	(addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
+#define	PTmap		((pt_entry_t *)(addr_PTmap))
+#define	PDmap		((pd_entry_t *)(addr_PDmap))
+#define	PDPmap		((pd_entry_t *)(addr_PDPmap))
+#define	PML4map		((pd_entry_t *)(addr_PML4map))
+#define	PML4pml4e	((pd_entry_t *)(addr_PML4pml4e))
+
+extern int nkpt;		/* Initial number of kernel page tables */
+extern u_int64_t KPDPphys;	/* physical address of kernel level 3 */
+extern u_int64_t KPML4phys;	/* physical address of kernel level 4 */
+
+/*
+ * virtual address to page table entry and
+ * to physical address.
+ * Note: these work recursively, thus vtopte of a pte will give
+ * the corresponding pde that in turn maps it.
+ */
+pt_entry_t *vtopte(vm_offset_t);
+#define	vtophys(va)	pmap_kextract(((vm_offset_t) (va)))
+
+#define	pte_load_store(ptep, pte)	atomic_swap_long(ptep, pte)
+#define	pte_load_clear(ptep)		atomic_swap_long(ptep, 0)
+#define	pte_store(ptep, pte) do { \
+	*(u_long *)(ptep) = (u_long)(pte); \
+} while (0)
+#define	pte_clear(ptep)			pte_store(ptep, 0)
+
+#define	pde_store(pdep, pde)		pte_store(pdep, pde)
+
+extern pt_entry_t pg_nx;
+
+#endif /* _KERNEL */
+
+/*
+ * Pmap stuff
+ */
+struct	pv_entry;
+struct	pv_chunk;
+
+/*
+ * Locks
+ * (p) PV list lock
+ */
+struct md_page {
+	TAILQ_HEAD(, pv_entry)	pv_list;  /* (p) */
+	int			pv_gen;   /* (p) */
+	int			pat_mode;
+};
+
+enum pmap_type {
+	PT_X86,			/* regular x86 page tables */
+	PT_EPT,			/* Intel's nested page tables */
+	PT_RVI,			/* AMD's nested page tables */
+};
+
+struct pmap_pcids {
+	uint32_t	pm_pcid;
+	uint32_t	pm_gen;
+};
+
+/*
+ * The kernel virtual address (KVA) of the level 4 page table page is always
+ * within the direct map (DMAP) region.
+ */
+struct pmap {
+	struct mtx		pm_mtx;
+	pml4_entry_t		*pm_pml4;	/* KVA of level 4 page table */
+	uint64_t		pm_cr3;
+	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
+	cpuset_t		pm_active;	/* active on cpus */
+	enum pmap_type		pm_type;	/* regular or nested tables */
+	struct pmap_statistics	pm_stats;	/* pmap statistics */
+	struct vm_radix		pm_root;	/* spare page table pages */
+	long			pm_eptgen;	/* EPT pmap generation id */
+	int			pm_flags;
+	struct pmap_pcids	pm_pcids[MAXCPU];
+};
+
+/* flags */
+#define	PMAP_NESTED_IPIMASK	0xff
+#define	PMAP_PDE_SUPERPAGE	(1 << 8)	/* supports 2MB superpages */
+#define	PMAP_EMULATE_AD_BITS	(1 << 9)	/* needs A/D bits emulation */
+#define	PMAP_SUPPORTS_EXEC_ONLY	(1 << 10)	/* execute only mappings ok */
+
+typedef struct pmap	*pmap_t;
+
+#ifdef _KERNEL
+extern struct pmap	kernel_pmap_store;
+#define kernel_pmap	(&kernel_pmap_store)
+
+#define	PMAP_LOCK(pmap)		mtx_lock(&(pmap)->pm_mtx)
+#define	PMAP_LOCK_ASSERT(pmap, type) \
+				mtx_assert(&(pmap)->pm_mtx, (type))
+#define	PMAP_LOCK_DESTROY(pmap)	mtx_destroy(&(pmap)->pm_mtx)
+#define	PMAP_LOCK_INIT(pmap)	mtx_init(&(pmap)->pm_mtx, "pmap", \
+				    NULL, MTX_DEF | MTX_DUPOK)
+#define	PMAP_LOCKED(pmap)	mtx_owned(&(pmap)->pm_mtx)
+#define	PMAP_MTX(pmap)		(&(pmap)->pm_mtx)
+#define	PMAP_TRYLOCK(pmap)	mtx_trylock(&(pmap)->pm_mtx)
+#define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
+
+int	pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
+int	pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
+#endif
+
+/*
+ * For each vm_page_t, there is a list of all currently valid virtual
+ * mappings of that page.  An entry is a pv_entry_t, the list is pv_list.
+ */
+typedef struct pv_entry {
+	vm_offset_t	pv_va;		/* virtual address for mapping */
+	TAILQ_ENTRY(pv_entry)	pv_next;
+} *pv_entry_t;
+
+/*
+ * pv_entries are allocated in chunks per-process.  This avoids the
+ * need to track per-pmap assignments.
+ */
+#define	_NPCM	3
+#define	_NPCPV	168
+struct pv_chunk {
+	pmap_t			pc_pmap;
+	TAILQ_ENTRY(pv_chunk)	pc_list;
+	uint64_t		pc_map[_NPCM];	/* bitmap; 1 = free */
+	TAILQ_ENTRY(pv_chunk)	pc_lru;
+	struct pv_entry		pc_pventry[_NPCPV];
+};
+
+#ifdef	_KERNEL
+
+extern caddr_t	CADDR1;
+extern pt_entry_t *CMAP1;
+extern vm_paddr_t phys_avail[];
+extern vm_paddr_t dump_avail[];
+extern vm_offset_t virtual_avail;
+extern vm_offset_t virtual_end;
+extern vm_paddr_t dmaplimit;
+extern int pmap_pcid_enabled;
+extern int invpcid_works;
+
+#define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
+#define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
+
+struct thread;
+
+void	pmap_activate_sw(struct thread *);
+void	pmap_bootstrap(vm_paddr_t *);
+int	pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde);
+int	pmap_change_attr(vm_offset_t, vm_size_t, int);
+void	pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate);
+void	pmap_init_pat(void);
+void	pmap_kenter(vm_offset_t va, vm_paddr_t pa);
+void	*pmap_kenter_temporary(vm_paddr_t pa, int i);
+vm_paddr_t pmap_kextract(vm_offset_t);
+void	pmap_kremove(vm_offset_t);
+void	*pmap_mapbios(vm_paddr_t, vm_size_t);
+void	*pmap_mapdev(vm_paddr_t, vm_size_t);
+void	*pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
+boolean_t pmap_page_is_mapped(vm_page_t m);
+void	pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
+void	pmap_pinit_pml4(vm_page_t);
+void	pmap_unmapdev(vm_offset_t, vm_size_t);
+void	pmap_invalidate_page(pmap_t, vm_offset_t);
+void	pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
+void	pmap_invalidate_all(pmap_t);
+void	pmap_invalidate_cache(void);
+void	pmap_invalidate_cache_pages(vm_page_t *pages, int count);
+void	pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva,
+	    boolean_t force);
+void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
+boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+void	pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+#endif /* _KERNEL */
+
+/* Return various clipped indexes for a given VA */
+static __inline vm_pindex_t
+pmap_pte_index(vm_offset_t va)
+{
+
+	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pde_index(vm_offset_t va)
+{
+
+	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pdpe_index(vm_offset_t va)
+{
+
+	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pml4e_index(vm_offset_t va)
+{
+
+	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
+}
+
+#endif /* !LOCORE */
+
+#endif /* !_MACHINE_PMAP_H_ */
diff --git a/usr/contrib/freebsd/amd64/machine/specialreg.h b/usr/contrib/freebsd/amd64/machine/specialreg.h
deleted file mode 100644
index 41d4125cb9..0000000000
--- a/usr/contrib/freebsd/amd64/machine/specialreg.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/*-
- * This file is in the public domain.
- */
-/* $FreeBSD: head/sys/amd64/include/specialreg.h 233207 2012-03-19 21:34:11Z tijl $ */
-
-#include <x86/specialreg.h>
diff --git a/usr/contrib/freebsd/dev/io/iodev.h b/usr/contrib/freebsd/dev/io/iodev.h
new file mode 100644
index 0000000000..d040fcccf4
--- /dev/null
+++ b/usr/contrib/freebsd/dev/io/iodev.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2010 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DEV_IODEV_H_
+#define	_DEV_IODEV_H_
+
+#define	IODEV_PIO_READ		0
+#define	IODEV_PIO_WRITE		1
+
+struct iodev_pio_req {
+	u_int access;
+	u_int port;
+	u_int width;
+	u_int val;
+};
+
+#define	IODEV_PIO	_IOWR('I', 0, struct iodev_pio_req)
+
+#endif /* _DEV_IODEV_H_ */
diff --git a/usr/contrib/freebsd/dev/mii/mii.h b/usr/contrib/freebsd/dev/mii/mii.h
new file mode 100644
index 0000000000..fa1ec84eaa
--- /dev/null
+++ b/usr/contrib/freebsd/dev/mii/mii.h
@@ -0,0 +1,239 @@
+/*	$NetBSD: mii.h,v 1.18 2014/06/16 14:43:22 msaitoh Exp $	*/
+
+/*-
+ * Copyright (c) 1997 Manuel Bouyer.  All rights reserved.
+ *
+ * Modification to match BSD/OS 3.0 MII interface by Jason R. Thorpe,
+ * Numerical Aerospace Simulation Facility, NASA Ames Research Center.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DEV_MII_MII_H_
+#define	_DEV_MII_MII_H_
+
+/*
+ * Registers common to all PHYs.
+ */
+
+#define	MII_NPHY	32	/* max # of PHYs per MII */
+
+/*
+ * MII commands, used if a device must drive the MII lines
+ * manually.
+ */
+#define	MII_COMMAND_START	0x01
+#define	MII_COMMAND_READ	0x02
+#define	MII_COMMAND_WRITE	0x01
+#define	MII_COMMAND_ACK		0x02
+
+#define	MII_BMCR	0x00	/* Basic mode control register (rw) */
+#define	BMCR_RESET	0x8000	/* reset */
+#define	BMCR_LOOP	0x4000	/* loopback */
+#define	BMCR_SPEED0	0x2000	/* speed selection (LSB) */
+#define	BMCR_AUTOEN	0x1000	/* autonegotiation enable */
+#define	BMCR_PDOWN	0x0800	/* power down */
+#define	BMCR_ISO	0x0400	/* isolate */
+#define	BMCR_STARTNEG	0x0200	/* restart autonegotiation */
+#define	BMCR_FDX	0x0100	/* Set duplex mode */
+#define	BMCR_CTEST	0x0080	/* collision test */
+#define	BMCR_SPEED1	0x0040	/* speed selection (MSB) */
+
+#define	BMCR_S10	0x0000		/* 10 Mb/s */
+#define	BMCR_S100	BMCR_SPEED0	/* 100 Mb/s */
+#define	BMCR_S1000	BMCR_SPEED1	/* 1000 Mb/s */
+
+#define	BMCR_SPEED(x)	((x) & (BMCR_SPEED0|BMCR_SPEED1))
+
+#define	MII_BMSR	0x01	/* Basic mode status register (ro) */
+#define	BMSR_100T4	0x8000	/* 100 base T4 capable */
+#define	BMSR_100TXFDX	0x4000	/* 100 base Tx full duplex capable */
+#define	BMSR_100TXHDX	0x2000	/* 100 base Tx half duplex capable */
+#define	BMSR_10TFDX	0x1000	/* 10 base T full duplex capable */
+#define	BMSR_10THDX	0x0800	/* 10 base T half duplex capable */
+#define	BMSR_100T2FDX	0x0400	/* 100 base T2 full duplex capable */
+#define	BMSR_100T2HDX	0x0200	/* 100 base T2 half duplex capable */
+#define	BMSR_EXTSTAT	0x0100	/* Extended status in register 15 */
+#define	BMSR_MFPS	0x0040	/* MII Frame Preamble Suppression */
+#define	BMSR_ACOMP	0x0020	/* Autonegotiation complete */
+#define	BMSR_RFAULT	0x0010	/* Link partner fault */
+#define	BMSR_ANEG	0x0008	/* Autonegotiation capable */
+#define	BMSR_LINK	0x0004	/* Link status */
+#define	BMSR_JABBER	0x0002	/* Jabber detected */
+#define	BMSR_EXTCAP	0x0001	/* Extended capability */
+
+#define	BMSR_DEFCAPMASK	0xffffffff
+
+/*
+ * Note that the EXTSTAT bit indicates that there is extended status
+ * info available in register 15, but 802.3 section 22.2.4.3 also
+ * states that all 1000 Mb/s capable PHYs will set this bit to 1.
+ */
+
+#define	BMSR_MEDIAMASK	(BMSR_100T4|BMSR_100TXFDX|BMSR_100TXHDX| \
+			 BMSR_10TFDX|BMSR_10THDX|BMSR_100T2FDX|BMSR_100T2HDX)
+
+/*
+ * Convert BMSR media capabilities to ANAR bits for autonegotiation.
+ * Note the shift chopps off the BMSR_ANEG bit.
+ */
+#define	BMSR_MEDIA_TO_ANAR(x)	(((x) & BMSR_MEDIAMASK) >> 6)
+
+#define	MII_PHYIDR1	0x02	/* ID register 1 (ro) */
+
+#define	MII_PHYIDR2	0x03	/* ID register 2 (ro) */
+#define	IDR2_OUILSB	0xfc00	/* OUI LSB */
+#define	IDR2_MODEL	0x03f0	/* vendor model */
+#define	IDR2_REV	0x000f	/* vendor revision */
+
+#define	MII_ANAR	0x04	/* Autonegotiation advertisement (rw) */
+		/* section 28.2.4.1 and 37.2.6.1 */
+#define ANAR_NP		0x8000	/* Next page (ro) */
+#define	ANAR_ACK	0x4000	/* link partner abilities acknowledged (ro) */
+#define ANAR_RF		0x2000	/* remote fault (ro) */
+		/* Annex 28B.2 */
+#define	ANAR_FC		0x0400	/* local device supports PAUSE */
+#define ANAR_T4		0x0200	/* local device supports 100bT4 */
+#define ANAR_TX_FD	0x0100	/* local device supports 100bTx FD */
+#define ANAR_TX		0x0080	/* local device supports 100bTx */
+#define ANAR_10_FD	0x0040	/* local device supports 10bT FD */
+#define ANAR_10		0x0020	/* local device supports 10bT */
+#define	ANAR_CSMA	0x0001	/* protocol selector CSMA/CD */
+#define	ANAR_PAUSE_NONE		(0 << 10)
+#define	ANAR_PAUSE_SYM		(1 << 10)
+#define	ANAR_PAUSE_ASYM		(2 << 10)
+#define	ANAR_PAUSE_TOWARDS	(3 << 10)
+
+		/* Annex 28D */
+#define	ANAR_X_FD	0x0020	/* local device supports 1000BASE-X FD */
+#define	ANAR_X_HD	0x0040	/* local device supports 1000BASE-X HD */
+#define	ANAR_X_PAUSE_NONE	(0 << 7)
+#define	ANAR_X_PAUSE_SYM	(1 << 7)
+#define	ANAR_X_PAUSE_ASYM	(2 << 7)
+#define	ANAR_X_PAUSE_TOWARDS	(3 << 7)
+
+#define	MII_ANLPAR	0x05	/* Autonegotiation lnk partner abilities (rw) */
+		/* section 28.2.4.1 and 37.2.6.1 */
+#define ANLPAR_NP	0x8000	/* Next page (ro) */
+#define	ANLPAR_ACK	0x4000	/* link partner accepted ACK (ro) */
+#define ANLPAR_RF	0x2000	/* remote fault (ro) */
+#define	ANLPAR_FC	0x0400	/* link partner supports PAUSE */
+#define ANLPAR_T4	0x0200	/* link partner supports 100bT4 */
+#define ANLPAR_TX_FD	0x0100	/* link partner supports 100bTx FD */
+#define ANLPAR_TX	0x0080	/* link partner supports 100bTx */
+#define ANLPAR_10_FD	0x0040	/* link partner supports 10bT FD */
+#define ANLPAR_10	0x0020	/* link partner supports 10bT */
+#define	ANLPAR_CSMA	0x0001	/* protocol selector CSMA/CD */
+#define	ANLPAR_PAUSE_MASK	(3 << 10)
+#define	ANLPAR_PAUSE_NONE	(0 << 10)
+#define	ANLPAR_PAUSE_SYM	(1 << 10)
+#define	ANLPAR_PAUSE_ASYM	(2 << 10)
+#define	ANLPAR_PAUSE_TOWARDS	(3 << 10)
+
+#define	ANLPAR_X_FD	0x0020	/* local device supports 1000BASE-X FD */
+#define	ANLPAR_X_HD	0x0040	/* local device supports 1000BASE-X HD */
+#define	ANLPAR_X_PAUSE_MASK	(3 << 7)
+#define	ANLPAR_X_PAUSE_NONE	(0 << 7)
+#define	ANLPAR_X_PAUSE_SYM	(1 << 7)
+#define	ANLPAR_X_PAUSE_ASYM	(2 << 7)
+#define	ANLPAR_X_PAUSE_TOWARDS	(3 << 7)
+
+#define	MII_ANER	0x06	/* Autonegotiation expansion (ro) */
+		/* section 28.2.4.1 and 37.2.6.1 */
+#define ANER_MLF	0x0010	/* multiple link detection fault */
+#define ANER_LPNP	0x0008	/* link parter next page-able */
+#define ANER_NP		0x0004	/* next page-able */
+#define ANER_PAGE_RX	0x0002	/* Page received */
+#define ANER_LPAN	0x0001	/* link parter autoneg-able */
+
+#define	MII_ANNP	0x07	/* Autonegotiation next page */
+		/* section 28.2.4.1 and 37.2.6.1 */
+
+#define	MII_ANLPRNP	0x08	/* Autonegotiation link partner rx next page */
+		/* section 32.5.1 and 37.2.6.1 */
+
+			/* This is also the 1000baseT control register */
+#define	MII_100T2CR	0x09	/* 100base-T2 control register */
+#define	GTCR_TEST_MASK	0xe000	/* see 802.3ab ss. 40.6.1.1.2 */
+#define	GTCR_MAN_MS	0x1000	/* enable manual master/slave control */
+#define	GTCR_ADV_MS	0x0800	/* 1 = adv. master, 0 = adv. slave */
+#define	GTCR_PORT_TYPE	0x0400	/* 1 = DCE, 0 = DTE (NIC) */
+#define	GTCR_ADV_1000TFDX 0x0200 /* adv. 1000baseT FDX */
+#define	GTCR_ADV_1000THDX 0x0100 /* adv. 1000baseT HDX */
+
+			/* This is also the 1000baseT status register */
+#define	MII_100T2SR	0x0a	/* 100base-T2 status register */
+#define	GTSR_MAN_MS_FLT	0x8000	/* master/slave config fault */
+#define	GTSR_MS_RES	0x4000	/* result: 1 = master, 0 = slave */
+#define	GTSR_LRS	0x2000	/* local rx status, 1 = ok */
+#define	GTSR_RRS	0x1000	/* remote rx status, 1 = ok */
+#define	GTSR_LP_1000TFDX 0x0800	/* link partner 1000baseT FDX capable */
+#define	GTSR_LP_1000THDX 0x0400	/* link partner 1000baseT HDX capable */
+#define	GTSR_LP_ASM_DIR	0x0200	/* link partner asym. pause dir. capable */
+#define	GTSR_IDLE_ERR	0x00ff	/* IDLE error count */
+
+#define	MII_PSECR	0x0b	/* PSE control register */
+#define	PSECR_PACTLMASK	0x000c	/* pair control mask */
+#define	PSECR_PSEENMASK	0x0003	/* PSE enable mask */
+#define	PSECR_PINOUTB	0x0008	/* PSE pinout Alternative B */
+#define	PSECR_PINOUTA	0x0004	/* PSE pinout Alternative A */
+#define	PSECR_FOPOWTST	0x0002	/* Force Power Test Mode */
+#define	PSECR_PSEEN	0x0001	/* PSE Enabled */
+#define	PSECR_PSEDIS	0x0000	/* PSE Disabled */
+
+#define	MII_PSESR	0x0c	/* PSE status register */
+#define	PSESR_PWRDENIED	0x1000	/* Power Denied */
+#define	PSESR_VALSIG	0x0800	/* Valid PD signature detected */
+#define	PSESR_INVALSIG	0x0400	/* Invalid PD signature detected */
+#define	PSESR_SHORTCIRC	0x0200	/* Short circuit condition detected */
+#define	PSESR_OVERLOAD	0x0100	/* Overload condition detected */
+#define	PSESR_MPSABSENT	0x0080	/* MPS absent condition detected */
+#define	PSESR_PDCLMASK	0x0070	/* PD Class mask */
+#define	PSESR_STATMASK	0x000e	/* PSE Status mask */
+#define	PSESR_PAIRCTABL	0x0001	/* PAIR Control Ability */
+#define	PSESR_PDCL_4		(4 << 4)	/* Class 4 */
+#define	PSESR_PDCL_3		(3 << 4)	/* Class 3 */
+#define	PSESR_PDCL_2		(2 << 4)	/* Class 2 */
+#define	PSESR_PDCL_1		(1 << 4)	/* Class 1 */
+#define	PSESR_PDCL_0		(0 << 4)	/* Class 0 */
+
+#define	MII_MMDACR	0x0d	/* MMD access control register */
+#define	MMDACR_FUNCMASK	0xc000	/* function */
+#define	MMDACR_DADDRMASK 0x001f	/* device address */
+#define	MMDACR_FN_ADDRESS	(0 << 14) /* address */
+#define	MMDACR_FN_DATANPI	(1 << 14) /* data, no post increment */
+#define	MMDACR_FN_DATAPIRW	(2 << 14) /* data, post increment on r/w */
+#define	MMDACR_FN_DATAPIW	(3 << 14) /* data, post increment on wr only */
+
+#define	MII_MMDAADR	0x0e	/* MMD access address data register */
+
+#define	MII_EXTSR	0x0f	/* Extended status register */
+#define	EXTSR_1000XFDX	0x8000	/* 1000X full-duplex capable */
+#define	EXTSR_1000XHDX	0x4000	/* 1000X half-duplex capable */
+#define	EXTSR_1000TFDX	0x2000	/* 1000T full-duplex capable */
+#define	EXTSR_1000THDX	0x1000	/* 1000T half-duplex capable */
+
+#define	EXTSR_MEDIAMASK	(EXTSR_1000XFDX|EXTSR_1000XHDX| \
+			 EXTSR_1000TFDX|EXTSR_1000THDX)
+
+#endif /* _DEV_MII_MII_H_ */
diff --git a/usr/contrib/freebsd/dev/nvme/nvme.h b/usr/contrib/freebsd/dev/nvme/nvme.h
new file mode 100644
index 0000000000..c7f6496426
--- /dev/null
+++ b/usr/contrib/freebsd/dev/nvme/nvme.h
@@ -0,0 +1,1511 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (C) 2012-2013 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef __NVME_H__
+#define __NVME_H__
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#endif
+
+#include <sys/param.h>
+#include <sys/endian.h>
+
+#define	NVME_PASSTHROUGH_CMD		_IOWR('n', 0, struct nvme_pt_command)
+#define	NVME_RESET_CONTROLLER		_IO('n', 1)
+
+#define	NVME_IO_TEST			_IOWR('n', 100, struct nvme_io_test)
+#define	NVME_BIO_TEST			_IOWR('n', 101, struct nvme_io_test)
+
+/*
+ * Macros to deal with NVME revisions, as defined VS register
+ */
+#define NVME_REV(x, y)			(((x) << 16) | ((y) << 8))
+#define NVME_MAJOR(r)			(((r) >> 16) & 0xffff)
+#define NVME_MINOR(r)			(((r) >> 8) & 0xff)
+
+/*
+ * Use to mark a command to apply to all namespaces, or to retrieve global
+ *  log pages.
+ */
+#define NVME_GLOBAL_NAMESPACE_TAG	((uint32_t)0xFFFFFFFF)
+
+/* Cap nvme to 1MB transfers driver explodes with larger sizes */
+#define NVME_MAX_XFER_SIZE		(MAXPHYS < (1<<20) ? MAXPHYS : (1<<20))
+
+/* Register field definitions */
+#define NVME_CAP_LO_REG_MQES_SHIFT			(0)
+#define NVME_CAP_LO_REG_MQES_MASK			(0xFFFF)
+#define NVME_CAP_LO_REG_CQR_SHIFT			(16)
+#define NVME_CAP_LO_REG_CQR_MASK			(0x1)
+#define NVME_CAP_LO_REG_AMS_SHIFT			(17)
+#define NVME_CAP_LO_REG_AMS_MASK			(0x3)
+#define NVME_CAP_LO_REG_TO_SHIFT			(24)
+#define NVME_CAP_LO_REG_TO_MASK				(0xFF)
+
+#define NVME_CAP_HI_REG_DSTRD_SHIFT			(0)
+#define NVME_CAP_HI_REG_DSTRD_MASK			(0xF)
+#define NVME_CAP_HI_REG_CSS_NVM_SHIFT			(5)
+#define NVME_CAP_HI_REG_CSS_NVM_MASK			(0x1)
+#define NVME_CAP_HI_REG_MPSMIN_SHIFT			(16)
+#define NVME_CAP_HI_REG_MPSMIN_MASK			(0xF)
+#define NVME_CAP_HI_REG_MPSMAX_SHIFT			(20)
+#define NVME_CAP_HI_REG_MPSMAX_MASK			(0xF)
+
+#define NVME_CC_REG_EN_SHIFT				(0)
+#define NVME_CC_REG_EN_MASK				(0x1)
+#define NVME_CC_REG_CSS_SHIFT				(4)
+#define NVME_CC_REG_CSS_MASK				(0x7)
+#define NVME_CC_REG_MPS_SHIFT				(7)
+#define NVME_CC_REG_MPS_MASK				(0xF)
+#define NVME_CC_REG_AMS_SHIFT				(11)
+#define NVME_CC_REG_AMS_MASK				(0x7)
+#define NVME_CC_REG_SHN_SHIFT				(14)
+#define NVME_CC_REG_SHN_MASK				(0x3)
+#define NVME_CC_REG_IOSQES_SHIFT			(16)
+#define NVME_CC_REG_IOSQES_MASK				(0xF)
+#define NVME_CC_REG_IOCQES_SHIFT			(20)
+#define NVME_CC_REG_IOCQES_MASK				(0xF)
+
+#define NVME_CSTS_REG_RDY_SHIFT				(0)
+#define NVME_CSTS_REG_RDY_MASK				(0x1)
+#define NVME_CSTS_REG_CFS_SHIFT				(1)
+#define NVME_CSTS_REG_CFS_MASK				(0x1)
+#define NVME_CSTS_REG_SHST_SHIFT			(2)
+#define NVME_CSTS_REG_SHST_MASK				(0x3)
+
+#define NVME_CSTS_GET_SHST(csts)			(((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK)
+
+#define NVME_AQA_REG_ASQS_SHIFT				(0)
+#define NVME_AQA_REG_ASQS_MASK				(0xFFF)
+#define NVME_AQA_REG_ACQS_SHIFT				(16)
+#define NVME_AQA_REG_ACQS_MASK				(0xFFF)
+
+/* Command field definitions */
+
+#define NVME_CMD_FUSE_SHIFT				(8)
+#define NVME_CMD_FUSE_MASK				(0x3)
+
+#define NVME_STATUS_P_SHIFT				(0)
+#define NVME_STATUS_P_MASK				(0x1)
+#define NVME_STATUS_SC_SHIFT				(1)
+#define NVME_STATUS_SC_MASK				(0xFF)
+#define NVME_STATUS_SCT_SHIFT				(9)
+#define NVME_STATUS_SCT_MASK				(0x7)
+#define NVME_STATUS_M_SHIFT				(14)
+#define NVME_STATUS_M_MASK				(0x1)
+#define NVME_STATUS_DNR_SHIFT				(15)
+#define NVME_STATUS_DNR_MASK				(0x1)
+
+#define NVME_STATUS_GET_P(st)				(((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK)
+#define NVME_STATUS_GET_SC(st)				(((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)
+#define NVME_STATUS_GET_SCT(st)				(((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK)
+#define NVME_STATUS_GET_M(st)				(((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK)
+#define NVME_STATUS_GET_DNR(st)				(((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK)
+
+#define NVME_PWR_ST_MPS_SHIFT				(0)
+#define NVME_PWR_ST_MPS_MASK				(0x1)
+#define NVME_PWR_ST_NOPS_SHIFT				(1)
+#define NVME_PWR_ST_NOPS_MASK				(0x1)
+#define NVME_PWR_ST_RRT_SHIFT				(0)
+#define NVME_PWR_ST_RRT_MASK				(0x1F)
+#define NVME_PWR_ST_RRL_SHIFT				(0)
+#define NVME_PWR_ST_RRL_MASK				(0x1F)
+#define NVME_PWR_ST_RWT_SHIFT				(0)
+#define NVME_PWR_ST_RWT_MASK				(0x1F)
+#define NVME_PWR_ST_RWL_SHIFT				(0)
+#define NVME_PWR_ST_RWL_MASK				(0x1F)
+#define NVME_PWR_ST_IPS_SHIFT				(6)
+#define NVME_PWR_ST_IPS_MASK				(0x3)
+#define NVME_PWR_ST_APW_SHIFT				(0)
+#define NVME_PWR_ST_APW_MASK				(0x7)
+#define NVME_PWR_ST_APS_SHIFT				(6)
+#define NVME_PWR_ST_APS_MASK				(0x3)
+
+/** Controller Multi-path I/O and Namespace Sharing Capabilities */
+/* More then one port */
+#define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT		(0)
+#define NVME_CTRLR_DATA_MIC_MPORTS_MASK			(0x1)
+/* More then one controller */
+#define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT		(1)
+#define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK		(0x1)
+/* SR-IOV Virtual Function */
+#define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT		(2)
+#define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK		(0x1)
+
+/** OACS - optional admin command support */
+/* supports security send/receive commands */
+#define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT		(0)
+#define NVME_CTRLR_DATA_OACS_SECURITY_MASK		(0x1)
+/* supports format nvm command */
+#define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT		(1)
+#define NVME_CTRLR_DATA_OACS_FORMAT_MASK		(0x1)
+/* supports firmware activate/download commands */
+#define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT		(2)
+#define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK		(0x1)
+/* supports namespace management commands */
+#define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT		(3)
+#define NVME_CTRLR_DATA_OACS_NSMGMT_MASK		(0x1)
+/* supports Device Self-test command */
+#define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT		(4)
+#define NVME_CTRLR_DATA_OACS_SELFTEST_MASK		(0x1)
+/* supports Directives */
+#define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT		(5)
+#define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK		(0x1)
+/* supports NVMe-MI Send/Receive */
+#define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT		(6)
+#define NVME_CTRLR_DATA_OACS_NVMEMI_MASK		(0x1)
+/* supports Virtualization Management */
+#define NVME_CTRLR_DATA_OACS_VM_SHIFT			(7)
+#define NVME_CTRLR_DATA_OACS_VM_MASK			(0x1)
+/* supports Doorbell Buffer Config */
+#define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT		(8)
+#define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK		(0x1)
+
+/** firmware updates */
+/* first slot is read-only */
+#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT		(0)
+#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK		(0x1)
+/* number of firmware slots */
+#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT		(1)
+#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK		(0x7)
+
+/** log page attributes */
+/* per namespace smart/health log page */
+#define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT		(0)
+#define NVME_CTRLR_DATA_LPA_NS_SMART_MASK		(0x1)
+
+/** AVSCC - admin vendor specific command configuration */
+/* admin vendor specific commands use spec format */
+#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT		(0)
+#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK		(0x1)
+
+/** Autonomous Power State Transition Attributes */
+/* Autonomous Power State Transitions supported */
+#define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT		(0)
+#define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK		(0x1)
+
+/** submission queue entry size */
+#define NVME_CTRLR_DATA_SQES_MIN_SHIFT			(0)
+#define NVME_CTRLR_DATA_SQES_MIN_MASK			(0xF)
+#define NVME_CTRLR_DATA_SQES_MAX_SHIFT			(4)
+#define NVME_CTRLR_DATA_SQES_MAX_MASK			(0xF)
+
+/** completion queue entry size */
+#define NVME_CTRLR_DATA_CQES_MIN_SHIFT			(0)
+#define NVME_CTRLR_DATA_CQES_MIN_MASK			(0xF)
+#define NVME_CTRLR_DATA_CQES_MAX_SHIFT			(4)
+#define NVME_CTRLR_DATA_CQES_MAX_MASK			(0xF)
+
+/** optional nvm command support */
+#define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT		(0)
+#define NVME_CTRLR_DATA_ONCS_COMPARE_MASK		(0x1)
+#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT		(1)
+#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK		(0x1)
+#define NVME_CTRLR_DATA_ONCS_DSM_SHIFT			(2)
+#define NVME_CTRLR_DATA_ONCS_DSM_MASK			(0x1)
+#define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT		(3)
+#define NVME_CTRLR_DATA_ONCS_WRZERO_MASK		(0x1)
+#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT		(4)
+#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK		(0x1)
+#define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT		(5)
+#define NVME_CTRLR_DATA_ONCS_RESERV_MASK		(0x1)
+#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT		(6)
+#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK		(0x1)
+
+/** Fused Operation Support */
+#define NVME_CTRLR_DATA_FUSES_CNW_SHIFT		(0)
+#define NVME_CTRLR_DATA_FUSES_CNW_MASK		(0x1)
+
+/** Format NVM Attributes */
+#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT		(0)
+#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK		(0x1)
+#define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT		(1)
+#define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK		(0x1)
+#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT		(2)
+#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK		(0x1)
+
+/** volatile write cache */
+#define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT		(0)
+#define NVME_CTRLR_DATA_VWC_PRESENT_MASK		(0x1)
+
+/** namespace features */
+/* thin provisioning */
+#define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT		(0)
+#define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK		(0x1)
+/* NAWUN, NAWUPF, and NACWU fields are valid */
+#define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT		(1)
+#define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK		(0x1)
+/* Deallocated or Unwritten Logical Block errors supported */
+#define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT		(2)
+#define NVME_NS_DATA_NSFEAT_DEALLOC_MASK		(0x1)
+/* NGUID and EUI64 fields are not reusable */
+#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT		(3)
+#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK		(0x1)
+
+/** formatted lba size */
+#define NVME_NS_DATA_FLBAS_FORMAT_SHIFT			(0)
+#define NVME_NS_DATA_FLBAS_FORMAT_MASK			(0xF)
+#define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT		(4)
+#define NVME_NS_DATA_FLBAS_EXTENDED_MASK		(0x1)
+
+/** metadata capabilities */
+/* metadata can be transferred as part of data prp list */
+#define NVME_NS_DATA_MC_EXTENDED_SHIFT			(0)
+#define NVME_NS_DATA_MC_EXTENDED_MASK			(0x1)
+/* metadata can be transferred with separate metadata pointer */
+#define NVME_NS_DATA_MC_POINTER_SHIFT			(1)
+#define NVME_NS_DATA_MC_POINTER_MASK			(0x1)
+
+/** end-to-end data protection capabilities */
+/* protection information type 1 */
+#define NVME_NS_DATA_DPC_PIT1_SHIFT			(0)
+#define NVME_NS_DATA_DPC_PIT1_MASK			(0x1)
+/* protection information type 2 */
+#define NVME_NS_DATA_DPC_PIT2_SHIFT			(1)
+#define NVME_NS_DATA_DPC_PIT2_MASK			(0x1)
+/* protection information type 3 */
+#define NVME_NS_DATA_DPC_PIT3_SHIFT			(2)
+#define NVME_NS_DATA_DPC_PIT3_MASK			(0x1)
+/* first eight bytes of metadata */
+#define NVME_NS_DATA_DPC_MD_START_SHIFT			(3)
+#define NVME_NS_DATA_DPC_MD_START_MASK			(0x1)
+/* last eight bytes of metadata */
+#define NVME_NS_DATA_DPC_MD_END_SHIFT			(4)
+#define NVME_NS_DATA_DPC_MD_END_MASK			(0x1)
+
+/** end-to-end data protection type settings */
+/* protection information type */
+#define NVME_NS_DATA_DPS_PIT_SHIFT			(0)
+#define NVME_NS_DATA_DPS_PIT_MASK			(0x7)
+/* 1 == protection info transferred at start of metadata */
+/* 0 == protection info transferred at end of metadata */
+#define NVME_NS_DATA_DPS_MD_START_SHIFT			(3)
+#define NVME_NS_DATA_DPS_MD_START_MASK			(0x1)
+
+/** Namespace Multi-path I/O and Namespace Sharing Capabilities */
+/* the namespace may be attached to two or more controllers */
+#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT		(0)
+#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK		(0x1)
+
+/** Reservation Capabilities */
+/* Persist Through Power Loss */
+#define NVME_NS_DATA_RESCAP_PTPL_SHIFT		(0)
+#define NVME_NS_DATA_RESCAP_PTPL_MASK		(0x1)
+/* supports the Write Exclusive */
+#define NVME_NS_DATA_RESCAP_WR_EX_SHIFT		(1)
+#define NVME_NS_DATA_RESCAP_WR_EX_MASK		(0x1)
+/* supports the Exclusive Access */
+#define NVME_NS_DATA_RESCAP_EX_AC_SHIFT		(2)
+#define NVME_NS_DATA_RESCAP_EX_AC_MASK		(0x1)
+/* supports the Write Exclusive – Registrants Only */
+#define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT	(3)
+#define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK	(0x1)
+/* supports the Exclusive Access - Registrants Only */
+#define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT	(4)
+#define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK	(0x1)
+/* supports the Write Exclusive – All Registrants */
+#define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT	(5)
+#define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK	(0x1)
+/* supports the Exclusive Access - All Registrants */
+#define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT	(6)
+#define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK	(0x1)
+/* Ignore Existing Key is used as defined in revision 1.3 or later */
+#define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT	(7)
+#define NVME_NS_DATA_RESCAP_IEKEY13_MASK	(0x1)
+
+/** Format Progress Indicator */
+/* percentage of the Format NVM command that remains to be completed */
+#define NVME_NS_DATA_FPI_PERC_SHIFT		(0)
+#define NVME_NS_DATA_FPI_PERC_MASK		(0x7f)
+/* namespace supports the Format Progress Indicator */
+#define NVME_NS_DATA_FPI_SUPP_SHIFT		(7)
+#define NVME_NS_DATA_FPI_SUPP_MASK		(0x1)
+
+/** lba format support */
+/* metadata size */
+#define NVME_NS_DATA_LBAF_MS_SHIFT			(0)
+#define NVME_NS_DATA_LBAF_MS_MASK			(0xFFFF)
+/* lba data size */
+#define NVME_NS_DATA_LBAF_LBADS_SHIFT			(16)
+#define NVME_NS_DATA_LBAF_LBADS_MASK			(0xFF)
+/* relative performance */
+#define NVME_NS_DATA_LBAF_RP_SHIFT			(24)
+#define NVME_NS_DATA_LBAF_RP_MASK			(0x3)
+
+enum nvme_critical_warning_state {
+	NVME_CRIT_WARN_ST_AVAILABLE_SPARE		= 0x1,
+	NVME_CRIT_WARN_ST_TEMPERATURE			= 0x2,
+	NVME_CRIT_WARN_ST_DEVICE_RELIABILITY		= 0x4,
+	NVME_CRIT_WARN_ST_READ_ONLY			= 0x8,
+	NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP	= 0x10,
+};
+#define NVME_CRIT_WARN_ST_RESERVED_MASK			(0xE0)
+
+/* slot for current FW */
+#define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT		(0)
+#define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK		(0x7)
+
+/* CC register SHN field values */
+enum shn_value {
+	NVME_SHN_NORMAL		= 0x1,
+	NVME_SHN_ABRUPT		= 0x2,
+};
+
+/* CSTS register SHST field values */
+enum shst_value {
+	NVME_SHST_NORMAL	= 0x0,
+	NVME_SHST_OCCURRING	= 0x1,
+	NVME_SHST_COMPLETE	= 0x2,
+};
+
+struct nvme_registers
+{
+	/** controller capabilities */
+	uint32_t		cap_lo;
+	uint32_t		cap_hi;
+
+	uint32_t		vs;	/* version */
+	uint32_t		intms;	/* interrupt mask set */
+	uint32_t		intmc;	/* interrupt mask clear */
+
+	/** controller configuration */
+	uint32_t		cc;
+
+	uint32_t		reserved1;
+
+	/** controller status */
+	uint32_t		csts;
+
+	uint32_t		reserved2;
+
+	/** admin queue attributes */
+	uint32_t		aqa;
+
+	uint64_t		asq;	/* admin submission queue base addr */
+	uint64_t		acq;	/* admin completion queue base addr */
+	uint32_t		reserved3[0x3f2];
+
+	struct {
+	    uint32_t		sq_tdbl; /* submission queue tail doorbell */
+	    uint32_t		cq_hdbl; /* completion queue head doorbell */
+	} doorbell[1] __packed;
+} __packed;
+
+_Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers");
+
+struct nvme_command
+{
+	/* dword 0 */
+	uint8_t opc;		/* opcode */
+	uint8_t fuse;		/* fused operation */
+	uint16_t cid;		/* command identifier */
+
+	/* dword 1 */
+	uint32_t nsid;		/* namespace identifier */
+
+	/* dword 2-3 */
+	uint32_t rsvd2;
+	uint32_t rsvd3;
+
+	/* dword 4-5 */
+	uint64_t mptr;		/* metadata pointer */
+
+	/* dword 6-7 */
+	uint64_t prp1;		/* prp entry 1 */
+
+	/* dword 8-9 */
+	uint64_t prp2;		/* prp entry 2 */
+
+	/* dword 10-15 */
+	uint32_t cdw10;		/* command-specific */
+	uint32_t cdw11;		/* command-specific */
+	uint32_t cdw12;		/* command-specific */
+	uint32_t cdw13;		/* command-specific */
+	uint32_t cdw14;		/* command-specific */
+	uint32_t cdw15;		/* command-specific */
+} __packed;
+
+_Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command");
+
+struct nvme_completion {
+
+	/* dword 0 */
+	uint32_t		cdw0;	/* command-specific */
+
+	/* dword 1 */
+	uint32_t		rsvd1;
+
+	/* dword 2 */
+	uint16_t		sqhd;	/* submission queue head pointer */
+	uint16_t		sqid;	/* submission queue identifier */
+
+	/* dword 3 */
+	uint16_t		cid;	/* command identifier */
+	uint16_t		status;
+} __packed;
+
+_Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion");
+
+struct nvme_dsm_range {
+	uint32_t attributes;
+	uint32_t length;
+	uint64_t starting_lba;
+} __packed;
+
+/* Largest DSM Trim that can be done */
+#define NVME_MAX_DSM_TRIM		4096
+
+_Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage");
+
+/* status code types */
+enum nvme_status_code_type {
+	NVME_SCT_GENERIC		= 0x0,
+	NVME_SCT_COMMAND_SPECIFIC	= 0x1,
+	NVME_SCT_MEDIA_ERROR		= 0x2,
+	/* 0x3-0x6 - reserved */
+	NVME_SCT_VENDOR_SPECIFIC	= 0x7,
+};
+
+/* generic command status codes */
+enum nvme_generic_command_status_code {
+	NVME_SC_SUCCESS				= 0x00,
+	NVME_SC_INVALID_OPCODE			= 0x01,
+	NVME_SC_INVALID_FIELD			= 0x02,
+	NVME_SC_COMMAND_ID_CONFLICT		= 0x03,
+	NVME_SC_DATA_TRANSFER_ERROR		= 0x04,
+	NVME_SC_ABORTED_POWER_LOSS		= 0x05,
+	NVME_SC_INTERNAL_DEVICE_ERROR		= 0x06,
+	NVME_SC_ABORTED_BY_REQUEST		= 0x07,
+	NVME_SC_ABORTED_SQ_DELETION		= 0x08,
+	NVME_SC_ABORTED_FAILED_FUSED		= 0x09,
+	NVME_SC_ABORTED_MISSING_FUSED		= 0x0a,
+	NVME_SC_INVALID_NAMESPACE_OR_FORMAT	= 0x0b,
+	NVME_SC_COMMAND_SEQUENCE_ERROR		= 0x0c,
+	NVME_SC_INVALID_SGL_SEGMENT_DESCR	= 0x0d,
+	NVME_SC_INVALID_NUMBER_OF_SGL_DESCR	= 0x0e,
+	NVME_SC_DATA_SGL_LENGTH_INVALID		= 0x0f,
+	NVME_SC_METADATA_SGL_LENGTH_INVALID	= 0x10,
+	NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID	= 0x11,
+	NVME_SC_INVALID_USE_OF_CMB		= 0x12,
+	NVME_SC_PRP_OFFET_INVALID		= 0x13,
+	NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED	= 0x14,
+	NVME_SC_OPERATION_DENIED		= 0x15,
+	NVME_SC_SGL_OFFSET_INVALID		= 0x16,
+	/* 0x17 - reserved */
+	NVME_SC_HOST_ID_INCONSISTENT_FORMAT	= 0x18,
+	NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED	= 0x19,
+	NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID	= 0x1a,
+	NVME_SC_ABORTED_DUE_TO_PREEMPT		= 0x1b,
+	NVME_SC_SANITIZE_FAILED			= 0x1c,
+	NVME_SC_SANITIZE_IN_PROGRESS		= 0x1d,
+	NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID	= 0x1e,
+	NVME_SC_NOT_SUPPORTED_IN_CMB		= 0x1f,
+
+	NVME_SC_LBA_OUT_OF_RANGE		= 0x80,
+	NVME_SC_CAPACITY_EXCEEDED		= 0x81,
+	NVME_SC_NAMESPACE_NOT_READY		= 0x82,
+	NVME_SC_RESERVATION_CONFLICT		= 0x83,
+	NVME_SC_FORMAT_IN_PROGRESS		= 0x84,
+};
+
+/* command specific status codes */
+enum nvme_command_specific_status_code {
+	NVME_SC_COMPLETION_QUEUE_INVALID	= 0x00,
+	NVME_SC_INVALID_QUEUE_IDENTIFIER	= 0x01,
+	NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED	= 0x02,
+	NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED	= 0x03,
+	/* 0x04 - reserved */
+	NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05,
+	NVME_SC_INVALID_FIRMWARE_SLOT		= 0x06,
+	NVME_SC_INVALID_FIRMWARE_IMAGE		= 0x07,
+	NVME_SC_INVALID_INTERRUPT_VECTOR	= 0x08,
+	NVME_SC_INVALID_LOG_PAGE		= 0x09,
+	NVME_SC_INVALID_FORMAT			= 0x0a,
+	NVME_SC_FIRMWARE_REQUIRES_RESET		= 0x0b,
+	NVME_SC_INVALID_QUEUE_DELETION		= 0x0c,
+	NVME_SC_FEATURE_NOT_SAVEABLE		= 0x0d,
+	NVME_SC_FEATURE_NOT_CHANGEABLE		= 0x0e,
+	NVME_SC_FEATURE_NOT_NS_SPECIFIC		= 0x0f,
+	NVME_SC_FW_ACT_REQUIRES_NVMS_RESET	= 0x10,
+	NVME_SC_FW_ACT_REQUIRES_RESET		= 0x11,
+	NVME_SC_FW_ACT_REQUIRES_TIME		= 0x12,
+	NVME_SC_FW_ACT_PROHIBITED		= 0x13,
+	NVME_SC_OVERLAPPING_RANGE		= 0x14,
+	NVME_SC_NS_INSUFFICIENT_CAPACITY	= 0x15,
+	NVME_SC_NS_ID_UNAVAILABLE		= 0x16,
+	/* 0x17 - reserved */
+	NVME_SC_NS_ALREADY_ATTACHED		= 0x18,
+	NVME_SC_NS_IS_PRIVATE			= 0x19,
+	NVME_SC_NS_NOT_ATTACHED			= 0x1a,
+	NVME_SC_THIN_PROV_NOT_SUPPORTED		= 0x1b,
+	NVME_SC_CTRLR_LIST_INVALID		= 0x1c,
+	NVME_SC_SELT_TEST_IN_PROGRESS		= 0x1d,
+	NVME_SC_BOOT_PART_WRITE_PROHIB		= 0x1e,
+	NVME_SC_INVALID_CTRLR_ID		= 0x1f,
+	NVME_SC_INVALID_SEC_CTRLR_STATE		= 0x20,
+	NVME_SC_INVALID_NUM_OF_CTRLR_RESRC	= 0x21,
+	NVME_SC_INVALID_RESOURCE_ID		= 0x22,
+
+	NVME_SC_CONFLICTING_ATTRIBUTES		= 0x80,
+	NVME_SC_INVALID_PROTECTION_INFO		= 0x81,
+	NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE	= 0x82,
+};
+
+/* media error status codes */
+enum nvme_media_error_status_code {
+	NVME_SC_WRITE_FAULTS			= 0x80,
+	NVME_SC_UNRECOVERED_READ_ERROR		= 0x81,
+	NVME_SC_GUARD_CHECK_ERROR		= 0x82,
+	NVME_SC_APPLICATION_TAG_CHECK_ERROR	= 0x83,
+	NVME_SC_REFERENCE_TAG_CHECK_ERROR	= 0x84,
+	NVME_SC_COMPARE_FAILURE			= 0x85,
+	NVME_SC_ACCESS_DENIED			= 0x86,
+	NVME_SC_DEALLOCATED_OR_UNWRITTEN	= 0x87,
+};
+
+/* admin opcodes */
+enum nvme_admin_opcode {
+	NVME_OPC_DELETE_IO_SQ			= 0x00,
+	NVME_OPC_CREATE_IO_SQ			= 0x01,
+	NVME_OPC_GET_LOG_PAGE			= 0x02,
+	/* 0x03 - reserved */
+	NVME_OPC_DELETE_IO_CQ			= 0x04,
+	NVME_OPC_CREATE_IO_CQ			= 0x05,
+	NVME_OPC_IDENTIFY			= 0x06,
+	/* 0x07 - reserved */
+	NVME_OPC_ABORT				= 0x08,
+	NVME_OPC_SET_FEATURES			= 0x09,
+	NVME_OPC_GET_FEATURES			= 0x0a,
+	/* 0x0b - reserved */
+	NVME_OPC_ASYNC_EVENT_REQUEST		= 0x0c,
+	NVME_OPC_NAMESPACE_MANAGEMENT		= 0x0d,
+	/* 0x0e-0x0f - reserved */
+	NVME_OPC_FIRMWARE_ACTIVATE		= 0x10,
+	NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD	= 0x11,
+	NVME_OPC_DEVICE_SELF_TEST		= 0x14,
+	NVME_OPC_NAMESPACE_ATTACHMENT		= 0x15,
+	NVME_OPC_KEEP_ALIVE			= 0x18,
+	NVME_OPC_DIRECTIVE_SEND			= 0x19,
+	NVME_OPC_DIRECTIVE_RECEIVE		= 0x1a,
+	NVME_OPC_VIRTUALIZATION_MANAGEMENT	= 0x1c,
+	NVME_OPC_NVME_MI_SEND			= 0x1d,
+	NVME_OPC_NVME_MI_RECEIVE		= 0x1e,
+	NVME_OPC_DOORBELL_BUFFER_CONFIG		= 0x7c,
+
+	NVME_OPC_FORMAT_NVM			= 0x80,
+	NVME_OPC_SECURITY_SEND			= 0x81,
+	NVME_OPC_SECURITY_RECEIVE		= 0x82,
+	NVME_OPC_SANITIZE			= 0x84,
+};
+
+/* nvme nvm opcodes */
+enum nvme_nvm_opcode {
+	NVME_OPC_FLUSH				= 0x00,
+	NVME_OPC_WRITE				= 0x01,
+	NVME_OPC_READ				= 0x02,
+	/* 0x03 - reserved */
+	NVME_OPC_WRITE_UNCORRECTABLE		= 0x04,
+	NVME_OPC_COMPARE			= 0x05,
+	/* 0x06 - reserved */
+	NVME_OPC_WRITE_ZEROES			= 0x08,
+	/* 0x07 - reserved */
+	NVME_OPC_DATASET_MANAGEMENT		= 0x09,
+	/* 0x0a-0x0c - reserved */
+	NVME_OPC_RESERVATION_REGISTER		= 0x0d,
+	NVME_OPC_RESERVATION_REPORT		= 0x0e,
+	/* 0x0f-0x10 - reserved */
+	NVME_OPC_RESERVATION_ACQUIRE		= 0x11,
+	/* 0x12-0x14 - reserved */
+	NVME_OPC_RESERVATION_RELEASE		= 0x15,
+};
+
+enum nvme_feature {
+	/* 0x00 - reserved */
+	NVME_FEAT_ARBITRATION			= 0x01,
+	NVME_FEAT_POWER_MANAGEMENT		= 0x02,
+	NVME_FEAT_LBA_RANGE_TYPE		= 0x03,
+	NVME_FEAT_TEMPERATURE_THRESHOLD		= 0x04,
+	NVME_FEAT_ERROR_RECOVERY		= 0x05,
+	NVME_FEAT_VOLATILE_WRITE_CACHE		= 0x06,
+	NVME_FEAT_NUMBER_OF_QUEUES		= 0x07,
+	NVME_FEAT_INTERRUPT_COALESCING		= 0x08,
+	NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09,
+	NVME_FEAT_WRITE_ATOMICITY		= 0x0A,
+	NVME_FEAT_ASYNC_EVENT_CONFIGURATION	= 0x0B,
+	NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C,
+	NVME_FEAT_HOST_MEMORY_BUFFER		= 0x0D,
+	NVME_FEAT_TIMESTAMP			= 0x0E,
+	NVME_FEAT_KEEP_ALIVE_TIMER		= 0x0F,
+	NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT	= 0x10,
+	NVME_FEAT_NON_OP_POWER_STATE_CONFIG	= 0x11,
+	/* 0x12-0x77 - reserved */
+	/* 0x78-0x7f - NVMe Management Interface */
+	NVME_FEAT_SOFTWARE_PROGRESS_MARKER	= 0x80,
+	/* 0x81-0xBF - command set specific (reserved) */
+	/* 0xC0-0xFF - vendor specific */
+};
+
+enum nvme_dsm_attribute {
+	NVME_DSM_ATTR_INTEGRAL_READ		= 0x1,
+	NVME_DSM_ATTR_INTEGRAL_WRITE		= 0x2,
+	NVME_DSM_ATTR_DEALLOCATE		= 0x4,
+};
+
+enum nvme_activate_action {
+	NVME_AA_REPLACE_NO_ACTIVATE		= 0x0,
+	NVME_AA_REPLACE_ACTIVATE		= 0x1,
+	NVME_AA_ACTIVATE			= 0x2,
+};
+
+struct nvme_power_state {
+	/** Maximum Power */
+	uint16_t	mp;			/* Maximum Power */
+	uint8_t		ps_rsvd1;
+	uint8_t		mps_nops;		/* Max Power Scale, Non-Operational State */
+
+	uint32_t	enlat;			/* Entry Latency */
+	uint32_t	exlat;			/* Exit Latency */
+
+	uint8_t		rrt;			/* Relative Read Throughput */
+	uint8_t		rrl;			/* Relative Read Latency */
+	uint8_t		rwt;			/* Relative Write Throughput */
+	uint8_t		rwl;			/* Relative Write Latency */
+
+	uint16_t	idlp;			/* Idle Power */
+	uint8_t		ips;			/* Idle Power Scale */
+	uint8_t		ps_rsvd8;
+
+	uint16_t	actp;			/* Active Power */
+	uint8_t		apw_aps;		/* Active Power Workload, Active Power Scale */
+	uint8_t		ps_rsvd10[9];
+} __packed;
+
+_Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state");
+
+#define NVME_SERIAL_NUMBER_LENGTH	20
+#define NVME_MODEL_NUMBER_LENGTH	40
+#define NVME_FIRMWARE_REVISION_LENGTH	8
+
+struct nvme_controller_data {
+
+	/* bytes 0-255: controller capabilities and features */
+
+	/** pci vendor id */
+	uint16_t		vid;
+
+	/** pci subsystem vendor id */
+	uint16_t		ssvid;
+
+	/** serial number */
+	uint8_t			sn[NVME_SERIAL_NUMBER_LENGTH];
+
+	/** model number */
+	uint8_t			mn[NVME_MODEL_NUMBER_LENGTH];
+
+	/** firmware revision */
+	uint8_t			fr[NVME_FIRMWARE_REVISION_LENGTH];
+
+	/** recommended arbitration burst */
+	uint8_t			rab;
+
+	/** ieee oui identifier */
+	uint8_t			ieee[3];
+
+	/** multi-interface capabilities */
+	uint8_t			mic;
+
+	/** maximum data transfer size */
+	uint8_t			mdts;
+
+	/** Controller ID */
+	uint16_t		ctrlr_id;
+
+	/** Version */
+	uint32_t		ver;
+
+	/** RTD3 Resume Latency */
+	uint32_t		rtd3r;
+
+	/** RTD3 Enter Latency */
+	uint32_t		rtd3e;
+
+	/** Optional Asynchronous Events Supported */
+	uint32_t		oaes;	/* bitfield really */
+
+	/** Controller Attributes */
+	uint32_t		ctratt;	/* bitfield really */
+
+	uint8_t			reserved1[12];
+
+	/** FRU Globally Unique Identifier */
+	uint8_t			fguid[16];
+
+	uint8_t			reserved2[128];
+
+	/* bytes 256-511: admin command set attributes */
+
+	/** optional admin command support */
+	uint16_t		oacs;
+
+	/** abort command limit */
+	uint8_t			acl;
+
+	/** asynchronous event request limit */
+	uint8_t			aerl;
+
+	/** firmware updates */
+	uint8_t			frmw;
+
+	/** log page attributes */
+	uint8_t			lpa;
+
+	/** error log page entries */
+	uint8_t			elpe;
+
+	/** number of power states supported */
+	uint8_t			npss;
+
+	/** admin vendor specific command configuration */
+	uint8_t			avscc;
+
+	/** Autonomous Power State Transition Attributes */
+	uint8_t			apsta;
+
+	/** Warning Composite Temperature Threshold */
+	uint16_t		wctemp;
+
+	/** Critical Composite Temperature Threshold */
+	uint16_t		cctemp;
+
+	/** Maximum Time for Firmware Activation */
+	uint16_t		mtfa;
+
+	/** Host Memory Buffer Preferred Size */
+	uint32_t		hmpre;
+
+	/** Host Memory Buffer Minimum Size */
+	uint32_t		hmmin;
+
+	/** Name space capabilities  */
+	struct {
+		/* if nsmgmt, report tnvmcap and unvmcap */
+		uint8_t    tnvmcap[16];
+		uint8_t    unvmcap[16];
+	} __packed untncap;
+
+	/** Replay Protected Memory Block Support */
+	uint32_t		rpmbs; /* Really a bitfield */
+
+	/** Extended Device Self-test Time */
+	uint16_t		edstt;
+
+	/** Device Self-test Options */
+	uint8_t			dsto; /* Really a bitfield */
+
+	/** Firmware Update Granularity */
+	uint8_t			fwug;
+
+	/** Keep Alive Support */
+	uint16_t		kas;
+
+	/** Host Controlled Thermal Management Attributes */
+	uint16_t		hctma; /* Really a bitfield */
+
+	/** Minimum Thermal Management Temperature */
+	uint16_t		mntmt;
+
+	/** Maximum Thermal Management Temperature */
+	uint16_t		mxtmt;
+
+	/** Sanitize Capabilities */
+	uint32_t		sanicap; /* Really a bitfield */
+
+	uint8_t			reserved3[180];
+	/* bytes 512-703: nvm command set attributes */
+
+	/** submission queue entry size */
+	uint8_t			sqes;
+
+	/** completion queue entry size */
+	uint8_t			cqes;
+
+	/** Maximum Outstanding Commands */
+	uint16_t		maxcmd;
+
+	/** number of namespaces */
+	uint32_t		nn;
+
+	/** optional nvm command support */
+	uint16_t		oncs;
+
+	/** fused operation support */
+	uint16_t		fuses;
+
+	/** format nvm attributes */
+	uint8_t			fna;
+
+	/** volatile write cache */
+	uint8_t			vwc;
+
+	/** Atomic Write Unit Normal */
+	uint16_t		awun;
+
+	/** Atomic Write Unit Power Fail */
+	uint16_t		awupf;
+
+	/** NVM Vendor Specific Command Configuration */
+	uint8_t			nvscc;
+	uint8_t			reserved5;
+
+	/** Atomic Compare & Write Unit */
+	uint16_t		acwu;
+	uint16_t		reserved6;
+
+	/** SGL Support */
+	uint32_t		sgls;
+
+	/* bytes 540-767: Reserved */
+	uint8_t			reserved7[228];
+
+	/** NVM Subsystem NVMe Qualified Name */
+	uint8_t			subnqn[256];
+
+	/* bytes 1024-1791: Reserved */
+	uint8_t			reserved8[768];
+
+	/* bytes 1792-2047: NVMe over Fabrics specification */
+	uint8_t			reserved9[256];
+
+	/* bytes 2048-3071: power state descriptors */
+	struct nvme_power_state power_state[32];
+
+	/* bytes 3072-4095: vendor specific */
+	uint8_t			vs[1024];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data");
+
+struct nvme_namespace_data {
+
+	/** namespace size */
+	uint64_t		nsze;
+
+	/** namespace capacity */
+	uint64_t		ncap;
+
+	/** namespace utilization */
+	uint64_t		nuse;
+
+	/** namespace features */
+	uint8_t			nsfeat;
+
+	/** number of lba formats */
+	uint8_t			nlbaf;
+
+	/** formatted lba size */
+	uint8_t			flbas;
+
+	/** metadata capabilities */
+	uint8_t			mc;
+
+	/** end-to-end data protection capabilities */
+	uint8_t			dpc;
+
+	/** end-to-end data protection type settings */
+	uint8_t			dps;
+
+	/** Namespace Multi-path I/O and Namespace Sharing Capabilities */
+	uint8_t			nmic;
+
+	/** Reservation Capabilities */
+	uint8_t			rescap;
+
+	/** Format Progress Indicator */
+	uint8_t			fpi;
+
+	/** Deallocate Logical Block Features */
+	uint8_t			dlfeat;
+
+	/** Namespace Atomic Write Unit Normal  */
+	uint16_t		nawun;
+
+	/** Namespace Atomic Write Unit Power Fail */
+	uint16_t		nawupf;
+
+	/** Namespace Atomic Compare & Write Unit */
+	uint16_t		nacwu;
+
+	/** Namespace Atomic Boundary Size Normal */
+	uint16_t		nabsn;
+
+	/** Namespace Atomic Boundary Offset */
+	uint16_t		nabo;
+
+	/** Namespace Atomic Boundary Size Power Fail */
+	uint16_t		nabspf;
+
+	/** Namespace Optimal IO Boundary */
+	uint16_t		noiob;
+
+	/** NVM Capacity */
+	uint8_t			nvmcap[16];
+
+	/* bytes 64-103: Reserved */
+	uint8_t			reserved5[40];
+
+	/** Namespace Globally Unique Identifier */
+	uint8_t			nguid[16];
+
+	/** IEEE Extended Unique Identifier */
+	uint8_t			eui64[8];
+
+	/** lba format support */
+	uint32_t		lbaf[16];
+
+	uint8_t			reserved6[192];
+
+	uint8_t			vendor_specific[3712];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data");
+
+enum nvme_log_page {
+
+	/* 0x00 - reserved */
+	NVME_LOG_ERROR			= 0x01,
+	NVME_LOG_HEALTH_INFORMATION	= 0x02,
+	NVME_LOG_FIRMWARE_SLOT		= 0x03,
+	NVME_LOG_CHANGED_NAMESPACE	= 0x04,
+	NVME_LOG_COMMAND_EFFECT		= 0x05,
+	/* 0x06-0x7F - reserved */
+	/* 0x80-0xBF - I/O command set specific */
+	NVME_LOG_RES_NOTIFICATION	= 0x80,
+	/* 0xC0-0xFF - vendor specific */
+
+	/*
+	 * The following are Intel Specific log pages, but they seem
+	 * to be widely implemented.
+	 */
+	INTEL_LOG_READ_LAT_LOG		= 0xc1,
+	INTEL_LOG_WRITE_LAT_LOG		= 0xc2,
+	INTEL_LOG_TEMP_STATS		= 0xc5,
+	INTEL_LOG_ADD_SMART		= 0xca,
+	INTEL_LOG_DRIVE_MKT_NAME	= 0xdd,
+
+	/*
+	 * HGST log page, with lots ofs sub pages.
+	 */
+	HGST_INFO_LOG			= 0xc1,
+};
+
+struct nvme_error_information_entry {
+
+	uint64_t		error_count;
+	uint16_t		sqid;
+	uint16_t		cid;
+	uint16_t		status;
+	uint16_t		error_location;
+	uint64_t		lba;
+	uint32_t		nsid;
+	uint8_t			vendor_specific;
+	uint8_t			reserved[35];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry");
+
+struct nvme_health_information_page {
+
+	uint8_t			critical_warning;
+	uint16_t		temperature;
+	uint8_t			available_spare;
+	uint8_t			available_spare_threshold;
+	uint8_t			percentage_used;
+
+	uint8_t			reserved[26];
+
+	/*
+	 * Note that the following are 128-bit values, but are
+	 *  defined as an array of 2 64-bit values.
+	 */
+	/* Data Units Read is always in 512-byte units. */
+	uint64_t		data_units_read[2];
+	/* Data Units Written is always in 512-byte units. */
+	uint64_t		data_units_written[2];
+	/* For NVM command set, this includes Compare commands. */
+	uint64_t		host_read_commands[2];
+	uint64_t		host_write_commands[2];
+	/* Controller Busy Time is reported in minutes. */
+	uint64_t		controller_busy_time[2];
+	uint64_t		power_cycles[2];
+	uint64_t		power_on_hours[2];
+	uint64_t		unsafe_shutdowns[2];
+	uint64_t		media_errors[2];
+	uint64_t		num_error_info_log_entries[2];
+	uint32_t		warning_temp_time;
+	uint32_t		error_temp_time;
+	uint16_t		temp_sensor[8];
+
+	uint8_t			reserved2[296];
+} __packed __aligned(4);
+
+/* Currently sparse/smatch incorrectly packs this struct in some situations. */
+#ifndef __CHECKER__
+_Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page");
+#endif
+
+struct nvme_firmware_page {
+
+	uint8_t			afi;
+	uint8_t			reserved[7];
+	uint64_t		revision[7]; /* revisions for 7 slots */
+	uint8_t			reserved2[448];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page");
+
+struct nvme_ns_list {
+	uint32_t		ns[1024];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list");
+
+struct intel_log_temp_stats
+{
+	uint64_t	current;
+	uint64_t	overtemp_flag_last;
+	uint64_t	overtemp_flag_life;
+	uint64_t	max_temp;
+	uint64_t	min_temp;
+	uint64_t	_rsvd[5];
+	uint64_t	max_oper_temp;
+	uint64_t	min_oper_temp;
+	uint64_t	est_offset;
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats");
+
+#define NVME_TEST_MAX_THREADS	128
+
+struct nvme_io_test {
+
+	enum nvme_nvm_opcode	opc;
+	uint32_t		size;
+	uint32_t		time;	/* in seconds */
+	uint32_t		num_threads;
+	uint32_t		flags;
+	uint64_t		io_completed[NVME_TEST_MAX_THREADS];
+};
+
+enum nvme_io_test_flags {
+
+	/*
+	 * Specifies whether dev_refthread/dev_relthread should be
+	 *  called during NVME_BIO_TEST.  Ignored for other test
+	 *  types.
+	 */
+	NVME_TEST_FLAG_REFTHREAD =	0x1,
+};
+
+struct nvme_pt_command {
+
+	/*
+	 * cmd is used to specify a passthrough command to a controller or
+	 *  namespace.
+	 *
+	 * The following fields from cmd may be specified by the caller:
+	 *	* opc  (opcode)
+	 *	* nsid (namespace id) - for admin commands only
+	 *	* cdw10-cdw15
+	 *
+	 * Remaining fields must be set to 0 by the caller.
+	 */
+	struct nvme_command	cmd;
+
+	/*
+	 * cpl returns completion status for the passthrough command
+	 *  specified by cmd.
+	 *
+	 * The following fields will be filled out by the driver, for
+	 *  consumption by the caller:
+	 *	* cdw0
+	 *	* status (except for phase)
+	 *
+	 * Remaining fields will be set to 0 by the driver.
+	 */
+	struct nvme_completion	cpl;
+
+	/* buf is the data buffer associated with this passthrough command. */
+	void *			buf;
+
+	/*
+	 * len is the length of the data buffer associated with this
+	 *  passthrough command.
+	 */
+	uint32_t		len;
+
+	/*
+	 * is_read = 1 if the passthrough command will read data into the
+	 *  supplied buffer from the controller.
+	 *
+	 * is_read = 0 if the passthrough command will write data from the
+	 *  supplied buffer to the controller.
+	 */
+	uint32_t		is_read;
+
+	/*
+	 * driver_lock is used by the driver only.  It must be set to 0
+	 *  by the caller.
+	 */
+	struct mtx *		driver_lock;
+};
+
+#define nvme_completion_is_error(cpl)					\
+	(NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0)
+
+void	nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen);
+
+#ifdef _KERNEL
+
+struct bio;
+
+struct nvme_namespace;
+struct nvme_controller;
+struct nvme_consumer;
+
+typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
+
+typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *);
+typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *);
+typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *,
+				     uint32_t, void *, uint32_t);
+typedef void (*nvme_cons_fail_fn_t)(void *);
+
+enum nvme_namespace_flags {
+	NVME_NS_DEALLOCATE_SUPPORTED	= 0x1,
+	NVME_NS_FLUSH_SUPPORTED		= 0x2,
+};
+
+int	nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
+				   struct nvme_pt_command *pt,
+				   uint32_t nsid, int is_user_buffer,
+				   int is_admin_cmd);
+
+/* Admin functions */
+void	nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
+				   uint8_t feature, uint32_t cdw11,
+				   void *payload, uint32_t payload_size,
+				   nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr,
+				   uint8_t feature, uint32_t cdw11,
+				   void *payload, uint32_t payload_size,
+				   nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr,
+				    uint8_t log_page, uint32_t nsid,
+				    void *payload, uint32_t payload_size,
+				    nvme_cb_fn_t cb_fn, void *cb_arg);
+
+/* NVM I/O functions */
+int	nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload,
+			  uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
+			  void *cb_arg);
+int	nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp,
+			      nvme_cb_fn_t cb_fn, void *cb_arg);
+int	nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload,
+			 uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
+			 void *cb_arg);
+int	nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp,
+			      nvme_cb_fn_t cb_fn, void *cb_arg);
+int	nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload,
+			       uint8_t num_ranges, nvme_cb_fn_t cb_fn,
+			       void *cb_arg);
+int	nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn,
+			  void *cb_arg);
+int	nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset,
+		     size_t len);
+
+/* Registration functions */
+struct nvme_consumer *	nvme_register_consumer(nvme_cons_ns_fn_t    ns_fn,
+					       nvme_cons_ctrlr_fn_t ctrlr_fn,
+					       nvme_cons_async_fn_t async_fn,
+					       nvme_cons_fail_fn_t  fail_fn);
+void		nvme_unregister_consumer(struct nvme_consumer *consumer);
+
+/* Controller helper functions */
+device_t	nvme_ctrlr_get_device(struct nvme_controller *ctrlr);
+const struct nvme_controller_data *
+		nvme_ctrlr_get_data(struct nvme_controller *ctrlr);
+
+/* Namespace helper functions */
+uint32_t	nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns);
+uint32_t	nvme_ns_get_sector_size(struct nvme_namespace *ns);
+uint64_t	nvme_ns_get_num_sectors(struct nvme_namespace *ns);
+uint64_t	nvme_ns_get_size(struct nvme_namespace *ns);
+uint32_t	nvme_ns_get_flags(struct nvme_namespace *ns);
+const char *	nvme_ns_get_serial_number(struct nvme_namespace *ns);
+const char *	nvme_ns_get_model_number(struct nvme_namespace *ns);
+const struct nvme_namespace_data *
+		nvme_ns_get_data(struct nvme_namespace *ns);
+uint32_t	nvme_ns_get_stripesize(struct nvme_namespace *ns);
+
+int	nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
+			    nvme_cb_fn_t cb_fn);
+
+/*
+ * Command building helper functions -- shared with CAM
+ * These functions assume allocator zeros out cmd structure
+ * CAM's xpt_get_ccb and the request allocator for nvme both
+ * do zero'd allocations.
+ */
+static inline
+void	nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid)
+{
+
+	cmd->opc = NVME_OPC_FLUSH;
+	cmd->nsid = htole32(nsid);
+}
+
+static inline
+void	nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid,
+    uint64_t lba, uint32_t count)
+{
+	cmd->opc = rwcmd;
+	cmd->nsid = htole32(nsid);
+	cmd->cdw10 = htole32(lba & 0xffffffffu);
+	cmd->cdw11 = htole32(lba >> 32);
+	cmd->cdw12 = htole32(count-1);
+}
+
+static inline
+void	nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid,
+    uint64_t lba, uint32_t count)
+{
+	nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count);
+}
+
+static inline
+void	nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid,
+    uint64_t lba, uint32_t count)
+{
+	nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count);
+}
+
+static inline
+void	nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid,
+    uint32_t num_ranges)
+{
+	cmd->opc = NVME_OPC_DATASET_MANAGEMENT;
+	cmd->nsid = htole32(nsid);
+	cmd->cdw10 = htole32(num_ranges - 1);
+	cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
+}
+
+extern int nvme_use_nvd;
+
+#endif /* _KERNEL */
+
+/* Endianess conversion functions for NVMe structs */
+static inline
+void	nvme_completion_swapbytes(struct nvme_completion *s)
+{
+
+	s->cdw0 = le32toh(s->cdw0);
+	/* omit rsvd1 */
+	s->sqhd = le16toh(s->sqhd);
+	s->sqid = le16toh(s->sqid);
+	/* omit cid */
+	s->status = le16toh(s->status);
+}
+
+static inline
+void	nvme_power_state_swapbytes(struct nvme_power_state *s)
+{
+
+	s->mp = le16toh(s->mp);
+	s->enlat = le32toh(s->enlat);
+	s->exlat = le32toh(s->exlat);
+	s->idlp = le16toh(s->idlp);
+	s->actp = le16toh(s->actp);
+}
+
+static inline
+void	nvme_controller_data_swapbytes(struct nvme_controller_data *s)
+{
+	int i;
+
+	s->vid = le16toh(s->vid);
+	s->ssvid = le16toh(s->ssvid);
+	s->ctrlr_id = le16toh(s->ctrlr_id);
+	s->ver = le32toh(s->ver);
+	s->rtd3r = le32toh(s->rtd3r);
+	s->rtd3e = le32toh(s->rtd3e);
+	s->oaes = le32toh(s->oaes);
+	s->ctratt = le32toh(s->ctratt);
+	s->oacs = le16toh(s->oacs);
+	s->wctemp = le16toh(s->wctemp);
+	s->cctemp = le16toh(s->cctemp);
+	s->mtfa = le16toh(s->mtfa);
+	s->hmpre = le32toh(s->hmpre);
+	s->hmmin = le32toh(s->hmmin);
+	s->rpmbs = le32toh(s->rpmbs);
+	s->edstt = le16toh(s->edstt);
+	s->kas = le16toh(s->kas);
+	s->hctma = le16toh(s->hctma);
+	s->mntmt = le16toh(s->mntmt);
+	s->mxtmt = le16toh(s->mxtmt);
+	s->sanicap = le32toh(s->sanicap);
+	s->maxcmd = le16toh(s->maxcmd);
+	s->nn = le32toh(s->nn);
+	s->oncs = le16toh(s->oncs);
+	s->fuses = le16toh(s->fuses);
+	s->awun = le16toh(s->awun);
+	s->awupf = le16toh(s->awupf);
+	s->acwu = le16toh(s->acwu);
+	s->sgls = le32toh(s->sgls);
+	for (i = 0; i < 32; i++)
+		nvme_power_state_swapbytes(&s->power_state[i]);
+}
+
+static inline
+void	nvme_namespace_data_swapbytes(struct nvme_namespace_data *s)
+{
+	int i;
+
+	s->nsze = le64toh(s->nsze);
+	s->ncap = le64toh(s->ncap);
+	s->nuse = le64toh(s->nuse);
+	s->nawun = le16toh(s->nawun);
+	s->nawupf = le16toh(s->nawupf);
+	s->nacwu = le16toh(s->nacwu);
+	s->nabsn = le16toh(s->nabsn);
+	s->nabo = le16toh(s->nabo);
+	s->nabspf = le16toh(s->nabspf);
+	s->noiob = le16toh(s->noiob);
+	for (i = 0; i < 16; i++)
+		s->lbaf[i] = le32toh(s->lbaf[i]);
+}
+
+static inline
+void	nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s)
+{
+
+	s->error_count = le64toh(s->error_count);
+	s->sqid = le16toh(s->sqid);
+	s->cid = le16toh(s->cid);
+	s->status = le16toh(s->status);
+	s->error_location = le16toh(s->error_location);
+	s->lba = le64toh(s->lba);
+	s->nsid = le32toh(s->nsid);
+}
+
+static inline
+void	nvme_le128toh(void *p)
+{
+	/*
+	 * Upstream, this uses the following comparison:
+	 * #if _BYTE_ORDER != _LITTLE_ENDIAN
+	 *
+	 * Rather than keep this file in compat with only that little bit
+	 * changed, we'll just float a little patch here for now.
+	 */
+#ifndef _LITTLE_ENDIAN
+	/* Swap 16 bytes in place */
+	char *tmp = (char*)p;
+	char b;
+	int i;
+	for (i = 0; i < 8; i++) {
+		b = tmp[i];
+		tmp[i] = tmp[15-i];
+		tmp[15-i] = b;
+	}
+#else
+	(void)p;
+#endif
+}
+
+static inline
+void	nvme_health_information_page_swapbytes(struct nvme_health_information_page *s)
+{
+	int i;
+
+	s->temperature = le16toh(s->temperature);
+	nvme_le128toh((void *)s->data_units_read);
+	nvme_le128toh((void *)s->data_units_written);
+	nvme_le128toh((void *)s->host_read_commands);
+	nvme_le128toh((void *)s->host_write_commands);
+	nvme_le128toh((void *)s->controller_busy_time);
+	nvme_le128toh((void *)s->power_cycles);
+	nvme_le128toh((void *)s->power_on_hours);
+	nvme_le128toh((void *)s->unsafe_shutdowns);
+	nvme_le128toh((void *)s->media_errors);
+	nvme_le128toh((void *)s->num_error_info_log_entries);
+	s->warning_temp_time = le32toh(s->warning_temp_time);
+	s->error_temp_time = le32toh(s->error_temp_time);
+	for (i = 0; i < 8; i++)
+		s->temp_sensor[i] = le16toh(s->temp_sensor[i]);
+}
+
+
+static inline
+void	nvme_firmware_page_swapbytes(struct nvme_firmware_page *s)
+{
+	int i;
+
+	for (i = 0; i < 7; i++)
+		s->revision[i] = le64toh(s->revision[i]);
+}
+
+static inline
+void	nvme_ns_list_swapbytes(struct nvme_ns_list *s)
+{
+	int i;
+
+	for (i = 0; i < 1024; i++)
+		s->ns[i] = le32toh(s->ns[i]);
+}
+
+static inline
+void	intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s)
+{
+
+	s->current = le64toh(s->current);
+	s->overtemp_flag_last = le64toh(s->overtemp_flag_last);
+	s->overtemp_flag_life = le64toh(s->overtemp_flag_life);
+	s->max_temp = le64toh(s->max_temp);
+	s->min_temp = le64toh(s->min_temp);
+	/* omit _rsvd[] */
+	s->max_oper_temp = le64toh(s->max_oper_temp);
+	s->min_oper_temp = le64toh(s->min_oper_temp);
+	s->est_offset = le64toh(s->est_offset);
+}
+
+#endif /* __NVME_H__ */
diff --git a/usr/contrib/freebsd/dev/usb/controller/xhcireg.h b/usr/contrib/freebsd/dev/usb/controller/xhcireg.h
new file mode 100644
index 0000000000..0e588ecba3
--- /dev/null
+++ b/usr/contrib/freebsd/dev/usb/controller/xhcireg.h
@@ -0,0 +1,224 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (c) 2010 Hans Petter Selasky. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _XHCIREG_H_
+#define	_XHCIREG_H_
+
+/* XHCI PCI config registers */
+#define	PCI_XHCI_CBMEM		0x10	/* configuration base MEM */
+#define	PCI_XHCI_USBREV		0x60	/* RO USB protocol revision */
+#define	PCI_USB_REV_3_0		0x30	/* USB 3.0 */
+#define	PCI_XHCI_FLADJ		0x61	/* RW frame length adjust */
+
+#define	PCI_XHCI_INTEL_XUSB2PR	0xD0	/* Intel USB2 Port Routing */
+#define	PCI_XHCI_INTEL_USB2PRM	0xD4	/* Intel USB2 Port Routing Mask */
+#define	PCI_XHCI_INTEL_USB3_PSSEN 0xD8	/* Intel USB3 Port SuperSpeed Enable */
+#define	PCI_XHCI_INTEL_USB3PRM	0xDC	/* Intel USB3 Port Routing Mask */
+
+/* XHCI capability registers */
+#define	XHCI_CAPLENGTH		0x00	/* RO capability */
+#define	XHCI_RESERVED		0x01	/* Reserved */
+#define	XHCI_HCIVERSION		0x02	/* RO Interface version number */
+#define	XHCI_HCIVERSION_0_9	0x0090	/* xHCI version 0.9 */
+#define	XHCI_HCIVERSION_1_0	0x0100	/* xHCI version 1.0 */
+#define	XHCI_HCSPARAMS1		0x04	/* RO structural parameters 1 */
+#define	XHCI_HCS1_DEVSLOT_MAX(x)((x) & 0xFF)
+#define	XHCI_HCS1_IRQ_MAX(x)	(((x) >> 8) & 0x3FF)
+#define	XHCI_HCS1_N_PORTS(x)	(((x) >> 24) & 0xFF)
+#define	XHCI_HCSPARAMS2		0x08	/* RO structural parameters 2 */
+#define	XHCI_HCS2_IST(x)	((x) & 0xF)
+#define	XHCI_HCS2_ERST_MAX(x)	(((x) >> 4) & 0xF)
+#define	XHCI_HCS2_SPR(x)	(((x) >> 26) & 0x1)
+#define	XHCI_HCS2_SPB_MAX(x)	((((x) >> 16) & 0x3E0) | (((x) >> 27) & 0x1F))
+#define	XHCI_HCSPARAMS3		0x0C	/* RO structural parameters 3 */
+#define	XHCI_HCS3_U1_DEL(x)	((x) & 0xFF)
+#define	XHCI_HCS3_U2_DEL(x)	(((x) >> 16) & 0xFFFF)
+#define	XHCI_HCSPARAMS0		0x10	/* RO capability parameters */
+#define	XHCI_HCS0_AC64(x)	((x) & 0x1)		/* 64-bit capable */
+#define	XHCI_HCS0_BNC(x)	(((x) >> 1) & 0x1)	/* BW negotiation */
+#define	XHCI_HCS0_CSZ(x)	(((x) >> 2) & 0x1)	/* context size */
+#define	XHCI_HCS0_PPC(x)	(((x) >> 3) & 0x1)	/* port power control */
+#define	XHCI_HCS0_PIND(x)	(((x) >> 4) & 0x1)	/* port indicators */
+#define	XHCI_HCS0_LHRC(x)	(((x) >> 5) & 0x1)	/* light HC reset */
+#define	XHCI_HCS0_LTC(x)	(((x) >> 6) & 0x1)	/* latency tolerance msg */
+#define	XHCI_HCS0_NSS(x)	(((x) >> 7) & 0x1)	/* no secondary sid */
+#define	XHCI_HCS0_PSA_SZ_MAX(x)	(((x) >> 12) & 0xF)	/* max pri. stream array size */
+#define	XHCI_HCS0_XECP(x)	(((x) >> 16) & 0xFFFF)	/* extended capabilities pointer */
+#define	XHCI_DBOFF		0x14	/* RO doorbell offset */
+#define	XHCI_RTSOFF		0x18	/* RO runtime register space offset */
+
+/* XHCI operational registers.  Offset given by XHCI_CAPLENGTH register */
+#define	XHCI_USBCMD		0x00	/* XHCI command */
+#define	XHCI_CMD_RS		0x00000001	/* RW Run/Stop */
+#define	XHCI_CMD_HCRST		0x00000002	/* RW Host Controller Reset */
+#define	XHCI_CMD_INTE		0x00000004	/* RW Interrupter Enable */
+#define	XHCI_CMD_HSEE		0x00000008	/* RW Host System Error Enable */
+#define	XHCI_CMD_LHCRST		0x00000080	/* RO/RW Light Host Controller Reset */
+#define	XHCI_CMD_CSS		0x00000100	/* RW Controller Save State */
+#define	XHCI_CMD_CRS		0x00000200	/* RW Controller Restore State */
+#define	XHCI_CMD_EWE		0x00000400	/* RW Enable Wrap Event */
+#define	XHCI_CMD_EU3S		0x00000800	/* RW Enable U3 MFINDEX Stop */
+#define	XHCI_USBSTS		0x04	/* XHCI status */
+#define	XHCI_STS_HCH		0x00000001	/* RO - Host Controller Halted */
+#define	XHCI_STS_HSE		0x00000004	/* RW - Host System Error */
+#define	XHCI_STS_EINT		0x00000008	/* RW - Event Interrupt */
+#define	XHCI_STS_PCD		0x00000010	/* RW - Port Change Detect */
+#define	XHCI_STS_SSS		0x00000100	/* RO - Save State Status */
+#define	XHCI_STS_RSS		0x00000200	/* RO - Restore State Status */
+#define	XHCI_STS_SRE		0x00000400	/* RW - Save/Restore Error */
+#define	XHCI_STS_CNR		0x00000800	/* RO - Controller Not Ready */
+#define	XHCI_STS_HCE		0x00001000	/* RO - Host Controller Error */
+#define	XHCI_PAGESIZE		0x08	/* XHCI page size mask */
+#define	XHCI_PAGESIZE_4K	0x00000001	/* 4K Page Size */
+#define	XHCI_PAGESIZE_8K	0x00000002	/* 8K Page Size */
+#define	XHCI_PAGESIZE_16K	0x00000004	/* 16K Page Size */
+#define	XHCI_PAGESIZE_32K	0x00000008	/* 32K Page Size */
+#define	XHCI_PAGESIZE_64K	0x00000010	/* 64K Page Size */
+#define	XHCI_DNCTRL		0x14	/* XHCI device notification control */
+#define	XHCI_DNCTRL_MASK(n)	(1U << (n))
+#define	XHCI_CRCR_LO		0x18	/* XHCI command ring control */
+#define	XHCI_CRCR_LO_RCS	0x00000001	/* RW - consumer cycle state */
+#define	XHCI_CRCR_LO_CS		0x00000002	/* RW - command stop */
+#define	XHCI_CRCR_LO_CA		0x00000004	/* RW - command abort */
+#define	XHCI_CRCR_LO_CRR	0x00000008	/* RW - command ring running */
+#define	XHCI_CRCR_LO_MASK	0x0000000F
+#define	XHCI_CRCR_HI		0x1C	/* XHCI command ring control */
+#define	XHCI_DCBAAP_LO		0x30	/* XHCI dev context BA pointer */
+#define	XHCI_DCBAAP_HI		0x34	/* XHCI dev context BA pointer */
+#define	XHCI_CONFIG		0x38
+#define	XHCI_CONFIG_SLOTS_MASK	0x000000FF	/* RW - number of device slots enabled */
+
+/* XHCI port status registers */
+#define	XHCI_PORTSC(n)		(0x3F0 + (0x10 * (n)))	/* XHCI port status */
+#define	XHCI_PS_CCS		0x00000001	/* RO - current connect status */
+#define	XHCI_PS_PED		0x00000002	/* RW - port enabled / disabled */
+#define	XHCI_PS_OCA		0x00000008	/* RO - over current active */
+#define	XHCI_PS_PR		0x00000010	/* RW - port reset */
+#define	XHCI_PS_PLS_GET(x)	(((x) >> 5) & 0xF)	/* RW - port link state */
+#define	XHCI_PS_PLS_SET(x)	(((x) & 0xF) << 5)	/* RW - port link state */
+#define	XHCI_PS_PP		0x00000200	/* RW - port power */
+#define	XHCI_PS_SPEED_GET(x)	(((x) >> 10) & 0xF)	/* RO - port speed */
+#define	XHCI_PS_PIC_GET(x)	(((x) >> 14) & 0x3)	/* RW - port indicator */
+#define	XHCI_PS_PIC_SET(x)	(((x) & 0x3) << 14)	/* RW - port indicator */
+#define	XHCI_PS_LWS		0x00010000	/* RW - port link state write strobe */
+#define	XHCI_PS_CSC		0x00020000	/* RW - connect status change */
+#define	XHCI_PS_PEC		0x00040000	/* RW - port enable/disable change */
+#define	XHCI_PS_WRC		0x00080000	/* RW - warm port reset change */
+#define	XHCI_PS_OCC		0x00100000	/* RW - over-current change */
+#define	XHCI_PS_PRC		0x00200000	/* RW - port reset change */
+#define	XHCI_PS_PLC		0x00400000	/* RW - port link state change */
+#define	XHCI_PS_CEC		0x00800000	/* RW - config error change */
+#define	XHCI_PS_CAS		0x01000000	/* RO - cold attach status */
+#define	XHCI_PS_WCE		0x02000000	/* RW - wake on connect enable */
+#define	XHCI_PS_WDE		0x04000000	/* RW - wake on disconnect enable */
+#define	XHCI_PS_WOE		0x08000000	/* RW - wake on over-current enable */
+#define	XHCI_PS_DR		0x40000000	/* RO - device removable */
+#define	XHCI_PS_WPR		0x80000000U	/* RW - warm port reset */
+#define	XHCI_PS_CLEAR		0x80FF01FFU	/* command bits */
+
+#define	XHCI_PORTPMSC(n)	(0x3F4 + (0x10 * (n)))	/* XHCI status and control */
+#define	XHCI_PM3_U1TO_GET(x)	(((x) >> 0) & 0xFF)	/* RW - U1 timeout */
+#define	XHCI_PM3_U1TO_SET(x)	(((x) & 0xFF) << 0)	/* RW - U1 timeout */
+#define	XHCI_PM3_U2TO_GET(x)	(((x) >> 8) & 0xFF)	/* RW - U2 timeout */
+#define	XHCI_PM3_U2TO_SET(x)	(((x) & 0xFF) << 8)	/* RW - U2 timeout */
+#define	XHCI_PM3_FLA		0x00010000	/* RW - Force Link PM Accept */
+#define	XHCI_PM2_L1S_GET(x)	(((x) >> 0) & 0x7)	/* RO - L1 status */
+#define	XHCI_PM2_RWE		0x00000008		/* RW - remote wakup enable */
+#define	XHCI_PM2_HIRD_GET(x)	(((x) >> 4) & 0xF)	/* RW - host initiated resume duration */
+#define	XHCI_PM2_HIRD_SET(x)	(((x) & 0xF) << 4)	/* RW - host initiated resume duration */
+#define	XHCI_PM2_L1SLOT_GET(x)	(((x) >> 8) & 0xFF)	/* RW - L1 device slot */
+#define	XHCI_PM2_L1SLOT_SET(x)	(((x) & 0xFF) << 8)	/* RW - L1 device slot */
+#define	XHCI_PM2_HLE		0x00010000		/* RW - hardware LPM enable */
+#define	XHCI_PORTLI(n)		(0x3F8 + (0x10 * (n)))	/* XHCI port link info */
+#define	XHCI_PLI3_ERR_GET(x)	(((x) >> 0) & 0xFFFF)	/* RO - port link errors */
+#define	XHCI_PORTRSV(n)		(0x3FC + (0x10 * (n)))	/* XHCI port reserved */
+
+/* XHCI runtime registers.  Offset given by XHCI_CAPLENGTH + XHCI_RTSOFF registers */
+#define	XHCI_MFINDEX		0x0000		/* RO - microframe index */
+#define	XHCI_MFINDEX_GET(x)	((x) & 0x3FFF)
+#define	XHCI_IMAN(n)		(0x0020 + (0x20 * (n)))	/* XHCI interrupt management */
+#define	XHCI_IMAN_INTR_PEND	0x00000001	/* RW - interrupt pending */
+#define	XHCI_IMAN_INTR_ENA	0x00000002	/* RW - interrupt enable */
+#define	XHCI_IMOD(n)		(0x0024 + (0x20 * (n)))	/* XHCI interrupt moderation */
+#define	XHCI_IMOD_IVAL_GET(x)	(((x) >> 0) & 0xFFFF)	/* 250ns unit */
+#define	XHCI_IMOD_IVAL_SET(x)	(((x) & 0xFFFF) << 0)	/* 250ns unit */
+#define	XHCI_IMOD_ICNT_GET(x)	(((x) >> 16) & 0xFFFF)	/* 250ns unit */
+#define	XHCI_IMOD_ICNT_SET(x)	(((x) & 0xFFFF) << 16)	/* 250ns unit */
+#define	XHCI_IMOD_DEFAULT	0x000001F4U	/* 8000 IRQs/second */
+#define	XHCI_IMOD_DEFAULT_LP 	0x000003F8U	/* 4000 IRQs/second - LynxPoint */
+#define	XHCI_ERSTSZ(n)		(0x0028 + (0x20 * (n)))	/* XHCI event ring segment table size */
+#define	XHCI_ERSTS_GET(x)	((x) & 0xFFFF)
+#define	XHCI_ERSTS_SET(x)	((x) & 0xFFFF)
+#define	XHCI_ERSTBA_LO(n)	(0x0030 + (0x20 * (n)))	/* XHCI event ring segment table BA */
+#define	XHCI_ERSTBA_HI(n)	(0x0034 + (0x20 * (n)))	/* XHCI event ring segment table BA */
+#define	XHCI_ERDP_LO(n)	(0x0038 + (0x20 * (n)))	/* XHCI event ring dequeue pointer */
+#define	XHCI_ERDP_LO_SINDEX(x)	((x) & 0x7)	/* RO - dequeue segment index */
+#define	XHCI_ERDP_LO_BUSY	0x00000008	/* RW - event handler busy */
+#define	XHCI_ERDP_HI(n)	(0x003C + (0x20 * (n)))	/* XHCI event ring dequeue pointer */
+
+/* XHCI doorbell registers. Offset given by XHCI_CAPLENGTH + XHCI_DBOFF registers */
+#define	XHCI_DOORBELL(n)	(0x0000 + (4 * (n)))
+#define	XHCI_DB_TARGET_GET(x)	((x) & 0xFF)		/* RW - doorbell target */
+#define	XHCI_DB_TARGET_SET(x)	((x) & 0xFF)		/* RW - doorbell target */
+#define	XHCI_DB_SID_GET(x)	(((x) >> 16) & 0xFFFF)	/* RW - doorbell stream ID */
+#define	XHCI_DB_SID_SET(x)	(((x) & 0xFFFF) << 16)	/* RW - doorbell stream ID */
+
+/* XHCI legacy support */
+#define	XHCI_XECP_ID(x)		((x) & 0xFF)
+#define	XHCI_XECP_NEXT(x)	(((x) >> 8) & 0xFF)
+#define	XHCI_XECP_BIOS_SEM	0x0002
+#define	XHCI_XECP_OS_SEM	0x0003
+
+/* XHCI capability ID's */
+#define	XHCI_ID_USB_LEGACY	0x0001
+#define	XHCI_ID_PROTOCOLS	0x0002
+#define	XHCI_ID_POWER_MGMT	0x0003
+#define	XHCI_ID_VIRTUALIZATION	0x0004
+#define	XHCI_ID_MSG_IRQ		0x0005
+#define	XHCI_ID_USB_LOCAL_MEM	0x0006
+
+/* XHCI register R/W wrappers */
+#define	XREAD1(sc, what, a) \
+	bus_space_read_1((sc)->sc_io_tag, (sc)->sc_io_hdl, \
+		(a) + (sc)->sc_##what##_off)
+#define	XREAD2(sc, what, a) \
+	bus_space_read_2((sc)->sc_io_tag, (sc)->sc_io_hdl, \
+		(a) + (sc)->sc_##what##_off)
+#define	XREAD4(sc, what, a) \
+	bus_space_read_4((sc)->sc_io_tag, (sc)->sc_io_hdl, \
+		(a) + (sc)->sc_##what##_off)
+#define	XWRITE1(sc, what, a, x) \
+	bus_space_write_1((sc)->sc_io_tag, (sc)->sc_io_hdl, \
+		(a) + (sc)->sc_##what##_off, (x))
+#define	XWRITE2(sc, what, a, x) \
+	bus_space_write_2((sc)->sc_io_tag, (sc)->sc_io_hdl, \
+		(a) + (sc)->sc_##what##_off, (x))
+#define	XWRITE4(sc, what, a, x) \
+	bus_space_write_4((sc)->sc_io_tag, (sc)->sc_io_hdl, \
+		(a) + (sc)->sc_##what##_off, (x))
+
+#endif	/* _XHCIREG_H_ */
diff --git a/usr/contrib/freebsd/dev/usb/usb.h b/usr/contrib/freebsd/dev/usb/usb.h
new file mode 100644
index 0000000000..bcea2ac8bd
--- /dev/null
+++ b/usr/contrib/freebsd/dev/usb/usb.h
@@ -0,0 +1,801 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
+ * Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved.
+ * Copyright (c) 1998 Lennart Augustsson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains standard definitions for the following USB
+ * protocol versions:
+ *
+ * USB v1.0
+ * USB v1.1
+ * USB v2.0
+ * USB v3.0
+ */
+
+#ifndef _USB_STANDARD_H_
+#define	_USB_STANDARD_H_
+
+#if defined(_KERNEL)
+#ifndef USB_GLOBAL_INCLUDE_FILE
+#include "opt_usb.h"
+#endif
+
+/* Declare parent SYSCTL USB node. */
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_hw_usb);
+#endif
+
+#ifndef USB_GLOBAL_INCLUDE_FILE
+#include <sys/malloc.h>
+#endif
+
+MALLOC_DECLARE(M_USB);
+MALLOC_DECLARE(M_USBDEV);
+#endif /* _KERNEL */
+
+#ifndef USB_GLOBAL_INCLUDE_FILE
+#include <dev/usb/usb_endian.h>
+#include <dev/usb/usb_freebsd.h>
+#endif
+
+#define	USB_STACK_VERSION 2000		/* 2.0 */
+
+/* Definition of some hardcoded USB constants. */
+
+#define	USB_MAX_IPACKET		8	/* initial USB packet size */
+#define	USB_EP_MAX (2*16)		/* hardcoded */
+#define	USB_ROOT_HUB_ADDR 1		/* index */
+#define	USB_MIN_DEVICES 2		/* unused + root HUB */
+#define	USB_UNCONFIG_INDEX 0xFF		/* internal use only */
+#define	USB_IFACE_INDEX_ANY 0xFF	/* internal use only */
+#define	USB_START_ADDR 0		/* default USB device BUS address
+					 * after USB bus reset */
+#define	USB_CONTROL_ENDPOINT 0		/* default control endpoint */
+
+#define	USB_FRAMES_PER_SECOND_FS 1000	/* full speed */
+#define	USB_FRAMES_PER_SECOND_HS 8000	/* high speed */
+
+#define	USB_FS_BYTES_PER_HS_UFRAME 188	/* bytes */
+#define	USB_HS_MICRO_FRAMES_MAX 8	/* units */
+
+#define	USB_ISOC_TIME_MAX 128		/* ms */
+
+/*
+ * Minimum time a device needs to be powered down to go through a
+ * power cycle. These values are not in the USB specification.
+ */
+#define	USB_POWER_DOWN_TIME	200	/* ms */
+#define	USB_PORT_POWER_DOWN_TIME	100	/* ms */
+
+/* Definition of software USB power modes */
+#define	USB_POWER_MODE_OFF 0		/* turn off device */
+#define	USB_POWER_MODE_ON 1		/* always on */
+#define	USB_POWER_MODE_SAVE 2		/* automatic suspend and resume */
+#define	USB_POWER_MODE_SUSPEND 3	/* force suspend */
+#define	USB_POWER_MODE_RESUME 4		/* force resume */
+
+/* These are the values from the USB specification. */
+#define	USB_PORT_RESET_DELAY_SPEC	10	/* ms */
+#define	USB_PORT_ROOT_RESET_DELAY_SPEC	50	/* ms */
+#define	USB_PORT_RESET_RECOVERY_SPEC	10	/* ms */
+#define	USB_PORT_POWERUP_DELAY_SPEC	100	/* ms */
+#define	USB_PORT_RESUME_DELAY_SPEC	20	/* ms */
+#define	USB_SET_ADDRESS_SETTLE_SPEC	2	/* ms */
+#define	USB_RESUME_DELAY_SPEC		(20*5)	/* ms */
+#define	USB_RESUME_WAIT_SPEC		10	/* ms */
+#define	USB_RESUME_RECOVERY_SPEC	10	/* ms */
+#define	USB_EXTRA_POWER_UP_TIME_SPEC	0	/* ms */
+
+/* Allow for marginal and non-conforming devices. */
+#define	USB_PORT_RESET_DELAY		50	/* ms */
+#define	USB_PORT_ROOT_RESET_DELAY	200	/* ms */
+#define	USB_PORT_RESET_RECOVERY		250	/* ms */
+#define	USB_PORT_POWERUP_DELAY		300	/* ms */
+#define	USB_PORT_RESUME_DELAY		(20*2)	/* ms */
+#define	USB_SET_ADDRESS_SETTLE		10	/* ms */
+#define	USB_RESUME_DELAY		(50*5)	/* ms */
+#define	USB_RESUME_WAIT			50	/* ms */
+#define	USB_RESUME_RECOVERY		50	/* ms */
+#define	USB_EXTRA_POWER_UP_TIME		20	/* ms */
+
+#define	USB_MIN_POWER		100	/* mA */
+#define	USB_MAX_POWER		500	/* mA */
+
+#define	USB_BUS_RESET_DELAY	100	/* ms */
+
+/*
+ * USB record layout in memory:
+ *
+ * - USB config 0
+ *   - USB interfaces
+ *     - USB alternative interfaces
+ *       - USB endpoints
+ *
+ * - USB config 1
+ *   - USB interfaces
+ *     - USB alternative interfaces
+ *       - USB endpoints
+ */
+
+/* Declaration of USB records */
+
+struct usb_device_request {
+	uByte	bmRequestType;
+	uByte	bRequest;
+	uWord	wValue;
+	uWord	wIndex;
+	uWord	wLength;
+} __packed;
+typedef struct usb_device_request usb_device_request_t;
+
+#define	UT_WRITE		0x00
+#define	UT_READ			0x80
+#define	UT_STANDARD		0x00
+#define	UT_CLASS		0x20
+#define	UT_VENDOR		0x40
+#define	UT_DEVICE		0x00
+#define	UT_INTERFACE		0x01
+#define	UT_ENDPOINT		0x02
+#define	UT_OTHER		0x03
+
+#define	UT_READ_DEVICE		(UT_READ  | UT_STANDARD | UT_DEVICE)
+#define	UT_READ_INTERFACE	(UT_READ  | UT_STANDARD | UT_INTERFACE)
+#define	UT_READ_ENDPOINT	(UT_READ  | UT_STANDARD | UT_ENDPOINT)
+#define	UT_WRITE_DEVICE		(UT_WRITE | UT_STANDARD | UT_DEVICE)
+#define	UT_WRITE_INTERFACE	(UT_WRITE | UT_STANDARD | UT_INTERFACE)
+#define	UT_WRITE_ENDPOINT	(UT_WRITE | UT_STANDARD | UT_ENDPOINT)
+#define	UT_READ_CLASS_DEVICE	(UT_READ  | UT_CLASS | UT_DEVICE)
+#define	UT_READ_CLASS_INTERFACE	(UT_READ  | UT_CLASS | UT_INTERFACE)
+#define	UT_READ_CLASS_OTHER	(UT_READ  | UT_CLASS | UT_OTHER)
+#define	UT_READ_CLASS_ENDPOINT	(UT_READ  | UT_CLASS | UT_ENDPOINT)
+#define	UT_WRITE_CLASS_DEVICE	(UT_WRITE | UT_CLASS | UT_DEVICE)
+#define	UT_WRITE_CLASS_INTERFACE (UT_WRITE | UT_CLASS | UT_INTERFACE)
+#define	UT_WRITE_CLASS_OTHER	(UT_WRITE | UT_CLASS | UT_OTHER)
+#define	UT_WRITE_CLASS_ENDPOINT	(UT_WRITE | UT_CLASS | UT_ENDPOINT)
+#define	UT_READ_VENDOR_DEVICE	(UT_READ  | UT_VENDOR | UT_DEVICE)
+#define	UT_READ_VENDOR_INTERFACE (UT_READ  | UT_VENDOR | UT_INTERFACE)
+#define	UT_READ_VENDOR_OTHER	(UT_READ  | UT_VENDOR | UT_OTHER)
+#define	UT_READ_VENDOR_ENDPOINT	(UT_READ  | UT_VENDOR | UT_ENDPOINT)
+#define	UT_WRITE_VENDOR_DEVICE	(UT_WRITE | UT_VENDOR | UT_DEVICE)
+#define	UT_WRITE_VENDOR_INTERFACE (UT_WRITE | UT_VENDOR | UT_INTERFACE)
+#define	UT_WRITE_VENDOR_OTHER	(UT_WRITE | UT_VENDOR | UT_OTHER)
+#define	UT_WRITE_VENDOR_ENDPOINT (UT_WRITE | UT_VENDOR | UT_ENDPOINT)
+
+/* Requests */
+#define	UR_GET_STATUS		0x00
+#define	UR_CLEAR_FEATURE	0x01
+#define	UR_SET_FEATURE		0x03
+#define	UR_SET_ADDRESS		0x05
+#define	UR_GET_DESCRIPTOR	0x06
+#define	UDESC_DEVICE		0x01
+#define	UDESC_CONFIG		0x02
+#define	UDESC_STRING		0x03
+#define	USB_LANGUAGE_TABLE	0x00	/* language ID string index */
+#define	UDESC_INTERFACE		0x04
+#define	UDESC_ENDPOINT		0x05
+#define	UDESC_DEVICE_QUALIFIER	0x06
+#define	UDESC_OTHER_SPEED_CONFIGURATION 0x07
+#define	UDESC_INTERFACE_POWER	0x08
+#define	UDESC_OTG		0x09
+#define	UDESC_DEBUG		0x0A
+#define	UDESC_IFACE_ASSOC	0x0B	/* interface association */
+#define	UDESC_BOS		0x0F	/* binary object store */
+#define	UDESC_DEVICE_CAPABILITY	0x10
+#define	UDESC_CS_DEVICE		0x21	/* class specific */
+#define	UDESC_CS_CONFIG		0x22
+#define	UDESC_CS_STRING		0x23
+#define	UDESC_CS_INTERFACE	0x24
+#define	UDESC_CS_ENDPOINT	0x25
+#define	UDESC_HUB		0x29
+#define	UDESC_SS_HUB		0x2A	/* super speed */
+#define	UDESC_ENDPOINT_SS_COMP	0x30	/* super speed */
+#define	UR_SET_DESCRIPTOR	0x07
+#define	UR_GET_CONFIG		0x08
+#define	UR_SET_CONFIG		0x09
+#define	UR_GET_INTERFACE	0x0a
+#define	UR_SET_INTERFACE	0x0b
+#define	UR_SYNCH_FRAME		0x0c
+#define	UR_SET_SEL		0x30
+#define	UR_ISOCH_DELAY		0x31
+
+/* HUB specific request */
+#define	UR_GET_BUS_STATE	0x02
+#define	UR_CLEAR_TT_BUFFER	0x08
+#define	UR_RESET_TT		0x09
+#define	UR_GET_TT_STATE		0x0a
+#define	UR_STOP_TT		0x0b
+#define	UR_SET_AND_TEST		0x0c	/* USB 2.0 only */
+#define	UR_SET_HUB_DEPTH	0x0c	/* USB 3.0 only */
+#define	USB_SS_HUB_DEPTH_MAX	5
+#define	UR_GET_PORT_ERR_COUNT	0x0d
+
+/* Feature numbers */
+#define	UF_ENDPOINT_HALT	0
+#define	UF_DEVICE_REMOTE_WAKEUP	1
+#define	UF_TEST_MODE		2
+#define	UF_U1_ENABLE		0x30
+#define	UF_U2_ENABLE		0x31
+#define	UF_LTM_ENABLE		0x32
+
+/* HUB specific features */
+#define	UHF_C_HUB_LOCAL_POWER	0
+#define	UHF_C_HUB_OVER_CURRENT	1
+#define	UHF_PORT_CONNECTION	0
+#define	UHF_PORT_ENABLE		1
+#define	UHF_PORT_SUSPEND	2
+#define	UHF_PORT_OVER_CURRENT	3
+#define	UHF_PORT_RESET		4
+#define	UHF_PORT_LINK_STATE	5
+#define	UHF_PORT_POWER		8
+#define	UHF_PORT_LOW_SPEED	9
+#define	UHF_PORT_L1		10
+#define	UHF_C_PORT_CONNECTION	16
+#define	UHF_C_PORT_ENABLE	17
+#define	UHF_C_PORT_SUSPEND	18
+#define	UHF_C_PORT_OVER_CURRENT	19
+#define	UHF_C_PORT_RESET	20
+#define	UHF_PORT_TEST		21
+#define	UHF_PORT_INDICATOR	22
+#define	UHF_C_PORT_L1		23
+
+/* SuperSpeed HUB specific features */
+#define	UHF_PORT_U1_TIMEOUT	23
+#define	UHF_PORT_U2_TIMEOUT	24
+#define	UHF_C_PORT_LINK_STATE	25
+#define	UHF_C_PORT_CONFIG_ERROR	26
+#define	UHF_PORT_REMOTE_WAKE_MASK	27
+#define	UHF_BH_PORT_RESET	28
+#define	UHF_C_BH_PORT_RESET	29
+#define	UHF_FORCE_LINKPM_ACCEPT	30
+
+struct usb_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bDescriptorSubtype;
+} __packed;
+typedef struct usb_descriptor usb_descriptor_t;
+
+struct usb_device_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uWord	bcdUSB;
+#define	UD_USB_2_0		0x0200
+#define	UD_USB_3_0		0x0300
+#define	UD_IS_USB2(d) ((d)->bcdUSB[1] == 0x02)
+#define	UD_IS_USB3(d) ((d)->bcdUSB[1] == 0x03)
+	uByte	bDeviceClass;
+	uByte	bDeviceSubClass;
+	uByte	bDeviceProtocol;
+	uByte	bMaxPacketSize;
+	/* The fields below are not part of the initial descriptor. */
+	uWord	idVendor;
+	uWord	idProduct;
+	uWord	bcdDevice;
+	uByte	iManufacturer;
+	uByte	iProduct;
+	uByte	iSerialNumber;
+	uByte	bNumConfigurations;
+} __packed;
+typedef struct usb_device_descriptor usb_device_descriptor_t;
+
+/* Binary Device Object Store (BOS) */
+struct usb_bos_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uWord	wTotalLength;
+	uByte	bNumDeviceCaps;
+} __packed;
+typedef struct usb_bos_descriptor usb_bos_descriptor_t;
+
+/* Binary Device Object Store Capability */
+struct usb_bos_cap_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bDevCapabilityType;
+#define	USB_DEVCAP_RESERVED	0x00
+#define	USB_DEVCAP_WUSB		0x01
+#define	USB_DEVCAP_USB2EXT	0x02
+#define	USB_DEVCAP_SUPER_SPEED	0x03
+#define	USB_DEVCAP_CONTAINER_ID	0x04
+	/* data ... */
+} __packed;
+typedef struct usb_bos_cap_descriptor usb_bos_cap_descriptor_t;
+
+struct usb_devcap_usb2ext_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bDevCapabilityType;
+	uDWord	bmAttributes;
+#define	USB_V2EXT_LPM (1U << 1)
+#define	USB_V2EXT_BESL_SUPPORTED (1U << 2)
+#define	USB_V2EXT_BESL_BASELINE_VALID (1U << 3)
+#define	USB_V2EXT_BESL_DEEP_VALID (1U << 4)
+#define	USB_V2EXT_BESL_BASELINE_GET(x) (((x) >> 8) & 0xF)
+#define	USB_V2EXT_BESL_DEEP_GET(x) (((x) >> 12) & 0xF)
+} __packed;
+typedef struct usb_devcap_usb2ext_descriptor usb_devcap_usb2ext_descriptor_t;
+
+struct usb_devcap_ss_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bDevCapabilityType;
+	uByte	bmAttributes;
+	uWord	wSpeedsSupported;
+	uByte	bFunctionalitySupport;
+	uByte	bU1DevExitLat;
+	uWord	wU2DevExitLat;
+} __packed;
+typedef struct usb_devcap_ss_descriptor usb_devcap_ss_descriptor_t;
+
+struct usb_devcap_container_id_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bDevCapabilityType;
+	uByte	bReserved;
+	uByte	bContainerID;
+} __packed;
+typedef struct usb_devcap_container_id_descriptor
+		usb_devcap_container_id_descriptor_t;
+
+/* Device class codes */
+#define	UDCLASS_IN_INTERFACE	0x00
+#define	UDCLASS_COMM		0x02
+#define	UDCLASS_HUB		0x09
+#define	UDSUBCLASS_HUB		0x00
+#define	UDPROTO_FSHUB		0x00
+#define	UDPROTO_HSHUBSTT	0x01
+#define	UDPROTO_HSHUBMTT	0x02
+#define	UDPROTO_SSHUB		0x03
+#define	UDCLASS_DIAGNOSTIC	0xdc
+#define	UDCLASS_WIRELESS	0xe0
+#define	UDSUBCLASS_RF		0x01
+#define	UDPROTO_BLUETOOTH	0x01
+#define	UDCLASS_VENDOR		0xff
+
+struct usb_config_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uWord	wTotalLength;
+	uByte	bNumInterface;
+	uByte	bConfigurationValue;
+#define	USB_UNCONFIG_NO 0
+	uByte	iConfiguration;
+	uByte	bmAttributes;
+#define	UC_BUS_POWERED		0x80
+#define	UC_SELF_POWERED		0x40
+#define	UC_REMOTE_WAKEUP	0x20
+	uByte	bMaxPower;		/* max current in 2 mA units */
+#define	UC_POWER_FACTOR 2
+} __packed;
+typedef struct usb_config_descriptor usb_config_descriptor_t;
+
+struct usb_interface_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bInterfaceNumber;
+	uByte	bAlternateSetting;
+	uByte	bNumEndpoints;
+	uByte	bInterfaceClass;
+	uByte	bInterfaceSubClass;
+	uByte	bInterfaceProtocol;
+	uByte	iInterface;
+} __packed;
+typedef struct usb_interface_descriptor usb_interface_descriptor_t;
+
+struct usb_interface_assoc_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bFirstInterface;
+	uByte	bInterfaceCount;
+	uByte	bFunctionClass;
+	uByte	bFunctionSubClass;
+	uByte	bFunctionProtocol;
+	uByte	iFunction;
+} __packed;
+typedef struct usb_interface_assoc_descriptor usb_interface_assoc_descriptor_t;
+
+/* Interface class codes */
+#define	UICLASS_UNSPEC		0x00
+#define	UICLASS_AUDIO		0x01	/* audio */
+#define	UISUBCLASS_AUDIOCONTROL	1
+#define	UISUBCLASS_AUDIOSTREAM		2
+#define	UISUBCLASS_MIDISTREAM		3
+
+#define	UICLASS_CDC		0x02	/* communication */
+#define	UISUBCLASS_DIRECT_LINE_CONTROL_MODEL	1
+#define	UISUBCLASS_ABSTRACT_CONTROL_MODEL	2
+#define	UISUBCLASS_TELEPHONE_CONTROL_MODEL	3
+#define	UISUBCLASS_MULTICHANNEL_CONTROL_MODEL	4
+#define	UISUBCLASS_CAPI_CONTROLMODEL		5
+#define	UISUBCLASS_ETHERNET_NETWORKING_CONTROL_MODEL 6
+#define	UISUBCLASS_ATM_NETWORKING_CONTROL_MODEL 7
+#define	UISUBCLASS_WIRELESS_HANDSET_CM 8
+#define	UISUBCLASS_DEVICE_MGMT 9
+#define	UISUBCLASS_MOBILE_DIRECT_LINE_MODEL 10
+#define	UISUBCLASS_OBEX 11
+#define	UISUBCLASS_ETHERNET_EMULATION_MODEL 12
+#define	UISUBCLASS_NETWORK_CONTROL_MODEL 13
+
+#define	UIPROTO_CDC_NONE		0
+#define	UIPROTO_CDC_AT			1
+
+#define	UICLASS_HID		0x03
+#define	UISUBCLASS_BOOT		1
+#define	UIPROTO_BOOT_KEYBOARD	1
+#define	UIPROTO_MOUSE		2
+
+#define	UICLASS_PHYSICAL	0x05
+#define	UICLASS_IMAGE		0x06
+#define	UISUBCLASS_SIC		1	/* still image class */
+#define	UICLASS_PRINTER		0x07
+#define	UISUBCLASS_PRINTER	1
+#define	UIPROTO_PRINTER_UNI	1
+#define	UIPROTO_PRINTER_BI	2
+#define	UIPROTO_PRINTER_1284	3
+
+#define	UICLASS_MASS		0x08
+#define	UISUBCLASS_RBC		1
+#define	UISUBCLASS_SFF8020I	2
+#define	UISUBCLASS_QIC157	3
+#define	UISUBCLASS_UFI		4
+#define	UISUBCLASS_SFF8070I	5
+#define	UISUBCLASS_SCSI		6
+#define	UIPROTO_MASS_CBI_I	0
+#define	UIPROTO_MASS_CBI	1
+#define	UIPROTO_MASS_BBB_OLD	2	/* Not in the spec anymore */
+#define	UIPROTO_MASS_BBB	80	/* 'P' for the Iomega Zip drive */
+
+#define	UICLASS_HUB		0x09
+#define	UISUBCLASS_HUB		0
+#define	UIPROTO_FSHUB		0
+#define	UIPROTO_HSHUBSTT	0	/* Yes, same as previous */
+#define	UIPROTO_HSHUBMTT	1
+
+#define	UICLASS_CDC_DATA	0x0a
+#define	UISUBCLASS_DATA		0x00
+#define	UIPROTO_DATA_ISDNBRI		0x30	/* Physical iface */
+#define	UIPROTO_DATA_HDLC		0x31	/* HDLC */
+#define	UIPROTO_DATA_TRANSPARENT	0x32	/* Transparent */
+#define	UIPROTO_DATA_Q921M		0x50	/* Management for Q921 */
+#define	UIPROTO_DATA_Q921		0x51	/* Data for Q921 */
+#define	UIPROTO_DATA_Q921TM		0x52	/* TEI multiplexer for Q921 */
+#define	UIPROTO_DATA_V42BIS		0x90	/* Data compression */
+#define	UIPROTO_DATA_Q931		0x91	/* Euro-ISDN */
+#define	UIPROTO_DATA_V120		0x92	/* V.24 rate adaption */
+#define	UIPROTO_DATA_CAPI		0x93	/* CAPI 2.0 commands */
+#define	UIPROTO_DATA_HOST_BASED		0xfd	/* Host based driver */
+#define	UIPROTO_DATA_PUF		0xfe	/* see Prot. Unit Func. Desc. */
+#define	UIPROTO_DATA_VENDOR		0xff	/* Vendor specific */
+#define	UIPROTO_DATA_NCM		0x01	/* Network Control Model */
+
+#define	UICLASS_SMARTCARD	0x0b
+#define	UICLASS_FIRM_UPD	0x0c
+#define	UICLASS_SECURITY	0x0d
+#define	UICLASS_DIAGNOSTIC	0xdc
+#define	UICLASS_WIRELESS	0xe0
+#define	UISUBCLASS_RF			0x01
+#define	UIPROTO_BLUETOOTH		0x01
+#define	UIPROTO_RNDIS			0x03
+
+#define	UICLASS_IAD		0xEF	/* Interface Association Descriptor */
+#define	UISUBCLASS_SYNC			0x01
+#define	UIPROTO_ACTIVESYNC		0x01
+
+#define	UICLASS_APPL_SPEC	0xfe
+#define	UISUBCLASS_FIRMWARE_DOWNLOAD	1
+#define	UISUBCLASS_IRDA			2
+#define	UIPROTO_IRDA			0
+
+#define	UICLASS_VENDOR		0xff
+#define	UISUBCLASS_XBOX360_CONTROLLER	0x5d
+#define	UIPROTO_XBOX360_GAMEPAD	0x01
+
+struct usb_endpoint_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bEndpointAddress;
+#define	UE_GET_DIR(a)	((a) & 0x80)
+#define	UE_SET_DIR(a,d)	((a) | (((d)&1) << 7))
+#define	UE_DIR_IN	0x80		/* IN-token endpoint, fixed */
+#define	UE_DIR_OUT	0x00		/* OUT-token endpoint, fixed */
+#define	UE_DIR_RX	0xfd		/* for internal use only! */
+#define	UE_DIR_TX	0xfe		/* for internal use only! */
+#define	UE_DIR_ANY	0xff		/* for internal use only! */
+#define	UE_ADDR		0x0f
+#define	UE_ADDR_ANY	0xff		/* for internal use only! */
+#define	UE_GET_ADDR(a)	((a) & UE_ADDR)
+	uByte	bmAttributes;
+#define	UE_XFERTYPE	0x03
+#define	UE_CONTROL	0x00
+#define	UE_ISOCHRONOUS	0x01
+#define	UE_BULK	0x02
+#define	UE_INTERRUPT	0x03
+#define	UE_BULK_INTR	0xfe		/* for internal use only! */
+#define	UE_TYPE_ANY	0xff		/* for internal use only! */
+#define	UE_GET_XFERTYPE(a)	((a) & UE_XFERTYPE)
+#define	UE_ISO_TYPE	0x0c
+#define	UE_ISO_ASYNC	0x04
+#define	UE_ISO_ADAPT	0x08
+#define	UE_ISO_SYNC	0x0c
+#define	UE_GET_ISO_TYPE(a)	((a) & UE_ISO_TYPE)
+#define	UE_ISO_USAGE	0x30
+#define	UE_ISO_USAGE_DATA	0x00
+#define	UE_ISO_USAGE_FEEDBACK	0x10
+#define	UE_ISO_USAGE_IMPLICT_FB	0x20
+#define	UE_GET_ISO_USAGE(a)	((a) & UE_ISO_USAGE)
+	uWord	wMaxPacketSize;
+#define	UE_ZERO_MPS 0xFFFF		/* for internal use only */
+	uByte	bInterval;
+} __packed;
+typedef struct usb_endpoint_descriptor usb_endpoint_descriptor_t;
+
+struct usb_endpoint_ss_comp_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bMaxBurst;
+	uByte	bmAttributes;
+#define	UE_GET_BULK_STREAMS(x) ((x) & 0x0F)
+#define	UE_GET_SS_ISO_MULT(x) ((x) & 0x03)
+	uWord	wBytesPerInterval;
+} __packed;
+typedef struct usb_endpoint_ss_comp_descriptor
+		usb_endpoint_ss_comp_descriptor_t;
+
+struct usb_string_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uWord	bString[126];
+	uByte	bUnused;
+} __packed;
+typedef struct usb_string_descriptor usb_string_descriptor_t;
+
+#define	USB_MAKE_STRING_DESC(m,name)	\
+static const struct {			\
+  uByte bLength;			\
+  uByte bDescriptorType;		\
+  uByte bData[sizeof((uint8_t []){m})];	\
+} __packed name = {			\
+  .bLength = sizeof(name),		\
+  .bDescriptorType = UDESC_STRING,	\
+  .bData = { m },			\
+}
+
+struct usb_string_lang {
+	uByte bLength;
+	uByte bDescriptorType;
+	uByte bData[2];
+} __packed;
+typedef struct usb_string_lang usb_string_lang_t;
+
+struct usb_hub_descriptor {
+	uByte	bDescLength;
+	uByte	bDescriptorType;
+	uByte	bNbrPorts;
+	uWord	wHubCharacteristics;
+#define	UHD_PWR			0x0003
+#define	UHD_PWR_GANGED		0x0000
+#define	UHD_PWR_INDIVIDUAL	0x0001
+#define	UHD_PWR_NO_SWITCH	0x0002
+#define	UHD_COMPOUND		0x0004
+#define	UHD_OC			0x0018
+#define	UHD_OC_GLOBAL		0x0000
+#define	UHD_OC_INDIVIDUAL	0x0008
+#define	UHD_OC_NONE		0x0010
+#define	UHD_TT_THINK		0x0060
+#define	UHD_TT_THINK_8		0x0000
+#define	UHD_TT_THINK_16		0x0020
+#define	UHD_TT_THINK_24		0x0040
+#define	UHD_TT_THINK_32		0x0060
+#define	UHD_PORT_IND		0x0080
+	uByte	bPwrOn2PwrGood;		/* delay in 2 ms units */
+#define	UHD_PWRON_FACTOR 2
+	uByte	bHubContrCurrent;
+	uByte	DeviceRemovable[32];	/* max 255 ports */
+#define	UHD_NOT_REMOV(desc, i) \
+    (((desc)->DeviceRemovable[(i)/8] >> ((i) % 8)) & 1)
+	uByte	PortPowerCtrlMask[1];	/* deprecated */
+} __packed;
+typedef struct usb_hub_descriptor usb_hub_descriptor_t;
+
+struct usb_hub_ss_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bNbrPorts;
+	uWord	wHubCharacteristics;
+	uByte	bPwrOn2PwrGood;		/* delay in 2 ms units */
+	uByte	bHubContrCurrent;
+	uByte	bHubHdrDecLat;
+	uWord	wHubDelay;
+	uByte	DeviceRemovable[32];	/* max 255 ports */
+} __packed;
+typedef struct usb_hub_ss_descriptor usb_hub_ss_descriptor_t;
+
+/* minimum HUB descriptor (8-ports maximum) */
+struct usb_hub_descriptor_min {
+	uByte	bDescLength;
+	uByte	bDescriptorType;
+	uByte	bNbrPorts;
+	uWord	wHubCharacteristics;
+	uByte	bPwrOn2PwrGood;
+	uByte	bHubContrCurrent;
+	uByte	DeviceRemovable[1];
+	uByte	PortPowerCtrlMask[1];
+} __packed;
+typedef struct usb_hub_descriptor_min usb_hub_descriptor_min_t;
+
+struct usb_device_qualifier {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uWord	bcdUSB;
+	uByte	bDeviceClass;
+	uByte	bDeviceSubClass;
+	uByte	bDeviceProtocol;
+	uByte	bMaxPacketSize0;
+	uByte	bNumConfigurations;
+	uByte	bReserved;
+} __packed;
+typedef struct usb_device_qualifier usb_device_qualifier_t;
+
+struct usb_otg_descriptor {
+	uByte	bLength;
+	uByte	bDescriptorType;
+	uByte	bmAttributes;
+#define	UOTG_SRP	0x01
+#define	UOTG_HNP	0x02
+} __packed;
+typedef struct usb_otg_descriptor usb_otg_descriptor_t;
+
+/* OTG feature selectors */
+#define	UOTG_B_HNP_ENABLE	3
+#define	UOTG_A_HNP_SUPPORT	4
+#define	UOTG_A_ALT_HNP_SUPPORT	5
+
+struct usb_status {
+	uWord	wStatus;
+/* Device status flags */
+#define	UDS_SELF_POWERED		0x0001
+#define	UDS_REMOTE_WAKEUP		0x0002
+/* Endpoint status flags */
+#define	UES_HALT			0x0001
+} __packed;
+typedef struct usb_status usb_status_t;
+
+struct usb_hub_status {
+	uWord	wHubStatus;
+#define	UHS_LOCAL_POWER			0x0001
+#define	UHS_OVER_CURRENT		0x0002
+	uWord	wHubChange;
+} __packed;
+typedef struct usb_hub_status usb_hub_status_t;
+
+struct usb_port_status {
+	uWord	wPortStatus;
+#define	UPS_CURRENT_CONNECT_STATUS	0x0001
+#define	UPS_PORT_ENABLED		0x0002
+#define	UPS_SUSPEND			0x0004
+#define	UPS_OVERCURRENT_INDICATOR	0x0008
+#define	UPS_RESET			0x0010
+#define	UPS_PORT_L1			0x0020	/* USB 2.0 only */
+/* The link-state bits are valid for Super-Speed USB HUBs */
+#define	UPS_PORT_LINK_STATE_GET(x)	(((x) >> 5) & 0xF)
+#define	UPS_PORT_LINK_STATE_SET(x)	(((x) & 0xF) << 5)
+#define	UPS_PORT_LS_U0		0x00
+#define	UPS_PORT_LS_U1		0x01
+#define	UPS_PORT_LS_U2		0x02
+#define	UPS_PORT_LS_U3		0x03
+#define	UPS_PORT_LS_SS_DIS	0x04
+#define	UPS_PORT_LS_RX_DET	0x05
+#define	UPS_PORT_LS_SS_INA	0x06
+#define	UPS_PORT_LS_POLL	0x07
+#define	UPS_PORT_LS_RECOVER	0x08
+#define	UPS_PORT_LS_HOT_RST	0x09
+#define	UPS_PORT_LS_COMP_MODE	0x0A
+#define	UPS_PORT_LS_LOOPBACK	0x0B
+#define	UPS_PORT_LS_RESUME	0x0F
+#define	UPS_PORT_POWER			0x0100
+#define	UPS_PORT_POWER_SS		0x0200	/* super-speed only */
+#define	UPS_LOW_SPEED			0x0200
+#define	UPS_HIGH_SPEED			0x0400
+#define	UPS_OTHER_SPEED			0x0600	/* currently FreeBSD specific */
+#define	UPS_PORT_TEST			0x0800
+#define	UPS_PORT_INDICATOR		0x1000
+#define	UPS_PORT_MODE_DEVICE		0x8000	/* currently FreeBSD specific */
+	uWord	wPortChange;
+#define	UPS_C_CONNECT_STATUS		0x0001
+#define	UPS_C_PORT_ENABLED		0x0002
+#define	UPS_C_SUSPEND			0x0004
+#define	UPS_C_OVERCURRENT_INDICATOR	0x0008
+#define	UPS_C_PORT_RESET		0x0010
+#define	UPS_C_PORT_L1			0x0020	/* USB 2.0 only */
+#define	UPS_C_BH_PORT_RESET		0x0020	/* USB 3.0 only */
+#define	UPS_C_PORT_LINK_STATE		0x0040
+#define	UPS_C_PORT_CONFIG_ERROR		0x0080
+} __packed;
+typedef struct usb_port_status usb_port_status_t;
+
+/*
+ * The "USB_SPEED" macros defines all the supported USB speeds.
+ */
+enum usb_dev_speed {
+	USB_SPEED_VARIABLE,
+	USB_SPEED_LOW,
+	USB_SPEED_FULL,
+	USB_SPEED_HIGH,
+	USB_SPEED_SUPER,
+};
+#define	USB_SPEED_MAX	(USB_SPEED_SUPER+1)
+
+/*
+ * The "USB_REV" macros defines all the supported USB revisions.
+ */
+enum usb_revision {
+	USB_REV_UNKNOWN,
+	USB_REV_PRE_1_0,
+	USB_REV_1_0,
+	USB_REV_1_1,
+	USB_REV_2_0,
+	USB_REV_2_5,
+	USB_REV_3_0
+};
+#define	USB_REV_MAX	(USB_REV_3_0+1)
+
+/*
+ * Supported host controller modes.
+ */
+enum usb_hc_mode {
+	USB_MODE_HOST,		/* initiates transfers */
+	USB_MODE_DEVICE,	/* bus transfer target */
+	USB_MODE_DUAL		/* can be host or device */
+};
+#define	USB_MODE_MAX	(USB_MODE_DUAL+1)
+
+/*
+ * The "USB_STATE" enums define all the supported device states.
+ */
+enum usb_dev_state {
+	USB_STATE_DETACHED,
+	USB_STATE_ATTACHED,
+	USB_STATE_POWERED,
+	USB_STATE_ADDRESSED,
+	USB_STATE_CONFIGURED,
+};
+#define	USB_STATE_MAX	(USB_STATE_CONFIGURED+1)
+
+/*
+ * The "USB_EP_MODE" macros define all the currently supported
+ * endpoint modes.
+ */
+enum usb_ep_mode {
+	USB_EP_MODE_DEFAULT,
+	USB_EP_MODE_STREAMS,	/* USB3.0 specific */
+	USB_EP_MODE_HW_MASS_STORAGE,
+	USB_EP_MODE_HW_SERIAL,
+	USB_EP_MODE_HW_ETHERNET_CDC,
+	USB_EP_MODE_HW_ETHERNET_NCM,
+	USB_EP_MODE_MAX
+};
+#endif					/* _USB_STANDARD_H_ */
diff --git a/usr/contrib/freebsd/dev/usb/usb_endian.h b/usr/contrib/freebsd/dev/usb/usb_endian.h
new file mode 100644
index 0000000000..0bbcb9bf82
--- /dev/null
+++ b/usr/contrib/freebsd/dev/usb/usb_endian.h
@@ -0,0 +1,121 @@
+/* $FreeBSD$ */
+/*
+ * Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _USB_ENDIAN_H_
+#define	_USB_ENDIAN_H_
+
+#ifndef USB_GLOBAL_INCLUDE_FILE
+#include <sys/stdint.h>
+#include <sys/endian.h>
+#endif
+
+/*
+ * Declare the basic USB record types. USB records have an alignment
+ * of 1 byte and are always packed.
+ */
+typedef uint8_t uByte;
+typedef uint8_t uWord[2];
+typedef uint8_t uDWord[4];
+typedef uint8_t uQWord[8];
+
+/*
+ * Define a set of macros that can get and set data independent of
+ * CPU endianness and CPU alignment requirements:
+ */
+#define	UGETB(w)			\
+  ((w)[0])
+
+#define	UGETW(w)			\
+  ((w)[0] |				\
+  (((uint16_t)((w)[1])) << 8))
+
+#define	UGETDW(w)			\
+  ((w)[0] |				\
+  (((uint16_t)((w)[1])) << 8) |		\
+  (((uint32_t)((w)[2])) << 16) |	\
+  (((uint32_t)((w)[3])) << 24))
+
+#define	UGETQW(w)			\
+  ((w)[0] |				\
+  (((uint16_t)((w)[1])) << 8) |		\
+  (((uint32_t)((w)[2])) << 16) |	\
+  (((uint32_t)((w)[3])) << 24) |	\
+  (((uint64_t)((w)[4])) << 32) |	\
+  (((uint64_t)((w)[5])) << 40) |	\
+  (((uint64_t)((w)[6])) << 48) |	\
+  (((uint64_t)((w)[7])) << 56))
+
+#define	USETB(w,v) do {			\
+  (w)[0] = (uint8_t)(v);		\
+} while (0)
+
+#define	USETW(w,v) do {			\
+  (w)[0] = (uint8_t)(v);		\
+  (w)[1] = (uint8_t)((v) >> 8);		\
+} while (0)
+
+#define	USETDW(w,v) do {		\
+  (w)[0] = (uint8_t)(v);		\
+  (w)[1] = (uint8_t)((v) >> 8);		\
+  (w)[2] = (uint8_t)((v) >> 16);	\
+  (w)[3] = (uint8_t)((v) >> 24);	\
+} while (0)
+
+#define	USETQW(w,v) do {		\
+  (w)[0] = (uint8_t)(v);		\
+  (w)[1] = (uint8_t)((v) >> 8);		\
+  (w)[2] = (uint8_t)((v) >> 16);	\
+  (w)[3] = (uint8_t)((v) >> 24);	\
+  (w)[4] = (uint8_t)((v) >> 32);	\
+  (w)[5] = (uint8_t)((v) >> 40);	\
+  (w)[6] = (uint8_t)((v) >> 48);	\
+  (w)[7] = (uint8_t)((v) >> 56);	\
+} while (0)
+
+#define	USETW2(w,b1,b0) do {		\
+  (w)[0] = (uint8_t)(b0);		\
+  (w)[1] = (uint8_t)(b1);		\
+} while (0)
+
+#define	USETW4(w,b3,b2,b1,b0) do {	\
+  (w)[0] = (uint8_t)(b0);		\
+  (w)[1] = (uint8_t)(b1);		\
+  (w)[2] = (uint8_t)(b2);		\
+  (w)[3] = (uint8_t)(b3);		\
+} while (0)
+
+#define	USETW8(w,b7,b6,b5,b4,b3,b2,b1,b0) do {	\
+  (w)[0] = (uint8_t)(b0);		\
+  (w)[1] = (uint8_t)(b1);		\
+  (w)[2] = (uint8_t)(b2);		\
+  (w)[3] = (uint8_t)(b3);		\
+  (w)[4] = (uint8_t)(b4);		\
+  (w)[5] = (uint8_t)(b5);		\
+  (w)[6] = (uint8_t)(b6);		\
+  (w)[7] = (uint8_t)(b7);		\
+} while (0)
+
+#endif					/* _USB_ENDIAN_H_ */
diff --git a/usr/contrib/freebsd/dev/usb/usb_freebsd.h b/usr/contrib/freebsd/dev/usb/usb_freebsd.h
new file mode 100644
index 0000000000..3bc9d2c1eb
--- /dev/null
+++ b/usr/contrib/freebsd/dev/usb/usb_freebsd.h
@@ -0,0 +1,101 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2008 Hans Petter Selasky. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Including this file is mandatory for all USB related c-files in the kernel.
+ */
+
+#ifndef _USB_FREEBSD_H_
+#define	_USB_FREEBSD_H_
+
+/* Default USB configuration */
+#define	USB_HAVE_UGEN 1
+#define	USB_HAVE_DEVCTL 1
+#define	USB_HAVE_BUSDMA 1
+#define	USB_HAVE_COMPAT_LINUX 1
+#define	USB_HAVE_USER_IO 1
+#define	USB_HAVE_MBUF 1
+#define	USB_HAVE_TT_SUPPORT 1
+#define	USB_HAVE_POWERD 1
+#define	USB_HAVE_MSCTEST 1
+#define	USB_HAVE_MSCTEST_DETACH 1
+#define	USB_HAVE_PF 1
+#define	USB_HAVE_ROOT_MOUNT_HOLD 1
+#define	USB_HAVE_ID_SECTION 1
+#define	USB_HAVE_PER_BUS_PROCESS 1
+#define	USB_HAVE_FIXED_ENDPOINT 0
+#define	USB_HAVE_FIXED_IFACE 0
+#define	USB_HAVE_FIXED_CONFIG 0
+#define	USB_HAVE_FIXED_PORT 0
+#define	USB_HAVE_DISABLE_ENUM 1
+
+/* define zero ticks callout value */
+#define	USB_CALLOUT_ZERO_TICKS 1
+
+#define	USB_TD_GET_PROC(td) (td)->td_proc
+#define	USB_PROC_GET_GID(td) (td)->p_pgid
+
+#if (!defined(USB_HOST_ALIGN)) || (USB_HOST_ALIGN <= 0)
+/* Use default value. */
+#undef USB_HOST_ALIGN
+#if defined(__arm__) || defined(__mips__) || defined(__powerpc__)
+#define USB_HOST_ALIGN	32		/* Arm and MIPS need at least this much, if not more */
+#else
+#define	USB_HOST_ALIGN    8		/* bytes, must be power of two */
+#endif
+#endif
+/* Sanity check for USB_HOST_ALIGN: Verify power of two. */
+#if ((-USB_HOST_ALIGN) & USB_HOST_ALIGN) != USB_HOST_ALIGN
+#error "USB_HOST_ALIGN is not power of two."
+#endif
+#define	USB_FS_ISOC_UFRAME_MAX 4	/* exclusive unit */
+#define	USB_BUS_MAX 256			/* units */
+#define	USB_MAX_DEVICES 128		/* units */
+#define	USB_CONFIG_MAX 65535		/* bytes */
+#define	USB_IFACE_MAX 32		/* units */
+#define	USB_FIFO_MAX 128		/* units */
+#define	USB_MAX_EP_STREAMS 8		/* units */
+#define	USB_MAX_EP_UNITS 32		/* units */
+#define	USB_MAX_PORTS 255		/* units */
+
+#define	USB_MAX_FS_ISOC_FRAMES_PER_XFER (120)	/* units */
+#define	USB_MAX_HS_ISOC_FRAMES_PER_XFER (8*120)	/* units */
+
+#define	USB_HUB_MAX_DEPTH	5
+#define	USB_EP0_BUFSIZE		1024	/* bytes */
+#define	USB_CS_RESET_LIMIT	20	/* failures = 20 * 50 ms = 1sec */
+
+#define	USB_MAX_AUTO_QUIRK	8	/* maximum number of dynamic quirks */
+
+typedef uint32_t usb_timeout_t;		/* milliseconds */
+typedef uint32_t usb_frlength_t;	/* bytes */
+typedef uint32_t usb_frcount_t;		/* units */
+typedef uint32_t usb_size_t;		/* bytes */
+typedef uint32_t usb_ticks_t;		/* system defined */
+typedef uint16_t usb_power_mask_t;	/* see "USB_HW_POWER_XXX" */
+typedef uint16_t usb_stream_t;		/* stream ID */
+
+#endif	/* _USB_FREEBSD_H_ */
diff --git a/usr/contrib/freebsd/dev/usb/usbdi.h b/usr/contrib/freebsd/dev/usb/usbdi.h
new file mode 100644
index 0000000000..202ad89fa7
--- /dev/null
+++ b/usr/contrib/freebsd/dev/usb/usbdi.h
@@ -0,0 +1,657 @@
+/*-
+ * Copyright (c) 2009 Andrew Thompson
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef _USB_USBDI_H_
+#define _USB_USBDI_H_
+
+struct usb_fifo;
+struct usb_xfer;
+struct usb_device;
+struct usb_attach_arg;
+struct usb_interface;
+struct usb_endpoint;
+struct usb_page_cache;
+struct usb_page_search;
+struct usb_process;
+struct usb_proc_msg;
+struct usb_mbuf;
+struct usb_fs_privdata;
+struct mbuf;
+
+typedef enum {	/* keep in sync with usb_errstr_table */
+	USB_ERR_NORMAL_COMPLETION = 0,
+	USB_ERR_PENDING_REQUESTS,	/* 1 */
+	USB_ERR_NOT_STARTED,		/* 2 */
+	USB_ERR_INVAL,			/* 3 */
+	USB_ERR_NOMEM,			/* 4 */
+	USB_ERR_CANCELLED,		/* 5 */
+	USB_ERR_BAD_ADDRESS,		/* 6 */
+	USB_ERR_BAD_BUFSIZE,		/* 7 */
+	USB_ERR_BAD_FLAG,		/* 8 */
+	USB_ERR_NO_CALLBACK,		/* 9 */
+	USB_ERR_IN_USE,			/* 10 */
+	USB_ERR_NO_ADDR,		/* 11 */
+	USB_ERR_NO_PIPE,		/* 12 */
+	USB_ERR_ZERO_NFRAMES,		/* 13 */
+	USB_ERR_ZERO_MAXP,		/* 14 */
+	USB_ERR_SET_ADDR_FAILED,	/* 15 */
+	USB_ERR_NO_POWER,		/* 16 */
+	USB_ERR_TOO_DEEP,		/* 17 */
+	USB_ERR_IOERROR,		/* 18 */
+	USB_ERR_NOT_CONFIGURED,		/* 19 */
+	USB_ERR_TIMEOUT,		/* 20 */
+	USB_ERR_SHORT_XFER,		/* 21 */
+	USB_ERR_STALLED,		/* 22 */
+	USB_ERR_INTERRUPTED,		/* 23 */
+	USB_ERR_DMA_LOAD_FAILED,	/* 24 */
+	USB_ERR_BAD_CONTEXT,		/* 25 */
+	USB_ERR_NO_ROOT_HUB,		/* 26 */
+	USB_ERR_NO_INTR_THREAD,		/* 27 */
+	USB_ERR_NOT_LOCKED,		/* 28 */
+	USB_ERR_MAX
+} usb_error_t;
+
+/*
+ * Flags for transfers
+ */
+#define	USB_FORCE_SHORT_XFER	0x0001	/* force a short transmit last */
+#define	USB_SHORT_XFER_OK	0x0004	/* allow short reads */
+#define	USB_DELAY_STATUS_STAGE	0x0010	/* insert delay before STATUS stage */
+#define	USB_USER_DATA_PTR	0x0020	/* internal flag */
+#define	USB_MULTI_SHORT_OK	0x0040	/* allow multiple short frames */
+#define	USB_MANUAL_STATUS	0x0080	/* manual ctrl status */
+
+#define	USB_NO_TIMEOUT 0
+#define	USB_DEFAULT_TIMEOUT 5000	/* 5000 ms = 5 seconds */
+
+#if defined(_KERNEL)
+/* typedefs */
+
+typedef void (usb_callback_t)(struct usb_xfer *, usb_error_t);
+typedef void (usb_proc_callback_t)(struct usb_proc_msg *);
+typedef usb_error_t (usb_handle_req_t)(struct usb_device *,
+    struct usb_device_request *, const void **, uint16_t *);
+
+typedef int (usb_fifo_open_t)(struct usb_fifo *fifo, int fflags);
+typedef void (usb_fifo_close_t)(struct usb_fifo *fifo, int fflags);
+typedef int (usb_fifo_ioctl_t)(struct usb_fifo *fifo, u_long cmd, void *addr, int fflags);
+typedef void (usb_fifo_cmd_t)(struct usb_fifo *fifo);
+typedef void (usb_fifo_filter_t)(struct usb_fifo *fifo, struct usb_mbuf *m);
+
+
+/* USB events */
+#ifndef USB_GLOBAL_INCLUDE_FILE
+#include <sys/eventhandler.h>
+#endif
+typedef void (*usb_dev_configured_t)(void *, struct usb_device *,
+    struct usb_attach_arg *);
+EVENTHANDLER_DECLARE(usb_dev_configured, usb_dev_configured_t);
+
+/*
+ * The following macros are used used to convert milliseconds into
+ * HZ. We use 1024 instead of 1000 milliseconds per second to save a
+ * full division.
+ */
+#define	USB_MS_HZ 1024
+
+#define	USB_MS_TO_TICKS(ms) \
+  (((uint32_t)((((uint32_t)(ms)) * ((uint32_t)(hz))) + USB_MS_HZ - 1)) / USB_MS_HZ)
+
+/*
+ * Common queue structure for USB transfers.
+ */
+struct usb_xfer_queue {
+	TAILQ_HEAD(, usb_xfer) head;
+	struct usb_xfer *curr;		/* current USB transfer processed */
+	void    (*command) (struct usb_xfer_queue *pq);
+	uint8_t	recurse_1:1;
+	uint8_t	recurse_2:1;
+	uint8_t	recurse_3:1;
+	uint8_t	reserved:5;
+};
+
+/*
+ * The following structure defines an USB endpoint
+ * USB endpoint.
+ */
+struct usb_endpoint {
+	/* queue of USB transfers */
+	struct usb_xfer_queue endpoint_q[USB_MAX_EP_STREAMS];
+
+	struct usb_endpoint_descriptor *edesc;
+	struct usb_endpoint_ss_comp_descriptor *ecomp;
+	const struct usb_pipe_methods *methods;	/* set by HC driver */
+
+	uint16_t isoc_next;
+
+	uint8_t	toggle_next:1;		/* next data toggle value */
+	uint8_t	is_stalled:1;		/* set if endpoint is stalled */
+	uint8_t	is_synced:1;		/* set if we a synchronised */
+	uint8_t	unused:5;
+	uint8_t	iface_index;		/* not used by "default endpoint" */
+
+	uint8_t refcount_alloc;		/* allocation refcount */
+	uint8_t refcount_bw;		/* bandwidth refcount */
+#define	USB_EP_REF_MAX 0x3f
+
+	/* High-Speed resource allocation (valid if "refcount_bw" > 0) */
+
+	uint8_t	usb_smask;		/* USB start mask */
+	uint8_t	usb_cmask;		/* USB complete mask */
+	uint8_t	usb_uframe;		/* USB microframe */
+
+	/* USB endpoint mode, see USB_EP_MODE_XXX */
+
+	uint8_t ep_mode;
+};
+
+/*
+ * The following structure defines an USB interface.
+ */
+struct usb_interface {
+	struct usb_interface_descriptor *idesc;
+	device_t subdev;
+	uint8_t	alt_index;
+	uint8_t	parent_iface_index;
+
+	/* Linux compat */
+	struct usb_host_interface *altsetting;
+	struct usb_host_interface *cur_altsetting;
+	struct usb_device *linux_udev;
+	void   *bsd_priv_sc;		/* device specific information */
+	char   *pnpinfo;		/* additional PnP-info for this interface */
+	uint8_t	num_altsetting;		/* number of alternate settings */
+	uint8_t	bsd_iface_index;
+};
+
+/*
+ * The following structure defines a set of USB transfer flags.
+ */
+struct usb_xfer_flags {
+	uint8_t	force_short_xfer:1;	/* force a short transmit transfer
+					 * last */
+	uint8_t	short_xfer_ok:1;	/* allow short receive transfers */
+	uint8_t	short_frames_ok:1;	/* allow short frames */
+	uint8_t	pipe_bof:1;		/* block pipe on failure */
+	uint8_t	proxy_buffer:1;		/* makes buffer size a factor of
+					 * "max_frame_size" */
+	uint8_t	ext_buffer:1;		/* uses external DMA buffer */
+	uint8_t	manual_status:1;	/* non automatic status stage on
+					 * control transfers */
+	uint8_t	no_pipe_ok:1;		/* set if "USB_ERR_NO_PIPE" error can
+					 * be ignored */
+	uint8_t	stall_pipe:1;		/* set if the endpoint belonging to
+					 * this USB transfer should be stalled
+					 * before starting this transfer! */
+	uint8_t pre_scale_frames:1;	/* "usb_config->frames" is
+					 * assumed to give the
+					 * buffering time in
+					 * milliseconds and is
+					 * converted into the nearest
+					 * number of frames when the
+					 * USB transfer is setup. This
+					 * option only has effect for
+					 * ISOCHRONOUS transfers.
+					 */
+};
+
+/*
+ * The following structure define an USB configuration, that basically
+ * is used when setting up an USB transfer.
+ */
+struct usb_config {
+	usb_callback_t *callback;	/* USB transfer callback */
+	usb_frlength_t bufsize;	/* total pipe buffer size in bytes */
+	usb_frcount_t frames;		/* maximum number of USB frames */
+	usb_timeout_t interval;	/* interval in milliseconds */
+#define	USB_DEFAULT_INTERVAL	0
+	usb_timeout_t timeout;		/* transfer timeout in milliseconds */
+	struct usb_xfer_flags flags;	/* transfer flags */
+	usb_stream_t stream_id;		/* USB3.0 specific */
+	enum usb_hc_mode usb_mode;	/* host or device mode */
+	uint8_t	type;			/* pipe type */
+	uint8_t	endpoint;		/* pipe number */
+	uint8_t	direction;		/* pipe direction */
+	uint8_t	ep_index;		/* pipe index match to use */
+	uint8_t	if_index;		/* "ifaces" index to use */
+};
+
+/*
+ * Use these macro when defining USB device ID arrays if you want to
+ * have your driver module automatically loaded in host, device or
+ * both modes respectively:
+ */
+#if USB_HAVE_ID_SECTION
+#define	STRUCT_USB_HOST_ID \
+    struct usb_device_id __section("usb_host_id")
+#define	STRUCT_USB_DEVICE_ID \
+    struct usb_device_id __section("usb_device_id")
+#define	STRUCT_USB_DUAL_ID \
+    struct usb_device_id __section("usb_dual_id")
+#else
+#define	STRUCT_USB_HOST_ID \
+    struct usb_device_id
+#define	STRUCT_USB_DEVICE_ID \
+    struct usb_device_id
+#define	STRUCT_USB_DUAL_ID \
+    struct usb_device_id
+#endif			/* USB_HAVE_ID_SECTION */
+
+/*
+ * The following structure is used when looking up an USB driver for
+ * an USB device. It is inspired by the Linux structure called
+ * "usb_device_id".
+ */
+struct usb_device_id {
+
+	/* Select which fields to match against */
+#if BYTE_ORDER == LITTLE_ENDIAN
+	uint16_t
+		match_flag_vendor:1,
+		match_flag_product:1,
+		match_flag_dev_lo:1,
+		match_flag_dev_hi:1,
+
+		match_flag_dev_class:1,
+		match_flag_dev_subclass:1,
+		match_flag_dev_protocol:1,
+		match_flag_int_class:1,
+
+		match_flag_int_subclass:1,
+		match_flag_int_protocol:1,
+		match_flag_unused:6;
+#else
+	uint16_t
+		match_flag_unused:6,
+		match_flag_int_protocol:1,
+		match_flag_int_subclass:1,
+
+		match_flag_int_class:1,
+		match_flag_dev_protocol:1,
+		match_flag_dev_subclass:1,
+		match_flag_dev_class:1,
+
+		match_flag_dev_hi:1,
+		match_flag_dev_lo:1,
+		match_flag_product:1,
+		match_flag_vendor:1;
+#endif
+
+	/* Used for product specific matches; the BCD range is inclusive */
+	uint16_t idVendor;
+	uint16_t idProduct;
+	uint16_t bcdDevice_lo;
+	uint16_t bcdDevice_hi;
+
+	/* Used for device class matches */
+	uint8_t	bDeviceClass;
+	uint8_t	bDeviceSubClass;
+	uint8_t	bDeviceProtocol;
+
+	/* Used for interface class matches */
+	uint8_t	bInterfaceClass;
+	uint8_t	bInterfaceSubClass;
+	uint8_t	bInterfaceProtocol;
+
+#if USB_HAVE_COMPAT_LINUX
+	/* which fields to match against */
+	uint16_t match_flags;
+#define	USB_DEVICE_ID_MATCH_VENDOR		0x0001
+#define	USB_DEVICE_ID_MATCH_PRODUCT		0x0002
+#define	USB_DEVICE_ID_MATCH_DEV_LO		0x0004
+#define	USB_DEVICE_ID_MATCH_DEV_HI		0x0008
+#define	USB_DEVICE_ID_MATCH_DEV_CLASS		0x0010
+#define	USB_DEVICE_ID_MATCH_DEV_SUBCLASS	0x0020
+#define	USB_DEVICE_ID_MATCH_DEV_PROTOCOL	0x0040
+#define	USB_DEVICE_ID_MATCH_INT_CLASS		0x0080
+#define	USB_DEVICE_ID_MATCH_INT_SUBCLASS	0x0100
+#define	USB_DEVICE_ID_MATCH_INT_PROTOCOL	0x0200
+#endif
+
+	/* Hook for driver specific information */
+	unsigned long driver_info;
+} __aligned(32);
+
+#define USB_STD_PNP_INFO "M16:mask;U16:vendor;U16:product;L16:product;G16:product;" \
+	"U8:devclass;U8:devsubclass;U8:devprotocol;" \
+	"U8:intclass;U8:intsubclass;U8:intprotocol;"
+#define USB_STD_PNP_HOST_INFO USB_STD_PNP_INFO "T:mode=host;"
+#define USB_STD_PNP_DEVICE_INFO USB_STD_PNP_INFO "T:mode=device;"
+#define USB_PNP_HOST_INFO(table)					\
+	MODULE_PNP_INFO(USB_STD_PNP_HOST_INFO, usb, table, table, sizeof(table[0]), \
+	    sizeof(table) / sizeof(table[0]))
+#define USB_PNP_DEVICE_INFO(table)					\
+	MODULE_PNP_INFO(USB_STD_PNP_DEVICE_INFO, usb, table, table, sizeof(table[0]), \
+	    sizeof(table) / sizeof(table[0]))
+#define USB_PNP_DUAL_INFO(table)					\
+	MODULE_PNP_INFO(USB_STD_PNP_INFO, usb, table, table, sizeof(table[0]), \
+	    sizeof(table) / sizeof(table[0]))
+
+/* check that the size of the structure above is correct */
+extern char usb_device_id_assert[(sizeof(struct usb_device_id) == 32) ? 1 : -1];
+
+#define	USB_VENDOR(vend)			\
+  .match_flag_vendor = 1, .idVendor = (vend)
+
+#define	USB_PRODUCT(prod)			\
+  .match_flag_product = 1, .idProduct = (prod)
+
+#define	USB_VP(vend,prod)			\
+  USB_VENDOR(vend), USB_PRODUCT(prod)
+
+#define	USB_VPI(vend,prod,info)			\
+  USB_VENDOR(vend), USB_PRODUCT(prod), USB_DRIVER_INFO(info)
+
+#define	USB_DEV_BCD_GTEQ(lo)	/* greater than or equal */ \
+  .match_flag_dev_lo = 1, .bcdDevice_lo = (lo)
+
+#define	USB_DEV_BCD_LTEQ(hi)	/* less than or equal */ \
+  .match_flag_dev_hi = 1, .bcdDevice_hi = (hi)
+
+#define	USB_DEV_CLASS(dc)			\
+  .match_flag_dev_class = 1, .bDeviceClass = (dc)
+
+#define	USB_DEV_SUBCLASS(dsc)			\
+  .match_flag_dev_subclass = 1, .bDeviceSubClass = (dsc)
+
+#define	USB_DEV_PROTOCOL(dp)			\
+  .match_flag_dev_protocol = 1, .bDeviceProtocol = (dp)
+
+#define	USB_IFACE_CLASS(ic)			\
+  .match_flag_int_class = 1, .bInterfaceClass = (ic)
+
+#define	USB_IFACE_SUBCLASS(isc)			\
+  .match_flag_int_subclass = 1, .bInterfaceSubClass = (isc)
+
+#define	USB_IFACE_PROTOCOL(ip)			\
+  .match_flag_int_protocol = 1, .bInterfaceProtocol = (ip)
+
+#define	USB_IF_CSI(class,subclass,info)			\
+  USB_IFACE_CLASS(class), USB_IFACE_SUBCLASS(subclass), USB_DRIVER_INFO(info)
+
+#define	USB_DRIVER_INFO(n)			\
+  .driver_info = (n)
+
+#define	USB_GET_DRIVER_INFO(did)		\
+  (did)->driver_info
+
+/*
+ * The following structure keeps information that is used to match
+ * against an array of "usb_device_id" elements.
+ */
+struct usbd_lookup_info {
+	uint16_t idVendor;
+	uint16_t idProduct;
+	uint16_t bcdDevice;
+	uint8_t	bDeviceClass;
+	uint8_t	bDeviceSubClass;
+	uint8_t	bDeviceProtocol;
+	uint8_t	bInterfaceClass;
+	uint8_t	bInterfaceSubClass;
+	uint8_t	bInterfaceProtocol;
+	uint8_t	bIfaceIndex;
+	uint8_t	bIfaceNum;
+	uint8_t	bConfigIndex;
+	uint8_t	bConfigNum;
+};
+
+/* Structure used by probe and attach */
+
+struct usb_attach_arg {
+	struct usbd_lookup_info info;
+	device_t temp_dev;		/* for internal use */
+	unsigned long driver_info;	/* for internal use */
+	void *driver_ivar;
+	struct usb_device *device;	/* current device */
+	struct usb_interface *iface;	/* current interface */
+	enum usb_hc_mode usb_mode;	/* host or device mode */
+	uint8_t	port;
+	uint8_t dev_state;
+#define UAA_DEV_READY		0
+#define UAA_DEV_DISABLED	1
+#define UAA_DEV_EJECTING	2
+};
+
+/*
+ * The following is a wrapper for the callout structure to ease
+ * porting the code to other platforms.
+ */
+struct usb_callout {
+	struct callout co;
+};
+#define	usb_callout_init_mtx(c,m,f) callout_init_mtx(&(c)->co,m,f)
+#define	usb_callout_reset(c,t,f,d) callout_reset(&(c)->co,t,f,d)
+#define	usb_callout_stop(c) callout_stop(&(c)->co)
+#define	usb_callout_drain(c) callout_drain(&(c)->co)
+#define	usb_callout_pending(c) callout_pending(&(c)->co)
+
+/* USB transfer states */
+
+#define	USB_ST_SETUP       0
+#define	USB_ST_TRANSFERRED 1
+#define	USB_ST_ERROR       2
+
+/* USB handle request states */
+#define	USB_HR_NOT_COMPLETE	0
+#define	USB_HR_COMPLETE_OK	1
+#define	USB_HR_COMPLETE_ERR	2
+
+/*
+ * The following macro will return the current state of an USB
+ * transfer like defined by the "USB_ST_XXX" enums.
+ */
+#define	USB_GET_STATE(xfer) (usbd_xfer_state(xfer))
+
+/*
+ * The following structure defines the USB process message header.
+ */
+struct usb_proc_msg {
+	TAILQ_ENTRY(usb_proc_msg) pm_qentry;
+	usb_proc_callback_t *pm_callback;
+	usb_size_t pm_num;
+};
+
+#define	USB_FIFO_TX 0
+#define	USB_FIFO_RX 1
+
+/*
+ * Locking note for the following functions.  All the
+ * "usb_fifo_cmd_t" and "usb_fifo_filter_t" functions are called
+ * locked. The others are called unlocked.
+ */
+struct usb_fifo_methods {
+	usb_fifo_open_t *f_open;
+	usb_fifo_close_t *f_close;
+	usb_fifo_ioctl_t *f_ioctl;
+	/*
+	 * NOTE: The post-ioctl callback is called after the USB reference
+	 * gets locked in the IOCTL handler:
+	 */
+	usb_fifo_ioctl_t *f_ioctl_post;
+	usb_fifo_cmd_t *f_start_read;
+	usb_fifo_cmd_t *f_stop_read;
+	usb_fifo_cmd_t *f_start_write;
+	usb_fifo_cmd_t *f_stop_write;
+	usb_fifo_filter_t *f_filter_read;
+	usb_fifo_filter_t *f_filter_write;
+	const char *basename[4];
+	const char *postfix[4];
+};
+
+struct usb_fifo_sc {
+	struct usb_fifo *fp[2];
+	struct usb_fs_privdata *dev;
+};
+
+const char *usbd_errstr(usb_error_t error);
+void	*usbd_find_descriptor(struct usb_device *udev, void *id,
+	    uint8_t iface_index, uint8_t type, uint8_t type_mask,
+	    uint8_t subtype, uint8_t subtype_mask);
+struct usb_config_descriptor *usbd_get_config_descriptor(
+	    struct usb_device *udev);
+struct usb_device_descriptor *usbd_get_device_descriptor(
+	    struct usb_device *udev);
+struct usb_interface *usbd_get_iface(struct usb_device *udev,
+	    uint8_t iface_index);
+struct usb_interface_descriptor *usbd_get_interface_descriptor(
+	    struct usb_interface *iface);
+struct usb_endpoint *usbd_get_endpoint(struct usb_device *udev, uint8_t iface_index,
+		    const struct usb_config *setup);
+struct usb_endpoint *usbd_get_ep_by_addr(struct usb_device *udev, uint8_t ea_val);
+usb_error_t	usbd_interface_count(struct usb_device *udev, uint8_t *count);
+enum usb_hc_mode usbd_get_mode(struct usb_device *udev);
+enum usb_dev_speed usbd_get_speed(struct usb_device *udev);
+void	device_set_usb_desc(device_t dev);
+void	usb_pause_mtx(struct mtx *mtx, int _ticks);
+usb_error_t	usbd_set_pnpinfo(struct usb_device *udev,
+			uint8_t iface_index, const char *pnpinfo);
+usb_error_t	usbd_add_dynamic_quirk(struct usb_device *udev,
+			uint16_t quirk);
+usb_error_t	usbd_set_endpoint_mode(struct usb_device *udev,
+			struct usb_endpoint *ep, uint8_t ep_mode);
+uint8_t		usbd_get_endpoint_mode(struct usb_device *udev,
+			struct usb_endpoint *ep);
+
+const struct usb_device_id *usbd_lookup_id_by_info(
+	    const struct usb_device_id *id, usb_size_t sizeof_id,
+	    const struct usbd_lookup_info *info);
+int	usbd_lookup_id_by_uaa(const struct usb_device_id *id,
+	    usb_size_t sizeof_id, struct usb_attach_arg *uaa);
+
+usb_error_t usbd_do_request_flags(struct usb_device *udev, struct mtx *mtx,
+		    struct usb_device_request *req, void *data, uint16_t flags,
+		    uint16_t *actlen, usb_timeout_t timeout);
+#define	usbd_do_request(u,m,r,d) \
+  usbd_do_request_flags(u,m,r,d,0,NULL,USB_DEFAULT_TIMEOUT)
+
+uint8_t	usbd_clear_stall_callback(struct usb_xfer *xfer1,
+	    struct usb_xfer *xfer2);
+uint8_t	usbd_get_interface_altindex(struct usb_interface *iface);
+usb_error_t usbd_set_alt_interface_index(struct usb_device *udev,
+	    uint8_t iface_index, uint8_t alt_index);
+uint32_t usbd_get_isoc_fps(struct usb_device *udev);
+usb_error_t usbd_transfer_setup(struct usb_device *udev,
+	    const uint8_t *ifaces, struct usb_xfer **pxfer,
+	    const struct usb_config *setup_start, uint16_t n_setup,
+	    void *priv_sc, struct mtx *priv_mtx);
+void	usbd_transfer_submit(struct usb_xfer *xfer);
+void	usbd_transfer_clear_stall(struct usb_xfer *xfer);
+void	usbd_transfer_drain(struct usb_xfer *xfer);
+uint8_t	usbd_transfer_pending(struct usb_xfer *xfer);
+void	usbd_transfer_start(struct usb_xfer *xfer);
+void	usbd_transfer_stop(struct usb_xfer *xfer);
+void	usbd_transfer_unsetup(struct usb_xfer **pxfer, uint16_t n_setup);
+void	usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max);
+void	usbd_set_parent_iface(struct usb_device *udev, uint8_t iface_index,
+	    uint8_t parent_index);
+uint8_t	usbd_get_bus_index(struct usb_device *udev);
+uint8_t	usbd_get_device_index(struct usb_device *udev);
+void	usbd_set_power_mode(struct usb_device *udev, uint8_t power_mode);
+uint8_t	usbd_filter_power_mode(struct usb_device *udev, uint8_t power_mode);
+uint8_t	usbd_device_attached(struct usb_device *udev);
+
+usb_frlength_t
+	usbd_xfer_old_frame_length(struct usb_xfer *xfer, usb_frcount_t frindex);
+void	usbd_xfer_status(struct usb_xfer *xfer, int *actlen, int *sumlen,
+	    int *aframes, int *nframes);
+struct usb_page_cache *usbd_xfer_get_frame(struct usb_xfer *, usb_frcount_t);
+void	*usbd_xfer_get_frame_buffer(struct usb_xfer *, usb_frcount_t);
+void	*usbd_xfer_softc(struct usb_xfer *xfer);
+void	*usbd_xfer_get_priv(struct usb_xfer *xfer);
+void	usbd_xfer_set_priv(struct usb_xfer *xfer, void *);
+void	usbd_xfer_set_interval(struct usb_xfer *xfer, int);
+uint8_t	usbd_xfer_state(struct usb_xfer *xfer);
+void	usbd_xfer_set_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex,
+	    void *ptr, usb_frlength_t len);
+void	usbd_xfer_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex,
+	    void **ptr, int *len);
+void	usbd_xfer_set_frame_offset(struct usb_xfer *xfer, usb_frlength_t offset,
+	    usb_frcount_t frindex);
+usb_frlength_t usbd_xfer_max_len(struct usb_xfer *xfer);
+usb_frlength_t usbd_xfer_max_framelen(struct usb_xfer *xfer);
+usb_frcount_t usbd_xfer_max_frames(struct usb_xfer *xfer);
+uint8_t	usbd_xfer_get_fps_shift(struct usb_xfer *xfer);
+usb_frlength_t usbd_xfer_frame_len(struct usb_xfer *xfer,
+	    usb_frcount_t frindex);
+void	usbd_xfer_set_frame_len(struct usb_xfer *xfer, usb_frcount_t frindex,
+	    usb_frlength_t len);
+void	usbd_xfer_set_timeout(struct usb_xfer *xfer, int timeout);
+void	usbd_xfer_set_frames(struct usb_xfer *xfer, usb_frcount_t n);
+void	usbd_xfer_set_stall(struct usb_xfer *xfer);
+int	usbd_xfer_is_stalled(struct usb_xfer *xfer);
+void	usbd_xfer_set_flag(struct usb_xfer *xfer, int flag);
+void	usbd_xfer_clr_flag(struct usb_xfer *xfer, int flag);
+uint16_t usbd_xfer_get_timestamp(struct usb_xfer *xfer);
+uint8_t usbd_xfer_maxp_was_clamped(struct usb_xfer *xfer);
+
+void	usbd_copy_in(struct usb_page_cache *cache, usb_frlength_t offset,
+	    const void *ptr, usb_frlength_t len);
+int	usbd_copy_in_user(struct usb_page_cache *cache, usb_frlength_t offset,
+	    const void *ptr, usb_frlength_t len);
+void	usbd_copy_out(struct usb_page_cache *cache, usb_frlength_t offset,
+	    void *ptr, usb_frlength_t len);
+int	usbd_copy_out_user(struct usb_page_cache *cache, usb_frlength_t offset,
+	    void *ptr, usb_frlength_t len);
+void	usbd_get_page(struct usb_page_cache *pc, usb_frlength_t offset,
+	    struct usb_page_search *res);
+void	usbd_m_copy_in(struct usb_page_cache *cache, usb_frlength_t dst_offset,
+	    struct mbuf *m, usb_size_t src_offset, usb_frlength_t src_len);
+void	usbd_frame_zero(struct usb_page_cache *cache, usb_frlength_t offset,
+	    usb_frlength_t len);
+void	usbd_start_re_enumerate(struct usb_device *udev);
+usb_error_t
+	usbd_start_set_config(struct usb_device *, uint8_t);
+
+int	usb_fifo_attach(struct usb_device *udev, void *priv_sc,
+	    struct mtx *priv_mtx, struct usb_fifo_methods *pm,
+	    struct usb_fifo_sc *f_sc, uint16_t unit, int16_t subunit,
+	    uint8_t iface_index, uid_t uid, gid_t gid, int mode);
+void	usb_fifo_detach(struct usb_fifo_sc *f_sc);
+int	usb_fifo_alloc_buffer(struct usb_fifo *f, uint32_t bufsize,
+	    uint16_t nbuf);
+void	usb_fifo_free_buffer(struct usb_fifo *f);
+uint32_t usb_fifo_put_bytes_max(struct usb_fifo *fifo);
+void	usb_fifo_put_data(struct usb_fifo *fifo, struct usb_page_cache *pc,
+	    usb_frlength_t offset, usb_frlength_t len, uint8_t what);
+void	usb_fifo_put_data_linear(struct usb_fifo *fifo, void *ptr,
+	    usb_size_t len, uint8_t what);
+uint8_t	usb_fifo_put_data_buffer(struct usb_fifo *f, void *ptr, usb_size_t len);
+void	usb_fifo_put_data_error(struct usb_fifo *fifo);
+uint8_t	usb_fifo_get_data(struct usb_fifo *fifo, struct usb_page_cache *pc,
+	    usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen,
+	    uint8_t what);
+uint8_t	usb_fifo_get_data_linear(struct usb_fifo *fifo, void *ptr,
+	    usb_size_t len, usb_size_t *actlen, uint8_t what);
+uint8_t	usb_fifo_get_data_buffer(struct usb_fifo *f, void **pptr,
+	    usb_size_t *plen);
+void	usb_fifo_reset(struct usb_fifo *f);
+void	usb_fifo_wakeup(struct usb_fifo *f);
+void	usb_fifo_get_data_error(struct usb_fifo *fifo);
+void	*usb_fifo_softc(struct usb_fifo *fifo);
+void	usb_fifo_set_close_zlp(struct usb_fifo *, uint8_t);
+void	usb_fifo_set_write_defrag(struct usb_fifo *, uint8_t);
+void	usb_fifo_free(struct usb_fifo *f);
+#endif /* _KERNEL */
+#endif /* _USB_USBDI_H_ */
diff --git a/usr/contrib/freebsd/isa/rtc.h b/usr/contrib/freebsd/isa/rtc.h
new file mode 100644
index 0000000000..bb964ddf6a
--- /dev/null
+++ b/usr/contrib/freebsd/isa/rtc.h
@@ -0,0 +1,125 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)rtc.h	7.1 (Berkeley) 5/12/91
+ * $FreeBSD$
+ */
+
+#ifndef _I386_ISA_RTC_H_
+#define _I386_ISA_RTC_H_ 1
+
+/*
+ * MC146818 RTC Register locations
+ */
+
+#define RTC_SEC		0x00	/* seconds */
+#define RTC_SECALRM	0x01	/* seconds alarm */
+#define RTC_MIN		0x02	/* minutes */
+#define RTC_MINALRM	0x03	/* minutes alarm */
+#define RTC_HRS		0x04	/* hours */
+#define RTC_HRSALRM	0x05	/* hours alarm */
+#define RTC_WDAY	0x06	/* week day */
+#define RTC_DAY		0x07	/* day of month */
+#define RTC_MONTH	0x08	/* month of year */
+#define RTC_YEAR	0x09	/* month of year */
+
+#define RTC_STATUSA	0x0a	/* status register A */
+#define  RTCSA_TUP	 0x80	/* time update, don't look now */
+#define  RTCSA_RESET	 0x70	/* reset divider */
+#define  RTCSA_DIVIDER   0x20   /* divider correct for 32768 Hz */
+#define  RTCSA_8192      0x03	/* 8192 Hz interrupt */
+#define  RTCSA_4096      0x04
+#define  RTCSA_2048      0x05
+#define  RTCSA_1024      0x06	/* default for profiling */
+#define  RTCSA_PROF      RTCSA_1024
+#define  RTC_PROFRATE    1024
+#define  RTCSA_512       0x07
+#define  RTCSA_256       0x08
+#define  RTCSA_128       0x09
+#define  RTCSA_NOPROF	 RTCSA_128
+#define  RTC_NOPROFRATE  128
+#define  RTCSA_64        0x0a
+#define  RTCSA_32        0x0b	/* 32 Hz interrupt */
+
+#define RTC_STATUSB	0x0b	/* status register B */
+#define	 RTCSB_DST	 0x01	/* USA Daylight Savings Time enable */
+#define	 RTCSB_24HR	 0x02	/* 0 = 12 hours, 1 = 24	hours */
+#define	 RTCSB_BCD	 0x04	/* 0 = BCD, 1 =	Binary coded time */
+#define	 RTCSB_SQWE	 0x08	/* 1 = output sqare wave at SQW	pin */
+#define	 RTCSB_UINTR	 0x10	/* 1 = enable update-ended interrupt */
+#define	 RTCSB_AINTR	 0x20	/* 1 = enable alarm interrupt */
+#define	 RTCSB_PINTR	 0x40	/* 1 = enable periodic clock interrupt */
+#define  RTCSB_HALT      0x80	/* stop clock updates */
+
+#define RTC_INTR	0x0c	/* status register C (R) interrupt source */
+#define  RTCIR_UPDATE	 0x10	/* update intr */
+#define  RTCIR_ALARM	 0x20	/* alarm intr */
+#define  RTCIR_PERIOD	 0x40	/* periodic intr */
+#define  RTCIR_INT	 0x80	/* interrupt output signal */
+
+#define RTC_STATUSD	0x0d	/* status register D (R) Lost Power */
+#define  RTCSD_PWR	 0x80	/* clock power OK */
+
+#define RTC_DIAG	0x0e	/* status register E - bios diagnostic */
+#define RTCDG_BITS	"\020\010clock_battery\007ROM_cksum\006config_unit\005memory_size\004fixed_disk\003invalid_time"
+
+#define RTC_RESET	0x0f	/* status register F - reset code byte */
+#define	 RTCRS_RST	 0x00		/* normal reset */
+#define	 RTCRS_LOAD	 0x04		/* load system */
+
+#define RTC_FDISKETTE	0x10	/* diskette drive type in upper/lower nibble */
+#define	 RTCFDT_NONE	 0		/* none present */
+#define	 RTCFDT_360K	 0x10		/* 360K */
+#define	 RTCFDT_12M	 0x20		/* 1.2M */
+#define  RTCFDT_720K     0x30           /* 720K */
+#define	 RTCFDT_144M	 0x40		/* 1.44M */
+#define  RTCFDT_288M_1   0x50		/* 2.88M, some BIOSes */
+#define	 RTCFDT_288M	 0x60		/* 2.88M */
+
+#define RTC_BASELO	0x15	/* low byte of basemem size */
+#define RTC_BASEHI	0x16	/* high byte of basemem size */
+#define RTC_EXTLO	0x17	/* low byte of extended mem size */
+#define RTC_EXTHI	0x18	/* low byte of extended mem size */
+
+#define	RTC_CENTURY	0x32	/* current century */
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+extern  struct mtx clock_lock;
+extern	int atrtcclock_disable;
+int	rtcin(int reg);
+void	atrtc_restore(void);
+void	writertc(int reg, u_char val);
+void	atrtc_set(struct timespec *ts);
+#endif
+#endif
+
+#endif /* _I386_ISA_RTC_H_ */
diff --git a/usr/contrib/freebsd/lib/libutil/humanize_number.c b/usr/contrib/freebsd/lib/libutil/humanize_number.c
new file mode 100644
index 0000000000..675a969aaa
--- /dev/null
+++ b/usr/contrib/freebsd/lib/libutil/humanize_number.c
@@ -0,0 +1,179 @@
+/*	$NetBSD: humanize_number.c,v 1.14 2008/04/28 20:22:59 martin Exp $	*/
+
+/*
+ * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc.
+ * Copyright 2013 John-Mark Gurney <jmg@FreeBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
+ * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <libutil.h>
+
+static const int maxscale = 6;
+
+int
+humanize_number(char *buf, size_t len, int64_t quotient,
+    const char *suffix, int scale, int flags)
+{
+	const char *prefixes, *sep;
+	int	i, r, remainder, s1, s2, sign;
+	int	divisordeccut;
+	int64_t	divisor, max;
+	size_t	baselen;
+
+	/* Since so many callers don't check -1, NUL terminate the buffer */
+	if (len > 0)
+		buf[0] = '\0';
+
+	/* validate args */
+	if (buf == NULL || suffix == NULL)
+		return (-1);
+	if (scale < 0)
+		return (-1);
+	else if (scale > maxscale &&
+	    ((scale & ~(HN_AUTOSCALE|HN_GETSCALE)) != 0))
+		return (-1);
+	if ((flags & HN_DIVISOR_1000) && (flags & HN_IEC_PREFIXES))
+		return (-1);
+
+	/* setup parameters */
+	remainder = 0;
+
+	if (flags & HN_IEC_PREFIXES) {
+		baselen = 2;
+		/*
+		 * Use the prefixes for power of two recommended by
+		 * the International Electrotechnical Commission
+		 * (IEC) in IEC 80000-3 (i.e. Ki, Mi, Gi...).
+		 *
+		 * HN_IEC_PREFIXES implies a divisor of 1024 here
+		 * (use of HN_DIVISOR_1000 would have triggered
+		 * an assertion earlier).
+		 */
+		divisor = 1024;
+		divisordeccut = 973;	/* ceil(.95 * 1024) */
+		if (flags & HN_B)
+			prefixes = "B\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei";
+		else
+			prefixes = "\0\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei";
+	} else {
+		baselen = 1;
+		if (flags & HN_DIVISOR_1000) {
+			divisor = 1000;
+			divisordeccut = 950;
+			if (flags & HN_B)
+				prefixes = "B\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E";
+			else
+				prefixes = "\0\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E";
+		} else {
+			divisor = 1024;
+			divisordeccut = 973;	/* ceil(.95 * 1024) */
+			if (flags & HN_B)
+				prefixes = "B\0\0K\0\0M\0\0G\0\0T\0\0P\0\0E";
+			else
+				prefixes = "\0\0\0K\0\0M\0\0G\0\0T\0\0P\0\0E";
+		}
+	}
+
+#define	SCALE2PREFIX(scale)	(&prefixes[(scale) * 3])
+
+	if (quotient < 0) {
+		sign = -1;
+		quotient = -quotient;
+		baselen += 2;		/* sign, digit */
+	} else {
+		sign = 1;
+		baselen += 1;		/* digit */
+	}
+	if (flags & HN_NOSPACE)
+		sep = "";
+	else {
+		sep = " ";
+		baselen++;
+	}
+	baselen += strlen(suffix);
+
+	/* Check if enough room for `x y' + suffix + `\0' */
+	if (len < baselen + 1)
+		return (-1);
+
+	if (scale & (HN_AUTOSCALE | HN_GETSCALE)) {
+		/* See if there is additional columns can be used. */
+		for (max = 1, i = len - baselen; i-- > 0;)
+			max *= 10;
+
+		/*
+		 * Divide the number until it fits the given column.
+		 * If there will be an overflow by the rounding below,
+		 * divide once more.
+		 */
+		for (i = 0;
+		    (quotient >= max || (quotient == max - 1 &&
+		    remainder >= divisordeccut)) && i < maxscale; i++) {
+			remainder = quotient % divisor;
+			quotient /= divisor;
+		}
+
+		if (scale & HN_GETSCALE)
+			return (i);
+	} else {
+		for (i = 0; i < scale && i < maxscale; i++) {
+			remainder = quotient % divisor;
+			quotient /= divisor;
+		}
+	}
+
+	/* If a value <= 9.9 after rounding and ... */
+	/*
+	 * XXX - should we make sure there is enough space for the decimal
+	 * place and if not, don't do HN_DECIMAL?
+	 */
+	if (((quotient == 9 && remainder < divisordeccut) || quotient < 9) &&
+	    i > 0 && flags & HN_DECIMAL) {
+		s1 = (int)quotient + ((remainder * 10 + divisor / 2) /
+		    divisor / 10);
+		s2 = ((remainder * 10 + divisor / 2) / divisor) % 10;
+		r = snprintf(buf, len, "%d%s%d%s%s%s",
+		    sign * s1, localeconv()->decimal_point, s2,
+		    sep, SCALE2PREFIX(i), suffix);
+	} else
+		r = snprintf(buf, len, "%" PRId64 "%s%s%s",
+		    sign * (quotient + (remainder + divisor / 2) / divisor),
+		    sep, SCALE2PREFIX(i), suffix);
+
+	return (r);
+}
diff --git a/usr/contrib/freebsd/sys/ata.h b/usr/contrib/freebsd/sys/ata.h
index 705460355f..223bd7b3eb 100644
--- a/usr/contrib/freebsd/sys/ata.h
+++ b/usr/contrib/freebsd/sys/ata.h
@@ -23,7 +23,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/sys/ata.h 264853 2014-04-24 01:28:14Z smh $
+ * $FreeBSD$
  */
 
 #ifndef _SYS_ATA_H_
@@ -105,6 +105,10 @@ struct ata_params {
 /*069*/ u_int16_t       support3;
 #define ATA_SUPPORT_RZAT                0x0020
 #define ATA_SUPPORT_DRAT                0x4000
+#define	ATA_SUPPORT_ZONE_MASK		0x0003
+#define	ATA_SUPPORT_ZONE_NR		0x0000
+#define	ATA_SUPPORT_ZONE_HOST_AWARE	0x0001
+#define	ATA_SUPPORT_ZONE_DEV_MANAGED	0x0002
 	u_int16_t       reserved70;
 /*071*/ u_int16_t       rlsovlap;               /* rel time (us) for overlap */
 /*072*/ u_int16_t       rlsservice;             /* rel time (us) for service */
@@ -228,7 +232,14 @@ struct ata_params {
 #define ATA_SUPPORT_RWLOGDMAEXT		0x0008
 #define ATA_SUPPORT_MICROCODE3		0x0010
 #define ATA_SUPPORT_FREEFALL		0x0020
+#define ATA_SUPPORT_SENSE_REPORT	0x0040
+#define ATA_SUPPORT_EPC			0x0080
 /*120*/ u_int16_t       enabled2;
+#define ATA_ENABLED_WRITEREADVERIFY	0x0002
+#define ATA_ENABLED_WRITEUNCORREXT	0x0004
+#define ATA_ENABLED_FREEFALL		0x0020
+#define ATA_ENABLED_SENSE_REPORT	0x0040
+#define ATA_ENABLED_EPC			0x0080
 	u_int16_t       reserved121[6];
 /*127*/ u_int16_t       removable_status;
 /*128*/ u_int16_t       security_status;
@@ -252,7 +263,7 @@ struct ata_params {
 	u_int16_t       reserved170[6];
 /*176*/ u_int8_t        media_serial[60];
 /*206*/ u_int16_t       sct;
-	u_int16_t       reserved206[2];
+	u_int16_t       reserved207[2];
 /*209*/ u_int16_t       lsalign;
 /*210*/ u_int16_t       wrv_sectors_m3_1;
 	u_int16_t       wrv_sectors_m3_2;
@@ -298,8 +309,14 @@ struct ata_params {
 #define ATA_MAX_28BIT_LBA	268435455UL
 
 /* ATA Status Register */
-#define ATA_STATUS_ERROR	0x01
-#define ATA_STATUS_DEVICE_FAULT	0x20
+#define ATA_STATUS_ERROR		0x01
+#define ATA_STATUS_SENSE_AVAIL		0x02
+#define ATA_STATUS_ALIGN_ERR		0x04
+#define ATA_STATUS_DATA_REQ		0x08
+#define ATA_STATUS_DEF_WRITE_ERR	0x10
+#define ATA_STATUS_DEVICE_FAULT		0x20
+#define ATA_STATUS_DEVICE_READY		0x40
+#define ATA_STATUS_BUSY			0x80
 
 /* ATA Error Register */
 #define ATA_ERROR_ABORT		0x04
@@ -335,6 +352,7 @@ struct ata_params {
 #define ATA_UDMA6               0x46
 #define ATA_SA150               0x47
 #define ATA_SA300               0x48
+#define ATA_SA600               0x49
 #define ATA_DMA_MAX             0x4f
 
 
@@ -367,13 +385,36 @@ struct ata_params {
 #define ATA_WRITE_LOG_EXT               0x3f
 #define ATA_READ_VERIFY                 0x40
 #define ATA_READ_VERIFY48               0x42
+#define ATA_WRITE_UNCORRECTABLE48       0x45    /* write uncorrectable 48bit LBA */
+#define         ATA_WU_PSEUDO           0x55    /* pseudo-uncorrectable error */
+#define         ATA_WU_FLAGGED          0xaa    /* flagged-uncorrectable error */
 #define ATA_READ_LOG_DMA_EXT            0x47    /* read log DMA ext - PIO Data-In */
+#define	ATA_ZAC_MANAGEMENT_IN		0x4a	/* ZAC management in */
+#define		ATA_ZM_REPORT_ZONES	0x00	/* report zones */
 #define ATA_READ_FPDMA_QUEUED           0x60    /* read DMA NCQ */
 #define ATA_WRITE_FPDMA_QUEUED          0x61    /* write DMA NCQ */
+#define ATA_NCQ_NON_DATA		0x63	/* NCQ non-data command */
+#define		ATA_ABORT_NCQ_QUEUE	0x00	/* abort NCQ queue */
+#define		ATA_DEADLINE_HANDLING	0x01	/* deadline handling */
+#define		ATA_SET_FEATURES	0x05	/* set features */
+#define		ATA_ZERO_EXT		0x06	/* zero ext */
+#define		ATA_NCQ_ZAC_MGMT_OUT	0x07	/* NCQ ZAC mgmt out no data */
 #define ATA_SEND_FPDMA_QUEUED           0x64    /* send DMA NCQ */
-#define ATA_RECV_FPDMA_QUEUED           0x65    /* recieve DMA NCQ */
+#define		ATA_SFPDMA_DSM		0x00	/* Data set management */
+#define			ATA_SFPDMA_DSM_TRIM	0x01	/* Set trim bit in auxiliary */
+#define		ATA_SFPDMA_HYBRID_EVICT	0x01	/* Hybrid Evict */
+#define		ATA_SFPDMA_WLDMA	0x02	/* Write Log DMA EXT */
+#define		ATA_SFPDMA_ZAC_MGMT_OUT	0x03	/* NCQ ZAC mgmt out w/data */
+#define ATA_RECV_FPDMA_QUEUED           0x65    /* receive DMA NCQ */
+#define		ATA_RFPDMA_RL_DMA_EXT	0x00	/* Read Log DMA EXT */
+#define		ATA_RFPDMA_ZAC_MGMT_IN	0x02	/* NCQ ZAC mgmt in w/data */
 #define ATA_SEP_ATTN                    0x67    /* SEP request */
 #define ATA_SEEK                        0x70    /* seek */
+#define	ATA_ZAC_MANAGEMENT_OUT		0x9f	/* ZAC management out */
+#define		ATA_ZM_CLOSE_ZONE	0x01	/* close zone */
+#define		ATA_ZM_FINISH_ZONE	0x02	/* finish zone */
+#define		ATA_ZM_OPEN_ZONE	0x03	/* open zone */
+#define		ATA_ZM_RWP		0x04	/* reset write pointer */
 #define ATA_PACKET_CMD                  0xa0    /* packet command */
 #define ATA_ATAPI_IDENTIFY              0xa1    /* get ATAPI params*/
 #define ATA_SERVICE                     0xa2    /* service command */
@@ -393,24 +434,36 @@ struct ata_params {
 #define ATA_IDLE_CMD                    0xe3    /* idle */
 #define ATA_READ_BUFFER                 0xe4    /* read buffer */
 #define ATA_READ_PM                     0xe4    /* read portmultiplier */
+#define ATA_CHECK_POWER_MODE            0xe5    /* device power mode */
 #define ATA_SLEEP                       0xe6    /* sleep */
 #define ATA_FLUSHCACHE                  0xe7    /* flush cache to disk */
 #define ATA_WRITE_PM                    0xe8    /* write portmultiplier */
 #define ATA_FLUSHCACHE48                0xea    /* flush cache to disk */
 #define ATA_ATA_IDENTIFY                0xec    /* get ATA params */
 #define ATA_SETFEATURES                 0xef    /* features command */
-#define         ATA_SF_SETXFER          0x03    /* set transfer mode */
 #define         ATA_SF_ENAB_WCACHE      0x02    /* enable write cache */
 #define         ATA_SF_DIS_WCACHE       0x82    /* disable write cache */
+#define         ATA_SF_SETXFER          0x03    /* set transfer mode */
+#define		ATA_SF_APM		0x05	/* Enable APM feature set */
 #define         ATA_SF_ENAB_PUIS        0x06    /* enable PUIS */
 #define         ATA_SF_DIS_PUIS         0x86    /* disable PUIS */
 #define         ATA_SF_PUIS_SPINUP      0x07    /* PUIS spin-up */
+#define		ATA_SF_WRV		0x0b	/* Enable Write-Read-Verify */
+#define 	ATA_SF_DLC		0x0c	/* Enable device life control */
+#define 	ATA_SF_SATA		0x10	/* Enable use of SATA feature */
+#define 	ATA_SF_FFC		0x41	/* Free-fall Control */
+#define 	ATA_SF_MHIST		0x43	/* Set Max Host Sect. Times */
+#define 	ATA_SF_RATE		0x45	/* Set Rate Basis */
+#define 	ATA_SF_EPC		0x4A	/* Extended Power Conditions */
 #define         ATA_SF_ENAB_RCACHE      0xaa    /* enable readahead cache */
 #define         ATA_SF_DIS_RCACHE       0x55    /* disable readahead cache */
 #define         ATA_SF_ENAB_RELIRQ      0x5d    /* enable release interrupt */
 #define         ATA_SF_DIS_RELIRQ       0xdd    /* disable release interrupt */
 #define         ATA_SF_ENAB_SRVIRQ      0x5e    /* enable service interrupt */
 #define         ATA_SF_DIS_SRVIRQ       0xde    /* disable service interrupt */
+#define 	ATA_SF_LPSAERC		0x62	/* Long Phys Sect Align ErrRep*/
+#define 	ATA_SF_DSN		0x63	/* Device Stats Notification */
+#define ATA_CHECK_POWER_MODE		0xe5	/* Check Power Mode */
 #define ATA_SECURITY_SET_PASSWORD       0xf1    /* set drive password */
 #define ATA_SECURITY_UNLOCK             0xf2    /* unlock drive using passwd */
 #define ATA_SECURITY_ERASE_PREPARE      0xf3    /* prepare to erase drive */
@@ -537,6 +590,333 @@ struct atapi_sense {
     u_int8_t	specific2;		/* sense key specific */
 } __packed;
 
+/*
+ * SET FEATURES subcommands
+ */
+
+/*
+ * SET FEATURES command
+ * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
+ * These values go in the LBA 3:0.
+ */
+#define ATA_SF_EPC_RESTORE	0x00	/* Restore Power Condition Settings */
+#define ATA_SF_EPC_GOTO		0x01	/* Go To Power Condition */
+#define ATA_SF_EPC_SET_TIMER	0x02	/* Set Power Condition Timer */
+#define ATA_SF_EPC_SET_STATE	0x03	/* Set Power Condition State */
+#define ATA_SF_EPC_ENABLE	0x04	/* Enable the EPC feature set */
+#define ATA_SF_EPC_DISABLE	0x05	/* Disable the EPC feature set */
+#define ATA_SF_EPC_SET_SOURCE	0x06	/* Set EPC Power Source */
+
+/*
+ * SET FEATURES command
+ * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
+ * Power Condition ID field
+ * These values go in the count register.
+ */
+#define ATA_EPC_STANDBY_Z	0x00	/* Substate of PM2:Standby */
+#define ATA_EPC_STANDBY_Y	0x01	/* Substate of PM2:Standby */
+#define ATA_EPC_IDLE_A		0x81	/* Substate of PM1:Idle */
+#define ATA_EPC_IDLE_B		0x82	/* Substate of PM1:Idle */
+#define ATA_EPC_IDLE_C		0x83	/* Substate of PM1:Idle */
+#define ATA_EPC_ALL		0xff	/* All supported power conditions */
+
+/*
+ * SET FEATURES command
+ * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
+ * Restore Power Conditions Settings subcommand
+ * These values go in the LBA register.
+ */
+#define ATA_SF_EPC_RST_DFLT	0x40	/* 1=Rst from Default, 0= from Saved */
+#define ATA_SF_EPC_RST_SAVE	0x10	/* 1=Save on completion */
+
+/*
+ * SET FEATURES command
+ * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
+ * Got To Power Condition subcommand
+ * These values go in the LBA register.
+ */
+#define ATA_SF_EPC_GOTO_DELAY	0x02000000	/* Delayed entry bit */
+#define ATA_SF_EPC_GOTO_HOLD	0x01000000	/* Hold Power Cond bit */
+
+/*
+ * SET FEATURES command
+ * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
+ * Set Power Condition Timer subcommand
+ * These values go in the LBA register.
+ */
+#define ATA_SF_EPC_TIMER_MASK	0x00ffff00	/* Timer field */
+#define ATA_SF_EPC_TIMER_SHIFT	8
+#define ATA_SF_EPC_TIMER_SEC	0x00000080	/* Timer units, 1=sec, 0=.1s */
+#define ATA_SF_EPC_TIMER_EN	0x00000020	/* Enable/disable cond. */
+#define ATA_SF_EPC_TIMER_SAVE	0x00000010	/* Save settings on comp.  */
+
+/*
+ * SET FEATURES command
+ * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
+ * Set Power Condition State subcommand
+ * These values go in the LBA register.
+ */
+#define ATA_SF_EPC_SETCON_EN	0x00000020	/* Enable power cond. */
+#define ATA_SF_EPC_SETCON_SAVE	0x00000010	/* Save settings on comp */
+
+/*
+ * SET FEATURES command
+ * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
+ * Set EPC Power Source subcommand
+ * These values go in the count register.
+ */
+#define ATA_SF_EPC_SRC_UNKNOWN	0x0000	/* Unknown source */
+#define ATA_SF_EPC_SRC_BAT	0x0001	/* battery source */
+#define ATA_SF_EPC_SRC_NOT_BAT	0x0002	/* not battery source */
+
+#define	ATA_LOG_DIRECTORY	0x00	/* Directory of all logs */
+#define	ATA_POWER_COND_LOG	0x08	/* Power Conditions Log */
+#define	ATA_PCL_IDLE		0x00	/* Idle Power Conditions Page */
+#define	ATA_PCL_STANDBY		0x01	/* Standby Power Conditions Page */
+#define	ATA_IDENTIFY_DATA_LOG	0x30	/* Identify Device Data Log */
+#define	ATA_IDL_PAGE_LIST	0x00	/* List of supported pages */
+#define	ATA_IDL_IDENTIFY_DATA	0x01	/* Copy of Identify Device data */
+#define	ATA_IDL_CAPACITY	0x02	/* Capacity */
+#define	ATA_IDL_SUP_CAP		0x03	/* Supported Capabilities */
+#define	ATA_IDL_CUR_SETTINGS	0x04	/* Current Settings */
+#define	ATA_IDL_ATA_STRINGS	0x05	/* ATA Strings */
+#define	ATA_IDL_SECURITY	0x06	/* Security */
+#define	ATA_IDL_PARALLEL_ATA	0x07	/* Parallel ATA */
+#define	ATA_IDL_SERIAL_ATA	0x08	/* Seiral ATA */
+#define	ATA_IDL_ZDI		0x09	/* Zoned Device Information */
+
+struct ata_gp_log_dir {
+	uint8_t header[2];
+#define	ATA_GP_LOG_DIR_VERSION		0x0001
+	uint8_t num_pages[255*2];	/* Number of log pages at address */
+};
+
+/*
+ * ATA Power Conditions log descriptor
+ */
+struct ata_power_cond_log_desc {
+	uint8_t reserved1;
+	uint8_t flags;
+#define ATA_PCL_COND_SUPPORTED		0x80
+#define ATA_PCL_COND_SAVEABLE		0x40
+#define ATA_PCL_COND_CHANGEABLE		0x20
+#define ATA_PCL_DEFAULT_TIMER_EN	0x10
+#define ATA_PCL_SAVED_TIMER_EN		0x08
+#define ATA_PCL_CURRENT_TIMER_EN	0x04
+#define ATA_PCL_HOLD_PC_NOT_SUP		0x02
+	uint8_t reserved2[2];
+	uint8_t default_timer[4];
+	uint8_t saved_timer[4];
+	uint8_t current_timer[4];
+	uint8_t nom_time_to_active[4];
+	uint8_t min_timer[4];
+	uint8_t max_timer[4];
+	uint8_t num_transitions_to_pc[4];
+	uint8_t hours_in_pc[4];
+	uint8_t reserved3[28];
+};
+
+/*
+ * ATA Power Conditions Log (0x08), Idle power conditions page (0x00)
+ */
+struct ata_power_cond_log_idle {
+	struct ata_power_cond_log_desc idle_a_desc;
+	struct ata_power_cond_log_desc idle_b_desc;
+	struct ata_power_cond_log_desc idle_c_desc;
+	uint8_t reserved[320];
+};
+
+/*
+ * ATA Power Conditions Log (0x08), Standby power conditions page (0x01)
+ */
+struct ata_power_cond_log_standby {
+	uint8_t reserved[384];
+	struct ata_power_cond_log_desc standby_y_desc;
+	struct ata_power_cond_log_desc standby_z_desc;
+};
+
+/*
+ * ATA IDENTIFY DEVICE data log (0x30) page 0x00
+ * List of Supported IDENTIFY DEVICE data pages.
+ */
+struct ata_identify_log_pages {
+	uint8_t header[8];
+#define	ATA_IDLOG_REVISION	0x0000000000000001
+	uint8_t entry_count;
+	uint8_t entries[503];
+};
+
+/*
+ * ATA IDENTIFY DEVICE data log (0x30)
+ * Capacity (Page 0x02).
+ */
+struct ata_identify_log_capacity {
+	uint8_t header[8];
+#define	ATA_CAP_HEADER_VALID	0x8000000000000000
+#define	ATA_CAP_PAGE_NUM_MASK	0x0000000000ff0000
+#define	ATA_CAP_PAGE_NUM_SHIFT	16
+#define ATA_CAP_REV_MASK	0x00000000000000ff
+	uint8_t capacity[8];
+#define	ATA_CAP_CAPACITY_VALID	0x8000000000000000
+#define	ATA_CAP_ACCESSIBLE_CAP	0x0000ffffffffffff
+	uint8_t phys_logical_sect_size[8];
+#define	ATA_CAP_PL_VALID	0x8000000000000000
+#define	ATA_CAP_LTOP_REL_SUP	0x4000000000000000
+#define	ATA_CAP_LOG_SECT_SUP	0x2000000000000000
+#define	ATA_CAP_ALIGN_ERR_MASK	0x0000000000300000
+#define	ATA_CAP_LTOP_MASK	0x00000000000f0000
+#define	ATA_CAP_LOG_SECT_OFF	0x000000000000ffff
+	uint8_t logical_sect_size[8];
+#define	ATA_CAP_LOG_SECT_VALID	0x8000000000000000
+#define	ATA_CAP_LOG_SECT_SIZE	0x00000000ffffffff
+	uint8_t nominal_buffer_size[8];
+#define	ATA_CAP_NOM_BUF_VALID	0x8000000000000000
+#define	ATA_CAP_NOM_BUF_SIZE	0x7fffffffffffffff
+	uint8_t reserved[472];
+};
+
+/*
+ * ATA IDENTIFY DEVICE data log (0x30)
+ * Supported Capabilities (Page 0x03).
+ */
+
+struct ata_identify_log_sup_cap {
+	uint8_t header[8];
+#define	ATA_SUP_CAP_HEADER_VALID	0x8000000000000000
+#define	ATA_SUP_CAP_PAGE_NUM_MASK	0x0000000000ff0000
+#define	ATA_SUP_CAP_PAGE_NUM_SHIFT	16
+#define ATA_SUP_CAP_REV_MASK		0x00000000000000ff
+	uint8_t sup_cap[8];
+#define	ATA_SUP_CAP_VALID		0x8000000000000000
+#define	ATA_SC_SET_SECT_CONFIG_SUP	0x0002000000000000 /* Set Sect Conf*/
+#define	ATA_SC_ZERO_EXT_SUP		0x0001000000000000 /* Zero EXT */
+#define	ATA_SC_SUCC_NCQ_SENSE_SUP	0x0000800000000000 /* Succ. NCQ Sns */
+#define	ATA_SC_DLC_SUP			0x0000400000000000 /* DLC */
+#define	ATA_SC_RQSN_DEV_FAULT_SUP	0x0000200000000000 /* Req Sns Dev Flt*/
+#define	ATA_SC_DSN_SUP			0x0000100000000000 /* DSN */
+#define	ATA_SC_LP_STANDBY_SUP		0x0000080000000000 /* LP Standby */
+#define	ATA_SC_SET_EPC_PS_SUP		0x0000040000000000 /* Set EPC PS */
+#define	ATA_SC_AMAX_ADDR_SUP		0x0000020000000000 /* AMAX Addr */
+#define	ATA_SC_DRAT_SUP			0x0000008000000000 /* DRAT */
+#define	ATA_SC_LPS_MISALGN_SUP		0x0000004000000000 /* LPS Misalign */
+#define	ATA_SC_RB_DMA_SUP		0x0000001000000000 /* Read Buf DMA */
+#define	ATA_SC_WB_DMA_SUP		0x0000000800000000 /* Write Buf DMA */
+#define	ATA_SC_DNLD_MC_DMA_SUP		0x0000000200000000 /* DL MCode DMA */
+#define	ATA_SC_28BIT_SUP		0x0000000100000000 /* 28-bit */
+#define	ATA_SC_RZAT_SUP			0x0000000080000000 /* RZAT */
+#define	ATA_SC_NOP_SUP			0x0000000020000000 /* NOP */
+#define	ATA_SC_READ_BUFFER_SUP		0x0000000010000000 /* Read Buffer */
+#define	ATA_SC_WRITE_BUFFER_SUP		0x0000000008000000 /* Write Buffer */
+#define	ATA_SC_READ_LOOK_AHEAD_SUP	0x0000000002000000 /* Read Look-Ahead*/
+#define	ATA_SC_VOLATILE_WC_SUP		0x0000000001000000 /* Volatile WC */
+#define	ATA_SC_SMART_SUP		0x0000000000800000 /* SMART */
+#define	ATA_SC_FLUSH_CACHE_EXT_SUP	0x0000000000400000 /* Flush Cache Ext */
+#define	ATA_SC_48BIT_SUP		0x0000000000100000 /* 48-Bit */
+#define	ATA_SC_SPINUP_SUP		0x0000000000040000 /* Spin-Up */
+#define	ATA_SC_PUIS_SUP			0x0000000000020000 /* PUIS */
+#define	ATA_SC_APM_SUP			0x0000000000010000 /* APM */
+#define	ATA_SC_DL_MICROCODE_SUP		0x0000000000004000 /* DL Microcode */
+#define	ATA_SC_UNLOAD_SUP		0x0000000000002000 /* Unload */
+#define	ATA_SC_WRITE_FUA_EXT_SUP	0x0000000000001000 /* Write FUA EXT */
+#define	ATA_SC_GPL_SUP			0x0000000000000800 /* GPL */
+#define	ATA_SC_STREAMING_SUP		0x0000000000000400 /* Streaming */
+#define	ATA_SC_SMART_SELFTEST_SUP	0x0000000000000100 /* SMART self-test */
+#define	ATA_SC_SMART_ERR_LOG_SUP	0x0000000000000080 /* SMART Err Log */
+#define	ATA_SC_EPC_SUP			0x0000000000000040 /* EPC */
+#define	ATA_SC_SENSE_SUP		0x0000000000000020 /* Sense data */
+#define	ATA_SC_FREEFALL_SUP		0x0000000000000010 /* Free-Fall */
+#define	ATA_SC_DM_MODE3_SUP		0x0000000000000008 /* DM Mode 3 */
+#define	ATA_SC_GPL_DMA_SUP		0x0000000000000004 /* GPL DMA */
+#define ATA_SC_WRITE_UNCOR_SUP		0x0000000000000002 /* Write uncorr.  */
+#define ATA_SC_WRV_SUP			0x0000000000000001 /* WRV */
+	uint8_t download_code_cap[8];
+#define ATA_DL_CODE_VALID		0x8000000000000000
+#define	ATA_DLC_DM_OFFSETS_DEFER_SUP	0x0000000400000000
+#define	ATA_DLC_DM_IMMED_SUP		0x0000000200000000
+#define	ATA_DLC_DM_OFF_IMMED_SUP	0x0000000100000000
+#define	ATA_DLC_DM_MAX_XFER_SIZE_MASK	0x00000000ffff0000
+#define	ATA_DLC_DM_MAX_XFER_SIZE_SHIFT	16
+#define	ATA_DLC_DM_MIN_XFER_SIZE_MASK	0x000000000000ffff
+	uint8_t nom_media_rotation_rate[8];
+#define	ATA_NOM_MEDIA_ROTATION_VALID	0x8000000000000000
+#define	ATA_ROTATION_MASK		0x000000000000ffff
+	uint8_t form_factor[8];
+#define	ATA_FORM_FACTOR_VALID		0x8000000000000000
+#define	ATA_FF_MASK			0x000000000000000f
+#define	ATA_FF_NOT_REPORTED		0x0000000000000000 /* Not reported */
+#define	ATA_FF_525_IN			0x0000000000000001 /* 5.25 inch */
+#define	ATA_FF_35_IN			0x0000000000000002 /* 3.5 inch */
+#define	ATA_FF_25_IN			0x0000000000000003 /* 2.5 inch */
+#define	ATA_FF_18_IN			0x0000000000000004 /* 1.8 inch */
+#define	ATA_FF_LT_18_IN			0x0000000000000005 /* < 1.8 inch */
+#define	ATA_FF_MSATA			0x0000000000000006 /* mSATA */
+#define	ATA_FF_M2			0x0000000000000007 /* M.2 */
+#define	ATA_FF_MICROSSD			0x0000000000000008 /* MicroSSD */
+#define	ATA_FF_CFAST			0x0000000000000009 /* CFast */
+	uint8_t wrv_sec_cnt_mode3[8];
+#define ATA_WRV_MODE3_VALID		0x8000000000000000
+#define ATA_WRV_MODE3_COUNT		0x00000000ffffffff
+	uint8_t wrv_sec_cnt_mode2[8];
+#define	ATA_WRV_MODE2_VALID		0x8000000000000000
+#define ATA_WRV_MODE2_COUNT		0x00000000ffffffff
+	uint8_t wwn[16];
+	/* XXX KDM need to figure out how to handle 128-bit fields */
+	uint8_t dsm[8];
+#define	ATA_DSM_VALID			0x8000000000000000
+#define	ATA_LB_MARKUP_SUP		0x000000000000ff00
+#define	ATA_TRIM_SUP			0x0000000000000001
+	uint8_t util_per_unit_time[16];
+	/* XXX KDM need to figure out how to handle 128-bit fields */
+	uint8_t util_usage_rate_sup[8];
+#define	ATA_UTIL_USAGE_RATE_VALID	0x8000000000000000
+#define	ATA_SETTING_RATE_SUP		0x0000000000800000
+#define	ATA_SINCE_POWERON_SUP		0x0000000000000100
+#define	ATA_POH_RATE_SUP		0x0000000000000010
+#define	ATA_DATE_TIME_RATE_SUP		0x0000000000000001
+	uint8_t zoned_cap[8];
+#define	ATA_ZONED_VALID			0x8000000000000000
+#define	ATA_ZONED_MASK			0x0000000000000003
+	uint8_t sup_zac_cap[8];
+#define	ATA_SUP_ZAC_CAP_VALID		0x8000000000000000
+#define	ATA_ND_RWP_SUP			0x0000000000000010 /* Reset Write Ptr*/
+#define	ATA_ND_FINISH_ZONE_SUP		0x0000000000000008 /* Finish Zone */
+#define	ATA_ND_CLOSE_ZONE_SUP		0x0000000000000004 /* Close Zone */
+#define	ATA_ND_OPEN_ZONE_SUP		0x0000000000000002 /* Open Zone */
+#define	ATA_REPORT_ZONES_SUP		0x0000000000000001 /* Report Zones */
+	uint8_t reserved[392];
+};
+
+/*
+ * ATA Identify Device Data Log Zoned Device Information Page (0x09).
+ * Current as of ZAC r04a, August 25, 2015.
+ */
+struct ata_zoned_info_log {
+	uint8_t header[8];
+#define	ATA_ZDI_HEADER_VALID	0x8000000000000000
+#define	ATA_ZDI_PAGE_NUM_MASK	0x0000000000ff0000
+#define	ATA_ZDI_PAGE_NUM_SHIFT	16
+#define ATA_ZDI_REV_MASK	0x00000000000000ff
+	uint8_t zoned_cap[8];
+#define	ATA_ZDI_CAP_VALID	0x8000000000000000
+#define	ATA_ZDI_CAP_URSWRZ	0x0000000000000001
+	uint8_t zoned_settings[8];
+#define	ATA_ZDI_SETTINGS_VALID	0x8000000000000000
+	uint8_t optimal_seq_zones[8];
+#define	ATA_ZDI_OPT_SEQ_VALID	0x8000000000000000
+#define	ATA_ZDI_OPT_SEQ_MASK	0x00000000ffffffff
+	uint8_t optimal_nonseq_zones[8];
+#define	ATA_ZDI_OPT_NS_VALID	0x8000000000000000
+#define	ATA_ZDI_OPT_NS_MASK	0x00000000ffffffff
+	uint8_t max_seq_req_zones[8];
+#define	ATA_ZDI_MAX_SEQ_VALID	0x8000000000000000
+#define	ATA_ZDI_MAX_SEQ_MASK	0x00000000ffffffff
+	uint8_t version_info[8];
+#define	ATA_ZDI_VER_VALID	0x8000000000000000
+#define	ATA_ZDI_VER_ZAC_SUP	0x0100000000000000
+#define	ATA_ZDI_VER_ZAC_MASK	0x00000000000000ff
+	uint8_t reserved[456];
+};
+
 struct ata_ioc_request {
     union {
 	struct {
diff --git a/usr/contrib/freebsd/sys/linker_set.h b/usr/contrib/freebsd/sys/linker_set.h
deleted file mode 100644
index 393dfbc131..0000000000
--- a/usr/contrib/freebsd/sys/linker_set.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*-
- * Copyright (c) 1999 John D. Polstra
- * Copyright (c) 1999,2001 Peter Wemm <peter@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/sys/linker_set.h 215701 2010-11-22 19:32:54Z dim $
- */
-
-#ifndef _SYS_LINKER_SET_H_
-#define _SYS_LINKER_SET_H_
-
-#ifdef	__FreeBSD__
-#ifndef _SYS_CDEFS_H_
-#error this file needs sys/cdefs.h as a prerequisite
-#endif
-#else
-#ifndef	_COMPAT_FREEBSD_SYS_CDEFS_H_
-#error this file needs sys/cdefs.h as a prerequisite
-#endif
-#endif
-
-/*
- * The following macros are used to declare global sets of objects, which
- * are collected by the linker into a `linker_set' as defined below.
- * For ELF, this is done by constructing a separate segment for each set.
- */
-
-/*
- * Private macros, not to be used outside this header file.
- */
-#ifdef __GNUCLIKE___SECTION
-#ifdef	__FreeBSD__
-#define __MAKE_SET(set, sym)						\
-	__GLOBL(__CONCAT(__start_set_,set));				\
-	__GLOBL(__CONCAT(__stop_set_,set));				\
-	static void const * const __set_##set##_sym_##sym 		\
-	__section("set_" #set) __used = &sym
-#else
-#define __MAKE_SET(set, sym)						\
-	static void const * const __set_##set##_sym_##sym 		\
-	__section("set_" #set) __used = &sym
-#endif
-#else /* !__GNUCLIKE___SECTION */
-#ifndef lint
-#error this file needs to be ported to your compiler
-#endif /* lint */
-#define __MAKE_SET(set, sym)	extern void const * const (__set_##set##_sym_##sym)
-#endif /* __GNUCLIKE___SECTION */
-
-/*
- * Public macros.
- */
-#define TEXT_SET(set, sym)	__MAKE_SET(set, sym)
-#define DATA_SET(set, sym)	__MAKE_SET(set, sym)
-#define BSS_SET(set, sym)	__MAKE_SET(set, sym)
-#define ABS_SET(set, sym)	__MAKE_SET(set, sym)
-#define SET_ENTRY(set, sym)	__MAKE_SET(set, sym)
-
-/*
- * Initialize before referring to a given linker set.
- */
-#ifdef	__FreeBSD__
-#define SET_DECLARE(set, ptype)						\
-	extern ptype *__CONCAT(__start_set_,set);			\
-	extern ptype *__CONCAT(__stop_set_,set)
-#else
-#define	SET_DECLARE(set, ptype)						\
-	_Pragma(__XSTRING(weak __CONCAT(__start_set_,set)))		\
-	_Pragma(__XSTRING(weak __CONCAT(__stop_set_,set)))		\
-	extern ptype *__CONCAT(__start_set_,set);			\
-	extern ptype *__CONCAT(__stop_set_,set)
-#endif
-
-#define SET_BEGIN(set)							\
-	(&__CONCAT(__start_set_,set))
-#define SET_LIMIT(set)							\
-	(&__CONCAT(__stop_set_,set))
-
-/*
- * Iterate over all the elements of a set.
- *
- * Sets always contain addresses of things, and "pvar" points to words
- * containing those addresses.  Thus is must be declared as "type **pvar",
- * and the address of each set item is obtained inside the loop by "*pvar".
- */
-#define SET_FOREACH(pvar, set)						\
-	for (pvar = SET_BEGIN(set); pvar < SET_LIMIT(set); pvar++)
-
-#define SET_ITEM(set, i)						\
-	((SET_BEGIN(set))[i])
-
-/*
- * Provide a count of the items in a set.
- */
-#define SET_COUNT(set)							\
-	(SET_LIMIT(set) - SET_BEGIN(set))
-
-#endif	/* _SYS_LINKER_SET_H_ */
diff --git a/usr/contrib/freebsd/sys/pciio.h b/usr/contrib/freebsd/sys/pciio.h
new file mode 100644
index 0000000000..d70bfbcf6f
--- /dev/null
+++ b/usr/contrib/freebsd/sys/pciio.h
@@ -0,0 +1,146 @@
+/*-
+ * Copyright (c) 1997, Stefan Esser <se@FreeBSD.ORG>
+ * Copyright (c) 1997, 1998, 1999, Kenneth D. Merry <ken@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ *
+ */
+
+#ifndef _SYS_PCIIO_H_
+#define	_SYS_PCIIO_H_
+
+#include <sys/ioccom.h>
+
+#define PCI_MAXNAMELEN	16
+
+typedef enum {
+	PCI_GETCONF_LAST_DEVICE,
+	PCI_GETCONF_LIST_CHANGED,
+	PCI_GETCONF_MORE_DEVS,
+	PCI_GETCONF_ERROR
+} pci_getconf_status;
+
+typedef enum {
+	PCI_GETCONF_NO_MATCH		= 0x0000,
+	PCI_GETCONF_MATCH_DOMAIN	= 0x0001,
+	PCI_GETCONF_MATCH_BUS		= 0x0002,
+	PCI_GETCONF_MATCH_DEV		= 0x0004,
+	PCI_GETCONF_MATCH_FUNC		= 0x0008,
+	PCI_GETCONF_MATCH_NAME		= 0x0010,
+	PCI_GETCONF_MATCH_UNIT		= 0x0020,
+	PCI_GETCONF_MATCH_VENDOR	= 0x0040,
+	PCI_GETCONF_MATCH_DEVICE	= 0x0080,
+	PCI_GETCONF_MATCH_CLASS		= 0x0100
+} pci_getconf_flags;
+
+struct pcisel {
+	u_int32_t	pc_domain;	/* domain number */
+	u_int8_t	pc_bus;		/* bus number */
+	u_int8_t	pc_dev;		/* device on this bus */
+	u_int8_t	pc_func;	/* function on this device */
+};
+
+struct pci_conf {
+	struct pcisel	pc_sel;		/* domain+bus+slot+function */
+	u_int8_t	pc_hdr;		/* PCI header type */
+	u_int16_t	pc_subvendor;	/* card vendor ID */
+	u_int16_t	pc_subdevice;	/* card device ID, assigned by 
+					   card vendor */
+	u_int16_t	pc_vendor;	/* chip vendor ID */
+	u_int16_t	pc_device;	/* chip device ID, assigned by 
+					   chip vendor */
+	u_int8_t	pc_class;	/* chip PCI class */
+	u_int8_t	pc_subclass;	/* chip PCI subclass */
+	u_int8_t	pc_progif;	/* chip PCI programming interface */
+	u_int8_t	pc_revid;	/* chip revision ID */
+	char		pd_name[PCI_MAXNAMELEN + 1];  /* device name */
+	u_long		pd_unit;	/* device unit number */
+};
+
+struct pci_match_conf {
+	struct pcisel		pc_sel;		/* domain+bus+slot+function */
+	char			pd_name[PCI_MAXNAMELEN + 1];  /* device name */
+	u_long			pd_unit;	/* Unit number */
+	u_int16_t		pc_vendor;	/* PCI Vendor ID */
+	u_int16_t		pc_device;	/* PCI Device ID */
+	u_int8_t		pc_class;	/* PCI class */
+	pci_getconf_flags	flags;		/* Matching expression */
+};
+
+struct pci_conf_io {
+	u_int32_t		pat_buf_len;	/* pattern buffer length */
+	u_int32_t		num_patterns;	/* number of patterns */
+	struct pci_match_conf	*patterns;	/* pattern buffer */
+	u_int32_t		match_buf_len;	/* match buffer length */
+	u_int32_t		num_matches;	/* number of matches returned */
+	struct pci_conf		*matches;	/* match buffer */
+	u_int32_t		offset;		/* offset into device list */
+	u_int32_t		generation;	/* device list generation */
+	pci_getconf_status	status;		/* request status */
+};
+
+struct pci_io {
+	struct pcisel	pi_sel;		/* device to operate on */
+	int		pi_reg;		/* configuration register to examine */
+	int		pi_width;	/* width (in bytes) of read or write */
+	u_int32_t	pi_data;	/* data to write or result of read */
+};
+
+struct pci_bar_io {
+	struct pcisel	pbi_sel;	/* device to operate on */
+	int		pbi_reg;	/* starting address of BAR */
+	int		pbi_enabled;	/* decoding enabled */
+	uint64_t	pbi_base;	/* current value of BAR */
+	uint64_t	pbi_length;	/* length of BAR */
+};
+
+struct pci_vpd_element {
+	char		pve_keyword[2];
+	uint8_t		pve_flags;
+	uint8_t		pve_datalen;
+	uint8_t		pve_data[0];
+};
+
+#define	PVE_FLAG_IDENT		0x01	/* Element is the string identifier */
+#define	PVE_FLAG_RW		0x02	/* Element is read/write */
+
+#define	PVE_NEXT(pve)							\
+	((struct pci_vpd_element *)((char *)(pve) +			\
+	    sizeof(struct pci_vpd_element) + (pve)->pve_datalen))
+
+struct pci_list_vpd_io {
+	struct pcisel	plvi_sel;	/* device to operate on */
+	size_t		plvi_len;	/* size of the data area */
+	struct pci_vpd_element *plvi_data;
+};
+
+#define	PCIOCGETCONF	_IOWR('p', 5, struct pci_conf_io)
+#define	PCIOCREAD	_IOWR('p', 2, struct pci_io)
+#define	PCIOCWRITE	_IOWR('p', 3, struct pci_io)
+#define	PCIOCATTACHED	_IOWR('p', 4, struct pci_io)
+#define	PCIOCGETBAR	_IOWR('p', 6, struct pci_bar_io)
+#define	PCIOCLISTVPD	_IOWR('p', 7, struct pci_list_vpd_io)
+
+#endif /* !_SYS_PCIIO_H_ */
diff --git a/usr/contrib/freebsd/sys/queue.h b/usr/contrib/freebsd/sys/queue.h
new file mode 100644
index 0000000000..f26c492af1
--- /dev/null
+++ b/usr/contrib/freebsd/sys/queue.h
@@ -0,0 +1,787 @@
+/*-
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_QUEUE_H_
+#define	_SYS_QUEUE_H_
+
+#include <sys/cdefs.h>
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may be traversed in either direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ * Below is a summary of implemented functions where:
+ *  +  means the macro is available
+ *  -  means the macro is not available
+ *  s  means the macro is available but is slow (runs in O(n) time)
+ *
+ *				SLIST	LIST	STAILQ	TAILQ
+ * _HEAD			+	+	+	+
+ * _CLASS_HEAD			+	+	+	+
+ * _HEAD_INITIALIZER		+	+	+	+
+ * _ENTRY			+	+	+	+
+ * _CLASS_ENTRY			+	+	+	+
+ * _INIT			+	+	+	+
+ * _EMPTY			+	+	+	+
+ * _FIRST			+	+	+	+
+ * _NEXT			+	+	+	+
+ * _PREV			-	+	-	+
+ * _LAST			-	-	+	+
+ * _FOREACH			+	+	+	+
+ * _FOREACH_FROM		+	+	+	+
+ * _FOREACH_SAFE		+	+	+	+
+ * _FOREACH_FROM_SAFE		+	+	+	+
+ * _FOREACH_REVERSE		-	-	-	+
+ * _FOREACH_REVERSE_FROM	-	-	-	+
+ * _FOREACH_REVERSE_SAFE	-	-	-	+
+ * _FOREACH_REVERSE_FROM_SAFE	-	-	-	+
+ * _INSERT_HEAD			+	+	+	+
+ * _INSERT_BEFORE		-	+	-	+
+ * _INSERT_AFTER		+	+	+	+
+ * _INSERT_TAIL			-	-	+	+
+ * _CONCAT			s	s	+	+
+ * _REMOVE_AFTER		+	-	+	-
+ * _REMOVE_HEAD			+	-	+	-
+ * _REMOVE			s	+	s	+
+ * _SWAP			+	+	+	+
+ *
+ */
+#ifdef QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+	unsigned long	 lastline;
+	unsigned long	 prevline;
+	const char	*lastfile;
+	const char	*prevfile;
+};
+
+#define	TRACEBUF	struct qm_trace trace;
+#define	TRACEBUF_INITIALIZER	{ __LINE__, 0, __FILE__, NULL } ,
+#define	TRASHIT(x)	do {(x) = (void *)-1;} while (0)
+#define	QMD_SAVELINK(name, link)	void **name = (void *)&(link)
+
+#define	QMD_TRACE_HEAD(head) do {					\
+	(head)->trace.prevline = (head)->trace.lastline;		\
+	(head)->trace.prevfile = (head)->trace.lastfile;		\
+	(head)->trace.lastline = __LINE__;				\
+	(head)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#define	QMD_TRACE_ELEM(elem) do {					\
+	(elem)->trace.prevline = (elem)->trace.lastline;		\
+	(elem)->trace.prevfile = (elem)->trace.lastfile;		\
+	(elem)->trace.lastline = __LINE__;				\
+	(elem)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#else
+#define	QMD_TRACE_ELEM(elem)
+#define	QMD_TRACE_HEAD(head)
+#define	QMD_SAVELINK(name, link)
+#define	TRACEBUF
+#define	TRACEBUF_INITIALIZER
+#define	TRASHIT(x)
+#endif	/* QUEUE_MACRO_DEBUG */
+
+#ifdef __cplusplus
+/*
+ * In C++ there can be structure lists and class lists:
+ */
+#define	QUEUE_TYPEOF(type) type
+#else
+#define	QUEUE_TYPEOF(type) struct type
+#endif
+
+/*
+ * Singly-linked List declarations.
+ */
+#define	SLIST_HEAD(name, type)						\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_CLASS_HEAD(name, type)					\
+struct name {								\
+	class type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+#define	SLIST_CLASS_ENTRY(type)						\
+struct {								\
+	class type *sle_next;		/* next element */		\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define SLIST_CONCAT(head1, head2, type, field) do {			\
+	QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head1);		\
+	if (curelm == NULL) {						\
+		if ((SLIST_FIRST(head1) = SLIST_FIRST(head2)) != NULL)	\
+			SLIST_INIT(head2);				\
+	} else if (SLIST_FIRST(head2) != NULL) {			\
+		while (SLIST_NEXT(curelm, field) != NULL)		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_NEXT(curelm, field) = SLIST_FIRST(head2);		\
+		SLIST_INIT(head2);					\
+	}								\
+} while (0)
+
+#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
+
+#define	SLIST_FIRST(head)	((head)->slh_first)
+
+#define	SLIST_FOREACH(var, head, field)					\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_FOREACH_FROM(var, head, field)				\
+	for ((var) = ((var) ? (var) : SLIST_FIRST((head)));		\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_FOREACH_SAFE(var, head, field, tvar)			\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var) && ((tvar) = SLIST_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	SLIST_FOREACH_FROM_SAFE(var, head, field, tvar)			\
+	for ((var) = ((var) ? (var) : SLIST_FIRST((head)));		\
+	    (var) && ((tvar) = SLIST_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	SLIST_FOREACH_PREVPTR(var, varp, head, field)			\
+	for ((varp) = &SLIST_FIRST((head));				\
+	    ((var) = *(varp)) != NULL;					\
+	    (varp) = &SLIST_NEXT((var), field))
+
+#define	SLIST_INIT(head) do {						\
+	SLIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
+	SLIST_NEXT((slistelm), field) = (elm);				\
+} while (0)
+
+#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
+	SLIST_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#define	SLIST_REMOVE(head, elm, type, field) do {			\
+	QMD_SAVELINK(oldnext, (elm)->field.sle_next);			\
+	if (SLIST_FIRST((head)) == (elm)) {				\
+		SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head);		\
+		while (SLIST_NEXT(curelm, field) != (elm))		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_REMOVE_AFTER(curelm, field);			\
+	}								\
+	TRASHIT(*oldnext);						\
+} while (0)
+
+#define SLIST_REMOVE_AFTER(elm, field) do {				\
+	SLIST_NEXT(elm, field) =					\
+	    SLIST_NEXT(SLIST_NEXT(elm, field), field);			\
+} while (0)
+
+#define	SLIST_REMOVE_HEAD(head, field) do {				\
+	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
+} while (0)
+
+#define SLIST_SWAP(head1, head2, type) do {				\
+	QUEUE_TYPEOF(type) *swap_first = SLIST_FIRST(head1);		\
+	SLIST_FIRST(head1) = SLIST_FIRST(head2);			\
+	SLIST_FIRST(head2) = swap_first;				\
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define	STAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *stqh_first;/* first element */			\
+	struct type **stqh_last;/* addr of last next element */		\
+}
+
+#define	STAILQ_CLASS_HEAD(name, type)					\
+struct name {								\
+	class type *stqh_first;	/* first element */			\
+	class type **stqh_last;	/* addr of last next element */		\
+}
+
+#define	STAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).stqh_first }
+
+#define	STAILQ_ENTRY(type)						\
+struct {								\
+	struct type *stqe_next;	/* next element */			\
+}
+
+#define	STAILQ_CLASS_ENTRY(type)					\
+struct {								\
+	class type *stqe_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define	STAILQ_CONCAT(head1, head2) do {				\
+	if (!STAILQ_EMPTY((head2))) {					\
+		*(head1)->stqh_last = (head2)->stqh_first;		\
+		(head1)->stqh_last = (head2)->stqh_last;		\
+		STAILQ_INIT((head2));					\
+	}								\
+} while (0)
+
+#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
+
+#define	STAILQ_FIRST(head)	((head)->stqh_first)
+
+#define	STAILQ_FOREACH(var, head, field)				\
+	for((var) = STAILQ_FIRST((head));				\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_FOREACH_FROM(var, head, field)				\
+	for ((var) = ((var) ? (var) : STAILQ_FIRST((head)));		\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_FOREACH_SAFE(var, head, field, tvar)			\
+	for ((var) = STAILQ_FIRST((head));				\
+	    (var) && ((tvar) = STAILQ_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	STAILQ_FOREACH_FROM_SAFE(var, head, field, tvar)		\
+	for ((var) = ((var) ? (var) : STAILQ_FIRST((head)));		\
+	    (var) && ((tvar) = STAILQ_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	STAILQ_INIT(head) do {						\
+	STAILQ_FIRST((head)) = NULL;					\
+	(head)->stqh_last = &STAILQ_FIRST((head));			\
+} while (0)
+
+#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_NEXT((tqelm), field) = (elm);				\
+} while (0)
+
+#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
+	STAILQ_NEXT((elm), field) = NULL;				\
+	*(head)->stqh_last = (elm);					\
+	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	STAILQ_LAST(head, type, field)				\
+	(STAILQ_EMPTY((head)) ? NULL :				\
+	    __containerof((head)->stqh_last,			\
+	    QUEUE_TYPEOF(type), field.stqe_next))
+
+#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
+
+#define	STAILQ_REMOVE(head, elm, type, field) do {			\
+	QMD_SAVELINK(oldnext, (elm)->field.stqe_next);			\
+	if (STAILQ_FIRST((head)) == (elm)) {				\
+		STAILQ_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		QUEUE_TYPEOF(type) *curelm = STAILQ_FIRST(head);	\
+		while (STAILQ_NEXT(curelm, field) != (elm))		\
+			curelm = STAILQ_NEXT(curelm, field);		\
+		STAILQ_REMOVE_AFTER(head, curelm, field);		\
+	}								\
+	TRASHIT(*oldnext);						\
+} while (0)
+
+#define STAILQ_REMOVE_AFTER(head, elm, field) do {			\
+	if ((STAILQ_NEXT(elm, field) =					\
+	     STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD(head, field) do {				\
+	if ((STAILQ_FIRST((head)) =					\
+	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+#define STAILQ_SWAP(head1, head2, type) do {				\
+	QUEUE_TYPEOF(type) *swap_first = STAILQ_FIRST(head1);		\
+	QUEUE_TYPEOF(type) **swap_last = (head1)->stqh_last;		\
+	STAILQ_FIRST(head1) = STAILQ_FIRST(head2);			\
+	(head1)->stqh_last = (head2)->stqh_last;			\
+	STAILQ_FIRST(head2) = swap_first;				\
+	(head2)->stqh_last = swap_last;					\
+	if (STAILQ_EMPTY(head1))					\
+		(head1)->stqh_last = &STAILQ_FIRST(head1);		\
+	if (STAILQ_EMPTY(head2))					\
+		(head2)->stqh_last = &STAILQ_FIRST(head2);		\
+} while (0)
+
+
+/*
+ * List declarations.
+ */
+#define	LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+#define	LIST_CLASS_HEAD(name, type)					\
+struct name {								\
+	class type *lh_first;	/* first element */			\
+}
+
+#define	LIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	LIST_ENTRY(type)						\
+struct {								\
+	struct type *le_next;	/* next element */			\
+	struct type **le_prev;	/* address of previous next element */	\
+}
+
+#define	LIST_CLASS_ENTRY(type)						\
+struct {								\
+	class type *le_next;	/* next element */			\
+	class type **le_prev;	/* address of previous next element */	\
+}
+
+/*
+ * List functions.
+ */
+
+#if (defined(_KERNEL) && defined(INVARIANTS))
+#define	QMD_LIST_CHECK_HEAD(head, field) do {				\
+	if (LIST_FIRST((head)) != NULL &&				\
+	    LIST_FIRST((head))->field.le_prev !=			\
+	     &LIST_FIRST((head)))					\
+		panic("Bad list head %p first->prev != head", (head));	\
+} while (0)
+
+#define	QMD_LIST_CHECK_NEXT(elm, field) do {				\
+	if (LIST_NEXT((elm), field) != NULL &&				\
+	    LIST_NEXT((elm), field)->field.le_prev !=			\
+	     &((elm)->field.le_next))					\
+	     	panic("Bad link elm %p next->prev != elm", (elm));	\
+} while (0)
+
+#define	QMD_LIST_CHECK_PREV(elm, field) do {				\
+	if (*(elm)->field.le_prev != (elm))				\
+		panic("Bad link elm %p prev->next != elm", (elm));	\
+} while (0)
+#else
+#define	QMD_LIST_CHECK_HEAD(head, field)
+#define	QMD_LIST_CHECK_NEXT(elm, field)
+#define	QMD_LIST_CHECK_PREV(elm, field)
+#endif /* (_KERNEL && INVARIANTS) */
+
+#define LIST_CONCAT(head1, head2, type, field) do {			      \
+	QUEUE_TYPEOF(type) *curelm = LIST_FIRST(head1);			      \
+	if (curelm == NULL) {						      \
+		if ((LIST_FIRST(head1) = LIST_FIRST(head2)) != NULL) {	      \
+			LIST_FIRST(head2)->field.le_prev =		      \
+			    &LIST_FIRST((head1));			      \
+			LIST_INIT(head2);				      \
+		}							      \
+	} else if (LIST_FIRST(head2) != NULL) {				      \
+		while (LIST_NEXT(curelm, field) != NULL)		      \
+			curelm = LIST_NEXT(curelm, field);		      \
+		LIST_NEXT(curelm, field) = LIST_FIRST(head2);		      \
+		LIST_FIRST(head2)->field.le_prev = &LIST_NEXT(curelm, field); \
+		LIST_INIT(head2);					      \
+	}								      \
+} while (0)
+
+#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
+
+#define	LIST_FIRST(head)	((head)->lh_first)
+
+#define	LIST_FOREACH(var, head, field)					\
+	for ((var) = LIST_FIRST((head));				\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_FOREACH_FROM(var, head, field)				\
+	for ((var) = ((var) ? (var) : LIST_FIRST((head)));		\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_FOREACH_SAFE(var, head, field, tvar)			\
+	for ((var) = LIST_FIRST((head));				\
+	    (var) && ((tvar) = LIST_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	LIST_FOREACH_FROM_SAFE(var, head, field, tvar)			\
+	for ((var) = ((var) ? (var) : LIST_FIRST((head)));		\
+	    (var) && ((tvar) = LIST_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	LIST_INIT(head) do {						\
+	LIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
+	QMD_LIST_CHECK_NEXT(listelm, field);				\
+	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+		LIST_NEXT((listelm), field)->field.le_prev =		\
+		    &LIST_NEXT((elm), field);				\
+	LIST_NEXT((listelm), field) = (elm);				\
+	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
+} while (0)
+
+#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
+	QMD_LIST_CHECK_PREV(listelm, field);				\
+	(elm)->field.le_prev = (listelm)->field.le_prev;		\
+	LIST_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.le_prev = (elm);				\
+	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
+} while (0)
+
+#define	LIST_INSERT_HEAD(head, elm, field) do {				\
+	QMD_LIST_CHECK_HEAD((head), field);				\
+	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
+		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+	LIST_FIRST((head)) = (elm);					\
+	(elm)->field.le_prev = &LIST_FIRST((head));			\
+} while (0)
+
+#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
+
+#define	LIST_PREV(elm, head, type, field)			\
+	((elm)->field.le_prev == &LIST_FIRST((head)) ? NULL :	\
+	    __containerof((elm)->field.le_prev,			\
+	    QUEUE_TYPEOF(type), field.le_next))
+
+#define	LIST_REMOVE(elm, field) do {					\
+	QMD_SAVELINK(oldnext, (elm)->field.le_next);			\
+	QMD_SAVELINK(oldprev, (elm)->field.le_prev);			\
+	QMD_LIST_CHECK_NEXT(elm, field);				\
+	QMD_LIST_CHECK_PREV(elm, field);				\
+	if (LIST_NEXT((elm), field) != NULL)				\
+		LIST_NEXT((elm), field)->field.le_prev = 		\
+		    (elm)->field.le_prev;				\
+	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
+	TRASHIT(*oldnext);						\
+	TRASHIT(*oldprev);						\
+} while (0)
+
+#define LIST_SWAP(head1, head2, type, field) do {			\
+	QUEUE_TYPEOF(type) *swap_tmp = LIST_FIRST(head1);		\
+	LIST_FIRST((head1)) = LIST_FIRST((head2));			\
+	LIST_FIRST((head2)) = swap_tmp;					\
+	if ((swap_tmp = LIST_FIRST((head1))) != NULL)			\
+		swap_tmp->field.le_prev = &LIST_FIRST((head1));		\
+	if ((swap_tmp = LIST_FIRST((head2))) != NULL)			\
+		swap_tmp->field.le_prev = &LIST_FIRST((head2));		\
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define	TAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *tqh_first;	/* first element */			\
+	struct type **tqh_last;	/* addr of last next element */		\
+	TRACEBUF							\
+}
+
+#define	TAILQ_CLASS_HEAD(name, type)					\
+struct name {								\
+	class type *tqh_first;	/* first element */			\
+	class type **tqh_last;	/* addr of last next element */		\
+	TRACEBUF							\
+}
+
+#define	TAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).tqh_first, TRACEBUF_INITIALIZER }
+
+#define	TAILQ_ENTRY(type)						\
+struct {								\
+	struct type *tqe_next;	/* next element */			\
+	struct type **tqe_prev;	/* address of previous next element */	\
+	TRACEBUF							\
+}
+
+#define	TAILQ_CLASS_ENTRY(type)						\
+struct {								\
+	class type *tqe_next;	/* next element */			\
+	class type **tqe_prev;	/* address of previous next element */	\
+	TRACEBUF							\
+}
+
+/*
+ * Tail queue functions.
+ */
+#if (defined(_KERNEL) && defined(INVARIANTS))
+#define	QMD_TAILQ_CHECK_HEAD(head, field) do {				\
+	if (!TAILQ_EMPTY(head) &&					\
+	    TAILQ_FIRST((head))->field.tqe_prev !=			\
+	     &TAILQ_FIRST((head)))					\
+		panic("Bad tailq head %p first->prev != head", (head));	\
+} while (0)
+
+#define	QMD_TAILQ_CHECK_TAIL(head, field) do {				\
+	if (*(head)->tqh_last != NULL)					\
+	    	panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); 	\
+} while (0)
+
+#define	QMD_TAILQ_CHECK_NEXT(elm, field) do {				\
+	if (TAILQ_NEXT((elm), field) != NULL &&				\
+	    TAILQ_NEXT((elm), field)->field.tqe_prev !=			\
+	     &((elm)->field.tqe_next))					\
+		panic("Bad link elm %p next->prev != elm", (elm));	\
+} while (0)
+
+#define	QMD_TAILQ_CHECK_PREV(elm, field) do {				\
+	if (*(elm)->field.tqe_prev != (elm))				\
+		panic("Bad link elm %p prev->next != elm", (elm));	\
+} while (0)
+#else
+#define	QMD_TAILQ_CHECK_HEAD(head, field)
+#define	QMD_TAILQ_CHECK_TAIL(head, headname)
+#define	QMD_TAILQ_CHECK_NEXT(elm, field)
+#define	QMD_TAILQ_CHECK_PREV(elm, field)
+#endif /* (_KERNEL && INVARIANTS) */
+
+#define	TAILQ_CONCAT(head1, head2, field) do {				\
+	if (!TAILQ_EMPTY(head2)) {					\
+		*(head1)->tqh_last = (head2)->tqh_first;		\
+		(head2)->tqh_first->field.tqe_prev = (head1)->tqh_last;	\
+		(head1)->tqh_last = (head2)->tqh_last;			\
+		TAILQ_INIT((head2));					\
+		QMD_TRACE_HEAD(head1);					\
+		QMD_TRACE_HEAD(head2);					\
+	}								\
+} while (0)
+
+#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)
+
+#define	TAILQ_FIRST(head)	((head)->tqh_first)
+
+#define	TAILQ_FOREACH(var, head, field)					\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_FROM(var, head, field)				\
+	for ((var) = ((var) ? (var) : TAILQ_FIRST((head)));		\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_SAFE(var, head, field, tvar)			\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var) && ((tvar) = TAILQ_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar)			\
+	for ((var) = ((var) ? (var) : TAILQ_FIRST((head)));		\
+	    (var) && ((tvar) = TAILQ_NEXT((var), field), 1);		\
+	    (var) = (tvar))
+
+#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field)		\
+	for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname));	\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar)	\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1);	\
+	    (var) = (tvar))
+
+#define	TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \
+	for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname));	\
+	    (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1);	\
+	    (var) = (tvar))
+
+#define	TAILQ_INIT(head) do {						\
+	TAILQ_FIRST((head)) = NULL;					\
+	(head)->tqh_last = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+} while (0)
+
+#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	QMD_TAILQ_CHECK_NEXT(listelm, field);				\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
+		    &TAILQ_NEXT((elm), field);				\
+	else {								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	TAILQ_NEXT((listelm), field) = (elm);				\
+	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&(listelm)->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	QMD_TAILQ_CHECK_PREV(listelm, field);				\
+	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
+	TAILQ_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.tqe_prev = (elm);				\
+	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&(listelm)->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	QMD_TAILQ_CHECK_HEAD(head, field);				\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
+		TAILQ_FIRST((head))->field.tqe_prev =			\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_FIRST((head)) = (elm);					\
+	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	QMD_TAILQ_CHECK_TAIL(head, field);				\
+	TAILQ_NEXT((elm), field) = NULL;				\
+	(elm)->field.tqe_prev = (head)->tqh_last;			\
+	*(head)->tqh_last = (elm);					\
+	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_LAST(head, headname)					\
+	(*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define	TAILQ_PREV(elm, headname, field)				\
+	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define	TAILQ_REMOVE(head, elm, field) do {				\
+	QMD_SAVELINK(oldnext, (elm)->field.tqe_next);			\
+	QMD_SAVELINK(oldprev, (elm)->field.tqe_prev);			\
+	QMD_TAILQ_CHECK_NEXT(elm, field);				\
+	QMD_TAILQ_CHECK_PREV(elm, field);				\
+	if ((TAILQ_NEXT((elm), field)) != NULL)				\
+		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
+		    (elm)->field.tqe_prev;				\
+	else {								\
+		(head)->tqh_last = (elm)->field.tqe_prev;		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+	TRASHIT(*oldnext);						\
+	TRASHIT(*oldprev);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define TAILQ_SWAP(head1, head2, type, field) do {			\
+	QUEUE_TYPEOF(type) *swap_first = (head1)->tqh_first;		\
+	QUEUE_TYPEOF(type) **swap_last = (head1)->tqh_last;		\
+	(head1)->tqh_first = (head2)->tqh_first;			\
+	(head1)->tqh_last = (head2)->tqh_last;				\
+	(head2)->tqh_first = swap_first;				\
+	(head2)->tqh_last = swap_last;					\
+	if ((swap_first = (head1)->tqh_first) != NULL)			\
+		swap_first->field.tqe_prev = &(head1)->tqh_first;	\
+	else								\
+		(head1)->tqh_last = &(head1)->tqh_first;		\
+	if ((swap_first = (head2)->tqh_first) != NULL)			\
+		swap_first->field.tqe_prev = &(head2)->tqh_first;	\
+	else								\
+		(head2)->tqh_last = &(head2)->tqh_first;		\
+} while (0)
+
+#endif /* !_SYS_QUEUE_H_ */
diff --git a/usr/contrib/freebsd/x86/segments.h b/usr/contrib/freebsd/x86/segments.h
new file mode 100644
index 0000000000..1b8c4a3c1c
--- /dev/null
+++ b/usr/contrib/freebsd/x86/segments.h
@@ -0,0 +1,274 @@
+/*-
+ * Copyright (c) 1989, 1990 William F. Jolitz
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)segments.h	7.1 (Berkeley) 5/9/91
+ * $FreeBSD$
+ */
+
+#ifndef _X86_SEGMENTS_H_
+#define	_X86_SEGMENTS_H_
+
+/*
+ * X86 Segmentation Data Structures and definitions
+ */
+
+/*
+ * Selectors
+ */
+#define	SEL_RPL_MASK	3		/* requester priv level */
+#define	ISPL(s)		((s)&3)		/* priority level of a selector */
+#define	SEL_KPL		0		/* kernel priority level */
+#define	SEL_UPL		3		/* user priority level */
+#define	ISLDT(s)	((s)&SEL_LDT)	/* is it local or global */
+#define	SEL_LDT		4		/* local descriptor table */
+#define	IDXSEL(s)	(((s)>>3) & 0x1fff) /* index of selector */
+#define	LSEL(s,r)	(((s)<<3) | SEL_LDT | r) /* a local selector */
+#define	GSEL(s,r)	(((s)<<3) | r)	/* a global selector */
+
+/*
+ * User segment descriptors (%cs, %ds etc for i386 apps. 64 bit wide)
+ * For long-mode apps, %cs only has the conforming bit in sd_type, the sd_dpl,
+ * sd_p, sd_l and sd_def32 which must be zero).  %ds only has sd_p.
+ */
+struct segment_descriptor {
+	unsigned sd_lolimit:16;		/* segment extent (lsb) */
+	unsigned sd_lobase:24;		/* segment base address (lsb) */
+	unsigned sd_type:5;		/* segment type */
+	unsigned sd_dpl:2;		/* segment descriptor priority level */
+	unsigned sd_p:1;		/* segment descriptor present */
+	unsigned sd_hilimit:4;		/* segment extent (msb) */
+	unsigned sd_xx:2;		/* unused */
+	unsigned sd_def32:1;		/* default 32 vs 16 bit size */
+	unsigned sd_gran:1;		/* limit granularity (byte/page units)*/
+	unsigned sd_hibase:8;		/* segment base address  (msb) */
+} __packed;
+
+struct user_segment_descriptor {
+	unsigned sd_lolimit:16;		/* segment extent (lsb) */
+	unsigned sd_lobase:24;		/* segment base address (lsb) */
+	unsigned sd_type:5;		/* segment type */
+	unsigned sd_dpl:2;		/* segment descriptor priority level */
+	unsigned sd_p:1;		/* segment descriptor present */
+	unsigned sd_hilimit:4;		/* segment extent (msb) */
+	unsigned sd_xx:1;		/* unused */
+	unsigned sd_long:1;		/* long mode (cs only) */
+	unsigned sd_def32:1;		/* default 32 vs 16 bit size */
+	unsigned sd_gran:1;		/* limit granularity (byte/page units)*/
+	unsigned sd_hibase:8;		/* segment base address  (msb) */
+} __packed;
+
+#define	USD_GETBASE(sd)		(((sd)->sd_lobase) | (sd)->sd_hibase << 24)
+#define	USD_SETBASE(sd, b)	(sd)->sd_lobase = (b);	\
+				(sd)->sd_hibase = ((b) >> 24);
+#define	USD_GETLIMIT(sd)	(((sd)->sd_lolimit) | (sd)->sd_hilimit << 16)
+#define	USD_SETLIMIT(sd, l)	(sd)->sd_lolimit = (l);	\
+				(sd)->sd_hilimit = ((l) >> 16);
+
+#ifdef __i386__
+/*
+ * Gate descriptors (e.g. indirect descriptors)
+ */
+struct gate_descriptor {
+	unsigned gd_looffset:16;	/* gate offset (lsb) */
+	unsigned gd_selector:16;	/* gate segment selector */
+	unsigned gd_stkcpy:5;		/* number of stack wds to cpy */
+	unsigned gd_xx:3;		/* unused */
+	unsigned gd_type:5;		/* segment type */
+	unsigned gd_dpl:2;		/* segment descriptor priority level */
+	unsigned gd_p:1;		/* segment descriptor present */
+	unsigned gd_hioffset:16;	/* gate offset (msb) */
+} __packed;
+
+/*
+ * Generic descriptor
+ */
+union descriptor {
+	struct segment_descriptor sd;
+	struct gate_descriptor gd;
+};
+#else
+/*
+ * Gate descriptors (e.g. indirect descriptors, trap, interrupt etc. 128 bit)
+ * Only interrupt and trap gates have gd_ist.
+ */
+struct gate_descriptor {
+	uint64_t gd_looffset:16;	/* gate offset (lsb) */
+	uint64_t gd_selector:16;	/* gate segment selector */
+	uint64_t gd_ist:3;		/* IST table index */
+	uint64_t gd_xx:5;		/* unused */
+	uint64_t gd_type:5;		/* segment type */
+	uint64_t gd_dpl:2;		/* segment descriptor priority level */
+	uint64_t gd_p:1;		/* segment descriptor present */
+	uint64_t gd_hioffset:48;	/* gate offset (msb) */
+	uint64_t sd_xx1:32;
+} __packed;
+
+/*
+ * Generic descriptor
+ */
+union descriptor {
+	struct user_segment_descriptor sd;
+	struct gate_descriptor gd;
+};
+#endif
+
+	/* system segments and gate types */
+#define	SDT_SYSNULL	 0	/* system null */
+#define	SDT_SYS286TSS	 1	/* system 286 TSS available */
+#define	SDT_SYSLDT	 2	/* system local descriptor table */
+#define	SDT_SYS286BSY	 3	/* system 286 TSS busy */
+#define	SDT_SYS286CGT	 4	/* system 286 call gate */
+#define	SDT_SYSTASKGT	 5	/* system task gate */
+#define	SDT_SYS286IGT	 6	/* system 286 interrupt gate */
+#define	SDT_SYS286TGT	 7	/* system 286 trap gate */
+#define	SDT_SYSNULL2	 8	/* system null again */
+#define	SDT_SYS386TSS	 9	/* system 386 TSS available */
+#define	SDT_SYSTSS	 9	/* system available 64 bit TSS */
+#define	SDT_SYSNULL3	10	/* system null again */
+#define	SDT_SYS386BSY	11	/* system 386 TSS busy */
+#define	SDT_SYSBSY	11	/* system busy 64 bit TSS */
+#define	SDT_SYS386CGT	12	/* system 386 call gate */
+#define	SDT_SYSCGT	12	/* system 64 bit call gate */
+#define	SDT_SYSNULL4	13	/* system null again */
+#define	SDT_SYS386IGT	14	/* system 386 interrupt gate */
+#define	SDT_SYSIGT	14	/* system 64 bit interrupt gate */
+#define	SDT_SYS386TGT	15	/* system 386 trap gate */
+#define	SDT_SYSTGT	15	/* system 64 bit trap gate */
+
+	/* memory segment types */
+#define	SDT_MEMRO	16	/* memory read only */
+#define	SDT_MEMROA	17	/* memory read only accessed */
+#define	SDT_MEMRW	18	/* memory read write */
+#define	SDT_MEMRWA	19	/* memory read write accessed */
+#define	SDT_MEMROD	20	/* memory read only expand dwn limit */
+#define	SDT_MEMRODA	21	/* memory read only expand dwn limit accessed */
+#define	SDT_MEMRWD	22	/* memory read write expand dwn limit */
+#define	SDT_MEMRWDA	23	/* memory read write expand dwn limit accessed*/
+#define	SDT_MEME	24	/* memory execute only */
+#define	SDT_MEMEA	25	/* memory execute only accessed */
+#define	SDT_MEMER	26	/* memory execute read */
+#define	SDT_MEMERA	27	/* memory execute read accessed */
+#define	SDT_MEMEC	28	/* memory execute only conforming */
+#define	SDT_MEMEAC	29	/* memory execute only accessed conforming */
+#define	SDT_MEMERC	30	/* memory execute read conforming */
+#define	SDT_MEMERAC	31	/* memory execute read accessed conforming */
+
+/*
+ * Size of IDT table
+ */
+#define	NIDT		256	/* 32 reserved, 0x80 syscall, most are h/w */
+#define	NRSVIDT		32	/* reserved entries for cpu exceptions */
+
+/*
+ * Entries in the Interrupt Descriptor Table (IDT)
+ */
+#define	IDT_DE		0	/* #DE: Divide Error */
+#define	IDT_DB		1	/* #DB: Debug */
+#define	IDT_NMI		2	/* Nonmaskable External Interrupt */
+#define	IDT_BP		3	/* #BP: Breakpoint */
+#define	IDT_OF		4	/* #OF: Overflow */
+#define	IDT_BR		5	/* #BR: Bound Range Exceeded */
+#define	IDT_UD		6	/* #UD: Undefined/Invalid Opcode */
+#define	IDT_NM		7	/* #NM: No Math Coprocessor */
+#define	IDT_DF		8	/* #DF: Double Fault */
+#define	IDT_FPUGP	9	/* Coprocessor Segment Overrun */
+#define	IDT_TS		10	/* #TS: Invalid TSS */
+#define	IDT_NP		11	/* #NP: Segment Not Present */
+#define	IDT_SS		12	/* #SS: Stack Segment Fault */
+#define	IDT_GP		13	/* #GP: General Protection Fault */
+#define	IDT_PF		14	/* #PF: Page Fault */
+#define	IDT_MF		16	/* #MF: FPU Floating-Point Error */
+#define	IDT_AC		17	/* #AC: Alignment Check */
+#define	IDT_MC		18	/* #MC: Machine Check */
+#define	IDT_XF		19	/* #XF: SIMD Floating-Point Exception */
+#define	IDT_IO_INTS	NRSVIDT	/* Base of IDT entries for I/O interrupts. */
+#define	IDT_SYSCALL	0x80	/* System Call Interrupt Vector */
+#define	IDT_DTRACE_RET	0x92	/* DTrace pid provider Interrupt Vector */
+#define	IDT_EVTCHN	0x93	/* Xen HVM Event Channel Interrupt Vector */
+
+#if defined(__i386__)
+/*
+ * Entries in the Global Descriptor Table (GDT)
+ * Note that each 4 entries share a single 32 byte L1 cache line.
+ * Some of the fast syscall instructions require a specific order here.
+ */
+#define	GNULL_SEL	0	/* Null Descriptor */
+#define	GPRIV_SEL	1	/* SMP Per-Processor Private Data */
+#define	GUFS_SEL	2	/* User %fs Descriptor (order critical: 1) */
+#define	GUGS_SEL	3	/* User %gs Descriptor (order critical: 2) */
+#define	GCODE_SEL	4	/* Kernel Code Descriptor (order critical: 1) */
+#define	GDATA_SEL	5	/* Kernel Data Descriptor (order critical: 2) */
+#define	GUCODE_SEL	6	/* User Code Descriptor (order critical: 3) */
+#define	GUDATA_SEL	7	/* User Data Descriptor (order critical: 4) */
+#define	GBIOSLOWMEM_SEL	8	/* BIOS low memory access (must be entry 8) */
+#define	GPROC0_SEL	9	/* Task state process slot zero and up */
+#define	GLDT_SEL	10	/* Default User LDT */
+#define	GUSERLDT_SEL	11	/* User LDT */
+#define	GPANIC_SEL	12	/* Task state to consider panic from */
+#define	GBIOSCODE32_SEL	13	/* BIOS interface (32bit Code) */
+#define	GBIOSCODE16_SEL	14	/* BIOS interface (16bit Code) */
+#define	GBIOSDATA_SEL	15	/* BIOS interface (Data) */
+#define	GBIOSUTIL_SEL	16	/* BIOS interface (Utility) */
+#define	GBIOSARGS_SEL	17	/* BIOS interface (Arguments) */
+#define	GNDIS_SEL	18	/* For the NDIS layer */
+#define	NGDT		19
+
+/*
+ * Entries in the Local Descriptor Table (LDT)
+ */
+#define	LSYS5CALLS_SEL	0	/* forced by intel BCS */
+#define	LSYS5SIGR_SEL	1
+#define	LUCODE_SEL	3
+#define	LUDATA_SEL	5
+#define	NLDT		(LUDATA_SEL + 1)
+
+#else /* !__i386__ */
+/*
+ * Entries in the Global Descriptor Table (GDT)
+ */
+#define	GNULL_SEL	0	/* Null Descriptor */
+#define	GNULL2_SEL	1	/* Null Descriptor */
+#define	GUFS32_SEL	2	/* User 32 bit %fs Descriptor */
+#define	GUGS32_SEL	3	/* User 32 bit %gs Descriptor */
+#define	GCODE_SEL	4	/* Kernel Code Descriptor */
+#define	GDATA_SEL	5	/* Kernel Data Descriptor */
+#define	GUCODE32_SEL	6	/* User 32 bit code Descriptor */
+#define	GUDATA_SEL	7	/* User 32/64 bit Data Descriptor */
+#define	GUCODE_SEL	8	/* User 64 bit Code Descriptor */
+#define	GPROC0_SEL	9	/* TSS for entering kernel etc */
+/* slot 10 is second half of GPROC0_SEL */
+#define	GUSERLDT_SEL	11	/* LDT */
+/* slot 12 is second half of GUSERLDT_SEL */
+#define	NGDT 		13
+#endif /* __i386__ */
+
+#endif /* !_X86_SEGMENTS_H_ */
diff --git a/usr/contrib/freebsd/x86/specialreg.h b/usr/contrib/freebsd/x86/specialreg.h
index bea3122423..f528bad55c 100644
--- a/usr/contrib/freebsd/x86/specialreg.h
+++ b/usr/contrib/freebsd/x86/specialreg.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  *
@@ -10,7 +12,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -27,7 +29,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)specialreg.h	7.1 (Berkeley) 5/9/91
- * $FreeBSD: head/sys/x86/include/specialreg.h 273338 2014-10-20 18:09:33Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _MACHINE_SPECIALREG_H_
@@ -53,6 +55,7 @@
 #define	CR0_CD  0x40000000	/* Cache Disable */
 
 #define	CR3_PCID_SAVE 0x8000000000000000
+#define	CR3_PCID_MASK 0xfff
 
 /*
  * Bits in PPro special registers
@@ -73,6 +76,8 @@
 #define	CR4_PCIDE 0x00020000	/* Enable Context ID */
 #define	CR4_XSAVE 0x00040000	/* XSETBV/XGETBV */
 #define	CR4_SMEP 0x00100000	/* Supervisor-Mode Execution Prevention */
+#define	CR4_SMAP 0x00200000	/* Supervisor-Mode Access Prevention */
+#define	CR4_PKE	0x00400000	/* Protection Keys Enable */
 
 /*
  * Bits in AMD64 special registers.  EFER is 64 bits wide.
@@ -82,6 +87,9 @@
 #define	EFER_LMA 0x000000400	/* Long mode active (R) */
 #define	EFER_NXE 0x000000800	/* PTE No-Execute bit enable (R/W) */
 #define	EFER_SVM 0x000001000	/* SVM enable bit for AMD, reserved for Intel */
+#define	EFER_LMSLE 0x000002000	/* Long Mode Segment Limit Enable */
+#define	EFER_FFXSR 0x000004000	/* Fast FXSAVE/FSRSTOR */
+#define	EFER_TCE   0x000008000	/* Translation Cache Extension */
 
 /*
  * Intel Extended Features registers
@@ -154,6 +162,7 @@
 #define	CPUID2_TM2	0x00000100
 #define	CPUID2_SSSE3	0x00000200
 #define	CPUID2_CNXTID	0x00000400
+#define	CPUID2_SDBG	0x00000800
 #define	CPUID2_FMA	0x00001000
 #define	CPUID2_CX16	0x00002000
 #define	CPUID2_XTPR	0x00004000
@@ -181,8 +190,43 @@
 #define	CPUTPM1_SENSOR	0x00000001
 #define	CPUTPM1_TURBO	0x00000002
 #define	CPUTPM1_ARAT	0x00000004
+#define	CPUTPM1_HWP	0x00000080
+#define	CPUTPM1_HWP_NOTIFICATION	0x00000100
+#define	CPUTPM1_HWP_ACTIVITY_WINDOW	0x00000200
+#define	CPUTPM1_HWP_PERF_PREF	0x00000400
+#define	CPUTPM1_HWP_PKG	0x00000800
+#define	CPUTPM1_HWP_FLEXIBLE	0x00020000
 #define	CPUTPM2_EFFREQ	0x00000001
 
+/* Intel Processor Trace CPUID. */
+
+/* Leaf 0 ebx. */
+#define	CPUPT_CR3		(1 << 0)	/* CR3 Filtering Support */
+#define	CPUPT_PSB		(1 << 1)	/* Configurable PSB and Cycle-Accurate Mode Supported */
+#define	CPUPT_IPF		(1 << 2)	/* IP Filtering and TraceStop supported */
+#define	CPUPT_MTC		(1 << 3)	/* MTC Supported */
+#define	CPUPT_PRW		(1 << 4)	/* PTWRITE Supported */
+#define	CPUPT_PWR		(1 << 5)	/* Power Event Trace Supported */
+
+/* Leaf 0 ecx. */
+#define	CPUPT_TOPA		(1 << 0)	/* ToPA Output Supported */
+#define	CPUPT_TOPA_MULTI	(1 << 1)	/* ToPA Tables Allow Multiple Output Entries */
+#define	CPUPT_SINGLE		(1 << 2)	/* Single-Range Output Supported */
+#define	CPUPT_TT_OUT		(1 << 3)	/* Output to Trace Transport Subsystem Supported */
+#define	CPUPT_LINEAR_IP		(1 << 31)	/* IP Payloads are Linear IP, otherwise IP is effective */
+
+/* Leaf 1 eax. */
+#define	CPUPT_NADDR_S		0	/* Number of Address Ranges */
+#define	CPUPT_NADDR_M		(0x7 << CPUPT_NADDR_S)
+#define	CPUPT_MTC_BITMAP_S	16	/* Bitmap of supported MTC Period Encodings */
+#define	CPUPT_MTC_BITMAP_M	(0xffff << CPUPT_MTC_BITMAP_S)
+
+/* Leaf 1 ebx. */
+#define	CPUPT_CT_BITMAP_S	0	/* Bitmap of supported Cycle Threshold values */
+#define	CPUPT_CT_BITMAP_M	(0xffff << CPUPT_CT_BITMAP_S)
+#define	CPUPT_PFE_BITMAP_S	16	/* Bitmap of supported Configurable PSB Frequency encoding */
+#define	CPUPT_PFE_BITMAP_M	(0xffff << CPUPT_PFE_BITMAP_S)
+
 /*
  * Important bits in the AMD extended cpuid flags
  */
@@ -190,7 +234,7 @@
 #define	AMDID_MP	0x00080000
 #define	AMDID_NX	0x00100000
 #define	AMDID_EXT_MMX	0x00400000
-#define	AMDID_FFXSR	0x01000000
+#define	AMDID_FFXSR	0x02000000
 #define	AMDID_PAGE1GB	0x04000000
 #define	AMDID_RDTSCP	0x08000000
 #define	AMDID_LM	0x20000000
@@ -222,6 +266,7 @@
 #define	AMDID2_DBE	0x04000000
 #define	AMDID2_PTSC	0x08000000
 #define	AMDID2_PTSCEL2I	0x10000000
+#define	AMDID2_MWAITX	0x20000000
 
 /*
  * CPUID instruction 1 eax info
@@ -301,6 +346,15 @@
 #define	CPUID_EXTSTATE_XINUSE	0x00000004
 #define	CPUID_EXTSTATE_XSAVES	0x00000008
 
+/*
+ * AMD extended function 8000_0007h ebx info
+ */
+#define	AMDRAS_MCA_OF_RECOV	0x00000001
+#define	AMDRAS_SUCCOR		0x00000002
+#define	AMDRAS_HW_ASSERT	0x00000004
+#define	AMDRAS_SCALABLE_MCA	0x00000008
+#define	AMDRAS_PFEH_SUPPORT	0x00000010
+
 /*
  * AMD extended function 8000_0007h edx info
  */
@@ -315,6 +369,24 @@
 #define	AMDPM_TSC_INVARIANT	0x00000100
 #define	AMDPM_CPB		0x00000200
 
+/*
+ * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions)
+ */
+#define	AMDFEID_CLZERO		0x00000001
+#define	AMDFEID_IRPERF		0x00000002
+#define	AMDFEID_XSAVEERPTR	0x00000004
+#define	AMDFEID_IBPB		0x00001000
+#define	AMDFEID_IBRS		0x00004000
+#define	AMDFEID_STIBP		0x00008000
+/* The below are only defined if the corresponding base feature above exists. */
+#define	AMDFEID_IBRS_ALWAYSON	0x00010000
+#define	AMDFEID_STIBP_ALWAYSON	0x00020000
+#define	AMDFEID_PREFER_IBRS	0x00040000
+#define	AMDFEID_SSBD		0x01000000
+/* SSBD via MSRC001_011F instead of MSR 0x48: */
+#define	AMDFEID_VIRT_SSBD	0x02000000
+#define	AMDFEID_SSB_NO		0x04000000
+
 /*
  * AMD extended function 8000_0008h ecx info
  */
@@ -327,25 +399,83 @@
  */
 #define	CPUID_STDEXT_FSGSBASE	0x00000001
 #define	CPUID_STDEXT_TSC_ADJUST	0x00000002
+#define	CPUID_STDEXT_SGX	0x00000004
 #define	CPUID_STDEXT_BMI1	0x00000008
 #define	CPUID_STDEXT_HLE	0x00000010
 #define	CPUID_STDEXT_AVX2	0x00000020
+#define	CPUID_STDEXT_FDP_EXC	0x00000040
 #define	CPUID_STDEXT_SMEP	0x00000080
 #define	CPUID_STDEXT_BMI2	0x00000100
 #define	CPUID_STDEXT_ERMS	0x00000200
 #define	CPUID_STDEXT_INVPCID	0x00000400
 #define	CPUID_STDEXT_RTM	0x00000800
+#define	CPUID_STDEXT_PQM	0x00001000
+#define	CPUID_STDEXT_NFPUSG	0x00002000
 #define	CPUID_STDEXT_MPX	0x00004000
+#define	CPUID_STDEXT_PQE	0x00008000
 #define	CPUID_STDEXT_AVX512F	0x00010000
+#define	CPUID_STDEXT_AVX512DQ	0x00020000
 #define	CPUID_STDEXT_RDSEED	0x00040000
 #define	CPUID_STDEXT_ADX	0x00080000
 #define	CPUID_STDEXT_SMAP	0x00100000
+#define	CPUID_STDEXT_AVX512IFMA	0x00200000
+#define	CPUID_STDEXT_PCOMMIT	0x00400000
 #define	CPUID_STDEXT_CLFLUSHOPT	0x00800000
+#define	CPUID_STDEXT_CLWB	0x01000000
 #define	CPUID_STDEXT_PROCTRACE	0x02000000
 #define	CPUID_STDEXT_AVX512PF	0x04000000
 #define	CPUID_STDEXT_AVX512ER	0x08000000
 #define	CPUID_STDEXT_AVX512CD	0x10000000
 #define	CPUID_STDEXT_SHA	0x20000000
+#define	CPUID_STDEXT_AVX512BW	0x40000000
+#define	CPUID_STDEXT_AVX512VL	0x80000000
+
+/*
+ * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info
+ */
+#define	CPUID_STDEXT2_PREFETCHWT1 	0x00000001
+#define	CPUID_STDEXT2_AVX512VBMI	0x00000002
+#define	CPUID_STDEXT2_UMIP		0x00000004
+#define	CPUID_STDEXT2_PKU		0x00000008
+#define	CPUID_STDEXT2_OSPKE		0x00000010
+#define	CPUID_STDEXT2_WAITPKG		0x00000020
+#define	CPUID_STDEXT2_AVX512VBMI2	0x00000040
+#define	CPUID_STDEXT2_GFNI		0x00000100
+#define	CPUID_STDEXT2_VAES		0x00000200
+#define	CPUID_STDEXT2_VPCLMULQDQ	0x00000400
+#define	CPUID_STDEXT2_AVX512VNNI	0x00000800
+#define	CPUID_STDEXT2_AVX512BITALG	0x00001000
+#define	CPUID_STDEXT2_AVX512VPOPCNTDQ	0x00004000
+#define	CPUID_STDEXT2_RDPID		0x00400000
+#define	CPUID_STDEXT2_CLDEMOTE		0x02000000
+#define	CPUID_STDEXT2_MOVDIRI		0x08000000
+#define	CPUID_STDEXT2_MOVDIRI64B	0x10000000
+#define	CPUID_STDEXT2_ENQCMD		0x20000000
+#define	CPUID_STDEXT2_SGXLC		0x40000000
+
+/*
+ * CPUID instruction 7 Structured Extended Features, leaf 0 edx info
+ */
+#define	CPUID_STDEXT3_AVX5124VNNIW	0x00000004
+#define	CPUID_STDEXT3_AVX5124FMAPS	0x00000008
+#define	CPUID_STDEXT3_AVX512VP2INTERSECT	0x00000100
+#define	CPUID_STDEXT3_MD_CLEAR		0x00000400
+#define	CPUID_STDEXT3_TSXFA		0x00002000
+#define	CPUID_STDEXT3_PCONFIG		0x00040000
+#define	CPUID_STDEXT3_IBPB		0x04000000
+#define	CPUID_STDEXT3_STIBP		0x08000000
+#define	CPUID_STDEXT3_L1D_FLUSH		0x10000000
+#define	CPUID_STDEXT3_ARCH_CAP		0x20000000
+#define	CPUID_STDEXT3_CORE_CAP		0x40000000
+#define	CPUID_STDEXT3_SSBD		0x80000000
+
+/* MSR IA32_ARCH_CAP(ABILITIES) bits */
+#define	IA32_ARCH_CAP_RDCL_NO	0x00000001
+#define	IA32_ARCH_CAP_IBRS_ALL	0x00000002
+#define	IA32_ARCH_CAP_RSBA	0x00000004
+#define	IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY	0x00000008
+#define	IA32_ARCH_CAP_SSB_NO	0x00000010
+#define	IA32_ARCH_CAP_MDS_NO	0x00000020
 
 /*
  * CPUID manufacturers identifiers
@@ -375,6 +505,8 @@
 #define	MSR_EBL_CR_POWERON	0x02a
 #define	MSR_TEST_CTL		0x033
 #define	MSR_IA32_FEATURE_CONTROL 0x03a
+#define	MSR_IA32_SPEC_CTRL	0x048
+#define	MSR_IA32_PRED_CMD	0x049
 #define	MSR_BIOS_UPDT_TRIG	0x079
 #define	MSR_BBL_CR_D0		0x088
 #define	MSR_BBL_CR_D1		0x089
@@ -387,6 +519,9 @@
 #define	MSR_APERF		0x0e8
 #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
 #define	MSR_MTRRcap		0x0fe
+#define	MSR_IA32_ARCH_CAP	0x10a
+#define	MSR_IA32_FLUSH_CMD	0x10b
+#define	MSR_TSX_FORCE_ABORT	0x10f
 #define	MSR_BBL_CR_ADDR		0x116
 #define	MSR_BBL_CR_DECC		0x118
 #define	MSR_BBL_CR_CTL		0x119
@@ -446,6 +581,14 @@
 #define	MSR_DRAM_ENERGY_STATUS	0x619
 #define	MSR_PP0_ENERGY_STATUS	0x639
 #define	MSR_PP1_ENERGY_STATUS	0x641
+#define	MSR_PPERF		0x64e
+#define	MSR_TSC_DEADLINE	0x6e0	/* Writes are not serializing */
+#define	MSR_IA32_PM_ENABLE	0x770
+#define	MSR_IA32_HWP_CAPABILITIES	0x771
+#define	MSR_IA32_HWP_REQUEST_PKG	0x772
+#define	MSR_IA32_HWP_INTERRUPT		0x773
+#define	MSR_IA32_HWP_REQUEST	0x774
+#define	MSR_IA32_HWP_STATUS	0x777
 
 /*
  * VMX MSRs
@@ -467,8 +610,10 @@
 #define	MSR_VMX_TRUE_ENTRY_CTLS	0x490
 
 /*
- * X2APIC MSRs
+ * X2APIC MSRs.
+ * Writes are not serializing.
  */
+#define	MSR_APIC_000		0x800
 #define	MSR_APIC_ID		0x802
 #define	MSR_APIC_VERSION	0x803
 #define	MSR_APIC_TPR		0x808
@@ -501,6 +646,85 @@
 
 #define	MSR_IA32_XSS		0xda0
 
+/*
+ * Intel Processor Trace (PT) MSRs.
+ */
+#define	MSR_IA32_RTIT_OUTPUT_BASE	0x560	/* Trace Output Base Register (R/W) */
+#define	MSR_IA32_RTIT_OUTPUT_MASK_PTRS	0x561	/* Trace Output Mask Pointers Register (R/W) */
+#define	MSR_IA32_RTIT_CTL		0x570	/* Trace Control Register (R/W) */
+#define	 RTIT_CTL_TRACEEN	(1 << 0)
+#define	 RTIT_CTL_CYCEN		(1 << 1)
+#define	 RTIT_CTL_OS		(1 << 2)
+#define	 RTIT_CTL_USER		(1 << 3)
+#define	 RTIT_CTL_PWREVTEN	(1 << 4)
+#define	 RTIT_CTL_FUPONPTW	(1 << 5)
+#define	 RTIT_CTL_FABRICEN	(1 << 6)
+#define	 RTIT_CTL_CR3FILTER	(1 << 7)
+#define	 RTIT_CTL_TOPA		(1 << 8)
+#define	 RTIT_CTL_MTCEN		(1 << 9)
+#define	 RTIT_CTL_TSCEN		(1 << 10)
+#define	 RTIT_CTL_DISRETC	(1 << 11)
+#define	 RTIT_CTL_PTWEN		(1 << 12)
+#define	 RTIT_CTL_BRANCHEN	(1 << 13)
+#define	 RTIT_CTL_MTC_FREQ_S	14
+#define	 RTIT_CTL_MTC_FREQ(n)	((n) << RTIT_CTL_MTC_FREQ_S)
+#define	 RTIT_CTL_MTC_FREQ_M	(0xf << RTIT_CTL_MTC_FREQ_S)
+#define	 RTIT_CTL_CYC_THRESH_S	19
+#define	 RTIT_CTL_CYC_THRESH_M	(0xf << RTIT_CTL_CYC_THRESH_S)
+#define	 RTIT_CTL_PSB_FREQ_S	24
+#define	 RTIT_CTL_PSB_FREQ_M	(0xf << RTIT_CTL_PSB_FREQ_S)
+#define	 RTIT_CTL_ADDR_CFG_S(n) (32 + (n) * 4)
+#define	 RTIT_CTL_ADDR0_CFG_S	32
+#define	 RTIT_CTL_ADDR0_CFG_M	(0xfULL << RTIT_CTL_ADDR0_CFG_S)
+#define	 RTIT_CTL_ADDR1_CFG_S	36
+#define	 RTIT_CTL_ADDR1_CFG_M	(0xfULL << RTIT_CTL_ADDR1_CFG_S)
+#define	 RTIT_CTL_ADDR2_CFG_S	40
+#define	 RTIT_CTL_ADDR2_CFG_M	(0xfULL << RTIT_CTL_ADDR2_CFG_S)
+#define	 RTIT_CTL_ADDR3_CFG_S	44
+#define	 RTIT_CTL_ADDR3_CFG_M	(0xfULL << RTIT_CTL_ADDR3_CFG_S)
+#define	MSR_IA32_RTIT_STATUS		0x571	/* Tracing Status Register (R/W) */
+#define	 RTIT_STATUS_FILTEREN	(1 << 0)
+#define	 RTIT_STATUS_CONTEXTEN	(1 << 1)
+#define	 RTIT_STATUS_TRIGGEREN	(1 << 2)
+#define	 RTIT_STATUS_ERROR	(1 << 4)
+#define	 RTIT_STATUS_STOPPED	(1 << 5)
+#define	 RTIT_STATUS_PACKETBYTECNT_S	32
+#define	 RTIT_STATUS_PACKETBYTECNT_M	(0x1ffffULL << RTIT_STATUS_PACKETBYTECNT_S)
+#define	MSR_IA32_RTIT_CR3_MATCH		0x572	/* Trace Filter CR3 Match Register (R/W) */
+#define	MSR_IA32_RTIT_ADDR_A(n)		(0x580 + (n) * 2)
+#define	MSR_IA32_RTIT_ADDR_B(n)		(0x581 + (n) * 2)
+#define	MSR_IA32_RTIT_ADDR0_A		0x580	/* Region 0 Start Address (R/W) */
+#define	MSR_IA32_RTIT_ADDR0_B		0x581	/* Region 0 End Address (R/W) */
+#define	MSR_IA32_RTIT_ADDR1_A		0x582	/* Region 1 Start Address (R/W) */
+#define	MSR_IA32_RTIT_ADDR1_B		0x583	/* Region 1 End Address (R/W) */
+#define	MSR_IA32_RTIT_ADDR2_A		0x584	/* Region 2 Start Address (R/W) */
+#define	MSR_IA32_RTIT_ADDR2_B		0x585	/* Region 2 End Address (R/W) */
+#define	MSR_IA32_RTIT_ADDR3_A		0x586	/* Region 3 Start Address (R/W) */
+#define	MSR_IA32_RTIT_ADDR3_B		0x587	/* Region 3 End Address (R/W) */
+
+/* Intel Processor Trace Table of Physical Addresses (ToPA). */
+#define	TOPA_SIZE_S	6
+#define	TOPA_SIZE_M	(0xf << TOPA_SIZE_S)
+#define	TOPA_SIZE_4K	(0 << TOPA_SIZE_S)
+#define	TOPA_SIZE_8K	(1 << TOPA_SIZE_S)
+#define	TOPA_SIZE_16K	(2 << TOPA_SIZE_S)
+#define	TOPA_SIZE_32K	(3 << TOPA_SIZE_S)
+#define	TOPA_SIZE_64K	(4 << TOPA_SIZE_S)
+#define	TOPA_SIZE_128K	(5 << TOPA_SIZE_S)
+#define	TOPA_SIZE_256K	(6 << TOPA_SIZE_S)
+#define	TOPA_SIZE_512K	(7 << TOPA_SIZE_S)
+#define	TOPA_SIZE_1M	(8 << TOPA_SIZE_S)
+#define	TOPA_SIZE_2M	(9 << TOPA_SIZE_S)
+#define	TOPA_SIZE_4M	(10 << TOPA_SIZE_S)
+#define	TOPA_SIZE_8M	(11 << TOPA_SIZE_S)
+#define	TOPA_SIZE_16M	(12 << TOPA_SIZE_S)
+#define	TOPA_SIZE_32M	(13 << TOPA_SIZE_S)
+#define	TOPA_SIZE_64M	(14 << TOPA_SIZE_S)
+#define	TOPA_SIZE_128M	(15 << TOPA_SIZE_S)
+#define	TOPA_STOP	(1 << 4)
+#define	TOPA_INT	(1 << 2)
+#define	TOPA_END	(1 << 0)
+
 /*
  * Constants related to MSR's.
  */
@@ -515,6 +739,55 @@
 #define	IA32_FEATURE_CONTROL_SMX_EN	0x02	/* enable VMX inside SMX */
 #define	IA32_FEATURE_CONTROL_VMX_EN	0x04	/* enable VMX outside SMX */
 
+/* MSR IA32_MISC_ENABLE */
+#define	IA32_MISC_EN_FASTSTR	0x0000000000000001ULL
+#define	IA32_MISC_EN_ATCCE	0x0000000000000008ULL
+#define	IA32_MISC_EN_PERFMON	0x0000000000000080ULL
+#define	IA32_MISC_EN_PEBSU	0x0000000000001000ULL
+#define	IA32_MISC_EN_ESSTE	0x0000000000010000ULL
+#define	IA32_MISC_EN_MONE	0x0000000000040000ULL
+#define	IA32_MISC_EN_LIMCPUID	0x0000000000400000ULL
+#define	IA32_MISC_EN_xTPRD	0x0000000000800000ULL
+#define	IA32_MISC_EN_XDD	0x0000000400000000ULL
+
+/*
+ * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel'
+ * document 336996-001 Speculative Execution Side Channel Mitigations.
+ *
+ * AMD uses the same MSRs and bit definitions, as described in 111006-B
+ * "Indirect Branch Control Extension" and 124441 "Speculative Store Bypass
+ * Disable."
+ */
+/* MSR IA32_SPEC_CTRL */
+#define	IA32_SPEC_CTRL_IBRS	0x00000001
+#define	IA32_SPEC_CTRL_STIBP	0x00000002
+#define	IA32_SPEC_CTRL_SSBD	0x00000004
+
+/* MSR IA32_PRED_CMD */
+#define	IA32_PRED_CMD_IBPB_BARRIER	0x0000000000000001ULL
+
+/* MSR IA32_FLUSH_CMD */
+#define	IA32_FLUSH_CMD_L1D	0x00000001
+
+/* MSR IA32_HWP_CAPABILITIES */
+#define	IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(x)	(((x) >> 0) & 0xff)
+#define	IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(x)	(((x) >> 8) & 0xff)
+#define	IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(x)	(((x) >> 16) & 0xff)
+#define	IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(x)	(((x) >> 24) & 0xff)
+
+/* MSR IA32_HWP_REQUEST */
+#define	IA32_HWP_REQUEST_MINIMUM_VALID			(1ULL << 63)
+#define	IA32_HWP_REQUEST_MAXIMUM_VALID			(1ULL << 62)
+#define	IA32_HWP_REQUEST_DESIRED_VALID			(1ULL << 61)
+#define	IA32_HWP_REQUEST_EPP_VALID 			(1ULL << 60)
+#define	IA32_HWP_REQUEST_ACTIVITY_WINDOW_VALID		(1ULL << 59)
+#define	IA32_HWP_REQUEST_PACKAGE_CONTROL		(1ULL << 42)
+#define	IA32_HWP_ACTIVITY_WINDOW			(0x3ffULL << 32)
+#define	IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE	(0xffULL << 24)
+#define	IA32_HWP_DESIRED_PERFORMANCE			(0xffULL << 16)
+#define	IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE		(0xffULL << 8)
+#define	IA32_HWP_MINIMUM_PERFORMANCE			(0xffULL << 0)
+
 /*
  * PAT modes.
  */
@@ -665,6 +938,33 @@
 #define	MC_MISC_ADDRESS_MODE	0x00000000000001c0	/* If MCG_CAP_SER_P */
 #define	MC_CTL2_THRESHOLD	0x0000000000007fff
 #define	MC_CTL2_CMCI_EN		0x0000000040000000
+#define	MC_AMDNB_BANK		4
+#define	MC_MISC_AMD_VAL		0x8000000000000000	/* Counter presence valid */
+#define	MC_MISC_AMD_CNTP	0x4000000000000000	/* Counter present */
+#define	MC_MISC_AMD_LOCK	0x2000000000000000	/* Register locked */
+#define	MC_MISC_AMD_INTP	0x1000000000000000	/* Int. type can generate interrupts */
+#define	MC_MISC_AMD_LVT_MASK	0x00f0000000000000	/* Extended LVT offset */
+#define	MC_MISC_AMD_LVT_SHIFT	52
+#define	MC_MISC_AMD_CNTEN	0x0008000000000000	/* Counter enabled */
+#define	MC_MISC_AMD_INT_MASK	0x0006000000000000	/* Interrupt type */
+#define	MC_MISC_AMD_INT_LVT	0x0002000000000000	/* Interrupt via Extended LVT */
+#define	MC_MISC_AMD_INT_SMI	0x0004000000000000	/* SMI */
+#define	MC_MISC_AMD_OVERFLOW	0x0001000000000000	/* Counter overflow */
+#define	MC_MISC_AMD_CNT_MASK	0x00000fff00000000	/* Counter value */
+#define	MC_MISC_AMD_CNT_SHIFT	32
+#define	MC_MISC_AMD_CNT_MAX	0xfff
+#define	MC_MISC_AMD_PTR_MASK	0x00000000ff000000	/* Pointer to additional registers */
+#define	MC_MISC_AMD_PTR_SHIFT	24
+
+/* AMD Scalable MCA */
+#define MSR_SMCA_MC0_CTL          0xc0002000
+#define MSR_SMCA_MC0_STATUS       0xc0002001
+#define MSR_SMCA_MC0_ADDR         0xc0002002
+#define MSR_SMCA_MC0_MISC0        0xc0002003
+#define MSR_SMCA_MC_CTL(x)       (MSR_SMCA_MC0_CTL + 0x10 * (x))
+#define MSR_SMCA_MC_STATUS(x)    (MSR_SMCA_MC0_STATUS + 0x10 * (x))
+#define MSR_SMCA_MC_ADDR(x)      (MSR_SMCA_MC0_ADDR + 0x10 * (x))
+#define MSR_SMCA_MC_MISC(x)      (MSR_SMCA_MC0_MISC0 + 0x10 * (x))
 
 /*
  * The following four 3-byte registers control the non-cacheable regions.
@@ -768,6 +1068,7 @@
 #define	MSR_FSBASE	0xc0000100	/* base address of the %fs "segment" */
 #define	MSR_GSBASE	0xc0000101	/* base address of the %gs "segment" */
 #define	MSR_KGSBASE	0xc0000102	/* base address of the kernel %gs */
+#define	MSR_TSC_AUX	0xc0000103
 #define	MSR_PERFEVSEL0	0xc0010000
 #define	MSR_PERFEVSEL1	0xc0010001
 #define	MSR_PERFEVSEL2	0xc0010002
@@ -785,17 +1086,20 @@
 #define	MSR_TOP_MEM	0xc001001a	/* boundary for ram below 4G */
 #define	MSR_TOP_MEM2	0xc001001d	/* boundary for ram above 4G */
 #define	MSR_NB_CFG1	0xc001001f	/* NB configuration 1 */
+#define	MSR_K8_UCODE_UPDATE 0xc0010020	/* update microcode */
+#define	MSR_MC0_CTL_MASK 0xc0010044
 #define	MSR_P_STATE_LIMIT 0xc0010061	/* P-state Current Limit Register */
 #define	MSR_P_STATE_CONTROL 0xc0010062	/* P-state Control Register */
 #define	MSR_P_STATE_STATUS 0xc0010063	/* P-state Status Register */
 #define	MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */
 #define	MSR_SMM_ADDR	0xc0010112	/* SMM TSEG base address */
 #define	MSR_SMM_MASK	0xc0010113	/* SMM TSEG address mask */
+#define	MSR_VM_CR	0xc0010114	/* SVM: feature control */
+#define	MSR_VM_HSAVE_PA 0xc0010117	/* SVM: host save area address */
+#define	MSR_AMD_CPUID07	0xc0011002	/* CPUID 07 %ebx override */
+#define	MSR_EXTFEATURES	0xc0011005	/* Extended CPUID Features override */
+#define	MSR_LS_CFG	0xc0011020
 #define	MSR_IC_CFG	0xc0011021	/* Instruction Cache Configuration */
-#define	MSR_K8_UCODE_UPDATE	0xc0010020	/* update microcode */
-#define	MSR_MC0_CTL_MASK	0xc0010044
-#define	MSR_VM_CR		0xc0010114 /* SVM: feature control */
-#define	MSR_VM_HSAVE_PA		0xc0010117 /* SVM: host save area address */
 
 /* MSR_VM_CR related */
 #define	VM_CR_SVMDIS		0x10	/* SVM: disabled by BIOS */
diff --git a/usr/src/Makefile.master b/usr/src/Makefile.master
index a6d4d763d6..aa7bd524bd 100644
--- a/usr/src/Makefile.master
+++ b/usr/src/Makefile.master
@@ -57,6 +57,12 @@ $(NO_ADJUNCT_PROTO)HAVE_ADJUNCT_PROTO=$(POUND_SIGN)
 #
 NATIVE_ADJUNCT=	/usr
 
+#
+# Compatibility code for FreeBSD etc.
+#
+COMPAT=	$(SRC)/compat
+CONTRIB= $(SRC)/../contrib
+
 #
 # RELEASE_BUILD should be cleared for final release builds.
 # NOT_RELEASE_BUILD is exactly what the name implies.
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index a4a744fe95..f20274bd35 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -488,6 +488,8 @@ i386_SUBDIRS=		\
 	acpihpd		\
 	addbadsec	\
 	ahciem		\
+	bhyve		\
+	bhyvectl	\
 	biosdev		\
 	cxgbetool	\
 	diskscan	\
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index f47daead31..e96868e006 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -11,13 +11,16 @@
 
 #
 # Copyright 2014 Pluribus Networks Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 PROG =		bhyve
 
 include ../Makefile.cmd
+include ../Makefile.cmd.64
+include ../Makefile.ctf
 
-$(BUILD64)SUBDIRS += $(MACH64)
+SUBDIRS = test
 
 all	:=	TARGET = all
 install	:=	TARGET = install
@@ -25,17 +28,127 @@ clean	:=	TARGET = clean
 clobber	:=	TARGET = clobber
 lint	:=	TARGET = lint
 
+SRCS =	acpi.c			\
+	atkbdc.c		\
+	bhyvegc.c		\
+	bhyverun.c		\
+	block_if.c		\
+	bootrom.c		\
+	console.c		\
+	consport.c		\
+	dbgport.c		\
+	fwctl.c			\
+	gdb.c			\
+	inout.c			\
+	ioapic.c		\
+	mem.c			\
+	mevent.c		\
+	mptbl.c			\
+	pci_ahci.c		\
+	pci_e82545.c		\
+	pci_emul.c		\
+	pci_fbuf.c		\
+	pci_hostbridge.c	\
+	pci_irq.c		\
+	pci_lpc.c		\
+	pci_nvme.c		\
+	pci_passthru.c		\
+	pci_uart.c		\
+	pci_virtio_block.c	\
+	pci_virtio_console.c	\
+	pci_virtio_net.c	\
+	pci_virtio_rnd.c	\
+	pci_xhci.c		\
+	pm.c			\
+	post.c			\
+	ps2kbd.c		\
+	ps2mouse.c		\
+	rfb.c			\
+	rtc.c			\
+	smbiostbl.c		\
+	sockstream.c		\
+	task_switch.c		\
+	uart_emul.c		\
+	usb_emul.c		\
+	usb_mouse.c		\
+	vga.c			\
+	virtio.c		\
+	vmm_instruction_emul.c	\
+	xmsr.c			\
+	spinup_ap.c		\
+	iov.c			\
+	bhyve_sol_glue.c
+
+# The virtio-scsi driver appears to include  a slew of materials from FreeBSD's
+# native SCSI implementation.  We will omit that complexity for now.
+	#ctl_util.c		\
+	#ctl_scsi_all.c		\
+	#pci_virtio_scsi.c	\
+
+
+OBJS = $(SRCS:.c=.o)
+
+CLOBBERFILES =	$(ROOTUSRSBINPROG)
+
+MEVENT_TEST_PROG = mevent_test
+MEVENT_TEST_SRCS = mevent.c mevent_test.c
+MEVENT_TEST_OBJS = $(MEVENT_TEST_SRCS:.c=.o)
+
+CLEANFILES =	$(PROG) $(MEVENT_TEST_PROG) $(MEVENT_TEST_OBJS)
+
+CFLAGS +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -_gcc=-Wno-parentheses
+CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+		-I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \
+		-I$(CONTRIB)/freebsd/dev/usb/controller \
+		-I$(CONTRIB)/freebsd/dev/mii \
+		-I$(SRC)/uts/common/io/e1000api \
+		$(CPPFLAGS.master) \
+		-I$(SRC)/uts/i86pc/io/vmm \
+		-I$(SRC)/uts/common \
+		-I$(SRC)/uts/i86pc \
+		-DWITHOUT_CAPSICUM
+
+# Disable the crypto code until it is wired up
+CPPFLAGS +=	-DNO_OPENSSL
+
+pci_nvme.o := CERRWARN += -_gcc=-Wno-pointer-sign
+
+SMOFF += all_func_returns,leaks,no_if_block
+
+# Force c99 for everything
+CSTD=		$(CSTD_GNU99)
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -lmd -luuid -lvmmapi -lz
+$(MEVENT_TEST_PROG) := LDLIBS += -lsocket
+
 .KEEP_STATE:
 
-all clean clobber lint:	$(SUBDIRS)
+all: $(PROG) $(MEVENT_TEST_PROG) $(SUBDIRS)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+	$(POST_PROCESS)
+
+$(MEVENT_TEST_PROG): $(MEVENT_TEST_OBJS)
+	$(LINK.c) -o $@ $(MEVENT_TEST_OBJS) $(LDFLAGS) $(LDLIBS)
+
+install: all $(ROOTUSRSBINPROG) $(SUBDIRS)
+
+clean: $(SUBDIRS)
+	$(RM) $(OBJS) $(CLEANFILES)
+
+clobber: clean $(SUBDIRS)
+	$(RM) $(CLOBBERFILES)
 
-install: $(SUBDIRS)
-	-$(RM) $(ROOTUSRSBINPROG)
-	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+lint:	lint_SRCS $(SUBDIRS)
 
-$(SUBDIRS):	FRC
-	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+$(SUBDIRS): FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
 
 FRC:
 
-include ../Makefile.targ
+%.o: $(SRC)/uts/i86pc/io/vmm/%.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
diff --git a/usr/src/cmd/bhyve/Makefile.com b/usr/src/cmd/bhyve/Makefile.com
deleted file mode 100644
index 4a92b622ab..0000000000
--- a/usr/src/cmd/bhyve/Makefile.com
+++ /dev/null
@@ -1,94 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2015 Pluribus Networks Inc.
-#
-
-PROG= bhyve
-
-SRCS =	atkbdc.c		\
-	bhyvegc.c		\
-	bhyverun.c		\
-	block_if.c		\
-	console.c		\
-	consport.c		\
-	inout.c			\
-	ioapic.c		\
-	mem.c			\
-	mptbl.c			\
-	pci_ahci.c		\
-	pci_emul.c		\
-	pci_hostbridge.c	\
-	pci_irq.c		\
-	pci_lpc.c		\
-	pci_virtio_block.c	\
-	pci_virtio_net.c	\
-	pci_virtio_viona.c	\
-	pm.c			\
-	pmtmr.c			\
-	post.c			\
-	ps2kbd.c		\
-	ps2mouse.c		\
-	rfb.c			\
-	rtc.c			\
-	smbiostbl.c		\
-	uart_emul.c		\
-	vga.c			\
-	virtio.c		\
-	vmm_instruction_emul.c	\
-	xmsr.c			\
-	spinup_ap.c		\
-	bhyve_sol_glue.c
-
-OBJS = $(SRCS:.c=.o)
-
-include ../../Makefile.cmd
-
-.KEEP_STATE:
-
-CFLAGS +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration
-CFLAGS64 +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration
-CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
-		-I$(ROOT)/usr/platform/i86pc/include \
-		-I$(SRC)/uts/i86pc/io/vmm \
-		-I$(SRC)/uts/common \
-		-I$(SRC)/uts/i86pc \
-		-I$(SRC)/lib/libdladm/common
-LDLIBS +=	-lsocket -lnsl -ldlpi -ldladm -lkstat -lmd -luuid -lvmmapi
-
-POST_PROCESS += ; $(GENSETDEFS) $@
-
-all: $(PROG)
-
-$(PROG): $(OBJS)
-	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
-	$(POST_PROCESS)
-
-install: all $(ROOTUSRSBINPROG)
-
-clean:
-	$(RM) $(OBJS)
-
-lint:	lint_SRCS
-
-include ../../Makefile.targ
-
-%.o: ../%.c
-	$(COMPILE.c) $<
-	$(POST_PROCESS_O)
-
-%.o: $(SRC)/uts/i86pc/io/vmm/%.c
-	$(COMPILE.c) $<
-	$(POST_PROCESS_O)
-
-%.o: ../%.s
-	$(COMPILE.s) $<
diff --git a/usr/src/cmd/bhyve/acpi.c b/usr/src/cmd/bhyve/acpi.c
new file mode 100644
index 0000000000..862f4512f8
--- /dev/null
+++ b/usr/src/cmd/bhyve/acpi.c
@@ -0,0 +1,1007 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * bhyve ACPI table generator.
+ *
+ * Create the minimal set of ACPI tables required to boot FreeBSD (and
+ * hopefully other o/s's) by writing out ASL template files for each of
+ * the tables and the compiling them to AML with the Intel iasl compiler.
+ * The AML files are then read into guest memory.
+ *
+ *  The tables are placed in the guest's ROM area just below 1MB physical,
+ * above the MPTable.
+ *
+ *  Layout (No longer correct at FADT and beyond due to properly
+ *  calculating the size of the MADT to allow for changes to
+ *  VM_MAXCPU above 21 which overflows this layout.)
+ *  ------
+ *   RSDP  ->   0xf2400    (36 bytes fixed)
+ *     RSDT  ->   0xf2440    (36 bytes + 4*7 table addrs, 4 used)
+ *     XSDT  ->   0xf2480    (36 bytes + 8*7 table addrs, 4 used)
+ *       MADT  ->   0xf2500  (depends on #CPUs)
+ *       FADT  ->   0xf2600  (268 bytes)
+ *       HPET  ->   0xf2740  (56 bytes)
+ *       MCFG  ->   0xf2780  (60 bytes)
+ *         FACS  ->   0xf27C0 (64 bytes)
+ *         DSDT  ->   0xf2800 (variable - can go up to 0x100000)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+
+#include <paths.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "pci_emul.h"
+
+/*
+ * Define the base address of the ACPI tables, the sizes of some tables, 
+ * and the offsets to the individual tables,
+ */
+#define BHYVE_ACPI_BASE		0xf2400
+#define RSDT_OFFSET		0x040
+#define XSDT_OFFSET		0x080
+#define MADT_OFFSET		0x100
+/*
+ * The MADT consists of:
+ *	44		Fixed Header
+ *	8 * maxcpu	Processor Local APIC entries
+ *	12		I/O APIC entry
+ *	2 * 10		Interrupt Source Override entires
+ *	6		Local APIC NMI entry
+ */
+#define	MADT_SIZE		(44 + VM_MAXCPU*8 + 12 + 2*10 + 6)
+#define	FADT_OFFSET		(MADT_OFFSET + MADT_SIZE)
+#define	FADT_SIZE		0x140
+#define	HPET_OFFSET		(FADT_OFFSET + FADT_SIZE)
+#define	HPET_SIZE		0x40
+#define	MCFG_OFFSET		(HPET_OFFSET + HPET_SIZE)
+#define	MCFG_SIZE		0x40
+#define	FACS_OFFSET		(MCFG_OFFSET + MCFG_SIZE)
+#define	FACS_SIZE		0x40
+#define	DSDT_OFFSET		(FACS_OFFSET + FACS_SIZE)
+
+#define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX"
+#define BHYVE_ASL_SUFFIX	".aml"
+#define BHYVE_ASL_COMPILER	"/usr/sbin/iasl"
+
+static int basl_keep_temps;
+static int basl_verbose_iasl;
+static int basl_ncpu;
+static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
+static uint32_t hpet_capabilities;
+
+/*
+ * Contains the full pathname of the template to be passed
+ * to mkstemp/mktemps(3)
+ */
+static char basl_template[MAXPATHLEN];
+static char basl_stemplate[MAXPATHLEN];
+
+/*
+ * State for dsdt_line(), dsdt_indent(), and dsdt_unindent().
+ */
+static FILE *dsdt_fp;
+static int dsdt_indent_level;
+static int dsdt_error;
+
+struct basl_fio {
+	int	fd;
+	FILE	*fp;
+	char	f_name[MAXPATHLEN];
+};
+
+#define EFPRINTF(...) \
+	if (fprintf(__VA_ARGS__) < 0) goto err_exit;
+
+#define EFFLUSH(x) \
+	if (fflush(x) != 0) goto err_exit;
+
+static int
+basl_fwrite_rsdp(FILE *fp)
+{
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve RSDP template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
+	EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
+	    basl_acpi_base + RSDT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
+	EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
+	    basl_acpi_base + XSDT_OFFSET);
+	EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
+	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_rsdt(FILE *fp)
+{
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve RSDT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Add in pointers to the MADT, FADT and HPET */
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
+	    basl_acpi_base + MADT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
+	    basl_acpi_base + FADT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
+	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_xsdt(FILE *fp)
+{
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve XSDT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Add in pointers to the MADT, FADT and HPET */
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
+	    basl_acpi_base + MADT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
+	    basl_acpi_base + FADT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
+	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_madt(FILE *fp)
+{
+	int i;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve MADT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
+	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+	EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
+	EFPRINTF(fp, "\n");
+
+	/* Add a Processor Local APIC entry for each CPU */
+	for (i = 0; i < basl_ncpu; i++) {
+		EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
+		EFPRINTF(fp, "[0001]\t\tLength : 08\n");
+		/* iasl expects hex values for the proc and apic id's */
+		EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i);
+		EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i);
+		EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+		EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
+#ifdef __FreeBSD__
+		EFPRINTF(fp, "\t\t\tRuntime Online Capable : 0\n");
+#else
+		/*
+		 * Until iasl is updated to support the "Runtime Online
+		 * Capable" entry, it must be omitted.  This should be
+		 * re-checked when illumos receives an acpica update.
+		 */
+#endif /* __FreeBSD__ */
+		EFPRINTF(fp, "\n");
+	}
+
+	/* Always a single IOAPIC entry, with ID 0 */
+	EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
+	EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
+	/* iasl expects a hex value for the i/o apic id */
+	EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0);
+	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+	EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
+	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Legacy IRQ0 is connected to pin 2 of the IOAPIC */
+	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
+	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
+	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
+	EFPRINTF(fp, "[0001]\t\tSource : 00\n");
+	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n");
+	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
+	EFPRINTF(fp, "\t\t\tPolarity : 1\n");
+	EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
+	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
+	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
+	EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT);
+	EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT);
+	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
+	EFPRINTF(fp, "\t\t\tPolarity : 3\n");
+	EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n");
+	EFPRINTF(fp, "\n");
+
+	/* Local APIC NMI is connected to LINT 1 on all CPUs */
+	EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n");
+	EFPRINTF(fp, "[0001]\t\tLength : 06\n");
+	EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n");
+	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
+	EFPRINTF(fp, "\t\t\tPolarity : 1\n");
+	EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
+	EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n");
+	EFPRINTF(fp, "\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_fadt(FILE *fp)
+{
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve FADT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
+	    basl_acpi_base + FACS_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
+	    basl_acpi_base + DSDT_OFFSET);
+	EFPRINTF(fp, "[0001]\t\tModel : 01\n");
+	EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
+	EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n",
+	    SCI_INT);
+	EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n",
+	    SMI_CMD);
+	EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n",
+	    BHYVE_ACPI_ENABLE);
+	EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n",
+	    BHYVE_ACPI_DISABLE);
+	EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
+	EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
+	EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n",
+	    PM1A_EVT_ADDR);
+	EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n",
+	    PM1A_CNT_ADDR);
+	EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
+	    IO_PMTMR);
+	EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
+	EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
+	EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
+	EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
+	EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
+	EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
+	EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
+	EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
+	EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
+	EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
+	EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
+	EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
+	EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
+	EFPRINTF(fp, "[0001]\t\tRTC Century Index : 32\n");
+	EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
+	EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
+	EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
+	EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
+	EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
+	EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
+	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+	EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n");
+	EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
+	EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
+	EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
+	EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
+	EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
+	EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tReset Register : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n");
+	EFPRINTF(fp, "[0002]\t\tARM Flags (decoded below): 0000\n");
+	EFPRINTF(fp, "\t\t\tPSCI Compliant : 0\n");
+	EFPRINTF(fp, "\t\t\tMust use HVC for PSCI : 0\n");
+	EFPRINTF(fp, "[0001]\t\tFADT Minor Revision : 01\n");
+	EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
+	    basl_acpi_base + FACS_OFFSET);
+	EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
+	    basl_acpi_base + DSDT_OFFSET);
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+	    PM1A_EVT_ADDR);
+	EFPRINTF(fp, "\n");
+	
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+	    PM1A_CNT_ADDR);
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Valid for bhyve */
+	EFPRINTF(fp,
+	    "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+	    IO_PMTMR);
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	   "[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_hpet(FILE *fp)
+{
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve HPET template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities);
+	EFPRINTF(fp,
+	    "[0012]\t\tTimer Block Register : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+		 "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n");
+	EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n");
+	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+	EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n");
+	EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_mcfg(FILE *fp)
+{
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve MCFG template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
+	EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
+	EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
+	EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
+	EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
+	EFFLUSH(fp);
+	return (0);
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_facs(FILE *fp)
+{
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve FACS template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
+	EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
+	EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+	EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
+	EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
+	EFPRINTF(fp,
+	    "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
+	EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
+	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+	EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
+	EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+	
+err_exit:
+	return (errno);
+}
+
+/*
+ * Helper routines for writing to the DSDT from other modules.
+ */
+void
+dsdt_line(const char *fmt, ...)
+{
+	va_list ap;
+
+	if (dsdt_error != 0)
+		return;
+
+	if (strcmp(fmt, "") != 0) {
+		if (dsdt_indent_level != 0)
+			EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' ');
+		va_start(ap, fmt);
+		if (vfprintf(dsdt_fp, fmt, ap) < 0) {
+			va_end(ap);
+			goto err_exit;
+		}
+		va_end(ap);
+	}
+	EFPRINTF(dsdt_fp, "\n");
+	return;
+
+err_exit:
+	dsdt_error = errno;
+}
+
+void
+dsdt_indent(int levels)
+{
+
+	dsdt_indent_level += levels;
+	assert(dsdt_indent_level >= 0);
+}
+
+void
+dsdt_unindent(int levels)
+{
+
+	assert(dsdt_indent_level >= levels);
+	dsdt_indent_level -= levels;
+}
+
+void
+dsdt_fixed_ioport(uint16_t iobase, uint16_t length)
+{
+
+	dsdt_line("IO (Decode16,");
+	dsdt_line("  0x%04X,             // Range Minimum", iobase);
+	dsdt_line("  0x%04X,             // Range Maximum", iobase);
+	dsdt_line("  0x01,               // Alignment");
+	dsdt_line("  0x%02X,               // Length", length);
+	dsdt_line("  )");
+}
+
+void
+dsdt_fixed_irq(uint8_t irq)
+{
+
+	dsdt_line("IRQNoFlags ()");
+	dsdt_line("  {%d}", irq);
+}
+
+void
+dsdt_fixed_mem32(uint32_t base, uint32_t length)
+{
+
+	dsdt_line("Memory32Fixed (ReadWrite,");
+	dsdt_line("  0x%08X,         // Address Base", base);
+	dsdt_line("  0x%08X,         // Address Length", length);
+	dsdt_line("  )");
+}
+
+static int
+basl_fwrite_dsdt(FILE *fp)
+{
+	dsdt_fp = fp;
+	dsdt_error = 0;
+	dsdt_indent_level = 0;
+
+	dsdt_line("/*");
+	dsdt_line(" * bhyve DSDT template");
+	dsdt_line(" */");
+	dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
+		 "\"BHYVE \", \"BVDSDT  \", 0x00000001)");
+	dsdt_line("{");
+	dsdt_line("  Name (_S5, Package ()");
+	dsdt_line("  {");
+	dsdt_line("      0x05,");
+	dsdt_line("      Zero,");
+	dsdt_line("  })");
+
+	pci_write_dsdt();
+
+	dsdt_line("");
+	dsdt_line("  Scope (_SB.PC00)");
+	dsdt_line("  {");
+	dsdt_line("    Device (HPET)");
+	dsdt_line("    {");
+	dsdt_line("      Name (_HID, EISAID(\"PNP0103\"))");
+	dsdt_line("      Name (_UID, 0)");
+	dsdt_line("      Name (_CRS, ResourceTemplate ()");
+	dsdt_line("      {");
+	dsdt_indent(4);
+	dsdt_fixed_mem32(0xFED00000, 0x400);
+	dsdt_unindent(4);
+	dsdt_line("      })");
+	dsdt_line("    }");
+	dsdt_line("  }");
+	dsdt_line("}");
+
+	if (dsdt_error != 0)
+		return (dsdt_error);
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_open(struct basl_fio *bf, int suffix)
+{
+	int err;
+
+	err = 0;
+
+	if (suffix) {
+		strlcpy(bf->f_name, basl_stemplate, MAXPATHLEN);
+		bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
+	} else {
+		strlcpy(bf->f_name, basl_template, MAXPATHLEN);
+		bf->fd = mkstemp(bf->f_name);
+	}
+
+	if (bf->fd > 0) {
+		bf->fp = fdopen(bf->fd, "w+");
+		if (bf->fp == NULL) {
+			unlink(bf->f_name);
+			close(bf->fd);
+		}
+	} else {
+		err = 1;
+	}
+
+	return (err);
+}
+
+static void
+basl_close(struct basl_fio *bf)
+{
+
+	if (!basl_keep_temps)
+		unlink(bf->f_name);
+	fclose(bf->fp);
+}
+
+static int
+basl_start(struct basl_fio *in, struct basl_fio *out)
+{
+	int err;
+
+	err = basl_open(in, 0);
+	if (!err) {
+		err = basl_open(out, 1);
+		if (err) {
+			basl_close(in);
+		}
+	}
+
+	return (err);
+}
+
+static void
+basl_end(struct basl_fio *in, struct basl_fio *out)
+{
+
+	basl_close(in);
+	basl_close(out);
+}
+
+static int
+basl_load(struct vmctx *ctx, int fd, uint64_t off)
+{
+	struct stat sb;
+	void *gaddr;
+
+	if (fstat(fd, &sb) < 0)
+		return (errno);
+		
+	gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size);
+	if (gaddr == NULL)
+		return (EFAULT);
+
+	if (read(fd, gaddr, sb.st_size) < 0)
+		return (errno);
+
+	return (0);
+}
+
+static int
+basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset)
+{
+	struct basl_fio io[2];
+	static char iaslbuf[3*MAXPATHLEN + 10];
+	char *fmt;
+	int err;
+
+	err = basl_start(&io[0], &io[1]);
+	if (!err) {
+		err = (*fwrite_section)(io[0].fp);
+
+		if (!err) {
+			/*
+			 * iasl sends the results of the compilation to
+			 * stdout. Shut this down by using the shell to
+			 * redirect stdout to /dev/null, unless the user
+			 * has requested verbose output for debugging
+			 * purposes
+			 */
+			fmt = basl_verbose_iasl ?
+				"%s -p %s %s" :
+				"/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
+				
+			snprintf(iaslbuf, sizeof(iaslbuf),
+				 fmt,
+				 BHYVE_ASL_COMPILER,
+				 io[1].f_name, io[0].f_name);
+			err = system(iaslbuf);
+
+			if (!err) {
+				/*
+				 * Copy the aml output file into guest
+				 * memory at the specified location
+				 */
+				err = basl_load(ctx, io[1].fd, offset);
+			}
+		}
+		basl_end(&io[0], &io[1]);
+	}
+
+	return (err);
+}
+
+static int
+basl_make_templates(void)
+{
+	const char *tmpdir;
+	int err;
+	int len;
+
+	err = 0;
+
+	/*
+	 * 
+	 */
+	if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
+	    (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
+		tmpdir = _PATH_TMP;
+	}
+
+	len = strlen(tmpdir);
+
+	if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
+		strcpy(basl_template, tmpdir);
+		while (len > 0 && basl_template[len - 1] == '/')
+			len--;
+		basl_template[len] = '/';
+		strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
+	} else
+		err = E2BIG;
+
+	if (!err) {
+		/*
+		 * len has been intialized (and maybe adjusted) above
+		 */
+		if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
+		     sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
+			strcpy(basl_stemplate, tmpdir);
+			basl_stemplate[len] = '/';
+			strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
+			len = strlen(basl_stemplate);
+			strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
+		} else
+			err = E2BIG;
+	}
+
+	return (err);
+}
+
+static struct {
+	int	(*wsect)(FILE *fp);
+	uint64_t  offset;
+} basl_ftables[] =
+{
+	{ basl_fwrite_rsdp, 0},
+	{ basl_fwrite_rsdt, RSDT_OFFSET },
+	{ basl_fwrite_xsdt, XSDT_OFFSET },
+	{ basl_fwrite_madt, MADT_OFFSET },
+	{ basl_fwrite_fadt, FADT_OFFSET },
+	{ basl_fwrite_hpet, HPET_OFFSET },
+	{ basl_fwrite_mcfg, MCFG_OFFSET },
+	{ basl_fwrite_facs, FACS_OFFSET },
+	{ basl_fwrite_dsdt, DSDT_OFFSET },
+	{ NULL }
+};
+
+int
+acpi_build(struct vmctx *ctx, int ncpu)
+{
+	int err;
+	int i;
+
+	basl_ncpu = ncpu;
+
+	err = vm_get_hpet_capabilities(ctx, &hpet_capabilities);
+	if (err != 0)
+		return (err);
+
+	/*
+	 * For debug, allow the user to have iasl compiler output sent
+	 * to stdout rather than /dev/null
+	 */
+	if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
+		basl_verbose_iasl = 1;
+
+	/*
+	 * Allow the user to keep the generated ASL files for debugging
+	 * instead of deleting them following use
+	 */
+	if (getenv("BHYVE_ACPI_KEEPTMPS"))
+		basl_keep_temps = 1;
+
+	i = 0;
+	err = basl_make_templates();
+
+	/*
+	 * Run through all the ASL files, compiling them and
+	 * copying them into guest memory
+	 */
+	while (!err && basl_ftables[i].wsect != NULL) {
+		err = basl_compile(ctx, basl_ftables[i].wsect,
+				   basl_ftables[i].offset);
+		i++;
+	}
+
+	return (err);
+}
diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h
index 477f827286..4c6d86d091 100644
--- a/usr/src/cmd/bhyve/acpi.h
+++ b/usr/src/cmd/bhyve/acpi.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/acpi.h 266125 2014-05-15 14:16:55Z jhb $
+ * $FreeBSD$
  */
 
 #ifndef _ACPI_H_
diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h
index 1cf09adcbf..691d4bd438 100644
--- a/usr/src/cmd/bhyve/ahci.h
+++ b/usr/src/cmd/bhyve/ahci.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
  * Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
@@ -24,281 +26,299 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/ahci.h 256056 2013-10-04 18:31:38Z grehan $
+ * $FreeBSD$
  */
 
 #ifndef _AHCI_H_
 #define	_AHCI_H_
 
 /* ATA register defines */
-#define ATA_DATA                        0       /* (RW) data */
-
-#define ATA_FEATURE                     1       /* (W) feature */
-#define         ATA_F_DMA               0x01    /* enable DMA */
-#define         ATA_F_OVL               0x02    /* enable overlap */
-
-#define ATA_COUNT                       2       /* (W) sector count */
-
-#define ATA_SECTOR                      3       /* (RW) sector # */
-#define ATA_CYL_LSB                     4       /* (RW) cylinder# LSB */
-#define ATA_CYL_MSB                     5       /* (RW) cylinder# MSB */
-#define ATA_DRIVE                       6       /* (W) Sector/Drive/Head */
-#define         ATA_D_LBA               0x40    /* use LBA addressing */
-#define         ATA_D_IBM               0xa0    /* 512 byte sectors, ECC */
-
-#define ATA_COMMAND                     7       /* (W) command */
-
-#define ATA_ERROR                       8       /* (R) error */
-#define         ATA_E_ILI               0x01    /* illegal length */
-#define         ATA_E_NM                0x02    /* no media */
-#define         ATA_E_ABORT             0x04    /* command aborted */
-#define         ATA_E_MCR               0x08    /* media change request */
-#define         ATA_E_IDNF              0x10    /* ID not found */
-#define         ATA_E_MC                0x20    /* media changed */
-#define         ATA_E_UNC               0x40    /* uncorrectable data */
-#define         ATA_E_ICRC              0x80    /* UDMA crc error */
-#define		ATA_E_ATAPI_SENSE_MASK	0xf0	/* ATAPI sense key mask */
-
-#define ATA_IREASON                     9       /* (R) interrupt reason */
-#define         ATA_I_CMD               0x01    /* cmd (1) | data (0) */
-#define         ATA_I_IN                0x02    /* read (1) | write (0) */
-#define         ATA_I_RELEASE           0x04    /* released bus (1) */
-#define         ATA_I_TAGMASK           0xf8    /* tag mask */
-
-#define ATA_STATUS                      10      /* (R) status */
-#define ATA_ALTSTAT                     11      /* (R) alternate status */
-#define         ATA_S_ERROR             0x01    /* error */
-#define         ATA_S_INDEX             0x02    /* index */
-#define         ATA_S_CORR              0x04    /* data corrected */
-#define         ATA_S_DRQ               0x08    /* data request */
-#define         ATA_S_DSC               0x10    /* drive seek completed */
-#define         ATA_S_SERVICE           0x10    /* drive needs service */
-#define         ATA_S_DWF               0x20    /* drive write fault */
-#define         ATA_S_DMA               0x20    /* DMA ready */
-#define         ATA_S_READY             0x40    /* drive ready */
-#define         ATA_S_BUSY              0x80    /* busy */
-
-#define ATA_CONTROL                     12      /* (W) control */
-#define         ATA_A_IDS               0x02    /* disable interrupts */
-#define         ATA_A_RESET             0x04    /* RESET controller */
-#define         ATA_A_4BIT              0x08    /* 4 head bits */
-#define         ATA_A_HOB               0x80    /* High Order Byte enable */
+#define	ATA_DATA		0	/* (RW) data */
+
+#define	ATA_FEATURE		1	/* (W) feature */
+#define	ATA_F_DMA		0x01	/* enable DMA */
+#define	ATA_F_OVL		0x02	/* enable overlap */
+
+#define	ATA_COUNT		2	/* (W) sector count */
+
+#define	ATA_SECTOR		3	/* (RW) sector # */
+#define	ATA_CYL_LSB		4	/* (RW) cylinder# LSB */
+#define	ATA_CYL_MSB		5	/* (RW) cylinder# MSB */
+#define	ATA_DRIVE		6	/* (W) Sector/Drive/Head */
+#define	ATA_D_LBA		0x40	/* use LBA addressing */
+#define	ATA_D_IBM		0xa0	/* 512 byte sectors, ECC */
+
+#define	ATA_COMMAND		7	/* (W) command */
+
+#define	ATA_ERROR		8	/* (R) error */
+#define	ATA_E_ILI		0x01	/* illegal length */
+#define	ATA_E_NM		0x02	/* no media */
+#define	ATA_E_ABORT		0x04	/* command aborted */
+#define	ATA_E_MCR		0x08	/* media change request */
+#define	ATA_E_IDNF		0x10	/* ID not found */
+#define	ATA_E_MC		0x20	/* media changed */
+#define	ATA_E_UNC		0x40	/* uncorrectable data */
+#define	ATA_E_ICRC		0x80	/* UDMA crc error */
+#define	ATA_E_ATAPI_SENSE_MASK	0xf0	/* ATAPI sense key mask */
+
+#define	ATA_IREASON		9	/* (R) interrupt reason */
+#define	ATA_I_CMD		0x01	/* cmd (1) | data (0) */
+#define	ATA_I_IN		0x02	/* read (1) | write (0) */
+#define	ATA_I_RELEASE		0x04	/* released bus (1) */
+#define	ATA_I_TAGMASK		0xf8	/* tag mask */
+
+#define	ATA_STATUS		10	/* (R) status */
+#define	ATA_ALTSTAT		11	/* (R) alternate status */
+#define	ATA_S_ERROR		0x01	/* error */
+#define	ATA_S_INDEX		0x02	/* index */
+#define	ATA_S_CORR		0x04	/* data corrected */
+#define	ATA_S_DRQ		0x08	/* data request */
+#define	ATA_S_DSC		0x10	/* drive seek completed */
+#define	ATA_S_SERVICE		0x10	/* drive needs service */
+#define	ATA_S_DWF		0x20	/* drive write fault */
+#define	ATA_S_DMA		0x20	/* DMA ready */
+#define	ATA_S_READY		0x40	/* drive ready */
+#define	ATA_S_BUSY		0x80	/* busy */
+
+#define	ATA_CONTROL		12	/* (W) control */
+#define	ATA_A_IDS		0x02	/* disable interrupts */
+#define	ATA_A_RESET		0x04	/* RESET controller */
+#define	ATA_A_4BIT		0x08	/* 4 head bits */
+#define	ATA_A_HOB		0x80	/* High Order Byte enable */
 
 /* SATA register defines */
-#define ATA_SSTATUS                     13
-#define         ATA_SS_DET_MASK         0x0000000f
-#define         ATA_SS_DET_NO_DEVICE    0x00000000
-#define         ATA_SS_DET_DEV_PRESENT  0x00000001
-#define         ATA_SS_DET_PHY_ONLINE   0x00000003
-#define         ATA_SS_DET_PHY_OFFLINE  0x00000004
-
-#define         ATA_SS_SPD_MASK         0x000000f0
-#define         ATA_SS_SPD_NO_SPEED     0x00000000
-#define         ATA_SS_SPD_GEN1         0x00000010
-#define         ATA_SS_SPD_GEN2         0x00000020
-#define         ATA_SS_SPD_GEN3         0x00000040
-
-#define         ATA_SS_IPM_MASK         0x00000f00
-#define         ATA_SS_IPM_NO_DEVICE    0x00000000
-#define         ATA_SS_IPM_ACTIVE       0x00000100
-#define         ATA_SS_IPM_PARTIAL      0x00000200
-#define         ATA_SS_IPM_SLUMBER      0x00000600
-
-#define ATA_SERROR                      14
-#define         ATA_SE_DATA_CORRECTED   0x00000001
-#define         ATA_SE_COMM_CORRECTED   0x00000002
-#define         ATA_SE_DATA_ERR         0x00000100
-#define         ATA_SE_COMM_ERR         0x00000200
-#define         ATA_SE_PROT_ERR         0x00000400
-#define         ATA_SE_HOST_ERR         0x00000800
-#define         ATA_SE_PHY_CHANGED      0x00010000
-#define         ATA_SE_PHY_IERROR       0x00020000
-#define         ATA_SE_COMM_WAKE        0x00040000
-#define         ATA_SE_DECODE_ERR       0x00080000
-#define         ATA_SE_PARITY_ERR       0x00100000
-#define         ATA_SE_CRC_ERR          0x00200000
-#define         ATA_SE_HANDSHAKE_ERR    0x00400000
-#define         ATA_SE_LINKSEQ_ERR      0x00800000
-#define         ATA_SE_TRANSPORT_ERR    0x01000000
-#define         ATA_SE_UNKNOWN_FIS      0x02000000
-#define         ATA_SE_EXCHANGED        0x04000000
-
-#define ATA_SCONTROL                    15
-#define         ATA_SC_DET_MASK         0x0000000f
-#define         ATA_SC_DET_IDLE         0x00000000
-#define         ATA_SC_DET_RESET        0x00000001
-#define         ATA_SC_DET_DISABLE      0x00000004
-
-#define         ATA_SC_SPD_MASK         0x000000f0
-#define         ATA_SC_SPD_NO_SPEED     0x00000000
-#define         ATA_SC_SPD_SPEED_GEN1   0x00000010
-#define         ATA_SC_SPD_SPEED_GEN2   0x00000020
-#define         ATA_SC_SPD_SPEED_GEN3   0x00000040
-
-#define         ATA_SC_IPM_MASK         0x00000f00
-#define         ATA_SC_IPM_NONE         0x00000000
-#define         ATA_SC_IPM_DIS_PARTIAL  0x00000100
-#define         ATA_SC_IPM_DIS_SLUMBER  0x00000200
-
-#define ATA_SACTIVE                     16
-
-#define AHCI_MAX_PORTS			32
-#define AHCI_MAX_SLOTS			32
+#define	ATA_SSTATUS		13
+#define	ATA_SS_DET_MASK		0x0000000f
+#define	ATA_SS_DET_NO_DEVICE	0x00000000
+#define	ATA_SS_DET_DEV_PRESENT	0x00000001
+#define	ATA_SS_DET_PHY_ONLINE	0x00000003
+#define	ATA_SS_DET_PHY_OFFLINE	0x00000004
+
+#define	ATA_SS_SPD_MASK		0x000000f0
+#define	ATA_SS_SPD_NO_SPEED	0x00000000
+#define	ATA_SS_SPD_GEN1		0x00000010
+#define	ATA_SS_SPD_GEN2		0x00000020
+#define	ATA_SS_SPD_GEN3		0x00000030
+
+#define	ATA_SS_IPM_MASK		0x00000f00
+#define	ATA_SS_IPM_NO_DEVICE	0x00000000
+#define	ATA_SS_IPM_ACTIVE	0x00000100
+#define	ATA_SS_IPM_PARTIAL	0x00000200
+#define	ATA_SS_IPM_SLUMBER	0x00000600
+#define	ATA_SS_IPM_DEVSLEEP	0x00000800
+
+#define	ATA_SERROR		14
+#define	ATA_SE_DATA_CORRECTED	0x00000001
+#define	ATA_SE_COMM_CORRECTED	0x00000002
+#define	ATA_SE_DATA_ERR		0x00000100
+#define	ATA_SE_COMM_ERR		0x00000200
+#define	ATA_SE_PROT_ERR		0x00000400
+#define	ATA_SE_HOST_ERR		0x00000800
+#define	ATA_SE_PHY_CHANGED	0x00010000
+#define	ATA_SE_PHY_IERROR	0x00020000
+#define	ATA_SE_COMM_WAKE	0x00040000
+#define	ATA_SE_DECODE_ERR	0x00080000
+#define	ATA_SE_PARITY_ERR	0x00100000
+#define	ATA_SE_CRC_ERR		0x00200000
+#define	ATA_SE_HANDSHAKE_ERR	0x00400000
+#define	ATA_SE_LINKSEQ_ERR	0x00800000
+#define	ATA_SE_TRANSPORT_ERR	0x01000000
+#define	ATA_SE_UNKNOWN_FIS	0x02000000
+#define	ATA_SE_EXCHANGED	0x04000000
+
+#define	ATA_SCONTROL		15
+#define	ATA_SC_DET_MASK		0x0000000f
+#define	ATA_SC_DET_IDLE		0x00000000
+#define	ATA_SC_DET_RESET	0x00000001
+#define	ATA_SC_DET_DISABLE	0x00000004
+
+#define	ATA_SC_SPD_MASK		0x000000f0
+#define	ATA_SC_SPD_NO_SPEED	0x00000000
+#define	ATA_SC_SPD_SPEED_GEN1	0x00000010
+#define	ATA_SC_SPD_SPEED_GEN2	0x00000020
+#define	ATA_SC_SPD_SPEED_GEN3	0x00000030
+
+#define	ATA_SC_IPM_MASK		0x00000f00
+#define	ATA_SC_IPM_NONE		0x00000000
+#define	ATA_SC_IPM_DIS_PARTIAL	0x00000100
+#define	ATA_SC_IPM_DIS_SLUMBER	0x00000200
+#define	ATA_SC_IPM_DIS_DEVSLEEP	0x00000400
+
+#define	ATA_SACTIVE		16
+
+#define	AHCI_MAX_PORTS		32
+#define	AHCI_MAX_SLOTS		32
+#define	AHCI_MAX_IRQS		16
 
 /* SATA AHCI v1.0 register defines */
-#define AHCI_CAP                    0x00
-#define		AHCI_CAP_NPMASK	0x0000001f
-#define		AHCI_CAP_SXS	0x00000020
-#define		AHCI_CAP_EMS	0x00000040
-#define		AHCI_CAP_CCCS	0x00000080
-#define		AHCI_CAP_NCS	0x00001F00
-#define		AHCI_CAP_NCS_SHIFT	8
-#define		AHCI_CAP_PSC	0x00002000
-#define		AHCI_CAP_SSC	0x00004000
-#define		AHCI_CAP_PMD	0x00008000
-#define		AHCI_CAP_FBSS	0x00010000
-#define		AHCI_CAP_SPM	0x00020000
-#define		AHCI_CAP_SAM	0x00080000
-#define		AHCI_CAP_ISS	0x00F00000
-#define		AHCI_CAP_ISS_SHIFT	20
-#define		AHCI_CAP_SCLO	0x01000000
-#define		AHCI_CAP_SAL	0x02000000
-#define		AHCI_CAP_SALP	0x04000000
-#define		AHCI_CAP_SSS	0x08000000
-#define		AHCI_CAP_SMPS	0x10000000
-#define		AHCI_CAP_SSNTF	0x20000000
-#define		AHCI_CAP_SNCQ	0x40000000
-#define		AHCI_CAP_64BIT	0x80000000
-
-#define AHCI_GHC                    0x04
-#define         AHCI_GHC_AE         0x80000000
-#define         AHCI_GHC_MRSM       0x00000004
-#define         AHCI_GHC_IE         0x00000002
-#define         AHCI_GHC_HR         0x00000001
-
-#define AHCI_IS                     0x08
-#define AHCI_PI                     0x0c
-#define AHCI_VS                     0x10
-
-#define AHCI_CCCC                   0x14
-#define		AHCI_CCCC_TV_MASK	0xffff0000
-#define		AHCI_CCCC_TV_SHIFT	16
-#define		AHCI_CCCC_CC_MASK	0x0000ff00
-#define		AHCI_CCCC_CC_SHIFT	8
-#define		AHCI_CCCC_INT_MASK	0x000000f8
-#define		AHCI_CCCC_INT_SHIFT	3
-#define		AHCI_CCCC_EN		0x00000001
-#define AHCI_CCCP                   0x18
-
-#define AHCI_EM_LOC                 0x1C
-#define AHCI_EM_CTL                 0x20
-#define 	AHCI_EM_MR              0x00000001
-#define 	AHCI_EM_TM              0x00000100
-#define 	AHCI_EM_RST             0x00000200
-#define 	AHCI_EM_LED             0x00010000
-#define 	AHCI_EM_SAFTE           0x00020000
-#define 	AHCI_EM_SES2            0x00040000
-#define 	AHCI_EM_SGPIO           0x00080000
-#define 	AHCI_EM_SMB             0x01000000
-#define 	AHCI_EM_XMT             0x02000000
-#define 	AHCI_EM_ALHD            0x04000000
-#define 	AHCI_EM_PM              0x08000000
-
-#define AHCI_CAP2                   0x24
-#define		AHCI_CAP2_BOH	0x00000001
-#define		AHCI_CAP2_NVMP	0x00000002
-#define		AHCI_CAP2_APST	0x00000004
-
-#define AHCI_OFFSET                 0x100
-#define AHCI_STEP                   0x80
-
-#define AHCI_P_CLB                  0x00
-#define AHCI_P_CLBU                 0x04
-#define AHCI_P_FB                   0x08
-#define AHCI_P_FBU                  0x0c
-#define AHCI_P_IS                   0x10
-#define AHCI_P_IE                   0x14
-#define         AHCI_P_IX_DHR       0x00000001
-#define         AHCI_P_IX_PS        0x00000002
-#define         AHCI_P_IX_DS        0x00000004
-#define         AHCI_P_IX_SDB       0x00000008
-#define         AHCI_P_IX_UF        0x00000010
-#define         AHCI_P_IX_DP        0x00000020
-#define         AHCI_P_IX_PC        0x00000040
-#define         AHCI_P_IX_MP        0x00000080
-
-#define         AHCI_P_IX_PRC       0x00400000
-#define         AHCI_P_IX_IPM       0x00800000
-#define         AHCI_P_IX_OF        0x01000000
-#define         AHCI_P_IX_INF       0x04000000
-#define         AHCI_P_IX_IF        0x08000000
-#define         AHCI_P_IX_HBD       0x10000000
-#define         AHCI_P_IX_HBF       0x20000000
-#define         AHCI_P_IX_TFE       0x40000000
-#define         AHCI_P_IX_CPD       0x80000000
-
-#define AHCI_P_CMD                  0x18
-#define         AHCI_P_CMD_ST       0x00000001
-#define         AHCI_P_CMD_SUD      0x00000002
-#define         AHCI_P_CMD_POD      0x00000004
-#define         AHCI_P_CMD_CLO      0x00000008
-#define         AHCI_P_CMD_FRE      0x00000010
-#define         AHCI_P_CMD_CCS_MASK 0x00001f00
-#define         AHCI_P_CMD_CCS_SHIFT 8
-#define         AHCI_P_CMD_ISS      0x00002000
-#define         AHCI_P_CMD_FR       0x00004000
-#define         AHCI_P_CMD_CR       0x00008000
-#define         AHCI_P_CMD_CPS      0x00010000
-#define         AHCI_P_CMD_PMA      0x00020000
-#define         AHCI_P_CMD_HPCP     0x00040000
-#define         AHCI_P_CMD_MPSP     0x00080000
-#define         AHCI_P_CMD_CPD      0x00100000
-#define         AHCI_P_CMD_ESP      0x00200000
-#define         AHCI_P_CMD_FBSCP    0x00400000
-#define         AHCI_P_CMD_APSTE    0x00800000
-#define         AHCI_P_CMD_ATAPI    0x01000000
-#define         AHCI_P_CMD_DLAE     0x02000000
-#define         AHCI_P_CMD_ALPE     0x04000000
-#define         AHCI_P_CMD_ASP      0x08000000
-#define         AHCI_P_CMD_ICC_MASK 0xf0000000
-#define         AHCI_P_CMD_NOOP     0x00000000
-#define         AHCI_P_CMD_ACTIVE   0x10000000
-#define         AHCI_P_CMD_PARTIAL  0x20000000
-#define         AHCI_P_CMD_SLUMBER  0x60000000
-
-#define AHCI_P_TFD                  0x20
-#define AHCI_P_SIG                  0x24
-#define AHCI_P_SSTS                 0x28
-#define AHCI_P_SCTL                 0x2c
-#define AHCI_P_SERR                 0x30
-#define AHCI_P_SACT                 0x34
-#define AHCI_P_CI                   0x38
-#define AHCI_P_SNTF                 0x3C
-#define AHCI_P_FBS                  0x40
-#define 	AHCI_P_FBS_EN       0x00000001
-#define 	AHCI_P_FBS_DEC      0x00000002
-#define 	AHCI_P_FBS_SDE      0x00000004
-#define 	AHCI_P_FBS_DEV      0x00000f00
-#define 	AHCI_P_FBS_DEV_SHIFT 8
-#define 	AHCI_P_FBS_ADO      0x0000f000
-#define 	AHCI_P_FBS_ADO_SHIFT 12
-#define 	AHCI_P_FBS_DWE      0x000f0000
-#define 	AHCI_P_FBS_DWE_SHIFT 16
+#define	AHCI_CAP		0x00
+#define	AHCI_CAP_NPMASK		0x0000001f
+#define	AHCI_CAP_SXS		0x00000020
+#define	AHCI_CAP_EMS		0x00000040
+#define	AHCI_CAP_CCCS		0x00000080
+#define	AHCI_CAP_NCS		0x00001F00
+#define	AHCI_CAP_NCS_SHIFT	8
+#define	AHCI_CAP_PSC		0x00002000
+#define	AHCI_CAP_SSC		0x00004000
+#define	AHCI_CAP_PMD		0x00008000
+#define	AHCI_CAP_FBSS		0x00010000
+#define	AHCI_CAP_SPM		0x00020000
+#define	AHCI_CAP_SAM		0x00080000
+#define	AHCI_CAP_ISS		0x00F00000
+#define	AHCI_CAP_ISS_SHIFT	20
+#define	AHCI_CAP_SCLO		0x01000000
+#define	AHCI_CAP_SAL		0x02000000
+#define	AHCI_CAP_SALP		0x04000000
+#define	AHCI_CAP_SSS		0x08000000
+#define	AHCI_CAP_SMPS		0x10000000
+#define	AHCI_CAP_SSNTF		0x20000000
+#define	AHCI_CAP_SNCQ		0x40000000
+#define	AHCI_CAP_64BIT		0x80000000
+
+#define	AHCI_GHC		0x04
+#define	AHCI_GHC_AE		0x80000000
+#define	AHCI_GHC_MRSM		0x00000004
+#define	AHCI_GHC_IE		0x00000002
+#define	AHCI_GHC_HR		0x00000001
+
+#define	AHCI_IS			0x08
+#define	AHCI_PI			0x0c
+#define	AHCI_VS			0x10
+
+#define	AHCI_CCCC		0x14
+#define	AHCI_CCCC_TV_MASK	0xffff0000
+#define	AHCI_CCCC_TV_SHIFT	16
+#define	AHCI_CCCC_CC_MASK	0x0000ff00
+#define	AHCI_CCCC_CC_SHIFT	8
+#define	AHCI_CCCC_INT_MASK	0x000000f8
+#define	AHCI_CCCC_INT_SHIFT	3
+#define	AHCI_CCCC_EN		0x00000001
+#define	AHCI_CCCP		0x18
+
+#define	AHCI_EM_LOC		0x1C
+#define	AHCI_EM_CTL		0x20
+#define	AHCI_EM_MR		0x00000001
+#define	AHCI_EM_TM		0x00000100
+#define	AHCI_EM_RST		0x00000200
+#define	AHCI_EM_LED		0x00010000
+#define	AHCI_EM_SAFTE		0x00020000
+#define	AHCI_EM_SES2		0x00040000
+#define	AHCI_EM_SGPIO		0x00080000
+#define	AHCI_EM_SMB		0x01000000
+#define	AHCI_EM_XMT		0x02000000
+#define	AHCI_EM_ALHD		0x04000000
+#define	AHCI_EM_PM		0x08000000
+
+#define	AHCI_CAP2		0x24
+#define	AHCI_CAP2_BOH		0x00000001
+#define	AHCI_CAP2_NVMP		0x00000002
+#define	AHCI_CAP2_APST		0x00000004
+#define	AHCI_CAP2_SDS		0x00000008
+#define	AHCI_CAP2_SADM		0x00000010
+#define	AHCI_CAP2_DESO		0x00000020
+
+#define	AHCI_OFFSET		0x100
+#define	AHCI_STEP		0x80
+
+#define	AHCI_P_CLB		0x00
+#define	AHCI_P_CLBU		0x04
+#define	AHCI_P_FB		0x08
+#define	AHCI_P_FBU		0x0c
+#define	AHCI_P_IS		0x10
+#define	AHCI_P_IE		0x14
+#define	AHCI_P_IX_DHR		0x00000001
+#define	AHCI_P_IX_PS		0x00000002
+#define	AHCI_P_IX_DS		0x00000004
+#define	AHCI_P_IX_SDB		0x00000008
+#define	AHCI_P_IX_UF		0x00000010
+#define	AHCI_P_IX_DP		0x00000020
+#define	AHCI_P_IX_PC		0x00000040
+#define	AHCI_P_IX_MP		0x00000080
+
+#define	AHCI_P_IX_PRC		0x00400000
+#define	AHCI_P_IX_IPM		0x00800000
+#define	AHCI_P_IX_OF		0x01000000
+#define	AHCI_P_IX_INF		0x04000000
+#define	AHCI_P_IX_IF		0x08000000
+#define	AHCI_P_IX_HBD		0x10000000
+#define	AHCI_P_IX_HBF		0x20000000
+#define	AHCI_P_IX_TFE		0x40000000
+#define	AHCI_P_IX_CPD		0x80000000
+
+#define	AHCI_P_CMD		0x18
+#define	AHCI_P_CMD_ST		0x00000001
+#define	AHCI_P_CMD_SUD		0x00000002
+#define	AHCI_P_CMD_POD		0x00000004
+#define	AHCI_P_CMD_CLO		0x00000008
+#define	AHCI_P_CMD_FRE		0x00000010
+#define	AHCI_P_CMD_CCS_MASK	0x00001f00
+#define	AHCI_P_CMD_CCS_SHIFT	8
+#define	AHCI_P_CMD_ISS		0x00002000
+#define	AHCI_P_CMD_FR		0x00004000
+#define	AHCI_P_CMD_CR		0x00008000
+#define	AHCI_P_CMD_CPS		0x00010000
+#define	AHCI_P_CMD_PMA		0x00020000
+#define	AHCI_P_CMD_HPCP		0x00040000
+#define	AHCI_P_CMD_MPSP		0x00080000
+#define	AHCI_P_CMD_CPD		0x00100000
+#define	AHCI_P_CMD_ESP		0x00200000
+#define	AHCI_P_CMD_FBSCP	0x00400000
+#define	AHCI_P_CMD_APSTE	0x00800000
+#define	AHCI_P_CMD_ATAPI	0x01000000
+#define	AHCI_P_CMD_DLAE		0x02000000
+#define	AHCI_P_CMD_ALPE		0x04000000
+#define	AHCI_P_CMD_ASP		0x08000000
+#define	AHCI_P_CMD_ICC_MASK	0xf0000000
+#define	AHCI_P_CMD_NOOP		0x00000000
+#define	AHCI_P_CMD_ACTIVE	0x10000000
+#define	AHCI_P_CMD_PARTIAL	0x20000000
+#define	AHCI_P_CMD_SLUMBER	0x60000000
+#define	AHCI_P_CMD_DEVSLEEP	0x80000000
+
+#define	AHCI_P_TFD			0x20
+#define	AHCI_P_SIG			0x24
+#define	AHCI_P_SSTS			0x28
+#define	AHCI_P_SCTL			0x2c
+#define	AHCI_P_SERR			0x30
+#define	AHCI_P_SACT			0x34
+#define	AHCI_P_CI			0x38
+#define	AHCI_P_SNTF			0x3C
+#define	AHCI_P_FBS			0x40
+#define	AHCI_P_FBS_EN			0x00000001
+#define	AHCI_P_FBS_DEC			0x00000002
+#define	AHCI_P_FBS_SDE			0x00000004
+#define	AHCI_P_FBS_DEV			0x00000f00
+#define	AHCI_P_FBS_DEV_SHIFT		8
+#define	AHCI_P_FBS_ADO			0x0000f000
+#define	AHCI_P_FBS_ADO_SHIFT		12
+#define	AHCI_P_FBS_DWE			0x000f0000
+#define	AHCI_P_FBS_DWE_SHIFT		16
+#define	AHCI_P_DEVSLP			0x44
+#define	AHCI_P_DEVSLP_ADSE		0x00000001
+#define	AHCI_P_DEVSLP_DSP		0x00000002
+#define	AHCI_P_DEVSLP_DETO		0x000003fc
+#define	AHCI_P_DEVSLP_DETO_SHIFT	2
+#define	AHCI_P_DEVSLP_MDAT		0x00007c00
+#define	AHCI_P_DEVSLP_MDAT_SHIFT	10
+#define	AHCI_P_DEVSLP_DITO		0x01ff8000
+#define	AHCI_P_DEVSLP_DITO_SHIFT	15
+#define	AHCI_P_DEVSLP_DM		0x0e000000
+#define	AHCI_P_DEVSLP_DM_SHIFT		25
 
 /* Just to be sure, if building as module. */
 #if MAXPHYS < 512 * 1024
 #undef MAXPHYS
-#define MAXPHYS				512 * 1024
+#define	MAXPHYS			512 * 1024
 #endif
 /* Pessimistic prognosis on number of required S/G entries */
-#define AHCI_SG_ENTRIES	(roundup(btoc(MAXPHYS) + 1, 8))
+#define	AHCI_SG_ENTRIES	(roundup(btoc(MAXPHYS) + 1, 8))
 /* Command list. 32 commands. First, 1Kbyte aligned. */
-#define AHCI_CL_OFFSET              0
-#define AHCI_CL_SIZE                32
+#define	AHCI_CL_OFFSET		0
+#define	AHCI_CL_SIZE		32
 /* Command tables. Up to 32 commands, Each, 128byte aligned. */
-#define AHCI_CT_OFFSET              (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
-#define AHCI_CT_SIZE                (128 + AHCI_SG_ENTRIES * 16)
+#define	AHCI_CT_OFFSET		(AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
+#define	AHCI_CT_SIZE		(128 + AHCI_SG_ENTRIES * 16)
 /* Total main work area. */
-#define AHCI_WORK_SIZE              (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)
+#define	AHCI_WORK_SIZE		(AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)
 
 #endif /* _AHCI_H_ */
diff --git a/usr/src/cmd/bhyve/amd64/Makefile b/usr/src/cmd/bhyve/amd64/Makefile
deleted file mode 100644
index 13cdae6663..0000000000
--- a/usr/src/cmd/bhyve/amd64/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2015 Pluribus Networks Inc.
-#
-
-include ../Makefile.com
-include ../../Makefile.cmd.64
-
-CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
-
-install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c
index 4d09d88266..1c1838c2e8 100644
--- a/usr/src/cmd/bhyve/atkbdc.c
+++ b/usr/src/cmd/bhyve/atkbdc.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2015 Nahanni Systems Inc.
  * All rights reserved.
@@ -26,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
@@ -45,6 +47,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z nee
 #include <pthread_np.h>
 
 #include "acpi.h"
+#include "atkbdc.h"
 #include "inout.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
@@ -99,19 +102,21 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z nee
 #define	KBDO_AUX_OUTFULL	0x20
 
 #define	RAMSZ			32
+#define	FIFOSZ			15
+#define	CTRL_CMD_FLAG		0x8000
 
 struct kbd_dev {
 	bool	irq_active;
 	int	irq;
 
-	uint8_t	buffer;
+	uint8_t	buffer[FIFOSZ];
+	int	brd, bwr;
+	int	bcnt;
 };
 
 struct aux_dev {
 	bool	irq_active;
 	int	irq;
-
-	uint8_t	buffer;
 };
 
 struct atkbdc_softc {
@@ -126,6 +131,7 @@ struct atkbdc_softc {
 	uint8_t	ram[RAMSZ];	/* byte0 = controller config */
 
 	uint32_t curcmd;	/* current command for next byte */
+	uint32_t  ctrlbyte;
 
 	struct kbd_dev kbd;
 	struct aux_dev aux;
@@ -134,72 +140,37 @@ struct atkbdc_softc {
 static void
 atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
 {
-	if (!sc->kbd.irq_active &&
-	    (sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) {
+	if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) {
 		sc->kbd.irq_active = true;
-		vm_isa_assert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
-	}
-}
-
-static void
-atkbdc_deassert_kbd_intr(struct atkbdc_softc *sc)
-{
-	if (sc->kbd.irq_active) {
-		vm_isa_deassert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
-		sc->kbd.irq_active = false;
+		vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
 	}
 }
 
 static void
 atkbdc_assert_aux_intr(struct atkbdc_softc *sc)
 {
-	if (!sc->aux.irq_active &&
-	    (sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) {
+	if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) {
 		sc->aux.irq_active = true;
-		vm_isa_assert_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
+		vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
 	}
 }
 
-static void
-atkbdc_deassert_aux_intr(struct atkbdc_softc *sc)
-{
-	if (sc->aux.irq_active) {
-		vm_isa_deassert_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
-		sc->aux.irq_active = false;
-	}
-}
-
-static void
-atkbdc_aux_queue_data(struct atkbdc_softc *sc, uint8_t val)
-{
-	assert(pthread_mutex_isowned_np(&sc->mtx));
-
-	sc->aux.buffer = val;
-	sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
-	sc->outport |= KBDO_AUX_OUTFULL;
-	atkbdc_assert_aux_intr(sc);
-}
-
-static void
+static int
 atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
-	sc->kbd.buffer = val;
-	sc->status |= KBDS_KBD_BUFFER_FULL;
-	sc->outport |= KBDO_KBD_OUTFULL;
-	atkbdc_assert_kbd_intr(sc);
-}
-
-static void
-atkbdc_aux_read(struct atkbdc_softc *sc)
-{
-	uint8_t val;
-
-        assert(pthread_mutex_isowned_np(&sc->mtx));
+	if (sc->kbd.bcnt < FIFOSZ) {
+		sc->kbd.buffer[sc->kbd.bwr] = val;
+		sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ;
+		sc->kbd.bcnt++;
+		sc->status |= KBDS_KBD_BUFFER_FULL;
+		sc->outport |= KBDO_KBD_OUTFULL;
+	} else {
+		printf("atkbd data buffer full\n");
+	}
 
-	if (ps2mouse_read(sc->ps2mouse_sc, &val) != -1)
-		atkbdc_aux_queue_data(sc, val);
+	return (sc->kbd.bcnt < FIFOSZ);
 }
 
 static void
@@ -252,21 +223,31 @@ atkbdc_kbd_read(struct atkbdc_softc *sc)
 			} else {
 				val = translation[val] | release;
 			}
-
 			atkbdc_kbd_queue_data(sc, val);
 			break;
 		}
 	} else {
-		if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1)
-			atkbdc_kbd_queue_data(sc, val);
+		while (sc->kbd.bcnt < FIFOSZ) {
+			if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1)
+				atkbdc_kbd_queue_data(sc, val);
+			else
+				break;
+		}
 	}
+
+	if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) ||
+	    ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0)
+		atkbdc_assert_kbd_intr(sc);
 }
 
 static void
 atkbdc_aux_poll(struct atkbdc_softc *sc)
 {
-	if ((sc->outport & KBDO_AUX_OUTFULL) == 0)
-		atkbdc_aux_read(sc);
+	if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) {
+		sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
+		sc->outport |= KBDO_AUX_OUTFULL;
+		atkbdc_assert_aux_intr(sc);
+	}
 }
 
 static void
@@ -274,8 +255,7 @@ atkbdc_kbd_poll(struct atkbdc_softc *sc)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
-	if ((sc->outport & KBDO_KBD_OUTFULL) == 0)
-		atkbdc_kbd_read(sc);
+	atkbdc_kbd_read(sc);
 }
 
 static void
@@ -290,22 +270,35 @@ atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
-	if (sc->outport & KBDO_AUX_OUTFULL) {
-		*buf = sc->aux.buffer;
-		sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
-		sc->outport &= ~KBDO_AUX_OUTFULL;
-		atkbdc_deassert_aux_intr(sc);
+	if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) {
+		if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) {
+			if (sc->kbd.bcnt == 0)
+				sc->status &= ~(KBDS_AUX_BUFFER_FULL |
+				                KBDS_KBD_BUFFER_FULL);
+			else
+				sc->status &= ~(KBDS_AUX_BUFFER_FULL);
+			sc->outport &= ~KBDO_AUX_OUTFULL;
+		}
 
 		atkbdc_poll(sc);
 		return;
 	}
 
-	*buf = sc->kbd.buffer;
-	sc->status &= ~KBDS_KBD_BUFFER_FULL;
-	sc->outport &= ~KBDO_KBD_OUTFULL;
-	atkbdc_deassert_kbd_intr(sc);
+	if (sc->kbd.bcnt > 0) {
+		*buf = sc->kbd.buffer[sc->kbd.brd];
+		sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ;
+		sc->kbd.bcnt--;
+		if (sc->kbd.bcnt == 0) {
+			sc->status &= ~KBDS_KBD_BUFFER_FULL;
+			sc->outport &= ~KBDO_KBD_OUTFULL;
+		}
 
-	atkbdc_poll(sc);
+		atkbdc_poll(sc);
+	}
+
+	if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) {
+		sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+	}
 }
 
 static int
@@ -318,19 +311,22 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 
 	if (bytes != 1)
 		return (-1);
-
 	sc = arg;
 	retval = 0;
 
 	pthread_mutex_lock(&sc->mtx);
 	if (in) {
 		sc->curcmd = 0;
-		sc->status &= ~KBDS_CTRL_FLAG;
-
-		/* read device buffer; includes kbd cmd responses */
-		atkbdc_dequeue_data(sc, &buf);
-		*eax = buf;
+		if (sc->ctrlbyte != 0) {
+			*eax = sc->ctrlbyte & 0xff;
+			sc->ctrlbyte = 0;
+		} else {
+			/* read device buffer; includes kbd cmd responses */
+			atkbdc_dequeue_data(sc, &buf);
+			*eax = buf;
+		}
 
+		sc->status &= ~KBDS_CTRL_FLAG;
 		pthread_mutex_unlock(&sc->mtx);
 		return (retval);
 	}
@@ -345,29 +341,22 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 			if (sc->ram[0] & KBD_SYS_FLAG_BIT)
 				sc->status |= KBDS_SYS_FLAG;
 			else
-				sc->status &= KBDS_SYS_FLAG;
-			if (sc->outport & KBDO_AUX_OUTFULL)
-				atkbdc_assert_aux_intr(sc);
-			else if (sc->outport & KBDO_KBD_OUTFULL)
-				atkbdc_assert_kbd_intr(sc);
+				sc->status &= ~KBDS_SYS_FLAG;
 			break;
 		case KBDC_WRITE_OUTPORT:
 			sc->outport = *eax;
-			if (sc->outport & KBDO_AUX_OUTFULL)
-				sc->status |= (KBDS_AUX_BUFFER_FULL |
-					       KBDS_KBD_BUFFER_FULL);
-			if (sc->outport & KBDO_KBD_OUTFULL)
-				sc->status |= KBDS_KBD_BUFFER_FULL;
 			break;
 		case KBDC_WRITE_TO_AUX:
-			ps2mouse_write(sc->ps2mouse_sc, *eax);
+			ps2mouse_write(sc->ps2mouse_sc, *eax, 0);
 			atkbdc_poll(sc);
 			break;
 		case KBDC_WRITE_KBD_OUTBUF:
 			atkbdc_kbd_queue_data(sc, *eax);
 			break;
 		case KBDC_WRITE_AUX_OUTBUF:
-			atkbdc_aux_queue_data(sc, *eax);
+			ps2mouse_write(sc->ps2mouse_sc, *eax, 1);
+			sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+			atkbdc_aux_poll(sc);
 			break;
 		default:
 			/* write to particular RAM byte */
@@ -398,7 +387,6 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 	return (retval);
 }
 
-
 static int
 atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
     int bytes, uint32_t *eax, void *arg)
@@ -421,25 +409,27 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
 		return (retval);
 	}
 
+
 	sc->curcmd = 0;
 	sc->status |= KBDS_CTRL_FLAG;
+	sc->ctrlbyte = 0;
 
 	switch (*eax) {
 	case KBDC_GET_COMMAND_BYTE:
-		atkbdc_kbd_queue_data(sc, sc->ram[0]);
+		sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0];
 		break;
 	case KBDC_TEST_CTRL:
-		atkbdc_kbd_queue_data(sc, 0x55);
+		sc->ctrlbyte = CTRL_CMD_FLAG | 0x55;
 		break;
 	case KBDC_TEST_AUX_PORT:
 	case KBDC_TEST_KBD_PORT:
-		atkbdc_kbd_queue_data(sc, 0);
+		sc->ctrlbyte = CTRL_CMD_FLAG | 0;
 		break;
 	case KBDC_READ_INPORT:
-		atkbdc_kbd_queue_data(sc, 0);
+		sc->ctrlbyte = CTRL_CMD_FLAG | 0;
 		break;
 	case KBDC_READ_OUTPORT:
-		atkbdc_kbd_queue_data(sc, sc->outport);
+		sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport;
 		break;
 	case KBDC_SET_COMMAND_BYTE:
 	case KBDC_WRITE_OUTPORT:
@@ -452,6 +442,8 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
 		break;
 	case KBDC_ENABLE_KBD_PORT:
 		sc->ram[0] &= ~KBD_DISABLE_KBD_PORT;
+		if (sc->kbd.bcnt > 0)
+			sc->status |= KBDS_KBD_BUFFER_FULL;
 		atkbdc_poll(sc);
 		break;
 	case KBDC_WRITE_TO_AUX:
@@ -459,17 +451,19 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
 		break;
 	case KBDC_DISABLE_AUX_PORT:
 		sc->ram[0] |= KBD_DISABLE_AUX_PORT;
+		ps2mouse_toggle(sc->ps2mouse_sc, 0);
+		sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+		sc->outport &= ~KBDS_AUX_BUFFER_FULL;
 		break;
 	case KBDC_ENABLE_AUX_PORT:
 		sc->ram[0] &= ~KBD_DISABLE_AUX_PORT;
+		ps2mouse_toggle(sc->ps2mouse_sc, 1);
+		if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0)
+			sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
 		break;
 	case KBDC_RESET:		/* Pulse "reset" line */
-#ifdef	__FreeBSD__
 		error = vm_suspend(ctx, VM_SUSPEND_RESET);
 		assert(error == 0 || errno == EALREADY);
-#else
-		exit(0);
-#endif
 		break;
 	default:
 		if (*eax >= 0x21 && *eax <= 0x3f) {
@@ -477,21 +471,38 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
 			int	byten;
 
 			byten = (*eax - 0x20) & 0x1f;
-			atkbdc_kbd_queue_data(sc, sc->ram[byten]);
+			sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten];
 		}
 		break;
 	}
 
 	pthread_mutex_unlock(&sc->mtx);
 
+	if (sc->ctrlbyte != 0) {
+		sc->status |= KBDS_KBD_BUFFER_FULL;
+		sc->status &= ~KBDS_AUX_BUFFER_FULL;
+		atkbdc_assert_kbd_intr(sc);
+	} else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 &&
+	           (sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) {
+		sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
+		atkbdc_assert_aux_intr(sc);
+	} else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) {
+		sc->status |= KBDS_KBD_BUFFER_FULL;
+		atkbdc_assert_kbd_intr(sc);
+	}
+
 	return (retval);
 }
 
 void
-atkbdc_event(struct atkbdc_softc *sc)
+atkbdc_event(struct atkbdc_softc *sc, int iskbd)
 {
 	pthread_mutex_lock(&sc->mtx);
-	atkbdc_poll(sc);
+
+	if (iskbd)
+		atkbdc_kbd_poll(sc);
+	else
+		atkbdc_aux_poll(sc);
 	pthread_mutex_unlock(&sc->mtx);
 }
 
@@ -539,7 +550,6 @@ atkbdc_init(struct vmctx *ctx)
 	sc->ps2mouse_sc = ps2mouse_init(sc);
 }
 
-#ifdef	__FreeBSD__
 static void
 atkbdc_dsdt(void)
 {
@@ -573,4 +583,4 @@ atkbdc_dsdt(void)
 	dsdt_line("}");
 }
 LPC_DSDT(atkbdc_dsdt);
-#endif
+
diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h
index 48b3a8b00c..85c8a7141e 100644
--- a/usr/src/cmd/bhyve/atkbdc.h
+++ b/usr/src/cmd/bhyve/atkbdc.h
@@ -33,6 +33,6 @@ struct atkbdc_softc;
 struct vmctx;
 
 void atkbdc_init(struct vmctx *ctx);
-void atkbdc_event(struct atkbdc_softc *sc);
+void atkbdc_event(struct atkbdc_softc *sc, int iskbd);
 
 #endif /* _ATKBDC_H_ */
diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c
index 633faacc5f..7b24ea7f5d 100644
--- a/usr/src/cmd/bhyve/bhyve_sol_glue.c
+++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/uio.h>
@@ -25,62 +26,14 @@
 void
 cfmakeraw(struct termios *t)
 {
-	t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON|IGNPAR);
+	t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|
+	    ICRNL|IXON|IGNPAR);
 	t->c_iflag |= IGNBRK;
 	t->c_oflag &= ~OPOST;
-	t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|TOSTOP |PENDIN);
+	t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|
+	    TOSTOP|PENDIN);
 	t->c_cflag &= ~(CSIZE|PARENB);
 	t->c_cflag |= CS8|CREAD;
 	t->c_cc[VMIN] = 1;
 	t->c_cc[VTIME] = 0;
 }
-
-ssize_t
-preadv(int d, const struct iovec *iov, int iovcnt, off_t offset)
-{
-	off_t		old_offset;
-	ssize_t		n;
-
-	old_offset = lseek(d, (off_t)0, SEEK_CUR);
-	if (old_offset == -1)
-		return (-1);
-
-	offset = lseek(d, offset, SEEK_SET);
-	if (offset == -1)
-		return (-1);
-
-	n = readv(d, iov, iovcnt);
-	if (n == -1)
-		return (-1);
-
-	offset = lseek(d, old_offset, SEEK_SET);
-	if (offset == -1)
-		return (-1);
-
-	return (n);
-}
-
-ssize_t
-pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset)
-{
-	off_t		old_offset;
-	ssize_t		n;
-
-	old_offset = lseek(d, (off_t)0, SEEK_CUR);
-	if (old_offset == -1)
-		return (-1);
-
-	offset = lseek(d, offset, SEEK_SET);
-	if (offset == -1)
-		return (-1);
-
-	n = writev(d, iov, iovcnt);
-	if (n == -1)
-		return (-1);
-
-	offset = lseek(d, old_offset, SEEK_SET);
-	if (offset == -1)
-		return (-1);
-
-	return (n);
-}
diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c
index 7a13c4c83f..4bd49ded79 100644
--- a/usr/src/cmd/bhyve/bhyvegc.c
+++ b/usr/src/cmd/bhyve/bhyvegc.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -37,10 +39,11 @@ __FBSDID("$FreeBSD$");
 
 struct bhyvegc {
 	struct bhyvegc_image	*gc_image;
+	int raw;
 };
 
 struct bhyvegc *
-bhyvegc_init(int width, int height)
+bhyvegc_init(int width, int height, void *fbaddr)
 {
 	struct bhyvegc *gc;
 	struct bhyvegc_image *gc_image;
@@ -50,13 +53,28 @@ bhyvegc_init(int width, int height)
 	gc_image = calloc(1, sizeof(struct bhyvegc_image));
 	gc_image->width = width;
 	gc_image->height = height;
-	gc_image->data = calloc(width * height, sizeof (uint32_t));
+	if (fbaddr) {
+		gc_image->data = fbaddr;
+		gc->raw = 1;
+	} else {
+		gc_image->data = calloc(width * height, sizeof (uint32_t));
+		gc->raw = 0;
+	}
 
 	gc->gc_image = gc_image;
 
 	return (gc);
 }
 
+void
+bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr)
+{
+	gc->raw = 1;
+	if (gc->gc_image->data && gc->gc_image->data != fbaddr)
+		free(gc->gc_image->data);
+	gc->gc_image->data = fbaddr;
+}
+
 void
 bhyvegc_resize(struct bhyvegc *gc, int width, int height)
 {
@@ -66,13 +84,20 @@ bhyvegc_resize(struct bhyvegc *gc, int width, int height)
 
 	gc_image->width = width;
 	gc_image->height = height;
-	gc_image->data = realloc(gc_image->data,
-	    sizeof (uint32_t) * width * height);
-	memset(gc_image->data, 0, width * height * sizeof (uint32_t));
+	if (!gc->raw) {
+		gc_image->data = reallocarray(gc_image->data, width * height,
+		    sizeof (uint32_t));
+		if (gc_image->data != NULL)
+			memset(gc_image->data, 0, width * height *
+			    sizeof (uint32_t));
+	}
 }
 
 struct bhyvegc_image *
 bhyvegc_get_image(struct bhyvegc *gc)
 {
+	if (gc == NULL)
+		return (NULL);
+
 	return (gc->gc_image);
 }
diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h
index 19648f98af..11323586df 100644
--- a/usr/src/cmd/bhyve/bhyvegc.h
+++ b/usr/src/cmd/bhyve/bhyvegc.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -32,12 +34,14 @@
 struct bhyvegc;
 
 struct bhyvegc_image {
+	int		vgamode;
 	int		width;
 	int		height;
 	uint32_t	*data;
 };
 
-struct bhyvegc *bhyvegc_init(int width, int height);
+struct bhyvegc *bhyvegc_init(int width, int height, void *fbaddr);
+void bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr);
 void bhyvegc_resize(struct bhyvegc *gc, int width, int height);
 struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc);
 
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c
index b985a2286e..928d2dc811 100644
--- a/usr/src/cmd/bhyve/bhyverun.c
+++ b/usr/src/cmd/bhyve/bhyverun.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,30 +38,50 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
 #include <sys/mman.h>
 #include <sys/time.h>
+#include <sys/cpuset.h>
+
+#ifdef __FreeBSD__
+#include <amd64/vmm/intel/vmcs.h>
+#else
+#include <intel/vmcs.h>
+#endif
 
+#include <machine/atomic.h>
 #include <machine/segments.h>
 
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <err.h>
+#include <errno.h>
 #include <libgen.h>
 #include <unistd.h>
 #include <assert.h>
-#include <errno.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <sysexits.h>
+#include <stdbool.h>
+#include <stdint.h>
 
 #include <machine/vmm.h>
+#ifndef WITHOUT_CAPSICUM
+#include <machine/vmm_dev.h>
+#endif
 #include <vmmapi.h>
 
 #include "bhyverun.h"
@@ -68,11 +90,11 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z n
 #include "console.h"
 #include "inout.h"
 #include "dbgport.h"
+#include "fwctl.h"
+#include "gdb.h"
 #include "ioapic.h"
 #include "mem.h"
-#ifdef	__FreeBSD__
 #include "mevent.h"
-#endif
 #include "mptbl.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
@@ -89,11 +111,81 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z n
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
+static const char * const vmx_exit_reason_desc[] = {
+	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
+	[EXIT_REASON_EXT_INTR] = "External interrupt",
+	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
+	[EXIT_REASON_INIT] = "INIT signal",
+	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
+	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
+	[EXIT_REASON_SMI] = "Other SMI",
+	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
+	[EXIT_REASON_NMI_WINDOW] = "NMI window",
+	[EXIT_REASON_TASK_SWITCH] = "Task switch",
+	[EXIT_REASON_CPUID] = "CPUID",
+	[EXIT_REASON_GETSEC] = "GETSEC",
+	[EXIT_REASON_HLT] = "HLT",
+	[EXIT_REASON_INVD] = "INVD",
+	[EXIT_REASON_INVLPG] = "INVLPG",
+	[EXIT_REASON_RDPMC] = "RDPMC",
+	[EXIT_REASON_RDTSC] = "RDTSC",
+	[EXIT_REASON_RSM] = "RSM",
+	[EXIT_REASON_VMCALL] = "VMCALL",
+	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
+	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
+	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
+	[EXIT_REASON_VMPTRST] = "VMPTRST",
+	[EXIT_REASON_VMREAD] = "VMREAD",
+	[EXIT_REASON_VMRESUME] = "VMRESUME",
+	[EXIT_REASON_VMWRITE] = "VMWRITE",
+	[EXIT_REASON_VMXOFF] = "VMXOFF",
+	[EXIT_REASON_VMXON] = "VMXON",
+	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
+	[EXIT_REASON_DR_ACCESS] = "MOV DR",
+	[EXIT_REASON_INOUT] = "I/O instruction",
+	[EXIT_REASON_RDMSR] = "RDMSR",
+	[EXIT_REASON_WRMSR] = "WRMSR",
+	[EXIT_REASON_INVAL_VMCS] =
+	    "VM-entry failure due to invalid guest state",
+	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
+	[EXIT_REASON_MWAIT] = "MWAIT",
+	[EXIT_REASON_MTF] = "Monitor trap flag",
+	[EXIT_REASON_MONITOR] = "MONITOR",
+	[EXIT_REASON_PAUSE] = "PAUSE",
+	[EXIT_REASON_MCE_DURING_ENTRY] =
+	    "VM-entry failure due to machine-check event",
+	[EXIT_REASON_TPR] = "TPR below threshold",
+	[EXIT_REASON_APIC_ACCESS] = "APIC access",
+	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
+	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
+	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
+	[EXIT_REASON_EPT_FAULT] = "EPT violation",
+	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
+	[EXIT_REASON_INVEPT] = "INVEPT",
+	[EXIT_REASON_RDTSCP] = "RDTSCP",
+	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
+	[EXIT_REASON_INVVPID] = "INVVPID",
+	[EXIT_REASON_WBINVD] = "WBINVD",
+	[EXIT_REASON_XSETBV] = "XSETBV",
+	[EXIT_REASON_APIC_WRITE] = "APIC write",
+	[EXIT_REASON_RDRAND] = "RDRAND",
+	[EXIT_REASON_INVPCID] = "INVPCID",
+	[EXIT_REASON_VMFUNC] = "VMFUNC",
+	[EXIT_REASON_ENCLS] = "ENCLS",
+	[EXIT_REASON_RDSEED] = "RDSEED",
+	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
+	[EXIT_REASON_XSAVES] = "XSAVES",
+	[EXIT_REASON_XRSTORS] = "XRSTORS"
+};
+
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
 char *vmname;
 
 int guest_ncpus;
+uint16_t cores, maxcpus, sockets, threads;
+
 char *guest_uuid_str;
 
 static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
@@ -103,9 +195,7 @@ static int x2apic_mode = 0;	/* default is xAPIC */
 static int strictio;
 static int strictmsr = 1;
 
-#ifdef	__FreeBSD__
 static int acpi;
-#endif
 
 static char *progname;
 static const int BSP = 0;
@@ -124,15 +214,14 @@ static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
-        uint64_t        vmexit_bogus;
-        uint64_t        vmexit_bogus_switch;
-        uint64_t        vmexit_hlt;
-        uint64_t        vmexit_pause;
-        uint64_t        vmexit_mtrap;
-        uint64_t        vmexit_inst_emul;
-        uint64_t        cpu_switch_rotate;
-        uint64_t        cpu_switch_direct;
-        int             io_reset;
+	uint64_t	vmexit_bogus;
+	uint64_t	vmexit_reqidle;
+	uint64_t	vmexit_hlt;
+	uint64_t	vmexit_pause;
+	uint64_t	vmexit_mtrap;
+	uint64_t	vmexit_inst_emul;
+	uint64_t	cpu_switch_rotate;
+	uint64_t	cpu_switch_direct;
 } stats;
 
 struct mt_vmm_info {
@@ -141,55 +230,211 @@ struct mt_vmm_info {
 	int		mt_vcpu;	
 } mt_vmm_info[VM_MAXCPU];
 
+#ifdef	__FreeBSD__
+static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
+#endif
+
 static void
 usage(int code)
 {
 
-#ifdef	__FreeBSD__
         fprintf(stderr,
-                "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n"
-		"       %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+		"Usage: %s [-abehuwxACHPSWY]\n"
+		"       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
+		"       %*s [-g <gdb port>] [-l <lpc>]\n"
+#ifdef	__FreeBSD__
+		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
+#else
+		"       %*s [-m mem] [-s <pci>] [-U uuid] <vm>\n"
+#endif
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
-		"       -A: create an ACPI table\n"
-		"       -g: gdb port\n"
-		"       -c: # cpus (default 1)\n"
+		"       -A: create ACPI tables\n"
+		"       -c: number of cpus and/or topology specification\n"
 		"       -C: include guest memory in core file\n"
-		"       -p: pin 'vcpu' to 'hostcpu'\n"
-		"       -H: vmexit from the guest on hlt\n"
-		"       -P: vmexit from the guest on pause\n"
-		"       -W: force virtio to use single-vector MSI\n"
 		"       -e: exit on unhandled I/O access\n"
+		"       -g: gdb port\n"
 		"       -h: help\n"
-		"       -s: <slot,driver,configinfo> PCI slot config\n"
-		"       -l: LPC device configuration\n"
-		"       -m: memory size in MB\n"
-		"       -w: ignore unimplemented MSRs\n"
-		"       -x: local apic is in x2APIC mode\n"
-		"       -Y: disable MPtable generation\n"
-		"       -U: uuid\n",
-		progname, (int)strlen(progname), "");
-#else
-        fprintf(stderr,
-                "Usage: %s [-ehwHPW] [-s <pci>] [-c vcpus]\n"
-		"       %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
-		"       -c: # cpus (default 1)\n"
 		"       -H: vmexit from the guest on hlt\n"
+		"       -l: LPC device configuration\n"
+		"       -m: memory size\n"
+#ifdef	__FreeBSD__
+		"       -p: pin 'vcpu' to 'hostcpu'\n"
+#endif
 		"       -P: vmexit from the guest on pause\n"
-		"       -W: force virtio to use single-vector MSI\n"
-		"       -e: exit on unhandled I/O access\n"
-		"       -h: help\n"
 		"       -s: <slot,driver,configinfo> PCI slot config\n"
-		"       -l: LPC device configuration\n"
-		"       -m: memory size in MB\n"
+		"       -S: guest memory cannot be swapped\n"
+		"       -u: RTC keeps UTC time\n"
+		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
-		"       -Y: disable MPtable generation\n"
-		"       -U: uuid\n",
-		progname, (int)strlen(progname), "");
-#endif
+		"       -W: force virtio to use single-vector MSI\n"
+		"       -x: local apic is in x2APIC mode\n"
+		"       -Y: disable MPtable generation\n",
+		progname, (int)strlen(progname), "", (int)strlen(progname), "",
+		(int)strlen(progname), "");
 
 	exit(code);
 }
 
+/*
+ * XXX This parser is known to have the following issues:
+ * 1.  It accepts null key=value tokens ",,".
+ * 2.  It accepts whitespace after = and before value.
+ * 3.  Values out of range of INT are silently wrapped.
+ * 4.  It doesn't check non-final values.
+ * 5.  The apparently bogus limits of UINT16_MAX are for future expansion.
+ *
+ * The acceptance of a null specification ('-c ""') is by design to match the
+ * manual page syntax specification, this results in a topology of 1 vCPU.
+ */
+static int
+topology_parse(const char *opt)
+{
+	uint64_t ncpus;
+	int c, chk, n, s, t, tmp;
+	char *cp, *str;
+	bool ns, scts;
+
+	c = 1, n = 1, s = 1, t = 1;
+	ns = false, scts = false;
+	str = strdup(opt);
+	if (str == NULL)
+		goto out;
+
+	while ((cp = strsep(&str, ",")) != NULL) {
+		if (sscanf(cp, "%i%n", &tmp, &chk) == 1) {
+			n = tmp;
+			ns = true;
+		} else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) {
+			n = tmp;
+			ns = true;
+		} else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) {
+			s = tmp;
+			scts = true;
+		} else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) {
+			c = tmp;
+			scts = true;
+		} else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) {
+			t = tmp;
+			scts = true;
+#ifdef notyet  /* Do not expose this until vmm.ko implements it */
+		} else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) {
+			m = tmp;
+#endif
+		/* Skip the empty argument case from -c "" */
+		} else if (cp[0] == '\0')
+			continue;
+		else
+			goto out;
+		/* Any trailing garbage causes an error */
+		if (cp[chk] != '\0')
+			goto out;
+	}
+	free(str);
+	str = NULL;
+
+	/*
+	 * Range check 1 <= n <= UINT16_MAX all values
+	 */
+	if (n < 1 || s < 1 || c < 1 || t < 1 ||
+	    n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX  ||
+	    t > UINT16_MAX)
+		return (-1);
+
+	/* If only the cpus was specified, use that as sockets */
+	if (!scts)
+		s = n;
+	/*
+	 * Compute sockets * cores * threads avoiding overflow
+	 * The range check above insures these are 16 bit values
+	 * If n was specified check it against computed ncpus
+	 */
+	ncpus = (uint64_t)s * c * t;
+	if (ncpus > UINT16_MAX || (ns && n != ncpus))
+		return (-1);
+
+	guest_ncpus = ncpus;
+	sockets = s;
+	cores = c;
+	threads = t;
+	return(0);
+
+out:
+	free(str);
+	return (-1);
+}
+
+#ifndef WITHOUT_CAPSICUM
+/*
+ * 11-stable capsicum helpers
+ */
+static void
+bhyve_caph_cache_catpages(void)
+{
+
+	(void)catopen("libc", NL_CAT_LOCALE);
+}
+
+static int
+bhyve_caph_limit_stdoe(void)
+{
+	cap_rights_t rights;
+	unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ };
+	int i, fds[] = { STDOUT_FILENO, STDERR_FILENO };
+
+	cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL);
+	cap_rights_set(&rights, CAP_WRITE);
+
+	for (i = 0; i < nitems(fds); i++) {
+		if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS)
+			return (-1);
+
+		if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS)
+			return (-1);
+
+		if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS)
+			return (-1);
+	}
+
+	return (0);
+}
+
+#endif
+
+#ifdef	__FreeBSD__
+static int
+pincpu_parse(const char *opt)
+{
+	int vcpu, pcpu;
+
+	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
+		fprintf(stderr, "invalid format: %s\n", opt);
+		return (-1);
+	}
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+		fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
+		    vcpu, VM_MAXCPU - 1);
+		return (-1);
+	}
+
+	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
+		fprintf(stderr, "hostcpu '%d' outside valid range from "
+		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
+		return (-1);
+	}
+
+	if (vcpumap[vcpu] == NULL) {
+		if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
+			perror("malloc");
+			return (-1);
+		}
+		CPU_ZERO(vcpumap[vcpu]);
+	}
+	CPU_SET(pcpu, vcpumap[vcpu]);
+	return (0);
+}
+#endif
+
 void
 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
     int errcode)
@@ -246,6 +491,8 @@ fbsdrun_start_thread(void *param)
 	snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
 	pthread_set_name_np(mtp->mt_thr, tname);
 
+	gdb_cpu_add(vcpu);
+
 	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
 
 	/* not reached */
@@ -267,7 +514,8 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 	 * with vm_suspend().
 	 */
 	error = vm_activate_cpu(ctx, newcpu);
-	assert(error == 0);
+	if (error != 0)
+		err(EX_OSERR, "could not activate CPU %d", newcpu);
 
 	CPU_SET_ATOMIC(newcpu, &cpumask);
 
@@ -286,6 +534,19 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 	assert(error == 0);
 }
 
+static int
+fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
+{
+
+	if (!CPU_ISSET(vcpu, &cpumask)) {
+		fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
+		exit(4);
+	}
+
+	CPU_CLR_ATOMIC(vcpu, &cpumask);
+	return (CPU_EMPTY(&cpumask));
+}
+
 static int
 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 		     uint32_t eax)
@@ -295,21 +556,20 @@ vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 	 * put guest-driven debug here
 	 */
 #endif
-        return (VMEXIT_CONTINUE);
+	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
-	int bytes, port, in, out, string;
+	int bytes, port, in, out;
 	int vcpu;
 
 	vcpu = *pvcpu;
 
 	port = vme->u.inout.port;
 	bytes = vme->u.inout.bytes;
-	string = vme->u.inout.string;
 	in = vme->u.inout.in;
 	out = !in;
 
@@ -380,13 +640,29 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 static int
 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
-	int newcpu;
-	int retval = VMEXIT_CONTINUE;
 
-	newcpu = spinup_ap(ctx, *pvcpu,
-			   vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+	(void)spinup_ap(ctx, *pvcpu,
+		    vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+
+	return (VMEXIT_CONTINUE);
+}
+
+#define	DEBUG_EPT_MISCONFIG
+#ifdef DEBUG_EPT_MISCONFIG
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+
+static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
+static int ept_misconfig_ptenum;
+#endif
+
+static const char *
+vmexit_vmx_desc(uint32_t exit_reason)
+{
 
-	return (retval);
+	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
+	    vmx_exit_reason_desc[exit_reason] == NULL)
+		return ("Unknown");
+	return (vmx_exit_reason_desc[exit_reason]);
 }
 
 static int
@@ -398,12 +674,41 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
-	fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+	fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
+	    vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
 	fprintf(stderr, "\tqualification\t0x%016lx\n",
 	    vmexit->u.vmx.exit_qualification);
 	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
+#ifdef DEBUG_EPT_MISCONFIG
+	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+		vm_get_register(ctx, *pvcpu,
+		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
+		    &ept_misconfig_gpa);
+		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
+		    &ept_misconfig_ptenum);
+		fprintf(stderr, "\tEPT misconfiguration:\n");
+		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
+		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
+		    ept_misconfig_ptenum, ept_misconfig_pte[0],
+		    ept_misconfig_pte[1], ept_misconfig_pte[2],
+		    ept_misconfig_pte[3]);
+	}
+#endif	/* DEBUG_EPT_MISCONFIG */
+	return (VMEXIT_ABORT);
+}
 
+static int
+vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+	fprintf(stderr, "\treason\t\tSVM\n");
+	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+	fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
+	fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
+	fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
 	return (VMEXIT_ABORT);
 }
 
@@ -411,11 +716,24 @@ static int
 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
+	assert(vmexit->inst_length == 0);
+
 	stats.vmexit_bogus++;
 
 	return (VMEXIT_CONTINUE);
 }
 
+static int
+vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	assert(vmexit->inst_length == 0);
+
+	stats.vmexit_reqidle++;
+
+	return (VMEXIT_CONTINUE);
+}
+
 static int
 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
@@ -443,8 +761,12 @@ static int
 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
+	assert(vmexit->inst_length == 0);
+
 	stats.vmexit_mtrap++;
 
+	gdb_cpu_mtrap(*pvcpu);
+
 	return (VMEXIT_CONTINUE);
 }
 
@@ -478,35 +800,88 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	return (VMEXIT_CONTINUE);
 }
 
+static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
+
+static int
+vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	enum vm_suspend_how how;
+
+	how = vmexit->u.suspended.how;
+
+	fbsdrun_deletecpu(ctx, *pvcpu);
+
+	if (*pvcpu != BSP) {
+		pthread_mutex_lock(&resetcpu_mtx);
+		pthread_cond_signal(&resetcpu_cond);
+		pthread_mutex_unlock(&resetcpu_mtx);
+		pthread_exit(NULL);
+	}
+
+	pthread_mutex_lock(&resetcpu_mtx);
+	while (!CPU_EMPTY(&cpumask)) {
+		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
+	}
+	pthread_mutex_unlock(&resetcpu_mtx);
+
+	switch (how) {
+	case VM_SUSPEND_RESET:
+		exit(0);
+	case VM_SUSPEND_POWEROFF:
+		exit(1);
+	case VM_SUSPEND_HALT:
+		exit(2);
+	case VM_SUSPEND_TRIPLEFAULT:
+		exit(3);
+	default:
+		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
+		exit(100);
+	}
+	return (0);	/* NOTREACHED */
+}
+
+static int
+vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	gdb_cpu_suspend(*pvcpu);
+	return (VMEXIT_CONTINUE);
+}
+
 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_INOUT]  = vmexit_inout,
+	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
 	[VM_EXITCODE_VMX]    = vmexit_vmx,
+	[VM_EXITCODE_SVM]    = vmexit_svm,
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
+	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
+	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
+	[VM_EXITCODE_DEBUG] = vmexit_debug,
 };
 
 static void
 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
 {
-#ifdef	__FreeBSD__
-	cpuset_t mask;
-#endif
-	int error, rc, prevcpu;
+	int error, rc;
 	enum vm_exitcode exitcode;
+	cpuset_t active_cpus;
 
 #ifdef	__FreeBSD__
-	if (pincpu >= 0) {
-		CPU_ZERO(&mask);
-		CPU_SET(pincpu + vcpu, &mask);
+	if (vcpumap[vcpu] != NULL) {
 		error = pthread_setaffinity_np(pthread_self(),
-					       sizeof(mask), &mask);
+		    sizeof(cpuset_t), vcpumap[vcpu]);
 		assert(error == 0);
 	}
 #endif
+	error = vm_active_cpus(ctx, &active_cpus);
+	assert(CPU_ISSET(vcpu, &active_cpus));
 
 	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
 	assert(error == 0);
@@ -516,16 +891,14 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
 		if (error != 0)
 			break;
 
-		prevcpu = vcpu;
-
 		exitcode = vmexit[vcpu].exitcode;
 		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
 			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
 			    exitcode);
-			exit(1);
+			exit(4);
 		}
 
-                rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
+		rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
 
 		switch (rc) {
 		case VMEXIT_CONTINUE:
@@ -533,7 +906,7 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
 		case VMEXIT_ABORT:
 			abort();
 		default:
-			exit(1);
+			exit(4);
 		}
 	}
 	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
@@ -565,7 +938,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 		err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr, "VM exit on HLT not supported\n");
-			exit(1);
+			exit(4);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
 		if (cpu == BSP)
@@ -580,7 +953,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 		if (err < 0) {
 			fprintf(stderr,
 			    "SMP mux requested, no pause support\n");
-			exit(1);
+			exit(4);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
 		if (cpu == BSP)
@@ -594,7 +967,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 
 	if (err) {
 		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
-		exit(1);
+		exit(4);
 	}
 
 #ifdef	__FreeBSD__
@@ -602,70 +975,175 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 #endif
 }
 
+static struct vmctx *
+do_open(const char *vmname)
+{
+	struct vmctx *ctx;
+	int error;
+	bool reinit, romboot;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+	const cap_ioctl_t *cmds;	
+	size_t ncmds;
+#endif
+
+	reinit = romboot = false;
+
+	if (lpc_bootrom())
+		romboot = true;
+
+	error = vm_create(vmname);
+	if (error) {
+		if (errno == EEXIST) {
+			if (romboot) {
+				reinit = true;
+			} else {
+				/*
+				 * The virtual machine has been setup by the
+				 * userspace bootloader.
+				 */
+			}
+		} else {
+			perror("vm_create");
+			exit(4);
+		}
+	} else {
+		if (!romboot) {
+			/*
+			 * If the virtual machine was just created then a
+			 * bootrom must be configured to boot it.
+			 */
+			fprintf(stderr, "virtual machine cannot be booted\n");
+			exit(4);
+		}
+	}
+
+	ctx = vm_open(vmname);
+	if (ctx == NULL) {
+		perror("vm_open");
+		exit(4);
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
+	if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) 
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	vm_get_ioctls(&ncmds);
+	cmds = vm_get_ioctls(NULL);
+	if (cmds == NULL)
+		errx(EX_OSERR, "out of memory");
+	if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	free((cap_ioctl_t *)cmds);
+#endif
+ 
+	if (reinit) {
+		error = vm_reinit(ctx);
+		if (error) {
+			perror("vm_reinit");
+			exit(4);
+		}
+	}
+	error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
+	if (error)
+		errx(EX_OSERR, "vm_set_topology");
+	return (ctx);
+}
+
 int
 main(int argc, char *argv[])
 {
-	int c, error, gdb_port, rfb_port, err, bvmcons;
-	int max_vcpus;
+	int c, error, dbg_port, gdb_port, err, bvmcons;
+	int max_vcpus, mptgen, memflags;
+	int rtc_localtime;
+	bool gdb_stop;
 	struct vmctx *ctx;
 	uint64_t rip;
 	size_t memsize;
+	char *optstr;
 
 	bvmcons = 0;
 	progname = basename(argv[0]);
+	dbg_port = 0;
 	gdb_port = 0;
-	rfb_port = -1;
+	gdb_stop = false;
 	guest_ncpus = 1;
+	sockets = cores = threads = 1;
+	maxcpus = 0;
 	memsize = 256 * MB;
-
+	mptgen = 1;
+	rtc_localtime = 1;
+	memflags = 0;
 
 #ifdef	__FreeBSD__
-	while ((c = getopt(argc, argv, "abehwxACHIPWYp:r:g:c:s:m:l:U:")) != -1) {
+	optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:";
 #else
-	while ((c = getopt(argc, argv, "abehwxHIPWYr:c:s:m:l:U:")) != -1) {
+	optstr = "abehuwxACHIPSWYg:G:c:s:m:l:B:U:";
 #endif
+	while ((c = getopt(argc, argv, optstr)) != -1) {
 		switch (c) {
 		case 'a':
 			x2apic_mode = 0;
 			break;
-#ifdef	__FreeBSD__
 		case 'A':
 			acpi = 1;
 			break;
-#endif
 		case 'b':
 			bvmcons = 1;
 			break;
+		case 'B':
+			if (smbios_parse(optarg) != 0) {
+				errx(EX_USAGE, "invalid SMBIOS "
+				    "configuration '%s'", optarg);
+			}
+			break;
 #ifdef	__FreeBSD__
 		case 'p':
-			pincpu = atoi(optarg);
+			if (pincpu_parse(optarg) != 0) {
+				errx(EX_USAGE, "invalid vcpu pinning "
+				    "configuration '%s'", optarg);
+			}
 			break;
 #endif
-		case 'r':
-			if (optarg[0] == ':')
-				rfb_port = atoi(optarg + 1) + RFB_PORT;
-			else
-				rfb_port = atoi(optarg);
-			break;
                 case 'c':
-			guest_ncpus = atoi(optarg);
+			if (topology_parse(optarg) != 0) {
+			    errx(EX_USAGE, "invalid cpu topology "
+				"'%s'", optarg);
+			}
+			break;
+		case 'C':
+			memflags |= VM_MEM_F_INCORE;
 			break;
-#ifdef	__FreeBSD__
 		case 'g':
+			dbg_port = atoi(optarg);
+			break;
+		case 'G':
+			if (optarg[0] == 'w') {
+				gdb_stop = true;
+				optarg++;
+			}
 			gdb_port = atoi(optarg);
 			break;
-#endif
 		case 'l':
-			if (lpc_device_parse(optarg) != 0) {
+			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
+				lpc_print_supported_devices();
+				exit(0);
+			} else if (lpc_device_parse(optarg) != 0) {
 				errx(EX_USAGE, "invalid lpc device "
 				    "configuration '%s'", optarg);
 			}
 			break;
 		case 's':
-			if (pci_parse_slot(optarg) != 0)
-				exit(1);
+			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
+				pci_print_supported_devices();
+				exit(0);
+			} else if (pci_parse_slot(optarg) != 0)
+				exit(4);
 			else
 				break;
+		case 'S':
+			memflags |= VM_MEM_F_WIRED;
+			break;
                 case 'm':
 			error = vm_parse_memsize(optarg, &memsize);
 			if (error)
@@ -689,15 +1167,24 @@ main(int argc, char *argv[])
 		case 'e':
 			strictio = 1;
 			break;
+		case 'u':
+			rtc_localtime = 0;
+			break;
 		case 'U':
 			guest_uuid_str = optarg;
 			break;
+		case 'w':
+			strictmsr = 0;
+			break;
 		case 'W':
 			virtio_msix = 0;
 			break;
 		case 'x':
 			x2apic_mode = 1;
 			break;
+		case 'Y':
+			mptgen = 0;
+			break;
 		case 'h':
 			usage(0);			
 		default:
@@ -711,32 +1198,41 @@ main(int argc, char *argv[])
 		usage(1);
 
 	vmname = argv[0];
-
-	ctx = vm_open(vmname);
-	if (ctx == NULL) {
-		perror("vm_open");
-		exit(1);
-	}
+	ctx = do_open(vmname);
 
 	max_vcpus = num_vcpus_allowed(ctx);
 	if (guest_ncpus > max_vcpus) {
 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
 			guest_ncpus, max_vcpus);
-		exit(1);
+		exit(4);
 	}
 
 	fbsdrun_set_capabilities(ctx, BSP);
 
+	vm_set_memflags(ctx, memflags);
+#ifdef	__FreeBSD__
 	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+#else
+	do {
+		errno = 0;
+		err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+		error = errno;
+		if (err != 0 && error == ENOMEM) {
+			(void) fprintf(stderr, "Unable to allocate memory "
+			    "(%llu), retrying in 1 second\n", memsize);
+			sleep(1);
+		}
+	} while (error == ENOMEM);
+#endif
 	if (err) {
-		fprintf(stderr, "Unable to setup memory (%d)\n", err);
-		exit(1);
+		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
+		exit(4);
 	}
 
 	error = init_msr();
 	if (error) {
 		fprintf(stderr, "init_msr error %d", error);
-		exit(1);
+		exit(4);
 	}
 
 	init_mem();
@@ -745,26 +1241,37 @@ main(int argc, char *argv[])
 	pci_irq_init(ctx);
 	ioapic_init(ctx);
 
-	rtc_init(ctx);
+	rtc_init(ctx, rtc_localtime);
+	sci_init(ctx);
 
 	/*
-	 * Exit if a device emulation finds an error in it's initilization
+	 * Exit if a device emulation finds an error in its initilization
 	 */
-	if (init_pci(ctx) != 0)
-		exit(1);
+	if (init_pci(ctx) != 0) {
+		perror("device emulation initialization error");
+		exit(4);
+	}
+
+	if (dbg_port != 0)
+		init_dbgport(dbg_port);
 
-#ifdef	__FreeBSD__
 	if (gdb_port != 0)
-		init_dbgport(gdb_port);
-#endif
+		init_gdb(ctx, gdb_port, gdb_stop);
 
 	if (bvmcons)
 		init_bvmcons();
 
-	console_init();
-	vga_init();
-	if (rfb_port != -1)
-		rfb_init(rfb_port);
+	vga_init(1);
+
+	if (lpc_bootrom()) {
+		if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
+			fprintf(stderr, "ROM boot failed: unrestricted guest "
+			    "capability not available\n");
+			exit(4);
+		}
+		error = vcpu_reset(ctx, BSP);
+		assert(error == 0);
+	}
 
 	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
 	assert(error == 0);
@@ -772,22 +1279,41 @@ main(int argc, char *argv[])
 	/*
  	 * build the guest tables, MP etc.
 	 */
-	mptable_build(ctx, guest_ncpus);
+	if (mptgen) {
+		error = mptable_build(ctx, guest_ncpus);
+		if (error) {
+			perror("error to build the guest tables");
+			exit(4);
+		}
+	}
 
 	error = smbios_build(ctx);
 	assert(error == 0);
 
-#ifdef	__FreeBSD__
 	if (acpi) {
 		error = acpi_build(ctx, guest_ncpus);
 		assert(error == 0);
 	}
 
+	if (lpc_bootrom())
+		fwctl_init();
+
 	/*
 	 * Change the proc title to include the VM name.
 	 */
-	setproctitle("%s", vmname); 
-#else
+	setproctitle("%s", vmname);
+
+#ifndef WITHOUT_CAPSICUM
+	caph_cache_catpages();
+
+	if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+
+	if (caph_enter() == -1)
+		errx(EX_OSERR, "cap_enter() failed");
+#endif
+
+#ifndef	__FreeBSD__
 	/*
 	 * If applicable, wait for bhyveconsole
 	 */
@@ -810,11 +1336,7 @@ main(int argc, char *argv[])
 	/*
 	 * Head off to the main event dispatch loop
 	 */
-#ifdef	__FreeBSD__
 	mevent_dispatch();
-#else
-	pthread_exit(NULL);
-#endif
 
-	exit(1);
+	exit(4);
 }
diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h
index be89314c09..78b3f1111f 100644
--- a/usr/src/cmd/bhyve/bhyverun.h
+++ b/usr/src/cmd/bhyve/bhyverun.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/bhyverun.h 277310 2015-01-18 03:08:30Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -41,17 +43,12 @@
 #ifndef	_FBSDRUN_H_
 #define	_FBSDRUN_H_
 
-#ifndef CTASSERT		/* Allow lint to override */
-#define	CTASSERT(x)		_CTASSERT(x, __LINE__)
-#define	_CTASSERT(x, y)		__CTASSERT(x, y)
-#define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]
-#endif
-
 #define	VMEXIT_CONTINUE		(0)
 #define	VMEXIT_ABORT		(-1)
 
 struct vmctx;
 extern int guest_ncpus;
+extern uint16_t cores, sockets, threads;
 extern char *guest_uuid_str;
 extern char *vmname;
 #ifndef	__FreeBSD__
diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c
index 2da946d420..72c5b02a0d 100644
--- a/usr/src/cmd/bhyve/block_if.c
+++ b/usr/src/cmd/bhyve/block_if.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
@@ -23,20 +25,36 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
 #include <sys/queue.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
+#include <sys/limits.h>
+#include <sys/uio.h>
+#ifndef __FreeBSD__
+#include <sys/dkio.h>
+#endif
 
 #include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -44,6 +62,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z t
 #include <pthread.h>
 #include <pthread_np.h>
 #include <signal.h>
+#include <sysexits.h>
 #include <unistd.h>
 
 #include <machine/atomic.h>
@@ -56,16 +75,27 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z t
 
 #define BLOCKIF_SIG	0xb109b109
 
-#define BLOCKIF_MAXREQ	33
+#ifdef __FreeBSD__
+#define BLOCKIF_NUMTHR	8
+#else
+/* Enlarge to keep pace with the virtio-block ring size */
+#define BLOCKIF_NUMTHR	16
+#endif
+#define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
 
 enum blockop {
 	BOP_READ,
 	BOP_WRITE,
-	BOP_FLUSH
+#ifndef __FreeBSD__
+	BOP_WRITE_SYNC,
+#endif
+	BOP_FLUSH,
+	BOP_DELETE
 };
 
 enum blockstat {
 	BST_FREE,
+	BST_BLOCK,
 	BST_PEND,
 	BST_BUSY,
 	BST_DONE
@@ -77,24 +107,40 @@ struct blockif_elem {
 	enum blockop	     be_op;
 	enum blockstat	     be_status;
 	pthread_t            be_tid;
+	off_t		     be_block;
 };
 
+#ifndef __FreeBSD__
+enum blockif_wce {
+	WCE_NONE = 0,
+	WCE_IOCTL,
+	WCE_FCNTL
+};
+#endif
+
 struct blockif_ctxt {
 	int			bc_magic;
 	int			bc_fd;
+	int			bc_ischr;
+	int			bc_isgeom;
+	int			bc_candelete;
+#ifndef __FreeBSD__
+	enum blockif_wce	bc_wce;
+#endif
 	int			bc_rdonly;
 	off_t			bc_size;
 	int			bc_sectsz;
-	pthread_t		bc_btid;
-        pthread_mutex_t		bc_mtx;
-        pthread_cond_t		bc_cond;
+	int			bc_psectsz;
+	int			bc_psectoff;
 	int			bc_closing;
+	pthread_t		bc_btid[BLOCKIF_NUMTHR];
+	pthread_mutex_t		bc_mtx;
+	pthread_cond_t		bc_cond;
 
 	/* Request elements and free/pending/busy queues */
 	TAILQ_HEAD(, blockif_elem) bc_freeq;       
 	TAILQ_HEAD(, blockif_elem) bc_pendq;
 	TAILQ_HEAD(, blockif_elem) bc_busyq;
-	u_int			bc_req_count;
 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
 };
 
@@ -113,83 +159,214 @@ static int
 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
-	struct blockif_elem *be;
-
-	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
+	struct blockif_elem *be, *tbe;
+	off_t off;
+	int i;
 
 	be = TAILQ_FIRST(&bc->bc_freeq);
 	assert(be != NULL);
 	assert(be->be_status == BST_FREE);
-
 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
-	be->be_status = BST_PEND;
 	be->be_req = breq;
 	be->be_op = op;
+	switch (op) {
+	case BOP_READ:
+	case BOP_WRITE:
+#ifndef __FreeBSD__
+	case BOP_WRITE_SYNC:
+#endif
+	case BOP_DELETE:
+		off = breq->br_offset;
+		for (i = 0; i < breq->br_iovcnt; i++)
+			off += breq->br_iov[i].iov_len;
+		break;
+	default:
+		off = OFF_MAX;
+	}
+	be->be_block = off;
+	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+		if (tbe->be_block == breq->br_offset)
+			break;
+	}
+	if (tbe == NULL) {
+		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
+			if (tbe->be_block == breq->br_offset)
+				break;
+		}
+	}
+	if (tbe == NULL)
+		be->be_status = BST_PEND;
+	else
+		be->be_status = BST_BLOCK;
 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
-
-	bc->bc_req_count++;
-
-	return (0);
+	return (be->be_status == BST_PEND);
 }
 
 static int
-blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
+blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
 {
 	struct blockif_elem *be;
 
-	if (bc->bc_req_count == 0)
-		return (ENOENT);
-
-	be = TAILQ_FIRST(&bc->bc_pendq);
-	assert(be != NULL);
-	assert(be->be_status == BST_PEND);
+	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+		if (be->be_status == BST_PEND)
+			break;
+		assert(be->be_status == BST_BLOCK);
+	}
+	if (be == NULL)
+		return (0);
 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
 	be->be_status = BST_BUSY;
-	be->be_tid = bc->bc_btid;
+	be->be_tid = t;
 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
-
 	*bep = be;
-
-	return (0);
+	return (1);
 }
 
 static void
 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
 {
-	assert(be->be_status == BST_DONE);
+	struct blockif_elem *tbe;
 
-	TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
+	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
+		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
+	else
+		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
+		if (tbe->be_req->br_offset == be->be_block)
+			tbe->be_status = BST_PEND;
+	}
 	be->be_tid = 0;
 	be->be_status = BST_FREE;
 	be->be_req = NULL;
 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
-
-	bc->bc_req_count--;
 }
 
 static void
-blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
+blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
 {
 	struct blockif_req *br;
-	int err;
+#ifdef	__FreeBSD__
+	off_t arg[2];
+#endif
+	ssize_t clen, len, off, boff, voff;
+	int i, err;
 
 	br = be->be_req;
+	if (br->br_iovcnt <= 1)
+		buf = NULL;
 	err = 0;
-
 	switch (be->be_op) {
 	case BOP_READ:
-		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
-			   br->br_offset) < 0)
-			err = errno;
+		if (buf == NULL) {
+			if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
+				   br->br_offset)) < 0)
+				err = errno;
+			else
+				br->br_resid -= len;
+			break;
+		}
+		i = 0;
+		off = voff = 0;
+		while (br->br_resid > 0) {
+			len = MIN(br->br_resid, MAXPHYS);
+			if (pread(bc->bc_fd, buf, len, br->br_offset +
+			    off) < 0) {
+				err = errno;
+				break;
+			}
+			boff = 0;
+			do {
+				clen = MIN(len - boff, br->br_iov[i].iov_len -
+				    voff);
+				memcpy(br->br_iov[i].iov_base + voff,
+				    buf + boff, clen);
+				if (clen < br->br_iov[i].iov_len - voff)
+					voff += clen;
+				else {
+					i++;
+					voff = 0;
+				}
+				boff += clen;
+			} while (boff < len);
+			off += len;
+			br->br_resid -= len;
+		}
 		break;
 	case BOP_WRITE:
-		if (bc->bc_rdonly)
+		if (bc->bc_rdonly) {
 			err = EROFS;
-		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
-			     br->br_offset) < 0)
-			err = errno;
+			break;
+		}
+		if (buf == NULL) {
+			if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
+				    br->br_offset)) < 0)
+				err = errno;
+			else
+				br->br_resid -= len;
+			break;
+		}
+		i = 0;
+		off = voff = 0;
+		while (br->br_resid > 0) {
+			len = MIN(br->br_resid, MAXPHYS);
+			boff = 0;
+			do {
+				clen = MIN(len - boff, br->br_iov[i].iov_len -
+				    voff);
+				memcpy(buf + boff,
+				    br->br_iov[i].iov_base + voff, clen);
+				if (clen < br->br_iov[i].iov_len - voff)
+					voff += clen;
+				else {
+					i++;
+					voff = 0;
+				}
+				boff += clen;
+			} while (boff < len);
+			if (pwrite(bc->bc_fd, buf, len, br->br_offset +
+			    off) < 0) {
+				err = errno;
+				break;
+			}
+			off += len;
+			br->br_resid -= len;
+		}
 		break;
 	case BOP_FLUSH:
+#ifdef	__FreeBSD__
+		if (bc->bc_ischr) {
+			if (ioctl(bc->bc_fd, DIOCGFLUSH))
+				err = errno;
+		} else if (fsync(bc->bc_fd))
+			err = errno;
+#else
+		/*
+		 * This fsync() should be adequate to flush the cache of a file
+		 * or device.  In VFS, the VOP_SYNC operation is converted to
+		 * the appropriate ioctl in both sdev (for real devices) and
+		 * zfs (for zvols).
+		 */
+		if (fsync(bc->bc_fd))
+			err = errno;
+#endif
+		break;
+	case BOP_DELETE:
+		if (!bc->bc_candelete)
+			err = EOPNOTSUPP;
+		else if (bc->bc_rdonly)
+			err = EROFS;
+#ifdef	__FreeBSD__
+		else if (bc->bc_ischr) {
+			arg[0] = br->br_offset;
+			arg[1] = br->br_resid;
+			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
+				err = errno;
+			else
+				br->br_resid = 0;
+		}
+#endif
+		else
+			 err = EOPNOTSUPP;
 		break;
 	default:
 		err = EINVAL;
@@ -206,28 +383,34 @@ blockif_thr(void *arg)
 {
 	struct blockif_ctxt *bc;
 	struct blockif_elem *be;
+	pthread_t t;
+	uint8_t *buf;
 
 	bc = arg;
+	if (bc->bc_isgeom)
+		buf = malloc(MAXPHYS);
+	else
+		buf = NULL;
+	t = pthread_self();
 
+	pthread_mutex_lock(&bc->bc_mtx);
 	for (;;) {
-		pthread_mutex_lock(&bc->bc_mtx);
-		while (!blockif_dequeue(bc, &be)) {
+		while (blockif_dequeue(bc, t, &be)) {
 			pthread_mutex_unlock(&bc->bc_mtx);
-			blockif_proc(bc, be);
+			blockif_proc(bc, be, buf);
 			pthread_mutex_lock(&bc->bc_mtx);
 			blockif_complete(bc, be);
 		}
-		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
-		pthread_mutex_unlock(&bc->bc_mtx);
-
-		/*
-		 * Check ctxt status here to see if exit requested
-		 */
+		/* Check ctxt status here to see if exit requested */
 		if (bc->bc_closing)
-			pthread_exit(NULL);
+			break;
+		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
 	}
+	pthread_mutex_unlock(&bc->bc_mtx);
 
-	/* Not reached */
+	if (buf)
+		free(buf);
+	pthread_exit(NULL);
 	return (NULL);
 }
 
@@ -276,15 +459,31 @@ struct blockif_ctxt *
 blockif_open(const char *optstr, const char *ident)
 {
 	char tname[MAXCOMLEN + 1];
-	char *nopt, *xopts;
+#ifdef	__FreeBSD__
+	char name[MAXPATHLEN];
+	char *nopt, *xopts, *cp;
+#else
+	char *nopt, *xopts, *cp = NULL;
+#endif
 	struct blockif_ctxt *bc;
 	struct stat sbuf;
-	off_t size;
+#ifdef	__FreeBSD__
+	struct diocgattr_arg arg;
+#else
+	enum blockif_wce wce = WCE_NONE;
+#endif
+	off_t size, psectsz, psectoff;
 	int extra, fd, i, sectsz;
-	int nocache, sync, ro;
+	int nocache, sync, ro, candelete, geom, ssopt, pssopt;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
+#endif
 
 	pthread_once(&blockif_once, blockif_init);
 
+	fd = -1;
+	ssopt = 0;
 	nocache = 0;
 	sync = 0;
 	ro = 0;
@@ -293,16 +492,25 @@ blockif_open(const char *optstr, const char *ident)
 	 * The first element in the optstring is always a pathname.
 	 * Optional elements follow
 	 */
-	nopt = strdup(optstr);
-	for (xopts = strtok(nopt, ",");
-	     xopts != NULL;
-	     xopts = strtok(NULL, ",")) {
-		if (!strcmp(xopts, "nocache"))
+	nopt = xopts = strdup(optstr);
+	while (xopts != NULL) {
+		cp = strsep(&xopts, ",");
+		if (cp == nopt)		/* file or device pathname */
+			continue;
+		else if (!strcmp(cp, "nocache"))
 			nocache = 1;
-		else if (!strcmp(xopts, "sync"))
+		else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
 			sync = 1;
-		else if (!strcmp(xopts, "ro"))
+		else if (!strcmp(cp, "ro"))
 			ro = 1;
+		else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
+			;
+		else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
+			pssopt = ssopt;
+		else {
+			fprintf(stderr, "Invalid device option \"%s\"\n", cp);
+			goto err;
+		}
 	}
 
 	extra = 0;
@@ -319,62 +527,185 @@ blockif_open(const char *optstr, const char *ident)
 	}
 
 	if (fd < 0) {
-		perror("Could not open backing file");
-		return (NULL);
+		warn("Could not open backing file: %s", nopt);
+		goto err;
 	}
 
         if (fstat(fd, &sbuf) < 0) {
-                perror("Could not stat backing file");
-                close(fd);
-                return (NULL);
+		warn("Could not stat backing file %s", nopt);
+		goto err;
         }
 
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
+	    CAP_WRITE);
+	if (ro)
+		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
+
+	if (caph_rights_limit(fd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
         /*
 	 * Deal with raw devices
 	 */
         size = sbuf.st_size;
 	sectsz = DEV_BSIZE;
+	psectsz = psectoff = 0;
+	candelete = geom = 0;
 #ifdef	__FreeBSD__
 	if (S_ISCHR(sbuf.st_mode)) {
 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
 			perror("Could not fetch dev blk/sector size");
-			close(fd);
-			return (NULL);
+			goto err;
 		}
 		assert(size != 0);
 		assert(sectsz != 0);
+		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
+			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
+		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
+		arg.len = sizeof(arg.value.i);
+		if (ioctl(fd, DIOCGATTR, &arg) == 0)
+			candelete = arg.value.i;
+		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
+			geom = 1;
+	} else {
+		psectsz = sbuf.st_blksize;
 	}
+#else
+	psectsz = sbuf.st_blksize;
+	if (S_ISCHR(sbuf.st_mode)) {
+		struct dk_minfo_ext dkmext;
+		int wce_val;
+
+		/* Look for a more accurate physical blocksize */
+		if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) {
+			psectsz = dkmext.dki_pbsize;
+		}
+		/* See if a configurable write cache is present and working */
+		if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) {
+			/*
+			 * If WCE is already active, disable it until the
+			 * specific device driver calls for its return.  If it
+			 * is not active, toggle it on and off to verify that
+			 * such actions are possible.
+			 */
+			if (wce_val != 0) {
+				wce_val = 0;
+				/*
+				 * Inability to disable the cache is a threat
+				 * to data durability.
+				 */
+				assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0);
+				wce = WCE_IOCTL;
+			} else {
+				int r1, r2;
+
+				wce_val = 1;
+				r1 = ioctl(fd, DKIOCSETWCE, &wce_val);
+				wce_val = 0;
+				r2 = ioctl(fd, DKIOCSETWCE, &wce_val);
+
+				if (r1 == 0 && r2 == 0) {
+					wce = WCE_IOCTL;
+				} else {
+					/*
+					 * If the cache cache toggle was not
+					 * successful, ensure that the cache
+					 * was not left enabled.
+					 */
+					assert(r1 != 0);
+				}
+			}
+		}
+	} else {
+		int flags;
+
+		if ((flags = fcntl(fd, F_GETFL)) >= 0) {
+			flags |= O_DSYNC;
+			if (fcntl(fd, F_SETFL, flags) != -1) {
+				wce = WCE_FCNTL;
+			}
+		}
+	}
+#endif
+
+#ifndef WITHOUT_CAPSICUM
+	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
+	if (ssopt != 0) {
+		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
+		    ssopt > pssopt) {
+			fprintf(stderr, "Invalid sector size %d/%d\n",
+			    ssopt, pssopt);
+			goto err;
+		}
+
+		/*
+		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
+		 * size be a multiple of the device's sector size.
+		 *
+		 * Validate that the emulated sector size complies with this
+		 * requirement.
+		 */
+		if (S_ISCHR(sbuf.st_mode)) {
+			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
+				fprintf(stderr, "Sector size %d incompatible "
+				    "with underlying device sector size %d\n",
+				    ssopt, sectsz);
+				goto err;
+			}
+		}
+
+		sectsz = ssopt;
+		psectsz = pssopt;
+		psectoff = 0;
+	}
+
 	bc = calloc(1, sizeof(struct blockif_ctxt));
 	if (bc == NULL) {
-		close(fd);
-		return (NULL);
+		perror("calloc");
+		goto err;
 	}
 
 	bc->bc_magic = BLOCKIF_SIG;
 	bc->bc_fd = fd;
+	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
+	bc->bc_isgeom = geom;
+	bc->bc_candelete = candelete;
+#ifndef __FreeBSD__
+	bc->bc_wce = wce;
+#endif
 	bc->bc_rdonly = ro;
 	bc->bc_size = size;
 	bc->bc_sectsz = sectsz;
+	bc->bc_psectsz = psectsz;
+	bc->bc_psectoff = psectoff;
 	pthread_mutex_init(&bc->bc_mtx, NULL);
 	pthread_cond_init(&bc->bc_cond, NULL);
 	TAILQ_INIT(&bc->bc_freeq);
 	TAILQ_INIT(&bc->bc_pendq);
 	TAILQ_INIT(&bc->bc_busyq);
-	bc->bc_req_count = 0;
 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
 		bc->bc_reqs[i].be_status = BST_FREE;
 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
 	}
 
-	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
-
-	snprintf(tname, sizeof(tname), "blk-%s", ident);
-	pthread_set_name_np(bc->bc_btid, tname);
+	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
+		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
+		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
+		pthread_set_name_np(bc->bc_btid[i], tname);
+	}
 
 	return (bc);
+err:
+	if (fd >= 0)
+		close(fd);
+	free(nopt);
+	return (NULL);
 }
 
 static int
@@ -386,13 +717,13 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
-	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
+	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
 		/*
 		 * Enqueue and inform the block i/o thread
 		 * that there is work available
 		 */
-		blockif_enqueue(bc, breq, op);
-		pthread_cond_signal(&bc->bc_cond);
+		if (blockif_enqueue(bc, breq, op))
+			pthread_cond_signal(&bc->bc_cond);
 	} else {
 		/*
 		 * Callers are not allowed to enqueue more than
@@ -431,6 +762,14 @@ blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 	return (blockif_request(bc, breq, BOP_FLUSH));
 }
 
+int
+blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (blockif_request(bc, breq, BOP_DELETE));
+}
+
 int
 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
@@ -450,11 +789,7 @@ blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
 		/*
 		 * Found it.
 		 */
-		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
-		be->be_status = BST_FREE;
-		be->be_req = NULL;
-		TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
-		bc->bc_req_count--;
+		blockif_complete(bc, be);
 		pthread_mutex_unlock(&bc->bc_mtx);
 
 		return (0);
@@ -515,18 +850,19 @@ int
 blockif_close(struct blockif_ctxt *bc)
 {
 	void *jval;
-	int err;
-
-	err = 0;
+	int i;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	/*
 	 * Stop the block i/o thread
 	 */
+	pthread_mutex_lock(&bc->bc_mtx);
 	bc->bc_closing = 1;
-	pthread_cond_signal(&bc->bc_cond);
-	pthread_join(bc->bc_btid, &jval);
+	pthread_mutex_unlock(&bc->bc_mtx);
+	pthread_cond_broadcast(&bc->bc_cond);
+	for (i = 0; i < BLOCKIF_NUMTHR; i++)
+		pthread_join(bc->bc_btid[i], &jval);
 
 	/* XXX Cancel queued i/o's ??? */
 
@@ -608,6 +944,15 @@ blockif_sectsz(struct blockif_ctxt *bc)
 	return (bc->bc_sectsz);
 }
 
+void
+blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	*size = bc->bc_psectsz;
+	*off = bc->bc_psectoff;
+}
+
 int
 blockif_queuesz(struct blockif_ctxt *bc)
 {
@@ -623,3 +968,54 @@ blockif_is_ro(struct blockif_ctxt *bc)
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_rdonly);
 }
+
+int
+blockif_candelete(struct blockif_ctxt *bc)
+{
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+	return (bc->bc_candelete);
+}
+
+#ifndef __FreeBSD__
+int
+blockif_set_wce(struct blockif_ctxt *bc, int wc_enable)
+{
+	int res = 0, flags;
+	int clean_val = (wc_enable != 0) ? 1 : 0;
+
+	(void) pthread_mutex_lock(&bc->bc_mtx);
+	switch (bc->bc_wce) {
+	case WCE_IOCTL:
+		res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val);
+		break;
+	case WCE_FCNTL:
+		if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) {
+			if (wc_enable == 0) {
+				flags |= O_DSYNC;
+			} else {
+				flags &= ~O_DSYNC;
+			}
+			if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) {
+				res = -1;
+			}
+		} else {
+			res = -1;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * After a successful disable of the write cache, ensure that any
+	 * lingering data in the cache is synced out.
+	 */
+	if (res == 0 && wc_enable == 0) {
+		res = fsync(bc->bc_fd);
+	}
+	(void) pthread_mutex_unlock(&bc->bc_mtx);
+
+	return (res);
+}
+#endif /* __FreeBSD__ */
diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h
index 5ef120933c..bff2b42768 100644
--- a/usr/src/cmd/bhyve/block_if.h
+++ b/usr/src/cmd/bhyve/block_if.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/block_if.h 268638 2014-07-15 00:25:54Z grehan $
+ * $FreeBSD$
  */
 
 /*
@@ -39,18 +41,21 @@
 #include <sys/uio.h>
 #include <sys/unistd.h>
 
-#ifdef	__FreeBSD__
-#define BLOCKIF_IOV_MAX		32	/* not practical to be IOV_MAX */
-#else
-#define BLOCKIF_IOV_MAX		16	/* not practical to be IOV_MAX */
-#endif
+/*
+ * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
+ * a single request.  BLOCKIF_RING_MAX is the maxmimum number of
+ * pending requests that can be queued.
+ */
+#define	BLOCKIF_IOV_MAX		128	/* not practical to be IOV_MAX */
+#define	BLOCKIF_RING_MAX	128
 
 struct blockif_req {
-	struct iovec	br_iov[BLOCKIF_IOV_MAX];
 	int		br_iovcnt;
 	off_t		br_offset;
+	ssize_t		br_resid;
 	void		(*br_callback)(struct blockif_req *req, int err);
 	void		*br_param;
+	struct iovec	br_iov[BLOCKIF_IOV_MAX];
 };
 
 struct blockif_ctxt;
@@ -59,11 +64,17 @@ off_t	blockif_size(struct blockif_ctxt *bc);
 void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
     uint8_t *s);
 int	blockif_sectsz(struct blockif_ctxt *bc);
+void	blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
 int	blockif_queuesz(struct blockif_ctxt *bc);
 int	blockif_is_ro(struct blockif_ctxt *bc);
+int	blockif_candelete(struct blockif_ctxt *bc);
+#ifndef __FreeBSD__
+int	blockif_set_wce(struct blockif_ctxt *bc, int enable);
+#endif
 int	blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
+int	blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_close(struct blockif_ctxt *bc);
 
diff --git a/usr/src/cmd/bhyve/bootrom.c b/usr/src/cmd/bhyve/bootrom.c
new file mode 100644
index 0000000000..b8c63828c8
--- /dev/null
+++ b/usr/src/cmd/bhyve/bootrom.c
@@ -0,0 +1,113 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <machine/vmm.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include <vmmapi.h>
+#include "bhyverun.h"
+#include "bootrom.h"
+
+#define	MAX_BOOTROM_SIZE	(16 * 1024 * 1024)	/* 16 MB */
+
+int
+bootrom_init(struct vmctx *ctx, const char *romfile)
+{
+	struct stat sbuf;
+	vm_paddr_t gpa;
+	ssize_t rlen;
+	char *ptr;
+	int fd, i, rv, prot;
+
+	rv = -1;
+	fd = open(romfile, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "Error opening bootrom \"%s\": %s\n",
+		    romfile, strerror(errno));
+		goto done;
+	}
+
+        if (fstat(fd, &sbuf) < 0) {
+		fprintf(stderr, "Could not fstat bootrom file \"%s\": %s\n",
+		    romfile, strerror(errno));
+		goto done;
+        }
+
+	/*
+	 * Limit bootrom size to 16MB so it doesn't encroach into reserved
+	 * MMIO space (e.g. APIC, HPET, MSI).
+	 */
+	if (sbuf.st_size > MAX_BOOTROM_SIZE || sbuf.st_size < PAGE_SIZE) {
+		fprintf(stderr, "Invalid bootrom size %ld\n", sbuf.st_size);
+		goto done;
+	}
+
+	if (sbuf.st_size & PAGE_MASK) {
+		fprintf(stderr, "Bootrom size %ld is not a multiple of the "
+		    "page size\n", sbuf.st_size);
+		goto done;
+	}
+
+	ptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", sbuf.st_size);
+	if (ptr == MAP_FAILED)
+		goto done;
+
+	/* Map the bootrom into the guest address space */
+	prot = PROT_READ | PROT_EXEC;
+	gpa = (1ULL << 32) - sbuf.st_size;
+	if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, 0, sbuf.st_size, prot) != 0)
+		goto done;
+
+	/* Read 'romfile' into the guest address space */
+	for (i = 0; i < sbuf.st_size / PAGE_SIZE; i++) {
+		rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE);
+		if (rlen != PAGE_SIZE) {
+			fprintf(stderr, "Incomplete read of page %d of bootrom "
+			    "file %s: %ld bytes\n", i, romfile, rlen);
+			goto done;
+		}
+	}
+	rv = 0;
+done:
+	if (fd >= 0)
+		close(fd);
+	return (rv);
+}
diff --git a/usr/src/cmd/bhyve/bootrom.h b/usr/src/cmd/bhyve/bootrom.h
new file mode 100644
index 0000000000..7fb12181dd
--- /dev/null
+++ b/usr/src/cmd/bhyve/bootrom.h
@@ -0,0 +1,40 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_BOOTROM_H_
+#define	_BOOTROM_H_
+
+#include <stdbool.h>
+
+struct vmctx;
+
+int bootrom_init(struct vmctx *ctx, const char *romfile);
+
+#endif
diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c
index a8d07709be..2567f69959 100644
--- a/usr/src/cmd/bhyve/console.c
+++ b/usr/src/cmd/bhyve/console.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -40,15 +42,23 @@ static struct {
 
 	kbd_event_func_t	kbd_event_cb;
 	void			*kbd_arg;
+	int			kbd_priority;
 
 	ptr_event_func_t	ptr_event_cb;
 	void			*ptr_arg;
+	int			ptr_priority;
 } console;
 
 void
-console_init(void)
+console_init(int w, int h, void *fbaddr)
+{
+	console.gc = bhyvegc_init(w, h, fbaddr);
+}
+
+void
+console_set_fbaddr(void *fbaddr)
 {
-	console.gc = bhyvegc_init(640, 400);
+	bhyvegc_set_fbaddr(console.gc, fbaddr);
 }
 
 struct bhyvegc_image *
@@ -71,31 +81,40 @@ console_fb_register(fb_render_func_t render_cb, void *arg)
 void
 console_refresh(void)
 {
-	(*console.fb_render_cb)(console.gc, console.fb_arg);
+	if (console.fb_render_cb)
+		(*console.fb_render_cb)(console.gc, console.fb_arg);
 }
 
 void
-console_kbd_register(kbd_event_func_t event_cb, void *arg)
+console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri)
 {
-	console.kbd_event_cb = event_cb;
-	console.kbd_arg = arg;
+	if (pri > console.kbd_priority) {
+		console.kbd_event_cb = event_cb;
+		console.kbd_arg = arg;
+		console.kbd_priority = pri;
+	}
 }
 
 void
-console_ptr_register(ptr_event_func_t event_cb, void *arg)
+console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri)
 {
-	console.ptr_event_cb = event_cb;
-	console.ptr_arg = arg;
+	if (pri > console.ptr_priority) {
+		console.ptr_event_cb = event_cb;
+		console.ptr_arg = arg;
+		console.ptr_priority = pri;
+	}
 }
 
 void
 console_key_event(int down, uint32_t keysym)
 {
-	(*console.kbd_event_cb)(down, keysym, console.kbd_arg);
+	if (console.kbd_event_cb)
+		(*console.kbd_event_cb)(down, keysym, console.kbd_arg);
 }
 
 void
 console_ptr_event(uint8_t button, int x, int y)
 {
-	(*console.ptr_event_cb)(button, x, y, console.ptr_arg);
+	if (console.ptr_event_cb)
+		(*console.ptr_event_cb)(button, x, y, console.ptr_arg);
 }
diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h
index bffb7c2456..0d0a854866 100644
--- a/usr/src/cmd/bhyve/console.h
+++ b/usr/src/cmd/bhyve/console.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -35,16 +37,19 @@ typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg);
 typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg);
 typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg);
 
-void	console_init(void);
+void console_init(int w, int h, void *fbaddr);
+
+void console_set_fbaddr(void *fbaddr);
+
 struct bhyvegc_image *console_get_image(void);
 
-void	console_fb_register(fb_render_func_t render_cb, void *arg);
-void	console_refresh(void);
+void console_fb_register(fb_render_func_t render_cb, void *arg);
+void console_refresh(void);
 
-void	console_kbd_register(kbd_event_func_t event_cb, void *arg);
-void	console_key_event(int down, uint32_t keysym);
+void console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri);
+void console_key_event(int down, uint32_t keysym);
 
-void	console_ptr_register(ptr_event_func_t event_cb, void *arg);
-void	console_ptr_event(uint8_t button, int x, int y);
+void console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri);
+void console_ptr_event(uint8_t button, int x, int y);
 
 #endif /* _CONSOLE_H_ */
diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c
index 69b6dfddf1..cda2df2414 100644
--- a/usr/src/cmd/bhyve/consport.c
+++ b/usr/src/cmd/bhyve/consport.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,20 +25,29 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
 #include <sys/select.h>
 
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <termios.h>
 #include <unistd.h>
 #include <stdbool.h>
+#include <sysexits.h>
 
 #include "inout.h"
 #include "pci_lpc.h"
@@ -44,6 +55,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z j
 #define	BVM_CONSOLE_PORT	0x220
 #define	BVM_CONS_SIG		('b' << 8 | 'v')
 
+#ifdef	__FreeBSD__
 static struct termios tio_orig, tio_new;
 
 static void
@@ -51,6 +63,7 @@ ttyclose(void)
 {
 	tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
 }
+#endif
 
 static void
 ttyopen(void)
@@ -68,14 +81,14 @@ ttyopen(void)
 static bool
 tty_char_available(void)
 {
-        fd_set rfds;
-        struct timeval tv;
-
-        FD_ZERO(&rfds);
-        FD_SET(STDIN_FILENO, &rfds);
-        tv.tv_sec = 0;
-        tv.tv_usec = 0;
-        if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+	fd_set rfds;
+	struct timeval tv;
+
+	FD_ZERO(&rfds);
+	FD_SET(STDIN_FILENO, &rfds);
+	tv.tv_sec = 0;
+	tv.tv_usec = 0;
+	if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
 		return (true);
 	} else {
 		return (false);
@@ -106,6 +119,10 @@ console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		uint32_t *eax, void *arg)
 {
 	static int opened;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+	cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ };
+#endif
 
 	if (bytes == 2 && in) {
 		*eax = BVM_CONS_SIG;
@@ -125,6 +142,14 @@ console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		return (-1);
 
 	if (!opened) {
+#ifndef WITHOUT_CAPSICUM
+		cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ,
+		    CAP_WRITE);
+		if (caph_rights_limit(STDIN_FILENO, &rights) == -1)
+			errx(EX_OSERR, "Unable to apply rights for sandbox");
+		if (caph_ioctls_limit(STDIN_FILENO, cmds, nitems(cmds)) == -1)
+			errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
 		ttyopen();
 		opened = 1;
 	}
diff --git a/usr/src/cmd/bhyve/dbgport.c b/usr/src/cmd/bhyve/dbgport.c
new file mode 100644
index 0000000000..88a616b50d
--- /dev/null
+++ b/usr/src/cmd/bhyve/dbgport.c
@@ -0,0 +1,180 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/uio.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sysexits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "inout.h"
+#include "dbgport.h"
+#include "pci_lpc.h"
+
+#define	BVM_DBG_PORT	0x224
+#define	BVM_DBG_SIG	('B' << 8 | 'V')
+
+static int listen_fd, conn_fd;
+
+static struct sockaddr_in sin;
+
+static int
+dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	    uint32_t *eax, void *arg)
+{
+	int nwritten, nread, printonce;
+	int on = 1;
+	char ch;
+
+	if (bytes == 2 && in) {
+		*eax = BVM_DBG_SIG;
+		return (0);
+	}
+
+	if (bytes != 4)
+		return (-1);
+
+again:
+	printonce = 0;
+	while (conn_fd < 0) {
+		if (!printonce) {
+			printf("Waiting for connection from gdb\r\n");
+			printonce = 1;
+		}
+		conn_fd = accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK);
+		if (conn_fd >= 0) {
+			/* Avoid EPIPE after the client drops off. */
+			(void)setsockopt(conn_fd, SOL_SOCKET, SO_NOSIGPIPE,
+			    &on, sizeof(on));
+			/* Improve latency for one byte at a time tranfers. */
+			(void)setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY,
+			    &on, sizeof(on));
+		} else if (errno != EINTR) {
+			perror("accept");
+		}
+	}
+
+	if (in) {
+		nread = read(conn_fd, &ch, 1);
+		if (nread == -1 && errno == EAGAIN)
+			*eax = -1;
+		else if (nread == 1)
+			*eax = ch;
+		else {
+			close(conn_fd);
+			conn_fd = -1;
+			goto again;
+		}
+	} else {
+		ch = *eax;
+		nwritten = write(conn_fd, &ch, 1);
+		if (nwritten != 1) {
+			close(conn_fd);
+			conn_fd = -1;
+			goto again;
+		}
+	}
+	return (0);
+}
+
+static struct inout_port dbgport = {
+	"bvmdbg",
+	BVM_DBG_PORT,
+	1,
+	IOPORT_F_INOUT,
+	dbg_handler
+};
+
+SYSRES_IO(BVM_DBG_PORT, 4);
+
+void
+init_dbgport(int sport)
+{
+	int reuse;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+
+	conn_fd = -1;
+
+	if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+		perror("cannot create socket");
+		exit(4);
+	}
+
+#ifdef	__FreeBSD__
+	sin.sin_len = sizeof(sin);
+#endif
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(sport);
+
+	reuse = 1;
+	if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse,
+	    sizeof(reuse)) < 0) {
+		perror("cannot set socket options");
+		exit(4);
+	}
+
+	if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+		perror("cannot bind socket");
+		exit(4);
+	}
+
+	if (listen(listen_fd, 1) < 0) {
+		perror("cannot listen socket");
+		exit(4);
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(listen_fd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	register_inout(&dbgport);
+}
diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h
index b95df0bd31..407ff3ffbf 100644
--- a/usr/src/cmd/bhyve/dbgport.h
+++ b/usr/src/cmd/bhyve/dbgport.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/dbgport.h 256156 2013-10-08 16:36:17Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _DBGPORT_H_
diff --git a/usr/src/cmd/bhyve/fwctl.c b/usr/src/cmd/bhyve/fwctl.c
new file mode 100644
index 0000000000..0640bc28ba
--- /dev/null
+++ b/usr/src/cmd/bhyve/fwctl.c
@@ -0,0 +1,552 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015  Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does,
+ * but with a request/response messaging protocol.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "fwctl.h"
+
+/*
+ * Messaging protocol base operations
+ */
+#define	OP_NULL		1
+#define	OP_ECHO		2
+#define	OP_GET		3
+#define	OP_GET_LEN	4
+#define	OP_SET		5
+#define	OP_MAX		OP_SET
+
+/* I/O ports */
+#define	FWCTL_OUT	0x510
+#define	FWCTL_IN	0x511
+
+/*
+ * Back-end state-machine
+ */
+enum state {
+	DORMANT,
+	IDENT_WAIT,
+	IDENT_SEND,
+	REQ,
+	RESP
+} be_state = DORMANT;
+
+static uint8_t sig[] = { 'B', 'H', 'Y', 'V' };
+static u_int ident_idx;
+
+struct op_info {
+	int op;
+	int  (*op_start)(uint32_t len);
+	void (*op_data)(uint32_t data, uint32_t len);
+	int  (*op_result)(struct iovec **data);
+	void (*op_done)(struct iovec *data);
+};
+static struct op_info *ops[OP_MAX+1];
+
+/* Return 0-padded uint32_t */
+static uint32_t
+fwctl_send_rest(uint32_t *data, size_t len)
+{
+	union {
+		uint8_t c[4];
+		uint32_t w;
+	} u;
+	uint8_t *cdata;
+	int i;
+
+	cdata = (uint8_t *) data;
+	u.w = 0;	
+
+	for (i = 0, u.w = 0; i < len; i++)
+		u.c[i] = *cdata++;
+
+	return (u.w);
+}
+
+/*
+ * error op dummy proto - drop all data sent and return an error
+*/
+static int errop_code;
+
+static void
+errop_set(int err)
+{
+
+	errop_code = err;
+}
+
+static int
+errop_start(uint32_t len)
+{
+	errop_code = ENOENT;
+
+	/* accept any length */
+	return (errop_code);
+}
+
+static void
+errop_data(uint32_t data, uint32_t len)
+{
+
+	/* ignore */
+}
+
+static int
+errop_result(struct iovec **data)
+{
+
+	/* no data to send back; always successful */
+	*data = NULL;
+	return (errop_code);
+}
+
+static void
+errop_done(struct iovec *data)
+{
+
+	/* assert data is NULL */
+}
+
+static struct op_info errop_info = {
+	.op_start  = errop_start,
+	.op_data   = errop_data,
+	.op_result = errop_result,
+	.op_done   = errop_done
+};
+
+/* OID search */
+SET_DECLARE(ctl_set, struct ctl);
+
+CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus));
+
+static struct ctl *
+ctl_locate(const char *str, int maxlen)
+{
+	struct ctl *cp, **cpp;
+
+	SET_FOREACH(cpp, ctl_set)  {
+		cp = *cpp;
+		if (!strncmp(str, cp->c_oid, maxlen))
+			return (cp);
+	}
+
+	return (NULL);
+}
+
+/* uefi-sysctl get-len */
+#define FGET_STRSZ	80
+static struct iovec fget_biov[2];
+static char fget_str[FGET_STRSZ];
+static struct {
+	size_t f_sz;
+	uint32_t f_data[1024];
+} fget_buf;
+static int fget_cnt;
+static size_t fget_size;
+
+static int
+fget_start(uint32_t len)
+{
+
+	if (len > FGET_STRSZ)
+		return(E2BIG);
+
+	fget_cnt = 0;
+
+	return (0);
+}
+
+static void
+fget_data(uint32_t data, uint32_t len)
+{
+
+	*((uint32_t *) &fget_str[fget_cnt]) = data;
+	fget_cnt += sizeof(uint32_t);
+}
+
+static int
+fget_result(struct iovec **data, int val)
+{
+	struct ctl *cp;
+	int err;
+
+	err = 0;
+
+	/* Locate the OID */
+	cp = ctl_locate(fget_str, fget_cnt);
+	if (cp == NULL) {
+		*data = NULL;
+		err = ENOENT;
+	} else {
+		if (val) {
+			/* For now, copy the len/data into a buffer */
+			memset(&fget_buf, 0, sizeof(fget_buf));
+			fget_buf.f_sz = cp->c_len;
+			memcpy(fget_buf.f_data, cp->c_data, cp->c_len);
+			fget_biov[0].iov_base = (char *)&fget_buf;
+			fget_biov[0].iov_len  = sizeof(fget_buf.f_sz) +
+				cp->c_len;
+		} else {
+			fget_size = cp->c_len;
+			fget_biov[0].iov_base = (char *)&fget_size;
+			fget_biov[0].iov_len  = sizeof(fget_size);
+		}
+
+		fget_biov[1].iov_base = NULL;
+		fget_biov[1].iov_len  = 0;
+		*data = fget_biov;
+	}
+
+	return (err);
+}
+
+static void
+fget_done(struct iovec *data)
+{
+
+	/* nothing needs to be freed */
+}
+
+static int
+fget_len_result(struct iovec **data)
+{
+	return (fget_result(data, 0));
+}
+
+static int
+fget_val_result(struct iovec **data)
+{
+	return (fget_result(data, 1));
+}
+
+static struct op_info fgetlen_info = {
+	.op_start  = fget_start,
+	.op_data   = fget_data,
+	.op_result = fget_len_result,
+	.op_done   = fget_done
+};
+
+static struct op_info fgetval_info = {
+	.op_start  = fget_start,
+	.op_data   = fget_data,
+	.op_result = fget_val_result,
+	.op_done   = fget_done
+};
+
+static struct req_info {
+	int      req_error;
+	u_int    req_count;
+	uint32_t req_size;
+	uint32_t req_type;
+	uint32_t req_txid;
+	struct op_info *req_op;
+	int	 resp_error;
+	int	 resp_count;
+	size_t	 resp_size;
+	size_t	 resp_off;
+	struct iovec *resp_biov;
+} rinfo;
+
+static void
+fwctl_response_done(void)
+{
+
+	(*rinfo.req_op->op_done)(rinfo.resp_biov);
+
+	/* reinit the req data struct */
+	memset(&rinfo, 0, sizeof(rinfo));
+}
+
+static void
+fwctl_request_done(void)
+{
+
+	rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov);
+
+	/* XXX only a single vector supported at the moment */
+	rinfo.resp_off = 0;
+	if (rinfo.resp_biov == NULL) {
+		rinfo.resp_size = 0;
+	} else {
+		rinfo.resp_size = rinfo.resp_biov[0].iov_len;
+	}
+}
+
+static int
+fwctl_request_start(void)
+{
+	int err;
+
+	/* Data size doesn't include header */
+	rinfo.req_size -= 12;
+
+	rinfo.req_op = &errop_info;
+	if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL)
+		rinfo.req_op = ops[rinfo.req_type];
+
+	err = (*rinfo.req_op->op_start)(rinfo.req_size);
+
+	if (err) {
+		errop_set(err);
+		rinfo.req_op = &errop_info;
+	}
+
+	/* Catch case of zero-length message here */
+	if (rinfo.req_size == 0) {
+		fwctl_request_done();
+		return (1);
+	}
+
+	return (0);
+}
+
+static int
+fwctl_request_data(uint32_t value)
+{
+
+	/* Make sure remaining size is >= 0 */
+	if (rinfo.req_size <= sizeof(uint32_t))
+		rinfo.req_size = 0;
+	else
+		rinfo.req_size -= sizeof(uint32_t);
+
+	(*rinfo.req_op->op_data)(value, rinfo.req_size);
+
+	if (rinfo.req_size < sizeof(uint32_t)) {
+		fwctl_request_done();
+		return (1);
+	}
+
+	return (0);
+}
+
+static int
+fwctl_request(uint32_t value)
+{
+
+	int ret;
+
+	ret = 0;
+
+	switch (rinfo.req_count) {
+	case 0:
+		/* Verify size */
+		if (value < 12) {
+			printf("msg size error");
+			exit(4);
+		}
+		rinfo.req_size = value;
+		rinfo.req_count = 1;
+		break;
+	case 1:
+		rinfo.req_type = value;
+		rinfo.req_count++;
+		break;
+	case 2:
+		rinfo.req_txid = value;
+		rinfo.req_count++;
+		ret = fwctl_request_start();
+		break;
+	default:
+		ret = fwctl_request_data(value);
+		break;
+	}
+
+	return (ret);
+}
+
+static int
+fwctl_response(uint32_t *retval)
+{
+	uint32_t *dp;
+	ssize_t remlen;
+
+	switch(rinfo.resp_count) {
+	case 0:
+		/* 4 x u32 header len + data */
+		*retval = 4*sizeof(uint32_t) +
+		    roundup(rinfo.resp_size, sizeof(uint32_t));
+		rinfo.resp_count++;
+		break;
+	case 1:
+		*retval = rinfo.req_type;
+		rinfo.resp_count++;
+		break;
+	case 2:
+		*retval = rinfo.req_txid;
+		rinfo.resp_count++;
+		break;
+	case 3:
+		*retval = rinfo.resp_error;
+		rinfo.resp_count++;
+		break;
+	default:
+		remlen = rinfo.resp_size - rinfo.resp_off;
+		dp = (uint32_t *)
+		    ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off);
+		if (remlen >= sizeof(uint32_t)) {
+			*retval = *dp;
+		} else if (remlen > 0) {
+			*retval = fwctl_send_rest(dp, remlen);
+		}
+		rinfo.resp_off += sizeof(uint32_t);
+		break;
+	}
+
+	if (rinfo.resp_count > 3 &&
+	    rinfo.resp_off >= rinfo.resp_size) {
+		fwctl_response_done();
+		return (1);
+	}
+
+	return (0);
+}
+
+
+/*
+ * i/o port handling.
+ */
+static uint8_t
+fwctl_inb(void)
+{
+	uint8_t retval;
+
+	retval = 0xff;
+
+	switch (be_state) {
+	case IDENT_SEND:
+		retval = sig[ident_idx++];
+		if (ident_idx >= sizeof(sig))
+			be_state = REQ;
+		break;
+	default:
+		break;
+	}
+
+	return (retval);
+}
+
+static void
+fwctl_outw(uint16_t val)
+{
+	switch (be_state) {
+	case IDENT_WAIT:
+		if (val == 0) {
+			be_state = IDENT_SEND;
+			ident_idx = 0;
+		}
+		break;
+	default:
+		/* ignore */
+		break;
+	}
+}
+
+static uint32_t
+fwctl_inl(void)
+{
+	uint32_t retval;
+
+	switch (be_state) {
+	case RESP:
+		if (fwctl_response(&retval))
+			be_state = REQ;
+		break;
+	default:
+		retval = 0xffffffff;
+		break;
+	}
+
+	return (retval);
+}
+
+static void
+fwctl_outl(uint32_t val)
+{
+
+	switch (be_state) {
+	case REQ:
+		if (fwctl_request(val))
+			be_state = RESP;
+	default:
+		break;
+	}
+
+}
+
+static int
+fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+
+	if (in) {
+		if (bytes == 1)
+			*eax = fwctl_inb();
+		else if (bytes == 4)
+			*eax = fwctl_inl();
+		else
+			*eax = 0xffff;
+	} else {
+		if (bytes == 2)
+			fwctl_outw(*eax);
+		else if (bytes == 4)
+			fwctl_outl(*eax);
+	}
+
+	return (0);
+}
+INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler);
+INOUT_PORT(fwctl_rreg, FWCTL_IN,  IOPORT_F_IN,    fwctl_handler);
+
+void
+fwctl_init(void)
+{
+
+	ops[OP_GET_LEN] = &fgetlen_info;
+	ops[OP_GET]     = &fgetval_info;
+
+	be_state = IDENT_WAIT;
+}
diff --git a/usr/src/cmd/bhyve/fwctl.h b/usr/src/cmd/bhyve/fwctl.h
new file mode 100644
index 0000000000..6dad244811
--- /dev/null
+++ b/usr/src/cmd/bhyve/fwctl.h
@@ -0,0 +1,56 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015  Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _FWCTL_H_
+#define _FWCTL_H_
+
+#include <sys/linker_set.h>
+
+/*
+ * Linker set api for export of information to guest firmware via
+ * a sysctl-like OID interface
+ */
+struct ctl {
+	const char *c_oid;
+	const void *c_data;
+	const int c_len;
+};
+
+#define CTL_NODE(oid, data, len)				\
+	static struct ctl __CONCAT(__ctl, __LINE__) = {		\
+		oid,						\
+		(data),						\
+		(len),						\
+	};							\
+	DATA_SET(ctl_set, __CONCAT(__ctl, __LINE__))
+
+void	fwctl_init(void);
+
+#endif /* _FWCTL_H_ */
diff --git a/usr/src/cmd/bhyve/gdb.c b/usr/src/cmd/bhyve/gdb.c
new file mode 100644
index 0000000000..20c2de1dec
--- /dev/null
+++ b/usr/src/cmd/bhyve/gdb.c
@@ -0,0 +1,1523 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017-2018 John H. Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <machine/atomic.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+#include <netinet/in.h>
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "mem.h"
+#include "mevent.h"
+
+/*
+ * GDB_SIGNAL_* numbers are part of the GDB remote protocol.  Most stops
+ * use SIGTRAP.
+ */
+#define	GDB_SIGNAL_TRAP		5
+
+static void gdb_resume_vcpus(void);
+static void check_command(int fd);
+
+static struct mevent *read_event, *write_event;
+
+static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting;
+static pthread_mutex_t gdb_lock;
+static pthread_cond_t idle_vcpus;
+static bool stop_pending, first_stop;
+static int stepping_vcpu, stopped_vcpu;
+
+/*
+ * An I/O buffer contains 'capacity' bytes of room at 'data'.  For a
+ * read buffer, 'start' is unused and 'len' contains the number of
+ * valid bytes in the buffer.  For a write buffer, 'start' is set to
+ * the index of the next byte in 'data' to send, and 'len' contains
+ * the remaining number of valid bytes to send.
+ */
+struct io_buffer {
+	uint8_t *data;
+	size_t capacity;
+	size_t start;
+	size_t len;
+};
+
+static struct io_buffer cur_comm, cur_resp;
+static uint8_t cur_csum;
+static int cur_vcpu;
+static struct vmctx *ctx;
+static int cur_fd = -1;
+
+const int gdb_regset[] = {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15,
+	VM_REG_GUEST_RIP,
+	VM_REG_GUEST_RFLAGS,
+	VM_REG_GUEST_CS,
+	VM_REG_GUEST_SS,
+	VM_REG_GUEST_DS,
+	VM_REG_GUEST_ES,
+	VM_REG_GUEST_FS,
+	VM_REG_GUEST_GS
+};
+
+const int gdb_regsize[] = {
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	8,
+	4,
+	4,
+	4,
+	4,
+	4,
+	4,
+	4
+};
+
+#ifdef GDB_LOG
+#include <stdarg.h>
+#include <stdio.h>
+
+static void __printflike(1, 2)
+debug(const char *fmt, ...)
+{
+	static FILE *logfile;
+	va_list ap;
+
+	if (logfile == NULL) {
+		logfile = fopen("/tmp/bhyve_gdb.log", "w");
+		if (logfile == NULL)
+			return;
+#ifndef WITHOUT_CAPSICUM
+		if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) {
+			fclose(logfile);
+			logfile = NULL;
+			return;
+		}
+#endif
+		setlinebuf(logfile);
+	}
+	va_start(ap, fmt);
+	vfprintf(logfile, fmt, ap);
+	va_end(ap);
+}
+#else
+#define debug(...)
+#endif
+
+static int
+guest_paging_info(int vcpu, struct vm_guest_paging *paging)
+{
+	uint64_t regs[4];
+	const int regset[4] = {
+		VM_REG_GUEST_CR0,
+		VM_REG_GUEST_CR3,
+		VM_REG_GUEST_CR4,
+		VM_REG_GUEST_EFER
+	};
+
+	if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1)
+		return (-1);
+
+	/*
+	 * For the debugger, always pretend to be the kernel (CPL 0),
+	 * and if long-mode is enabled, always parse addresses as if
+	 * in 64-bit mode.
+	 */
+	paging->cr3 = regs[1];
+	paging->cpl = 0;
+	if (regs[3] & EFER_LMA)
+		paging->cpu_mode = CPU_MODE_64BIT;
+	else if (regs[0] & CR0_PE)
+		paging->cpu_mode = CPU_MODE_PROTECTED;
+	else
+		paging->cpu_mode = CPU_MODE_REAL;
+	if (!(regs[0] & CR0_PG))
+		paging->paging_mode = PAGING_MODE_FLAT;
+	else if (!(regs[2] & CR4_PAE))
+		paging->paging_mode = PAGING_MODE_32;
+	else if (regs[3] & EFER_LME)
+		paging->paging_mode = PAGING_MODE_64;
+	else
+		paging->paging_mode = PAGING_MODE_PAE;
+	return (0);
+}
+
+/*
+ * Map a guest virtual address to a physical address (for a given vcpu).
+ * If a guest virtual address is valid, return 1.  If the address is
+ * not valid, return 0.  If an error occurs obtaining the mapping,
+ * return -1.
+ */
+static int
+guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr)
+{
+	struct vm_guest_paging paging;
+	int fault;
+
+	if (guest_paging_info(vcpu, &paging) == -1)
+		return (-1);
+
+	/*
+	 * Always use PROT_READ.  We really care if the VA is
+	 * accessible, not if the current vCPU can write.
+	 */
+	if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr,
+	    &fault) == -1)
+		return (-1);
+	if (fault)
+		return (0);
+	return (1);
+}
+
+static void
+io_buffer_reset(struct io_buffer *io)
+{
+
+	io->start = 0;
+	io->len = 0;
+}
+
+/* Available room for adding data. */
+static size_t
+io_buffer_avail(struct io_buffer *io)
+{
+
+	return (io->capacity - (io->start + io->len));
+}
+
+static uint8_t *
+io_buffer_head(struct io_buffer *io)
+{
+
+	return (io->data + io->start);
+}
+
+static uint8_t *
+io_buffer_tail(struct io_buffer *io)
+{
+
+	return (io->data + io->start + io->len);
+}
+
+static void
+io_buffer_advance(struct io_buffer *io, size_t amount)
+{
+
+	assert(amount <= io->len);
+	io->start += amount;
+	io->len -= amount;
+}
+
+static void
+io_buffer_consume(struct io_buffer *io, size_t amount)
+{
+
+	io_buffer_advance(io, amount);
+	if (io->len == 0) {
+		io->start = 0;
+		return;
+	}
+
+	/*
+	 * XXX: Consider making this move optional and compacting on a
+	 * future read() before realloc().
+	 */
+	memmove(io->data, io_buffer_head(io), io->len);
+	io->start = 0;
+}
+
+static void
+io_buffer_grow(struct io_buffer *io, size_t newsize)
+{
+	uint8_t *new_data;
+	size_t avail, new_cap;
+
+	avail = io_buffer_avail(io);
+	if (newsize <= avail)
+		return;
+
+	new_cap = io->capacity + (newsize - avail);
+	new_data = realloc(io->data, new_cap);
+	if (new_data == NULL)
+		err(1, "Failed to grow GDB I/O buffer");
+	io->data = new_data;
+	io->capacity = new_cap;
+}
+
+static bool
+response_pending(void)
+{
+
+	if (cur_resp.start == 0 && cur_resp.len == 0)
+		return (false);
+	if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+')
+		return (false);
+	return (true);
+}
+
+static void
+close_connection(void)
+{
+
+	/*
+	 * XXX: This triggers a warning because mevent does the close
+	 * before the EV_DELETE.
+	 */
+	pthread_mutex_lock(&gdb_lock);
+	mevent_delete(write_event);
+	mevent_delete_close(read_event);
+	write_event = NULL;
+	read_event = NULL;
+	io_buffer_reset(&cur_comm);
+	io_buffer_reset(&cur_resp);
+	cur_fd = -1;
+
+	/* Resume any stopped vCPUs. */
+	gdb_resume_vcpus();
+	pthread_mutex_unlock(&gdb_lock);
+}
+
+static uint8_t
+hex_digit(uint8_t nibble)
+{
+
+	if (nibble <= 9)
+		return (nibble + '0');
+	else
+		return (nibble + 'a' - 10);
+}
+
+static uint8_t
+parse_digit(uint8_t v)
+{
+
+	if (v >= '0' && v <= '9')
+		return (v - '0');
+	if (v >= 'a' && v <= 'f')
+		return (v - 'a' + 10);
+	if (v >= 'A' && v <= 'F')
+		return (v - 'A' + 10);
+	return (0xF);
+}
+
+/* Parses big-endian hexadecimal. */
+static uintmax_t
+parse_integer(const uint8_t *p, size_t len)
+{
+	uintmax_t v;
+
+	v = 0;
+	while (len > 0) {
+		v <<= 4;
+		v |= parse_digit(*p);
+		p++;
+		len--;
+	}
+	return (v);
+}
+
+static uint8_t
+parse_byte(const uint8_t *p)
+{
+
+	return (parse_digit(p[0]) << 4 | parse_digit(p[1]));
+}
+
+static void
+send_pending_data(int fd)
+{
+	ssize_t nwritten;
+
+	if (cur_resp.len == 0) {
+		mevent_disable(write_event);
+		return;
+	}
+	nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len);
+	if (nwritten == -1) {
+		warn("Write to GDB socket failed");
+		close_connection();
+	} else {
+		io_buffer_advance(&cur_resp, nwritten);
+		if (cur_resp.len == 0)
+			mevent_disable(write_event);
+		else
+			mevent_enable(write_event);
+	}
+}
+
+/* Append a single character to the output buffer. */
+static void
+send_char(uint8_t data)
+{
+	io_buffer_grow(&cur_resp, 1);
+	*io_buffer_tail(&cur_resp) = data;
+	cur_resp.len++;
+}
+
+/* Append an array of bytes to the output buffer. */
+static void
+send_data(const uint8_t *data, size_t len)
+{
+
+	io_buffer_grow(&cur_resp, len);
+	memcpy(io_buffer_tail(&cur_resp), data, len);
+	cur_resp.len += len;
+}
+
+static void
+format_byte(uint8_t v, uint8_t *buf)
+{
+
+	buf[0] = hex_digit(v >> 4);
+	buf[1] = hex_digit(v & 0xf);
+}
+
+/*
+ * Append a single byte (formatted as two hex characters) to the
+ * output buffer.
+ */
+static void
+send_byte(uint8_t v)
+{
+	uint8_t buf[2];
+
+	format_byte(v, buf);
+	send_data(buf, sizeof(buf));
+}
+
+static void
+start_packet(void)
+{
+
+	send_char('$');
+	cur_csum = 0;
+}
+
+static void
+finish_packet(void)
+{
+
+	send_char('#');
+	send_byte(cur_csum);
+	debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp));
+}
+
+/*
+ * Append a single character (for the packet payload) and update the
+ * checksum.
+ */
+static void
+append_char(uint8_t v)
+{
+
+	send_char(v);
+	cur_csum += v;
+}
+
+/*
+ * Append an array of bytes (for the packet payload) and update the
+ * checksum.
+ */
+static void
+append_packet_data(const uint8_t *data, size_t len)
+{
+
+	send_data(data, len);
+	while (len > 0) {
+		cur_csum += *data;
+		data++;
+		len--;
+	}
+}
+
+static void
+append_string(const char *str)
+{
+
+#ifdef __FreeBSD__
+	append_packet_data(str, strlen(str));
+#else
+	append_packet_data((const uint8_t *)str, strlen(str));
+#endif
+}
+
+static void
+append_byte(uint8_t v)
+{
+	uint8_t buf[2];
+
+	format_byte(v, buf);
+	append_packet_data(buf, sizeof(buf));
+}
+
+static void
+append_unsigned_native(uintmax_t value, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		append_byte(value);
+		value >>= 8;
+	}
+}
+
+static void
+append_unsigned_be(uintmax_t value, size_t len)
+{
+	char buf[len * 2];
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+#ifdef __FreeBSD__
+		format_byte(value, buf + (len - i - 1) * 2);
+#else
+		format_byte(value, (uint8_t *)(buf + (len - i - 1) * 2));
+#endif
+		value >>= 8;
+	}
+#ifdef __FreeBSD__
+	append_packet_data(buf, sizeof(buf));
+#else
+	append_packet_data((const uint8_t *)buf, sizeof(buf));
+#endif
+}
+
+static void
+append_integer(unsigned int value)
+{
+
+	if (value == 0)
+		append_char('0');
+	else
+		append_unsigned_be(value, fls(value) + 7 / 8);
+}
+
+static void
+append_asciihex(const char *str)
+{
+
+	while (*str != '\0') {
+		append_byte(*str);
+		str++;
+	}
+}
+
+static void
+send_empty_response(void)
+{
+
+	start_packet();
+	finish_packet();
+}
+
+static void
+send_error(int error)
+{
+
+	start_packet();
+	append_char('E');
+	append_byte(error);
+	finish_packet();
+}
+
+static void
+send_ok(void)
+{
+
+	start_packet();
+	append_string("OK");
+	finish_packet();
+}
+
+static int
+parse_threadid(const uint8_t *data, size_t len)
+{
+
+	if (len == 1 && *data == '0')
+		return (0);
+	if (len == 2 && memcmp(data, "-1", 2) == 0)
+		return (-1);
+	if (len == 0)
+		return (-2);
+	return (parse_integer(data, len));
+}
+
+static void
+report_stop(void)
+{
+
+	start_packet();
+	if (stopped_vcpu == -1)
+		append_char('S');
+	else
+		append_char('T');
+	append_byte(GDB_SIGNAL_TRAP);
+	if (stopped_vcpu != -1) {
+		append_string("thread:");
+		append_integer(stopped_vcpu + 1);
+		append_char(';');
+	}
+	stopped_vcpu = -1;
+	finish_packet();
+}
+
+static void
+gdb_finish_suspend_vcpus(void)
+{
+
+	if (first_stop) {
+		first_stop = false;
+		stopped_vcpu = -1;
+	} else if (response_pending())
+		stop_pending = true;
+	else {
+		report_stop();
+		send_pending_data(cur_fd);
+	}
+}
+
+static void
+_gdb_cpu_suspend(int vcpu, bool report_stop)
+{
+
+	debug("$vCPU %d suspending\n", vcpu);
+	CPU_SET(vcpu, &vcpus_waiting);
+	if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0)
+		gdb_finish_suspend_vcpus();
+	while (CPU_ISSET(vcpu, &vcpus_suspended) && vcpu != stepping_vcpu)
+		pthread_cond_wait(&idle_vcpus, &gdb_lock);
+	CPU_CLR(vcpu, &vcpus_waiting);
+	debug("$vCPU %d resuming\n", vcpu);
+}
+
+void
+gdb_cpu_add(int vcpu)
+{
+
+	debug("$vCPU %d starting\n", vcpu);
+	pthread_mutex_lock(&gdb_lock);
+	CPU_SET(vcpu, &vcpus_active);
+
+	/*
+	 * If a vcpu is added while vcpus are stopped, suspend the new
+	 * vcpu so that it will pop back out with a debug exit before
+	 * executing the first instruction.
+	 */
+	if (!CPU_EMPTY(&vcpus_suspended)) {
+		CPU_SET(vcpu, &vcpus_suspended);
+		_gdb_cpu_suspend(vcpu, false);
+	}
+	pthread_mutex_unlock(&gdb_lock);
+}
+
+void
+gdb_cpu_suspend(int vcpu)
+{
+
+	pthread_mutex_lock(&gdb_lock);
+	_gdb_cpu_suspend(vcpu, true);
+	pthread_mutex_unlock(&gdb_lock);
+}
+
+void
+gdb_cpu_mtrap(int vcpu)
+{
+
+	debug("$vCPU %d MTRAP\n", vcpu);
+	pthread_mutex_lock(&gdb_lock);
+	if (vcpu == stepping_vcpu) {
+		stepping_vcpu = -1;
+		vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0);
+		vm_suspend_cpu(ctx, vcpu);
+		assert(stopped_vcpu == -1);
+		stopped_vcpu = vcpu;
+		_gdb_cpu_suspend(vcpu, true);
+	}
+	pthread_mutex_unlock(&gdb_lock);
+}
+
+static void
+gdb_suspend_vcpus(void)
+{
+
+	assert(pthread_mutex_isowned_np(&gdb_lock));
+	debug("suspending all CPUs\n");
+	vcpus_suspended = vcpus_active;
+	vm_suspend_cpu(ctx, -1);
+	if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0)
+		gdb_finish_suspend_vcpus();
+}
+
+static bool
+gdb_step_vcpu(int vcpu)
+{
+	int error, val;
+
+	debug("$vCPU %d step\n", vcpu);
+	error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val);
+	if (error < 0)
+		return (false);
+	error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1);
+	vm_resume_cpu(ctx, vcpu);
+	stepping_vcpu = vcpu;
+	pthread_cond_broadcast(&idle_vcpus);
+	return (true);
+}
+
+static void
+gdb_resume_vcpus(void)
+{
+
+	assert(pthread_mutex_isowned_np(&gdb_lock));
+	vm_resume_cpu(ctx, -1);
+	debug("resuming all CPUs\n");
+	CPU_ZERO(&vcpus_suspended);
+	pthread_cond_broadcast(&idle_vcpus);
+}
+
+static void
+gdb_read_regs(void)
+{
+	uint64_t regvals[nitems(gdb_regset)];
+	int i;
+
+	if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset),
+	    gdb_regset, regvals) == -1) {
+		send_error(errno);
+		return;
+	}
+	start_packet();
+	for (i = 0; i < nitems(regvals); i++)
+		append_unsigned_native(regvals[i], gdb_regsize[i]);
+	finish_packet();
+}
+
+static void
+gdb_read_mem(const uint8_t *data, size_t len)
+{
+	uint64_t gpa, gva, val;
+	uint8_t *cp;
+	size_t resid, todo, bytes;
+	bool started;
+	int error;
+
+	/* Skip 'm' */
+	data += 1;
+	len -= 1;
+
+	/* Parse and consume address. */
+	cp = memchr(data, ',', len);
+	if (cp == NULL || cp == data) {
+		send_error(EINVAL);
+		return;
+	}
+	gva = parse_integer(data, cp - data);
+	len -= (cp - data) + 1;
+	data += (cp - data) + 1;
+
+	/* Parse length. */
+	resid = parse_integer(data, len);
+
+	started = false;
+	while (resid > 0) {
+		error = guest_vaddr2paddr(cur_vcpu, gva, &gpa);
+		if (error == -1) {
+			if (started)
+				finish_packet();
+			else
+				send_error(errno);
+			return;
+		}
+		if (error == 0) {
+			if (started)
+				finish_packet();
+			else
+				send_error(EFAULT);
+			return;
+		}
+
+		/* Read bytes from current page. */
+		todo = getpagesize() - gpa % getpagesize();
+		if (todo > resid)
+			todo = resid;
+
+		cp = paddr_guest2host(ctx, gpa, todo);
+		if (cp != NULL) {
+			/*
+			 * If this page is guest RAM, read it a byte
+			 * at a time.
+			 */
+			if (!started) {
+				start_packet();
+				started = true;
+			}
+			while (todo > 0) {
+				append_byte(*cp);
+				cp++;
+				gpa++;
+				gva++;
+				resid--;
+				todo--;
+			}
+		} else {
+			/*
+			 * If this page isn't guest RAM, try to handle
+			 * it via MMIO.  For MMIO requests, use
+			 * aligned reads of words when possible.
+			 */
+			while (todo > 0) {
+				if (gpa & 1 || todo == 1)
+					bytes = 1;
+				else if (gpa & 2 || todo == 2)
+					bytes = 2;
+				else
+					bytes = 4;
+				error = read_mem(ctx, cur_vcpu, gpa, &val,
+				    bytes);
+				if (error == 0) {
+					if (!started) {
+						start_packet();
+						started = true;
+					}
+					gpa += bytes;
+					gva += bytes;
+					resid -= bytes;
+					todo -= bytes;
+					while (bytes > 0) {
+						append_byte(val);
+						val >>= 8;
+						bytes--;
+					}
+				} else {
+					if (started)
+						finish_packet();
+					else
+						send_error(EFAULT);
+					return;
+				}
+			}
+		}
+		assert(resid == 0 || gpa % getpagesize() == 0);
+	}
+	if (!started)
+		start_packet();
+	finish_packet();
+}
+
+static void
+gdb_write_mem(const uint8_t *data, size_t len)
+{
+	uint64_t gpa, gva, val;
+	uint8_t *cp;
+	size_t resid, todo, bytes;
+	int error;
+
+	/* Skip 'M' */
+	data += 1;
+	len -= 1;
+
+	/* Parse and consume address. */
+	cp = memchr(data, ',', len);
+	if (cp == NULL || cp == data) {
+		send_error(EINVAL);
+		return;
+	}
+	gva = parse_integer(data, cp - data);
+	len -= (cp - data) + 1;
+	data += (cp - data) + 1;
+
+	/* Parse and consume length. */
+	cp = memchr(data, ':', len);
+	if (cp == NULL || cp == data) {
+		send_error(EINVAL);
+		return;
+	}
+	resid = parse_integer(data, cp - data);
+	len -= (cp - data) + 1;
+	data += (cp - data) + 1;
+
+	/* Verify the available bytes match the length. */
+	if (len != resid * 2) {
+		send_error(EINVAL);
+		return;
+	}
+
+	while (resid > 0) {
+		error = guest_vaddr2paddr(cur_vcpu, gva, &gpa);
+		if (error == -1) {
+			send_error(errno);
+			return;
+		}
+		if (error == 0) {
+			send_error(EFAULT);
+			return;
+		}
+
+		/* Write bytes to current page. */
+		todo = getpagesize() - gpa % getpagesize();
+		if (todo > resid)
+			todo = resid;
+
+		cp = paddr_guest2host(ctx, gpa, todo);
+		if (cp != NULL) {
+			/*
+			 * If this page is guest RAM, write it a byte
+			 * at a time.
+			 */
+			while (todo > 0) {
+				assert(len >= 2);
+				*cp = parse_byte(data);
+				data += 2;
+				len -= 2;
+				cp++;
+				gpa++;
+				gva++;
+				resid--;
+				todo--;
+			}
+		} else {
+			/*
+			 * If this page isn't guest RAM, try to handle
+			 * it via MMIO.  For MMIO requests, use
+			 * aligned writes of words when possible.
+			 */
+			while (todo > 0) {
+				if (gpa & 1 || todo == 1) {
+					bytes = 1;
+					val = parse_byte(data);
+				} else if (gpa & 2 || todo == 2) {
+					bytes = 2;
+					val = parse_byte(data) |
+					    (parse_byte(data + 2) << 8);
+				} else {
+					bytes = 4;
+					val = parse_byte(data) |
+					    (parse_byte(data + 2) << 8) |
+					    (parse_byte(data + 4) << 16) |
+					    (parse_byte(data + 6) << 24);
+				}
+				error = write_mem(ctx, cur_vcpu, gpa, val,
+				    bytes);
+				if (error == 0) {
+					gpa += bytes;
+					gva += bytes;
+					resid -= bytes;
+					todo -= bytes;
+					data += 2 * bytes;
+					len -= 2 * bytes;
+				} else {
+					send_error(EFAULT);
+					return;
+				}
+			}
+		}
+		assert(resid == 0 || gpa % getpagesize() == 0);
+	}
+	assert(len == 0);
+	send_ok();
+}
+
+static bool
+command_equals(const uint8_t *data, size_t len, const char *cmd)
+{
+
+	if (strlen(cmd) > len)
+		return (false);
+	return (memcmp(data, cmd, strlen(cmd)) == 0);
+}
+
+static void
+check_features(const uint8_t *data, size_t len)
+{
+	char *feature, *next_feature, *str, *value;
+	bool supported;
+
+	str = malloc(len + 1);
+	memcpy(str, data, len);
+	str[len] = '\0';
+	next_feature = str;
+
+	while ((feature = strsep(&next_feature, ";")) != NULL) {
+		/*
+		 * Null features shouldn't exist, but skip if they
+		 * do.
+		 */
+		if (strcmp(feature, "") == 0)
+			continue;
+
+		/*
+		 * Look for the value or supported / not supported
+		 * flag.
+		 */
+		value = strchr(feature, '=');
+		if (value != NULL) {
+			*value = '\0';
+			value++;
+			supported = true;
+		} else {
+			value = feature + strlen(feature) - 1;
+			switch (*value) {
+			case '+':
+				supported = true;
+				break;
+			case '-':
+				supported = false;
+				break;
+			default:
+				/*
+				 * This is really a protocol error,
+				 * but we just ignore malformed
+				 * features for ease of
+				 * implementation.
+				 */
+				continue;
+			}
+			value = NULL;
+		}
+
+		/* No currently supported features. */
+#ifndef __FreeBSD__
+		/*
+		 * The compiler dislikes 'supported' being set but never used.
+		 * Make it happy here.
+		 */
+		if (supported) {
+			debug("feature '%s' supported\n", feature);
+		}
+#endif /* __FreeBSD__ */
+	}
+	free(str);
+
+	start_packet();
+
+	/* This is an arbitrary limit. */
+	append_string("PacketSize=4096");
+	finish_packet();
+}
+
+static void
+gdb_query(const uint8_t *data, size_t len)
+{
+
+	/*
+	 * TODO:
+	 * - qSearch
+	 */
+	if (command_equals(data, len, "qAttached")) {
+		start_packet();
+		append_char('1');
+		finish_packet();
+	} else if (command_equals(data, len, "qC")) {
+		start_packet();
+		append_string("QC");
+		append_integer(cur_vcpu + 1);
+		finish_packet();
+	} else if (command_equals(data, len, "qfThreadInfo")) {
+		cpuset_t mask;
+		bool first;
+		int vcpu;
+
+		if (CPU_EMPTY(&vcpus_active)) {
+			send_error(EINVAL);
+			return;
+		}
+		mask = vcpus_active;
+		start_packet();
+		append_char('m');
+		first = true;
+		while (!CPU_EMPTY(&mask)) {
+			vcpu = CPU_FFS(&mask) - 1;
+			CPU_CLR(vcpu, &mask);
+			if (first)
+				first = false;
+			else
+				append_char(',');
+			append_integer(vcpu + 1);
+		}
+		finish_packet();
+	} else if (command_equals(data, len, "qsThreadInfo")) {
+		start_packet();
+		append_char('l');
+		finish_packet();
+	} else if (command_equals(data, len, "qSupported")) {
+		data += strlen("qSupported");
+		len -= strlen("qSupported");
+		check_features(data, len);
+	} else if (command_equals(data, len, "qThreadExtraInfo")) {
+		char buf[16];
+		int tid;
+
+		data += strlen("qThreadExtraInfo");
+		len -= strlen("qThreadExtraInfo");
+		if (*data != ',') {
+			send_error(EINVAL);
+			return;
+		}
+		tid = parse_threadid(data + 1, len - 1);
+		if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) {
+			send_error(EINVAL);
+			return;
+		}
+
+		snprintf(buf, sizeof(buf), "vCPU %d", tid - 1);
+		start_packet();
+		append_asciihex(buf);
+		finish_packet();
+	} else
+		send_empty_response();
+}
+
+static void
+handle_command(const uint8_t *data, size_t len)
+{
+
+	/* Reject packets with a sequence-id. */
+	if (len >= 3 && data[0] >= '0' && data[0] <= '9' &&
+	    data[0] >= '0' && data[0] <= '9' && data[2] == ':') {
+		send_empty_response();
+		return;
+	}
+
+	switch (*data) {
+	case 'c':
+		if (len != 1) {
+			send_error(EINVAL);
+			break;
+		}
+
+		/* Don't send a reply until a stop occurs. */
+		gdb_resume_vcpus();
+		break;
+	case 'D':
+		send_ok();
+
+		/* TODO: Resume any stopped CPUs. */
+		break;
+	case 'g': {
+		gdb_read_regs();
+		break;
+	}
+	case 'H': {
+		int tid;
+
+		if (data[1] != 'g' && data[1] != 'c') {
+			send_error(EINVAL);
+			break;
+		}
+		tid = parse_threadid(data + 2, len - 2);
+		if (tid == -2) {
+			send_error(EINVAL);
+			break;
+		}
+
+		if (CPU_EMPTY(&vcpus_active)) {
+			send_error(EINVAL);
+			break;
+		}
+		if (tid == -1 || tid == 0)
+			cur_vcpu = CPU_FFS(&vcpus_active) - 1;
+		else if (CPU_ISSET(tid - 1, &vcpus_active))
+			cur_vcpu = tid - 1;
+		else {
+			send_error(EINVAL);
+			break;
+		}
+		send_ok();
+		break;
+	}
+	case 'm':
+		gdb_read_mem(data, len);
+		break;
+	case 'M':
+		gdb_write_mem(data, len);
+		break;
+	case 'T': {
+		int tid;
+
+		tid = parse_threadid(data + 1, len - 1);
+		if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) {
+			send_error(EINVAL);
+			return;
+		}
+		send_ok();
+		break;
+	}
+	case 'q':
+		gdb_query(data, len);
+		break;
+	case 's':
+		if (len != 1) {
+			send_error(EINVAL);
+			break;
+		}
+
+		/* Don't send a reply until a stop occurs. */
+		if (!gdb_step_vcpu(cur_vcpu)) {
+			send_error(EOPNOTSUPP);
+			break;
+		}
+		break;
+	case '?':
+		/* XXX: Only if stopped? */
+		/* For now, just report that we are always stopped. */
+		start_packet();
+		append_char('S');
+		append_byte(GDB_SIGNAL_TRAP);
+		finish_packet();
+		break;
+	case 'G': /* TODO */
+	case 'v':
+		/* Handle 'vCont' */
+		/* 'vCtrlC' */
+	case 'p': /* TODO */
+	case 'P': /* TODO */
+	case 'Q': /* TODO */
+	case 't': /* TODO */
+	case 'X': /* TODO */
+	case 'z': /* TODO */
+	case 'Z': /* TODO */
+	default:
+		send_empty_response();
+	}
+}
+
+/* Check for a valid packet in the command buffer. */
+static void
+check_command(int fd)
+{
+	uint8_t *head, *hash, *p, sum;
+	size_t avail, plen;
+
+	for (;;) {
+		avail = cur_comm.len;
+		if (avail == 0)
+			return;
+		head = io_buffer_head(&cur_comm);
+		switch (*head) {
+		case 0x03:
+			debug("<- Ctrl-C\n");
+			io_buffer_consume(&cur_comm, 1);
+
+			gdb_suspend_vcpus();
+			break;
+		case '+':
+			/* ACK of previous response. */
+			debug("<- +\n");
+			if (response_pending())
+				io_buffer_reset(&cur_resp);
+			io_buffer_consume(&cur_comm, 1);
+			if (stop_pending) {
+				stop_pending = false;
+				report_stop();
+				send_pending_data(fd);
+			}
+			break;
+		case '-':
+			/* NACK of previous response. */
+			debug("<- -\n");
+			if (response_pending()) {
+				cur_resp.len += cur_resp.start;
+				cur_resp.start = 0;
+				if (cur_resp.data[0] == '+')
+					io_buffer_advance(&cur_resp, 1);
+				debug("-> %.*s\n", (int)cur_resp.len,
+				    io_buffer_head(&cur_resp));
+			}
+			io_buffer_consume(&cur_comm, 1);
+			send_pending_data(fd);
+			break;
+		case '$':
+			/* Packet. */
+
+			if (response_pending()) {
+				warnx("New GDB command while response in "
+				    "progress");
+				io_buffer_reset(&cur_resp);
+			}
+
+			/* Is packet complete? */
+			hash = memchr(head, '#', avail);
+			if (hash == NULL)
+				return;
+			plen = (hash - head + 1) + 2;
+			if (avail < plen)
+				return;
+			debug("<- %.*s\n", (int)plen, head);
+
+			/* Verify checksum. */
+			for (sum = 0, p = head + 1; p < hash; p++)
+				sum += *p;
+			if (sum != parse_byte(hash + 1)) {
+				io_buffer_consume(&cur_comm, plen);
+				debug("-> -\n");
+				send_char('-');
+				send_pending_data(fd);
+				break;
+			}
+			send_char('+');
+
+			handle_command(head + 1, hash - (head + 1));
+			io_buffer_consume(&cur_comm, plen);
+			if (!response_pending()) {
+				debug("-> +\n");
+			}
+			send_pending_data(fd);
+			break;
+		default:
+			/* XXX: Possibly drop connection instead. */
+			debug("-> %02x\n", *head);
+			io_buffer_consume(&cur_comm, 1);
+			break;
+		}
+	}
+}
+
+static void
+gdb_readable(int fd, enum ev_type event, void *arg)
+{
+	ssize_t nread;
+	int pending;
+
+	if (ioctl(fd, FIONREAD, &pending) == -1) {
+		warn("FIONREAD on GDB socket");
+		return;
+	}
+
+	/*
+	 * 'pending' might be zero due to EOF.  We need to call read
+	 * with a non-zero length to detect EOF.
+	 */
+	if (pending == 0)
+		pending = 1;
+
+	/* Ensure there is room in the command buffer. */
+	io_buffer_grow(&cur_comm, pending);
+	assert(io_buffer_avail(&cur_comm) >= pending);
+
+	nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm));
+	if (nread == 0) {
+		close_connection();
+	} else if (nread == -1) {
+		if (errno == EAGAIN)
+			return;
+
+		warn("Read from GDB socket");
+		close_connection();
+	} else {
+		cur_comm.len += nread;
+		pthread_mutex_lock(&gdb_lock);
+		check_command(fd);
+		pthread_mutex_unlock(&gdb_lock);
+	}
+}
+
+static void
+gdb_writable(int fd, enum ev_type event, void *arg)
+{
+
+	send_pending_data(fd);
+}
+
+static void
+new_connection(int fd, enum ev_type event, void *arg)
+{
+	int optval, s;
+
+	s = accept4(fd, NULL, NULL, SOCK_NONBLOCK);
+	if (s == -1) {
+		if (arg != NULL)
+			err(1, "Failed accepting initial GDB connection");
+
+		/* Silently ignore errors post-startup. */
+		return;
+	}
+
+	optval = 1;
+	if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) ==
+	    -1) {
+		warn("Failed to disable SIGPIPE for GDB connection");
+		close(s);
+		return;
+	}
+
+	pthread_mutex_lock(&gdb_lock);
+	if (cur_fd != -1) {
+		close(s);
+		warnx("Ignoring additional GDB connection.");
+	}
+
+	read_event = mevent_add(s, EVF_READ, gdb_readable, NULL);
+	if (read_event == NULL) {
+		if (arg != NULL)
+			err(1, "Failed to setup initial GDB connection");
+		pthread_mutex_unlock(&gdb_lock);
+		return;
+	}
+	write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL);
+	if (write_event == NULL) {
+		if (arg != NULL)
+			err(1, "Failed to setup initial GDB connection");
+		mevent_delete_close(read_event);
+		read_event = NULL;
+	}
+
+	cur_fd = s;
+	cur_vcpu = 0;
+	stepping_vcpu = -1;
+	stopped_vcpu = -1;
+	stop_pending = false;
+
+	/* Break on attach. */
+	first_stop = true;
+	gdb_suspend_vcpus();
+	pthread_mutex_unlock(&gdb_lock);
+}
+
+#ifndef WITHOUT_CAPSICUM
+void
+limit_gdb_socket(int s)
+{
+	cap_rights_t rights;
+	unsigned long ioctls[] = { FIONREAD };
+
+	cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE,
+	    CAP_SETSOCKOPT, CAP_IOCTL);
+	if (caph_rights_limit(s, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+}
+#endif
+
+void
+init_gdb(struct vmctx *_ctx, int sport, bool wait)
+{
+	struct sockaddr_in sin;
+	int error, flags, s;
+
+	debug("==> starting on %d, %swaiting\n", sport, wait ? "" : "not ");
+
+	error = pthread_mutex_init(&gdb_lock, NULL);
+	if (error != 0)
+		errc(1, error, "gdb mutex init");
+	error = pthread_cond_init(&idle_vcpus, NULL);
+	if (error != 0)
+		errc(1, error, "gdb cv init");
+
+	ctx = _ctx;
+	s = socket(PF_INET, SOCK_STREAM, 0);
+	if (s < 0)
+		err(1, "gdb socket create");
+
+#ifdef __FreeBSD__
+	sin.sin_len = sizeof(sin);
+#endif
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(sport);
+
+	if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0)
+		err(1, "gdb socket bind");
+
+	if (listen(s, 1) < 0)
+		err(1, "gdb socket listen");
+
+	if (wait) {
+		/*
+		 * Set vcpu 0 in vcpus_suspended.  This will trigger the
+		 * logic in gdb_cpu_add() to suspend the first vcpu before
+		 * it starts execution.  The vcpu will remain suspended
+		 * until a debugger connects.
+		 */
+		stepping_vcpu = -1;
+		stopped_vcpu = -1;
+		CPU_SET(0, &vcpus_suspended);
+	}
+
+	flags = fcntl(s, F_GETFL);
+	if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1)
+		err(1, "Failed to mark gdb socket non-blocking");
+
+#ifndef WITHOUT_CAPSICUM
+	limit_gdb_socket(s);
+#endif
+	mevent_add(s, EVF_READ, new_connection, NULL);
+}
diff --git a/usr/src/cmd/bhyve/gdb.h b/usr/src/cmd/bhyve/gdb.h
new file mode 100644
index 0000000000..09ebc34f24
--- /dev/null
+++ b/usr/src/cmd/bhyve/gdb.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017 John H. Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __GDB_H__
+#define	__GDB_H__
+
+void	gdb_cpu_add(int vcpu);
+void	gdb_cpu_mtrap(int vcpu);
+void	gdb_cpu_suspend(int vcpu);
+void	init_gdb(struct vmctx *ctx, int sport, bool wait);
+
+#endif /* !__GDB_H__ */
diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c
index 510649893a..b460ee2988 100644
--- a/usr/src/cmd/bhyve/inout.c
+++ b/usr/src/cmd/bhyve/inout.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,11 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
@@ -66,21 +68,21 @@ static int
 default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
               uint32_t *eax, void *arg)
 {
-        if (in) {
-                switch (bytes) {
-                case 4:
-                        *eax = 0xffffffff;
-                        break;
-                case 2:
-                        *eax = 0xffff;
-                        break;
-                case 1:
-                        *eax = 0xff;
-                        break;
-                }
-        }
-        
-        return (0);
+	if (in) {
+		switch (bytes) {
+		case 4:
+			*eax = 0xffffffff;
+			break;
+		case 2:
+			*eax = 0xffff;
+			break;
+		case 1:
+			*eax = 0xff;
+			break;
+		}
+	}
+
+	return (0);
 }
 
 static void 
@@ -107,7 +109,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 	uint32_t eax, val;
 	inout_func_t handler;
 	void *arg;
-	int error, retval;
+	int error, fault, retval;
 	enum vm_reg_name idxreg;
 	uint64_t gla, index, iterations, count;
 	struct vm_inout_str *vis;
@@ -163,11 +165,11 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 			}
 
 			error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
-			    bytes, prot, iov, nitems(iov));
-			if (error == -1) {
+			    bytes, prot, iov, nitems(iov), &fault);
+			if (error) {
 				retval = -1;  /* Unrecoverable error */
 				break;
-			} else if (error == 1) {
+			} else if (fault) {
 				retval = 0;  /* Resume guest to handle fault */
 				break;
 			}
diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h
index 0d4046bd61..b72ee5d93e 100644
--- a/usr/src/cmd/bhyve/inout.h
+++ b/usr/src/cmd/bhyve/inout.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/inout.h 269094 2014-07-25 20:18:35Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c
index 86ff5c6580..acdbb5111b 100644
--- a/usr/src/cmd/bhyve/ioapic.c
+++ b/usr/src/cmd/bhyve/ioapic.c
@@ -1,5 +1,7 @@
 /*-
- * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
@@ -26,14 +28,17 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/ioapic.c 261268 2014-01-29 14:56:48Z jhb $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
+#include <stdio.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "ioapic.h"
+#include "pci_emul.h"
+#include "pci_lpc.h"
 
 /*
  * Assign PCI INTx interrupts to I/O APIC pins in a round-robin
@@ -64,11 +69,15 @@ ioapic_init(struct vmctx *ctx)
 }
 
 int
-ioapic_pci_alloc_irq(void)
+ioapic_pci_alloc_irq(struct pci_devinst *pi)
 {
 	static int last_pin;
 
 	if (pci_pins == 0)
 		return (-1);
+	if (lpc_bootrom()) {
+		/* For external bootrom use fixed mapping. */
+		return (16 + (4 + pi->pi_slot + pi->pi_lintr.pin) % 8);
+	}
 	return (16 + (last_pin++ % pci_pins));
 }
diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h
index 789f90fea9..3a7fa76192 100644
--- a/usr/src/cmd/bhyve/ioapic.h
+++ b/usr/src/cmd/bhyve/ioapic.h
@@ -1,5 +1,7 @@
 /*-
- * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
@@ -24,16 +26,18 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/ioapic.h 261268 2014-01-29 14:56:48Z jhb $
+ * $FreeBSD$
  */
 
 #ifndef _IOAPIC_H_
 #define	_IOAPIC_H_
 
+struct pci_devinst;
+
 /*
  * Allocate a PCI IRQ from the I/O APIC.
  */
 void	ioapic_init(struct vmctx *ctx);
-int	ioapic_pci_alloc_irq(void);
+int	ioapic_pci_alloc_irq(struct pci_devinst *pi);
 
 #endif
diff --git a/usr/src/cmd/bhyve/iov.c b/usr/src/cmd/bhyve/iov.c
new file mode 100644
index 0000000000..54ea22aa94
--- /dev/null
+++ b/usr/src/cmd/bhyve/iov.c
@@ -0,0 +1,148 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include "iov.h"
+
+void
+seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2,
+    size_t seek)
+{
+	size_t remainder = 0;
+	size_t left = seek;
+	int i, j;
+
+	for (i = 0; i < niov1; i++) {
+		size_t toseek = MIN(left, iov1[i].iov_len);
+		left -= toseek;
+
+		if (toseek == iov1[i].iov_len)
+			continue;
+
+		if (left == 0) {
+			remainder = toseek;
+			break;
+		}
+	}
+
+	for (j = i; j < niov1; j++) {
+		iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder;
+		iov2[j - i].iov_len = iov1[j].iov_len - remainder;
+		remainder = 0;
+	}
+
+	*niov2 = j - i;
+}
+
+size_t
+count_iov(const struct iovec *iov, int niov)
+{
+	size_t total = 0;
+	int i;
+
+	for (i = 0; i < niov; i++)
+		total += iov[i].iov_len;
+
+	return (total);
+}
+
+void
+truncate_iov(struct iovec *iov, int *niov, size_t length)
+{
+	size_t done = 0;
+	int i;
+
+	for (i = 0; i < *niov; i++) {
+		size_t toseek = MIN(length - done, iov[i].iov_len);
+		done += toseek;
+
+		if (toseek <= iov[i].iov_len) {
+			iov[i].iov_len = toseek;
+			*niov = i + 1;
+			return;
+		}
+	}
+}
+
+ssize_t
+iov_to_buf(const struct iovec *iov, int niov, void **buf)
+{
+	size_t ptr, total;
+	int i;
+
+	total = count_iov(iov, niov);
+	*buf = realloc(*buf, total);
+	if (*buf == NULL)
+		return (-1);
+
+	for (i = 0, ptr = 0; i < niov; i++) {
+		memcpy(*buf + ptr, iov[i].iov_base, iov[i].iov_len);
+		ptr += iov[i].iov_len;
+	}
+
+	return (total);
+}
+
+ssize_t
+buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
+    size_t seek)
+{
+	struct iovec *diov;
+	int ndiov, i;
+	size_t off = 0, len;
+
+	if (seek > 0) {
+		diov = malloc(sizeof(struct iovec) * niov);
+		seek_iov(iov, niov, diov, &ndiov, seek);
+	} else {
+		diov = iov;
+		ndiov = niov;
+	}
+
+	for (i = 0; i < ndiov && off < buflen; i++) {
+		len = MIN(diov[i].iov_len, buflen - off);
+		memcpy(diov[i].iov_base, buf + off, len);
+		off += len;
+	}
+
+	if (seek > 0)
+		free(diov);
+
+	return ((ssize_t)off);
+}
+
diff --git a/usr/src/cmd/bhyve/iov.h b/usr/src/cmd/bhyve/iov.h
new file mode 100644
index 0000000000..e3b5916edb
--- /dev/null
+++ b/usr/src/cmd/bhyve/iov.h
@@ -0,0 +1,44 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IOV_H_
+#define	_IOV_H_
+
+void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2,
+    int *niov2, size_t seek);
+void truncate_iov(struct iovec *iov, int *niov, size_t length);
+size_t count_iov(const struct iovec *iov, int niov);
+ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
+ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
+    size_t seek);
+
+#endif	/* _IOV_H_ */
diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c
index a153a8e960..90aefe45c8 100644
--- a/usr/src/cmd/bhyve/mem.c
+++ b/usr/src/cmd/bhyve/mem.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $
+ * $FreeBSD$
  */
 
 /*
@@ -33,18 +35,19 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
-#include <sys/tree.h>
 #include <sys/errno.h>
+#include <sys/tree.h>
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <assert.h>
+#include <err.h>
 #include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "mem.h"
 
@@ -121,6 +124,7 @@ mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
 static void
 mmio_rb_dump(struct mmio_rb_tree *rbt)
 {
+	int perror;
 	struct mmio_rb_range *np;
 
 	pthread_rwlock_rdlock(&mmio_rwlock);
@@ -128,12 +132,16 @@ mmio_rb_dump(struct mmio_rb_tree *rbt)
 		printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
 		       np->mr_param.name);
 	}
-	pthread_rwlock_unlock(&mmio_rwlock);
+	perror = pthread_rwlock_unlock(&mmio_rwlock);
+	assert(perror == 0);
 }
 #endif
 
 RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
 
+typedef int (mem_cb_t)(struct vmctx *ctx, int vcpu, uint64_t gpa,
+    struct mem_range *mr, void *arg);
+
 static int
 mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
 {
@@ -156,13 +164,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
 	return (error);
 }
 
-int
-emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
-    struct vm_guest_paging *paging)
-
+static int
+access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb,
+    void *arg)
 {
 	struct mmio_rb_range *entry;
-	int err, immutable;
+	int err, perror, immutable;
 	
 	pthread_rwlock_rdlock(&mmio_rwlock);
 	/*
@@ -180,7 +187,8 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
 			/* Update the per-vCPU cache */
 			mmio_hint[vcpu] = entry;			
 		} else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
-			pthread_rwlock_unlock(&mmio_rwlock);
+			perror = pthread_rwlock_unlock(&mmio_rwlock);
+			assert(perror == 0);
 			return (ESRCH);
 		}
 	}
@@ -199,40 +207,114 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
 	 * config space window as 'immutable' the deadlock can be avoided.
 	 */
 	immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
-	if (immutable)
-		pthread_rwlock_unlock(&mmio_rwlock);
+	if (immutable) {
+		perror = pthread_rwlock_unlock(&mmio_rwlock);
+		assert(perror == 0);
+	}
 
-	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
-				      mem_read, mem_write, &entry->mr_param);
+	err = cb(ctx, vcpu, paddr, &entry->mr_param, arg);
+
+	if (!immutable) {
+		perror = pthread_rwlock_unlock(&mmio_rwlock);
+		assert(perror == 0);
+	}
 
-	if (!immutable)
-		pthread_rwlock_unlock(&mmio_rwlock);
 
 	return (err);
 }
 
+struct emulate_mem_args {
+	struct vie *vie;
+	struct vm_guest_paging *paging;
+};
+
+static int
+emulate_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr,
+    void *arg)
+{
+	struct emulate_mem_args *ema;
+
+	ema = arg;
+	return (vmm_emulate_instruction(ctx, vcpu, paddr, ema->vie, ema->paging,
+	    mem_read, mem_write, mr));
+}
+
+int
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+    struct vm_guest_paging *paging)
+
+{
+	struct emulate_mem_args ema;
+
+	ema.vie = vie;
+	ema.paging = paging;
+	return (access_memory(ctx, vcpu, paddr, emulate_mem_cb, &ema));
+}
+
+struct rw_mem_args {
+	uint64_t *val;
+	int size;
+	int operation;
+};
+
+static int
+rw_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr,
+    void *arg)
+{
+	struct rw_mem_args *rma;
+
+	rma = arg;
+	return (mr->handler(ctx, vcpu, rma->operation, paddr, rma->size,
+	    rma->val, mr->arg1, mr->arg2));
+}
+
+int
+read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size)
+{
+	struct rw_mem_args rma;
+
+	rma.val = rval;
+	rma.size = size;
+	rma.operation = MEM_F_READ;
+	return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma));
+}
+
+int
+write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size)
+{
+	struct rw_mem_args rma;
+
+	rma.val = &wval;
+	rma.size = size;
+	rma.operation = MEM_F_WRITE;
+	return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma));
+}
+
 static int
 register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
 {
 	struct mmio_rb_range *entry, *mrp;
-	int		err;
+	int err, perror;
 
 	err = 0;
 
 	mrp = malloc(sizeof(struct mmio_rb_range));
-	
-	if (mrp != NULL) {
+	if (mrp == NULL) {
+		warn("%s: couldn't allocate memory for mrp\n",
+		     __func__);
+		err = ENOMEM;
+	} else {
 		mrp->mr_param = *memp;
 		mrp->mr_base = memp->base;
 		mrp->mr_end = memp->base + memp->size - 1;
 		pthread_rwlock_wrlock(&mmio_rwlock);
 		if (mmio_rb_lookup(rbt, memp->base, &entry) != 0)
 			err = mmio_rb_add(rbt, mrp);
-		pthread_rwlock_unlock(&mmio_rwlock);
+		perror = pthread_rwlock_unlock(&mmio_rwlock);
+		assert(perror == 0);
 		if (err)
 			free(mrp);
-	} else
-		err = ENOMEM;
+	}
 
 	return (err);
 }
@@ -256,7 +338,7 @@ unregister_mem(struct mem_range *memp)
 {
 	struct mem_range *mr;
 	struct mmio_rb_range *entry = NULL;
-	int err, i;
+	int err, perror, i;
 	
 	pthread_rwlock_wrlock(&mmio_rwlock);
 	err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry);
@@ -273,7 +355,8 @@ unregister_mem(struct mem_range *memp)
 				mmio_hint[i] = NULL;
 		}
 	}
-	pthread_rwlock_unlock(&mmio_rwlock);
+	perror = pthread_rwlock_unlock(&mmio_rwlock);
+	assert(perror == 0);
 
 	if (entry)
 		free(entry);
diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h
index 09cf56b72e..38d773c43f 100644
--- a/usr/src/cmd/bhyve/mem.h
+++ b/usr/src/cmd/bhyve/mem.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/mem.h 269700 2014-08-08 03:49:01Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _MEM_H_
@@ -53,9 +55,13 @@ struct mem_range {
 void	init_mem(void);
 int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
 		    struct vm_guest_paging *paging);
-		    
+
+int	read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval,
+		 int size);
 int	register_mem(struct mem_range *memp);
 int	register_mem_fallback(struct mem_range *memp);
 int	unregister_mem(struct mem_range *memp);
+int	write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval,
+		  int size);
 
 #endif	/* _MEM_H_ */
diff --git a/usr/src/cmd/bhyve/mevent.c b/usr/src/cmd/bhyve/mevent.c
new file mode 100644
index 0000000000..a258fd3047
--- /dev/null
+++ b/usr/src/cmd/bhyve/mevent.c
@@ -0,0 +1,680 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread 
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/event.h>
+#else
+#include <port.h>
+#include <sys/poll.h>
+#include <sys/siginfo.h>
+#include <sys/queue.h>
+#endif
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "mevent.h"
+
+#define	MEVENT_MAX	64
+
+#define	MEV_ADD		1
+#define	MEV_ENABLE	2
+#define	MEV_DISABLE	3
+#define	MEV_DEL_PENDING	4
+
+extern char *vmname;
+
+static pthread_t mevent_tid;
+static int mevent_timid = 43;
+static int mevent_pipefd[2];
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+	void	(*me_func)(int, enum ev_type, void *);
+#define me_msecs me_fd
+	int	me_fd;
+#ifdef __FreeBSD__
+	int	me_timid;
+#else
+	timer_t me_timid;
+#endif
+	enum ev_type me_type;
+	void    *me_param;
+	int	me_cq;
+	int	me_state;
+	int	me_closefd;
+#ifndef __FreeBSD__
+	port_notify_t	me_notify;
+	struct sigevent	me_sigev;
+	boolean_t	me_auto_requeue;
+#endif
+	LIST_ENTRY(mevent) me_list;
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+	pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+	pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type, void *param)
+{
+	char buf[MEVENT_MAX];
+	int status;
+
+	/*
+	 * Drain the pipe read side. The fd is non-blocking so this is
+	 * safe to do.
+	 */
+	do {
+		status = read(fd, buf, sizeof(buf));
+	} while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+	char c;
+	
+	/*
+	 * If calling from outside the i/o thread, write a byte on the
+	 * pipe to force the i/o thread to exit the blocking kevent call.
+	 */
+	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+		write(mevent_pipefd[1], &c, 1);
+	}
+}
+#ifdef __FreeBSD__
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+	int retval;
+
+	retval = 0;
+
+	if (mevp->me_type == EVF_READ)
+		retval = EVFILT_READ;
+
+	if (mevp->me_type == EVF_WRITE)
+		retval = EVFILT_WRITE;
+
+	if (mevp->me_type == EVF_TIMER)
+		retval = EVFILT_TIMER;
+
+	if (mevp->me_type == EVF_SIGNAL)
+		retval = EVFILT_SIGNAL;
+
+	return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+	int ret;
+
+	switch (mevp->me_state) {
+	case MEV_ADD:
+		ret = EV_ADD;		/* implicitly enabled */
+		break;
+	case MEV_ENABLE:
+		ret = EV_ENABLE;
+		break;
+	case MEV_DISABLE:
+		ret = EV_DISABLE;
+		break;
+	case MEV_DEL_PENDING:
+		ret = EV_DELETE;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	return (ret);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+	/* XXX nothing yet, perhaps EV_EOF for reads ? */
+	return (0);
+}
+
+static int
+mevent_build(int mfd, struct kevent *kev)
+{
+	struct mevent *mevp, *tmpp;
+	int i;
+
+	i = 0;
+
+	mevent_qlock();
+
+	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+		if (mevp->me_closefd) {
+			/*
+			 * A close of the file descriptor will remove the
+			 * event
+			 */
+			close(mevp->me_fd);
+		} else {
+			if (mevp->me_type == EVF_TIMER) {
+				kev[i].ident = mevp->me_timid;
+				kev[i].data = mevp->me_msecs;
+			} else {
+				kev[i].ident = mevp->me_fd;
+				kev[i].data = 0;
+			}
+			kev[i].filter = mevent_kq_filter(mevp);
+			kev[i].flags = mevent_kq_flags(mevp);
+			kev[i].fflags = mevent_kq_fflags(mevp);
+			kev[i].udata = mevp;
+			i++;
+		}
+
+		mevp->me_cq = 0;
+		LIST_REMOVE(mevp, me_list);
+
+		if (mevp->me_state == MEV_DEL_PENDING) {
+			free(mevp);
+		} else {
+			LIST_INSERT_HEAD(&global_head, mevp, me_list);
+		}
+
+		assert(i < MEVENT_MAX);
+	}
+
+	mevent_qunlock();
+
+	return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+	struct mevent *mevp;
+	int i;
+
+	for (i = 0; i < numev; i++) {
+		mevp = kev[i].udata;
+
+		/* XXX check for EV_ERROR ? */
+
+		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+	}
+}
+
+#else /* __FreeBSD__ */
+
+static void
+mevent_update_one(struct mevent *mevp)
+{
+	int portfd = mevp->me_notify.portnfy_port;
+
+	switch (mevp->me_type) {
+	case EVF_READ:
+	case EVF_WRITE:
+		mevp->me_auto_requeue = B_FALSE;
+
+		switch (mevp->me_state) {
+		case MEV_ADD:
+		case MEV_ENABLE:
+		{
+			int events;
+
+			events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT;
+
+			if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd,
+			    events, mevp) != 0) {
+				(void) fprintf(stderr,
+				    "port_associate fd %d %p failed: %s\n",
+				    mevp->me_fd, mevp, strerror(errno));
+			}
+			return;
+		}
+		case MEV_DISABLE:
+		case MEV_DEL_PENDING:
+			/*
+			 * A disable that comes in while an event is being
+			 * handled will result in an ENOENT.
+			 */
+			if (port_dissociate(portfd, PORT_SOURCE_FD,
+			    mevp->me_fd) != 0 && errno != ENOENT) {
+				(void) fprintf(stderr, "port_dissociate "
+				    "portfd %d fd %d mevp %p failed: %s\n",
+				    portfd, mevp->me_fd, mevp, strerror(errno));
+			}
+			return;
+		default:
+			goto abort;
+		}
+
+	case EVF_TIMER:
+		mevp->me_auto_requeue = B_TRUE;
+
+		switch (mevp->me_state) {
+		case MEV_ADD:
+		case MEV_ENABLE:
+		{
+			struct itimerspec it = { 0 };
+
+			mevp->me_sigev.sigev_notify = SIGEV_PORT;
+			mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify;
+
+			if (timer_create(CLOCK_REALTIME, &mevp->me_sigev,
+			    &mevp->me_timid) != 0) {
+				(void) fprintf(stderr,
+				    "timer_create failed: %s", strerror(errno));
+				return;
+			}
+
+			/* The first timeout */
+			it.it_value.tv_sec = mevp->me_msecs / MILLISEC;
+			it.it_value.tv_nsec =
+				MSEC2NSEC(mevp->me_msecs % MILLISEC);
+			/* Repeat at the same interval */
+			it.it_interval = it.it_value;
+
+			if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) {
+				(void) fprintf(stderr, "timer_settime failed: "
+				    "%s", strerror(errno));
+			}
+			return;
+		}
+		case MEV_DISABLE:
+		case MEV_DEL_PENDING:
+			if (timer_delete(mevp->me_timid) != 0) {
+				(void) fprintf(stderr, "timer_delete failed: "
+				    "%s", strerror(errno));
+			}
+			return;
+		default:
+			goto abort;
+		}
+	default:
+		/* EVF_SIGNAL not yet implemented. */
+		goto abort;
+	}
+
+abort:
+	(void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__,
+	    mevp->me_type, mevp->me_state);
+	abort();
+}
+
+static void
+mevent_update_pending(int portfd)
+{
+	struct mevent *mevp, *tmpp;
+
+	mevent_qlock();
+
+	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+		mevp->me_notify.portnfy_port = portfd;
+		mevp->me_notify.portnfy_user = mevp;
+		if (mevp->me_closefd) {
+			/*
+			 * A close of the file descriptor will remove the
+			 * event
+			 */
+			(void) close(mevp->me_fd);
+			mevp->me_fd = -1;
+		} else {
+			mevent_update_one(mevp);
+		}
+
+		mevp->me_cq = 0;
+		LIST_REMOVE(mevp, me_list);
+
+		if (mevp->me_state == MEV_DEL_PENDING) {
+			free(mevp);
+		} else {
+			LIST_INSERT_HEAD(&global_head, mevp, me_list);
+		}
+	}
+
+	mevent_qunlock();
+}
+
+static void
+mevent_handle_pe(port_event_t *pe)
+{
+	struct mevent *mevp = pe->portev_user;
+
+	mevent_qunlock();
+
+	(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+
+	mevent_qlock();
+	if (!mevp->me_cq && !mevp->me_auto_requeue) {
+		mevent_update_one(mevp);
+	}
+	mevent_qunlock();
+}
+#endif
+
+struct mevent *
+mevent_add(int tfd, enum ev_type type,
+	   void (*func)(int, enum ev_type, void *), void *param)
+{
+	struct mevent *lp, *mevp;
+
+	if (tfd < 0 || func == NULL) {
+		return (NULL);
+	}
+
+	mevp = NULL;
+
+	mevent_qlock();
+
+	/*
+	 * Verify that the fd/type tuple is not present in any list
+	 */
+	LIST_FOREACH(lp, &global_head, me_list) {
+		if (type != EVF_TIMER && lp->me_fd == tfd &&
+		    lp->me_type == type) {
+			goto exit;
+		}
+	}
+
+	LIST_FOREACH(lp, &change_head, me_list) {
+		if (type != EVF_TIMER && lp->me_fd == tfd &&
+		    lp->me_type == type) {
+			goto exit;
+		}
+	}
+
+	/*
+	 * Allocate an entry, populate it, and add it to the change list.
+	 */
+	mevp = calloc(1, sizeof(struct mevent));
+	if (mevp == NULL) {
+		goto exit;
+	}
+
+	if (type == EVF_TIMER) {
+		mevp->me_msecs = tfd;
+		mevp->me_timid = mevent_timid++;
+	} else
+		mevp->me_fd = tfd;
+	mevp->me_type = type;
+	mevp->me_func = func;
+	mevp->me_param = param;
+
+	LIST_INSERT_HEAD(&change_head, mevp, me_list);
+	mevp->me_cq = 1;
+	mevp->me_state = MEV_ADD;
+	mevent_notify();
+
+exit:
+	mevent_qunlock();
+
+	return (mevp);
+}
+
+static int
+mevent_update(struct mevent *evp, int newstate)
+{
+	/*
+	 * It's not possible to enable/disable a deleted event
+	 */
+	if (evp->me_state == MEV_DEL_PENDING)
+		return (EINVAL);
+
+	/*
+	 * No update needed if state isn't changing
+	 */
+	if (evp->me_state == newstate)
+		return (0);
+	
+	mevent_qlock();
+
+	evp->me_state = newstate;
+
+	/*
+	 * Place the entry onto the changed list if not already there.
+	 */
+	if (evp->me_cq == 0) {
+		evp->me_cq = 1;
+		LIST_REMOVE(evp, me_list);
+		LIST_INSERT_HEAD(&change_head, evp, me_list);
+		mevent_notify();
+	}
+
+	mevent_qunlock();
+
+	return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+
+	return (mevent_update(evp, MEV_ENABLE));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+
+	return (mevent_update(evp, MEV_DISABLE));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+	mevent_qlock();
+
+	/*
+         * Place the entry onto the changed list if not already there, and
+	 * mark as to be deleted.
+         */
+        if (evp->me_cq == 0) {
+		evp->me_cq = 1;
+		LIST_REMOVE(evp, me_list);
+		LIST_INSERT_HEAD(&change_head, evp, me_list);
+		mevent_notify();
+        }
+	evp->me_state = MEV_DEL_PENDING;
+
+	if (closefd)
+		evp->me_closefd = 1;
+
+	mevent_qunlock();
+
+	return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+	return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+	return (mevent_delete_event(evp, 1));
+}
+
+static void
+mevent_set_name(void)
+{
+
+	pthread_set_name_np(mevent_tid, "mevent");
+}
+
+void
+mevent_dispatch(void)
+{
+#ifdef __FreeBSD__
+	struct kevent changelist[MEVENT_MAX];
+	struct kevent eventlist[MEVENT_MAX];
+	struct mevent *pipev;
+	int mfd;
+	int numev;
+#else
+	struct mevent *pipev;
+	int portfd;
+#endif
+	int ret;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+
+	mevent_tid = pthread_self();
+	mevent_set_name();
+
+#ifdef __FreeBSD__
+	mfd = kqueue();
+	assert(mfd > 0);
+#else
+	portfd = port_create();
+	assert(portfd >= 0);
+#endif
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_KQUEUE);
+	if (caph_rights_limit(mfd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	/*
+	 * Open the pipe that will be used for other threads to force
+	 * the blocking kqueue call to exit by writing to it. Set the
+	 * descriptor to non-blocking.
+	 */
+	ret = pipe(mevent_pipefd);
+	if (ret < 0) {
+		perror("pipe");
+		exit(0);
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	/*
+	 * Add internal event handler for the pipe write fd
+	 */
+	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+	assert(pipev != NULL);
+
+	for (;;) {
+#ifdef __FreeBSD__
+		/*
+		 * Build changelist if required.
+		 * XXX the changelist can be put into the blocking call
+		 * to eliminate the extra syscall. Currently better for
+		 * debug.
+		 */
+		numev = mevent_build(mfd, changelist);
+		if (numev) {
+			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+			if (ret == -1) {
+				perror("Error return from kevent change");
+			}
+		}
+
+		/*
+		 * Block awaiting events
+		 */
+		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+		if (ret == -1 && errno != EINTR) {
+			perror("Error return from kevent monitor");
+		}
+		
+		/*
+		 * Handle reported events
+		 */
+		mevent_handle(eventlist, ret);
+
+#else /* __FreeBSD__ */
+		port_event_t pev;
+
+		/* Handle any pending updates */
+		mevent_update_pending(portfd);
+
+		/* Block awaiting events */
+		ret = port_get(portfd, &pev, NULL);
+		if (ret != 0 && errno != EINTR) {
+			perror("Error return from port_get");
+			continue;
+		}
+
+		/* Handle reported event */
+		mevent_handle_pe(&pev);
+#endif /* __FreeBSD__ */
+	}			
+}
diff --git a/usr/src/cmd/bhyve/mevent.h b/usr/src/cmd/bhyve/mevent.h
new file mode 100644
index 0000000000..e6b96f0a7c
--- /dev/null
+++ b/usr/src/cmd/bhyve/mevent.h
@@ -0,0 +1,53 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_MEVENT_H_
+#define	_MEVENT_H_
+
+enum ev_type {
+	EVF_READ,
+	EVF_WRITE,
+	EVF_TIMER,
+	EVF_SIGNAL
+};
+
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type, 
+			  void (*func)(int, enum ev_type, void *),
+			  void *param);
+int	mevent_enable(struct mevent *evp);
+int	mevent_disable(struct mevent *evp);
+int	mevent_delete(struct mevent *evp);
+int	mevent_delete_close(struct mevent *evp);
+
+void	mevent_dispatch(void);
+
+#endif	/* _MEVENT_H_ */
diff --git a/usr/src/cmd/bhyve/mevent_test.c b/usr/src/cmd/bhyve/mevent_test.c
new file mode 100644
index 0000000000..4da3adb5ae
--- /dev/null
+++ b/usr/src/cmd/bhyve/mevent_test.c
@@ -0,0 +1,282 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Test program for the micro event library. Set up a simple TCP echo
+ * service.
+ *
+ *  cc mevent_test.c mevent.c -lpthread
+ */
+
+#include <sys/types.h>
+#include <sys/stdint.h>
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#endif
+#include <sys/socket.h>
+#include <netinet/in.h>
+#ifdef __FreeBSD__
+#include <machine/cpufunc.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include "mevent.h"
+
+#define TEST_PORT	4321
+
+static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
+
+static struct mevent *tevp;
+
+char *vmname = "test vm";
+
+
+#define MEVENT_ECHO
+
+/* Number of timer events to capture */
+#define TEVSZ	4096
+uint64_t tevbuf[TEVSZ];
+
+static void
+timer_print(void)
+{
+	uint64_t min, max, diff, sum;
+#ifdef __FreeBSD__
+	uint64_t tsc_freq;
+	size_t len;
+#endif
+	int j;
+
+	min = UINT64_MAX;
+	max = 0;
+	sum = 0;
+
+#ifdef __FreeBSD__
+	len = sizeof(tsc_freq);
+	sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0);
+#endif
+
+	for (j = 1; j < TEVSZ; j++) {
+#ifdef __FreeBSD__
+		/* Convert a tsc diff into microseconds */
+		diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq;
+#else
+		diff = (tevbuf[j] - tevbuf[j-1]) / 1000;
+#endif
+		sum += diff;
+		if (min > diff)
+			min = diff;
+		if (max < diff)
+			max = diff;
+	}
+
+	printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max,
+	    sum/(TEVSZ - 1));
+}
+
+static void
+timer_callback(int fd, enum ev_type type, void *param)
+{
+	static int i;
+
+	if (i >= TEVSZ)
+		abort();
+
+#ifdef __FreeBSD__
+	tevbuf[i++] = rdtsc();
+#else
+	tevbuf[i++] = gethrtime();
+#endif
+
+	if (i == TEVSZ) {
+		mevent_delete(tevp);
+		timer_print();
+	}
+}
+
+
+#ifdef MEVENT_ECHO
+struct esync {
+	pthread_mutex_t	e_mt;
+	pthread_cond_t	e_cond;       
+};
+
+static void
+echoer_callback(int fd, enum ev_type type, void *param)
+{
+	struct esync *sync = param;
+
+	pthread_mutex_lock(&sync->e_mt);
+	pthread_cond_signal(&sync->e_cond);
+	pthread_mutex_unlock(&sync->e_mt);
+}
+
+static void *
+echoer(void *param)
+{
+	struct esync sync;
+	struct mevent *mev;
+	char buf[128];
+	int fd = (int)(uintptr_t) param;
+	int len;
+
+	pthread_mutex_init(&sync.e_mt, NULL);
+	pthread_cond_init(&sync.e_cond, NULL);
+
+	pthread_mutex_lock(&sync.e_mt);
+
+	mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
+	if (mev == NULL) {
+		printf("Could not allocate echoer event\n");
+		exit(4);
+	}
+
+	while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
+		len = read(fd, buf, sizeof(buf));
+		if (len > 0) {
+			write(fd, buf, len);
+			write(0, buf, len);
+		} else {
+			break;
+		}
+	}
+
+	mevent_delete_close(mev);
+
+	pthread_mutex_unlock(&sync.e_mt);
+	pthread_mutex_destroy(&sync.e_mt);
+	pthread_cond_destroy(&sync.e_cond);
+
+	return (NULL);
+}
+
+#else
+
+static void *
+echoer(void *param)
+{
+	char buf[128];
+	int fd = (int)(uintptr_t) param;
+	int len;
+
+	while ((len = read(fd, buf, sizeof(buf))) > 0) {
+		write(1, buf, len);
+	}
+
+	return (NULL);
+}
+#endif /* MEVENT_ECHO */
+
+static void
+acceptor_callback(int fd, enum ev_type type, void *param)
+{
+	pthread_mutex_lock(&accept_mutex);
+	pthread_cond_signal(&accept_condvar);
+	pthread_mutex_unlock(&accept_mutex);
+}
+
+static void *
+acceptor(void *param)
+{
+	struct sockaddr_in sin;
+	pthread_t tid;
+	int news;
+	int s;
+
+	if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+		perror("cannot create socket");
+		exit(4);
+	}
+
+#ifdef __FreeBSD__
+	sin.sin_len = sizeof(sin);
+#endif
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(TEST_PORT);
+
+	if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+		perror("cannot bind socket");
+		exit(4);
+	}
+
+	if (listen(s, 1) < 0) {
+		perror("cannot listen socket");
+		exit(4);
+	}
+
+	(void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
+
+	pthread_mutex_lock(&accept_mutex);
+
+	while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
+		news = accept(s, NULL, NULL);
+		if (news < 0) {
+			perror("accept error");
+		} else {
+			static int first = 1;
+
+			if (first) {
+				/*
+				 * Start a timer
+				 */
+				first = 0;
+				tevp = mevent_add(1, EVF_TIMER, timer_callback,
+						  NULL);
+			}
+
+			printf("incoming connection, spawning thread\n");
+			pthread_create(&tid, NULL, echoer,
+				       (void *)(uintptr_t)news);
+		}
+	}
+
+	return (NULL);
+}
+
+int
+main()
+{
+	pthread_t tid;
+
+	pthread_create(&tid, NULL, acceptor, NULL);
+
+	mevent_dispatch();
+	return (0);
+}
diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c
index 9d03765c7a..e78f88f074 100644
--- a/usr/src/cmd/bhyve/mptbl.c
+++ b/usr/src/cmd/bhyve/mptbl.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,11 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/errno.h>
diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h
index d78ea6da09..ebc8d85ea8 100644
--- a/usr/src/cmd/bhyve/mptbl.h
+++ b/usr/src/cmd/bhyve/mptbl.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/mptbl.h 257423 2013-10-31 05:44:45Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _MPTBL_H_
diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c
index b68c977c1f..1e3feffcc2 100644
--- a/usr/src/cmd/bhyve/pci_ahci.c
+++ b/usr/src/cmd/bhyve/pci_ahci.c
@@ -1,5 +1,8 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013  Zhixiang Yu <zcore@freebsd.org>
+ * Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -23,11 +26,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
@@ -50,13 +53,15 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z t
 #include <pthread.h>
 #include <pthread_np.h>
 #include <inttypes.h>
+#include <md5.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "ahci.h"
 #include "block_if.h"
 
-#define	MAX_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
+#define	DEF_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
+#define	MAX_PORTS	32	/* AHCI supports 32 ports */
 
 #define	PxSIG_ATA	0x00000101 /* ATA drive */
 #define	PxSIG_ATAPI	0xeb140101 /* ATAPI drive */
@@ -86,6 +91,7 @@ enum sata_fis_type {
 #define	READ_TOC		0x43
 #define	GET_EVENT_STATUS_NOTIFICATION 0x4A
 #define	MODE_SENSE_10		0x5A
+#define	REPORT_LUNS		0xA0
 #define	READ_12			0xA8
 #define	READ_CD			0xBE
 
@@ -99,7 +105,7 @@ enum sata_fis_type {
  * ATA commands
  */
 #define	ATA_SF_ENAB_SATA_SF		0x10
-#define		ATA_SATA_SF_AN		0x05
+#define	ATA_SATA_SF_AN			0x05
 #define	ATA_SF_DIS_SATA_SF		0x90
 
 /*
@@ -113,6 +119,8 @@ static FILE *dbg;
 #endif
 #define WPRINTF(format, arg...) printf(format, ##arg)
 
+#define AHCI_PORT_IDENT 20 + 1
+
 struct ahci_ioreq {
 	struct blockif_req io_req;
 	struct ahci_port *io_pr;
@@ -122,7 +130,7 @@ struct ahci_ioreq {
 	uint32_t len;
 	uint32_t done;
 	int slot;
-	int prdtl;
+	int more;
 };
 
 struct ahci_port {
@@ -130,12 +138,17 @@ struct ahci_port {
 	struct pci_ahci_softc *pr_sc;
 	uint8_t *cmd_lst;
 	uint8_t *rfis;
+	char ident[AHCI_PORT_IDENT];
+	int port;
 	int atapi;
 	int reset;
+	int waitforclear;
 	int mult_sectors;
 	uint8_t xfermode;
+	uint8_t err_cfis[20];
 	uint8_t sense_key;
 	uint8_t asc;
+	u_int ccs;
 	uint32_t pending;
 
 	uint32_t clb;
@@ -200,6 +213,8 @@ struct pci_ahci_softc {
 };
 #define	ahci_ctx(sc)	((sc)->asc_pi->pi_vmctx)
 
+static void ahci_handle_port(struct ahci_port *p);
+
 static inline void lba_to_msf(uint8_t *buf, int lba)
 {
 	lba += 150;
@@ -209,47 +224,95 @@ static inline void lba_to_msf(uint8_t *buf, int lba)
 }
 
 /*
- * generate HBA intr depending on whether or not ports within
- * the controller have an interrupt pending.
+ * Generate HBA interrupts on global IS register write.
  */
 static void
-ahci_generate_intr(struct pci_ahci_softc *sc)
+ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask)
 {
-	struct pci_devinst *pi;
-	int i;
-
-	pi = sc->asc_pi;
+	struct pci_devinst *pi = sc->asc_pi;
+	struct ahci_port *p;
+	int i, nmsg;
+	uint32_t mmask;
 
+	/* Update global IS from PxIS/PxIE. */
 	for (i = 0; i < sc->ports; i++) {
-		struct ahci_port *pr;
-		pr = &sc->port[i];
-		if (pr->is & pr->ie)
+		p = &sc->port[i];
+		if (p->is & p->ie)
 			sc->is |= (1 << i);
 	}
+	DPRINTF("%s(%08x) %08x\n", __func__, mask, sc->is);
+
+	/* If there is nothing enabled -- clear legacy interrupt and exit. */
+	if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) {
+		if (sc->lintr) {
+			pci_lintr_deassert(pi);
+			sc->lintr = 0;
+		}
+		return;
+	}
 
-	DPRINTF("%s %x\n", __func__, sc->is);
-
-	if (sc->is && (sc->ghc & AHCI_GHC_IE)) {		
-		if (pci_msi_enabled(pi)) {
-			/*
-			 * Generate an MSI interrupt on every edge
-			 */
-			pci_generate_msi(pi, 0);
-		} else if (!sc->lintr) {
-			/*
-			 * Only generate a pin-based interrupt if one wasn't
-			 * in progress
-			 */
+	/* If there is anything and no MSI -- assert legacy interrupt. */
+	nmsg = pci_msi_maxmsgnum(pi);
+	if (nmsg == 0) {
+		if (!sc->lintr) {
 			sc->lintr = 1;
 			pci_lintr_assert(pi);
 		}
-	} else if (sc->lintr) {
-		/*
-		 * No interrupts: deassert pin-based signal if it had
-		 * been asserted
-		 */
-		pci_lintr_deassert(pi);
-		sc->lintr = 0;
+		return;
+	}
+
+	/* Assert respective MSIs for ports that were touched. */
+	for (i = 0; i < nmsg; i++) {
+		if (sc->ports <= nmsg || i < nmsg - 1)
+			mmask = 1 << i;
+		else
+			mmask = 0xffffffff << i;
+		if (sc->is & mask && mmask & mask)
+			pci_generate_msi(pi, i);
+	}
+}
+
+/*
+ * Generate HBA interrupt on specific port event.
+ */
+static void
+ahci_port_intr(struct ahci_port *p)
+{
+	struct pci_ahci_softc *sc = p->pr_sc;
+	struct pci_devinst *pi = sc->asc_pi;
+	int nmsg;
+
+	DPRINTF("%s(%d) %08x/%08x %08x\n", __func__,
+	    p->port, p->is, p->ie, sc->is);
+
+	/* If there is nothing enabled -- we are done. */
+	if ((p->is & p->ie) == 0)
+		return;
+
+	/* In case of non-shared MSI always generate interrupt. */
+	nmsg = pci_msi_maxmsgnum(pi);
+	if (sc->ports <= nmsg || p->port < nmsg - 1) {
+		sc->is |= (1 << p->port);
+		if ((sc->ghc & AHCI_GHC_IE) == 0)
+			return;
+		pci_generate_msi(pi, p->port);
+		return;
+	}
+
+	/* If IS for this port is already set -- do nothing. */
+	if (sc->is & (1 << p->port))
+		return;
+
+	sc->is |= (1 << p->port);
+
+	/* If interrupts are enabled -- generate one. */
+	if ((sc->ghc & AHCI_GHC_IE) == 0)
+		return;
+	if (nmsg > 0) {
+		pci_generate_msi(pi, nmsg - 1);
+	} else if (!sc->lintr) {
+		sc->lintr = 1;
+		pci_lintr_assert(pi);
 	}
 }
 
@@ -265,26 +328,32 @@ ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
 	case FIS_TYPE_REGD2H:
 		offset = 0x40;
 		len = 20;
-		irq = AHCI_P_IX_DHR;
+		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0;
 		break;
 	case FIS_TYPE_SETDEVBITS:
 		offset = 0x58;
 		len = 8;
-		irq = AHCI_P_IX_SDB;
+		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0;
 		break;
 	case FIS_TYPE_PIOSETUP:
 		offset = 0x20;
 		len = 20;
-		irq = 0;
+		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0;
 		break;
 	default:
 		WPRINTF("unsupported fis type %d\n", ft);
 		return;
 	}
+	if (fis[2] & ATA_S_ERROR) {
+		p->waitforclear = 1;
+		irq |= AHCI_P_IX_TFE;
+	}
 	memcpy(p->rfis + offset, fis, len);
 	if (irq) {
-		p->is |= irq;
-		ahci_generate_intr(p->pr_sc);
+		if (~p->is & irq) {
+			p->is |= irq;
+			ahci_port_intr(p);
+		}
 	}
 }
 
@@ -299,19 +368,29 @@ ahci_write_fis_piosetup(struct ahci_port *p)
 }
 
 static void
-ahci_write_fis_sdb(struct ahci_port *p, int slot, uint32_t tfd)
+ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[8];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
+	tfd &= 0x77;
 	memset(fis, 0, sizeof(fis));
-	fis[0] = error;
-	fis[2] = tfd & 0x77;
-	*(uint32_t *)(fis + 4) = (1 << slot);
-	if (fis[2] & ATA_S_ERROR)
-		p->is |= AHCI_P_IX_TFE;
-	p->tfd = tfd;
+	fis[0] = FIS_TYPE_SETDEVBITS;
+	fis[1] = (1 << 6);
+	fis[2] = tfd;
+	fis[3] = error;
+	if (fis[2] & ATA_S_ERROR) {
+		p->err_cfis[0] = slot;
+		p->err_cfis[2] = tfd;
+		p->err_cfis[3] = error;
+		memcpy(&p->err_cfis[4], cfis + 4, 16);
+	} else {
+		*(uint32_t *)(fis + 4) = (1 << slot);
+		p->sact &= ~(1 << slot);
+	}
+	p->tfd &= ~0x77;
+	p->tfd |= tfd;
 	ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
 }
 
@@ -337,14 +416,32 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 	fis[11] = cfis[11];
 	fis[12] = cfis[12];
 	fis[13] = cfis[13];
-	if (fis[2] & ATA_S_ERROR)
-		p->is |= AHCI_P_IX_TFE;
-	else
+	if (fis[2] & ATA_S_ERROR) {
+		p->err_cfis[0] = 0x80;
+		p->err_cfis[2] = tfd & 0xff;
+		p->err_cfis[3] = error;
+		memcpy(&p->err_cfis[4], cfis + 4, 16);
+	} else
 		p->ci &= ~(1 << slot);
 	p->tfd = tfd;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
+static void
+ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot)
+{
+	uint8_t fis[20];
+
+	p->tfd = ATA_S_READY | ATA_S_DSC;
+	memset(fis, 0, sizeof(fis));
+	fis[0] = FIS_TYPE_REGD2H;
+	fis[1] = 0;			/* No interrupt */
+	fis[2] = p->tfd;		/* Status */
+	fis[3] = 0;			/* No error */
+	p->ci &= ~(1 << slot);
+	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
 static void
 ahci_write_reset_fis_d2h(struct ahci_port *p)
 {
@@ -372,9 +469,11 @@ ahci_check_stopped(struct ahci_port *p)
 	 */
 	if (!(p->cmd & AHCI_P_CMD_ST)) {
 		if (p->pending == 0) {
+			p->ccs = 0;
 			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
 			p->ci = 0;
 			p->sact = 0;
+			p->waitforclear = 0;
 		}
 	}
 }
@@ -385,7 +484,6 @@ ahci_port_stop(struct ahci_port *p)
 	struct ahci_ioreq *aior;
 	uint8_t *cfis;
 	int slot;
-	int ncq;
 	int error;
 
 	assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
@@ -401,11 +499,9 @@ ahci_port_stop(struct ahci_port *p)
 		slot = aior->slot;
 		cfis = aior->cfis;
 		if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
-		    cfis[2] == ATA_READ_FPDMA_QUEUED)
-			ncq = 1;
-
-		if (ncq)
-			p->sact &= ~(1 << slot);
+		    cfis[2] == ATA_READ_FPDMA_QUEUED ||
+		    cfis[2] == ATA_SEND_FPDMA_QUEUED)
+			p->sact &= ~(1 << slot);	/* NCQ */
 		else
 			p->ci &= ~(1 << slot);
 
@@ -431,7 +527,6 @@ ahci_port_stop(struct ahci_port *p)
 static void
 ahci_port_reset(struct ahci_port *pr)
 {
-	pr->sctl = 0;
 	pr->serr = 0;
 	pr->sact = 0;
 	pr->xfermode = ATA_UDMA6;
@@ -443,8 +538,11 @@ ahci_port_reset(struct ahci_port *pr)
 		pr->tfd = 0x7F;
 		return;
 	}
-	pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_SPD_GEN2 |
-		ATA_SS_IPM_ACTIVE;
+	pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE;
+	if (pr->sctl & ATA_SC_SPD_MASK)
+		pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK);
+	else
+		pr->ssts |= ATA_SS_SPD_GEN3;
 	pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
 	if (!pr->atapi) {
 		pr->sig = PxSIG_ATA;
@@ -470,6 +568,10 @@ ahci_reset(struct pci_ahci_softc *sc)
 	for (i = 0; i < sc->ports; i++) {
 		sc->port[i].ie = 0;
 		sc->port[i].is = 0;
+		sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD);
+		if (sc->port[i].bctx)
+			sc->port[i].cmd |= AHCI_P_CMD_CPS;
+		sc->port[i].sctl = 0;
 		ahci_port_reset(&sc->port[i]);
 	}
 }
@@ -500,32 +602,87 @@ atapi_string(uint8_t *dest, const char *src, int len)
 	}
 }
 
+/*
+ * Build up the iovec based on the PRDT, 'done' and 'len'.
+ */
 static void
-ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
-    int seek)
+ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior,
+    struct ahci_prdt_entry *prdt, uint16_t prdtl)
+{
+	struct blockif_req *breq = &aior->io_req;
+	int i, j, skip, todo, left, extra;
+	uint32_t dbcsz;
+
+	/* Copy part of PRDT between 'done' and 'len' bytes into the iov. */
+	skip = aior->done;
+	left = aior->len - aior->done;
+	todo = 0;
+	for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0;
+	    i++, prdt++) {
+		dbcsz = (prdt->dbc & DBCMASK) + 1;
+		/* Skip already done part of the PRDT */
+		if (dbcsz <= skip) {
+			skip -= dbcsz;
+			continue;
+		}
+		dbcsz -= skip;
+		if (dbcsz > left)
+			dbcsz = left;
+		breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc),
+		    prdt->dba + skip, dbcsz);
+		breq->br_iov[j].iov_len = dbcsz;
+		todo += dbcsz;
+		left -= dbcsz;
+		skip = 0;
+		j++;
+	}
+
+	/* If we got limited by IOV length, round I/O down to sector size. */
+	if (j == BLOCKIF_IOV_MAX) {
+		extra = todo % blockif_sectsz(p->bctx);
+		todo -= extra;
+		assert(todo > 0);
+		while (extra > 0) {
+			if (breq->br_iov[j - 1].iov_len > extra) {
+				breq->br_iov[j - 1].iov_len -= extra;
+				break;
+			}
+			extra -= breq->br_iov[j - 1].iov_len;
+			j--;
+		}
+	}
+
+	breq->br_iovcnt = j;
+	breq->br_resid = todo;
+	aior->done += todo;
+	aior->more = (aior->done < aior->len && i < prdtl);
+}
+
+static void
+ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
-	struct pci_ahci_softc *sc;
 	struct ahci_prdt_entry *prdt;
 	struct ahci_cmd_hdr *hdr;
 	uint64_t lba;
 	uint32_t len;
-	int i, err, iovcnt, ncq, readop;
+	int err, first, ncq, readop;
 
-	sc = p->pr_sc;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	ncq = 0;
 	readop = 1;
+	first = (done == 0);
 
-	prdt += seek;
-	if (cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
-			cfis[2] == ATA_WRITE_FPDMA_QUEUED)
+	if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 ||
+	    cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 ||
+	    cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
+	    cfis[2] == ATA_WRITE_FPDMA_QUEUED)
 		readop = 0;
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
-			cfis[2] == ATA_READ_FPDMA_QUEUED) {
+	    cfis[2] == ATA_READ_FPDMA_QUEUED) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
@@ -536,7 +693,9 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
 		if (!len)
 			len = 65536;
 		ncq = 1;
-	} else if (cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
+	} else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 ||
+	    cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 ||
+	    cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
@@ -556,57 +715,33 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
 	lba *= blockif_sectsz(p->bctx);
 	len *= blockif_sectsz(p->bctx);
 
-	/*
-	 * Pull request off free list
-	 */
+	/* Pull request off free list */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
-	iovcnt = hdr->prdtl - seek;
-	if (iovcnt > BLOCKIF_IOV_MAX) {
-		aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
-		iovcnt = BLOCKIF_IOV_MAX;
-	} else
-		aior->prdtl = 0;
-	breq->br_iovcnt = iovcnt;
+	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
-	/*
-	 * Mark this command in-flight.
-	 */
+	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
-	/*
-	 * Stuff request onto busy list
-	 */
+	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
-	/*
-	 * Build up the iovec based on the prdt
-	 */
-	for (i = 0; i < iovcnt; i++) {
-		uint32_t dbcsz;
+	if (ncq && first)
+		ahci_write_fis_d2h_ncq(p, slot);
 
-		dbcsz = (prdt->dbc & DBCMASK) + 1;
-		breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
-		    prdt->dba, dbcsz);
-		breq->br_iov[i].iov_len = dbcsz;
-		aior->done += dbcsz;
-		prdt++;
-	}
 	if (readop)
 		err = blockif_read(p->bctx, breq);
 	else
 		err = blockif_write(p->bctx, breq);
 	assert(err == 0);
-
-	if (ncq)
-		p->ci &= ~(1 << slot);
 }
 
 static void
@@ -626,7 +761,7 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 	aior->slot = slot;
 	aior->len = 0;
 	aior->done = 0;
-	aior->prdtl = 0;
+	aior->more = 0;
 	breq = &aior->io_req;
 
 	/*
@@ -643,6 +778,120 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 	assert(err == 0);
 }
 
+static inline void
+read_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
+		void *buf, int size)
+{
+	struct ahci_cmd_hdr *hdr;
+	struct ahci_prdt_entry *prdt;
+	void *to;
+	int i, len;
+
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+	len = size;
+	to = buf;
+	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+	for (i = 0; i < hdr->prdtl && len; i++) {
+		uint8_t *ptr;
+		uint32_t dbcsz;
+		int sublen;
+
+		dbcsz = (prdt->dbc & DBCMASK) + 1;
+		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
+		sublen = MIN(len, dbcsz);
+		memcpy(to, ptr, sublen);
+		len -= sublen;
+		to += sublen;
+		prdt++;
+	}
+}
+
+static void
+ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
+{
+	struct ahci_ioreq *aior;
+	struct blockif_req *breq;
+	uint8_t *entry;
+	uint64_t elba;
+	uint32_t len, elen;
+	int err, first, ncq;
+	uint8_t buf[512];
+
+	first = (done == 0);
+	if (cfis[2] == ATA_DATA_SET_MANAGEMENT) {
+		len = (uint16_t)cfis[13] << 8 | cfis[12];
+		len *= 512;
+		ncq = 0;
+	} else { /* ATA_SEND_FPDMA_QUEUED */
+		len = (uint16_t)cfis[11] << 8 | cfis[3];
+		len *= 512;
+		ncq = 1;
+	}
+	read_prdt(p, slot, cfis, buf, sizeof(buf));
+
+next:
+	entry = &buf[done];
+	elba = ((uint64_t)entry[5] << 40) |
+		((uint64_t)entry[4] << 32) |
+		((uint64_t)entry[3] << 24) |
+		((uint64_t)entry[2] << 16) |
+		((uint64_t)entry[1] << 8) |
+		entry[0];
+	elen = (uint16_t)entry[7] << 8 | entry[6];
+	done += 8;
+	if (elen == 0) {
+		if (done >= len) {
+			if (ncq) {
+				if (first)
+					ahci_write_fis_d2h_ncq(p, slot);
+				ahci_write_fis_sdb(p, slot, cfis,
+				    ATA_S_READY | ATA_S_DSC);
+			} else {
+				ahci_write_fis_d2h(p, slot, cfis,
+				    ATA_S_READY | ATA_S_DSC);
+			}
+			p->pending &= ~(1 << slot);
+			ahci_check_stopped(p);
+			if (!first)
+				ahci_handle_port(p);
+			return;
+		}
+		goto next;
+	}
+
+	/*
+	 * Pull request off free list
+	 */
+	aior = STAILQ_FIRST(&p->iofhd);
+	assert(aior != NULL);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+	aior->cfis = cfis;
+	aior->slot = slot;
+	aior->len = len;
+	aior->done = done;
+	aior->more = (len != done);
+
+	breq = &aior->io_req;
+	breq->br_offset = elba * blockif_sectsz(p->bctx);
+	breq->br_resid = elen * blockif_sectsz(p->bctx);
+
+	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	if (ncq && first)
+		ahci_write_fis_d2h_ncq(p, slot);
+
+	err = blockif_delete(p->bctx, breq);
+	assert(err == 0);
+}
+
 static inline void
 write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 		void *buf, int size)
@@ -663,7 +912,7 @@ write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
-		sublen = len < dbcsz ? len : dbcsz;
+		sublen = MIN(len, dbcsz);
 		memcpy(ptr, from, sublen);
 		len -= sublen;
 		from += sublen;
@@ -672,6 +921,58 @@ write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 	hdr->prdbc = size - len;
 }
 
+static void
+ahci_checksum(uint8_t *buf, int size)
+{
+	int i;
+	uint8_t sum = 0;
+
+	for (i = 0; i < size - 1; i++)
+		sum += buf[i];
+	buf[size - 1] = 0x100 - sum;
+}
+
+static void
+ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	struct ahci_cmd_hdr *hdr;
+	uint32_t buf[128];
+	uint8_t *buf8 = (uint8_t *)buf;
+	uint16_t *buf16 = (uint16_t *)buf;
+
+	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+	if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 ||
+	    cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) {
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+		return;
+	}
+
+	memset(buf, 0, sizeof(buf));
+	if (cfis[4] == 0x00) {	/* Log directory */
+		buf16[0x00] = 1; /* Version -- 1 */
+		buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */
+		buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */
+	} else if (cfis[4] == 0x10) {	/* NCQ Command Error Log */
+		memcpy(buf8, p->err_cfis, sizeof(p->err_cfis));
+		ahci_checksum(buf8, sizeof(buf));
+	} else if (cfis[4] == 0x13) {	/* SATA NCQ Send and Receive Log */
+		if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) {
+			buf[0x00] = 1;	/* SFQ DSM supported */
+			buf[0x01] = 1;	/* SFQ DSM TRIM supported */
+		}
+	} else {
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+		return;
+	}
+
+	if (cfis[2] == ATA_READ_LOG_EXT)
+		ahci_write_fis_piosetup(p);
+	write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+	ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
+}
+
 static void
 handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
@@ -679,82 +980,116 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0) {
-		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
-		p->is |= AHCI_P_IX_TFE;
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		uint16_t buf[256];
 		uint64_t sectors;
+		int sectsz, psectsz, psectoff, candelete, ro;
 		uint16_t cyl;
 		uint8_t sech, heads;
 
-		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+		ro = blockif_is_ro(p->bctx);
+		candelete = blockif_candelete(p->bctx);
+		sectsz = blockif_sectsz(p->bctx);
+		sectors = blockif_size(p->bctx) / sectsz;
 		blockif_chs(p->bctx, &cyl, &heads, &sech);
+		blockif_psectsz(p->bctx, &psectsz, &psectoff);
 		memset(buf, 0, sizeof(buf));
 		buf[0] = 0x0040;
 		buf[1] = cyl;
 		buf[3] = heads;
 		buf[6] = sech;
-		/* TODO emulate different serial? */
-		ata_string((uint8_t *)(buf+10), "123456", 20);
+		ata_string((uint8_t *)(buf+10), p->ident, 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
 		buf[47] = (0x8000 | 128);
-		buf[48] = 0x1;
+		buf[48] = 0;
 		buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
 		buf[50] = (1 << 14);
 		buf[53] = (1 << 1 | 1 << 2);
 		if (p->mult_sectors)
 			buf[59] = (0x100 | p->mult_sectors);
-		buf[60] = sectors;
-		buf[61] = (sectors >> 16);
+		if (sectors <= 0x0fffffff) {
+			buf[60] = sectors;
+			buf[61] = (sectors >> 16);
+		} else {
+			buf[60] = 0xffff;
+			buf[61] = 0x0fff;
+		}
 		buf[63] = 0x7;
 		if (p->xfermode & ATA_WDMA0)
 			buf[63] |= (1 << ((p->xfermode & 7) + 8));
 		buf[64] = 0x3;
-		buf[65] = 100;
-		buf[66] = 100;
-		buf[67] = 100;
-		buf[68] = 100;
+		buf[65] = 120;
+		buf[66] = 120;
+		buf[67] = 120;
+		buf[68] = 120;
+		buf[69] = 0;
 		buf[75] = 31;
-		buf[76] = (1 << 8 | 1 << 2);
-		buf[80] = 0x1f0;
+		buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 |
+			   ATA_SUPPORT_NCQ);
+		buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED |
+			   (p->ssts & ATA_SS_SPD_MASK) >> 3);
+		buf[80] = 0x3f0;
 		buf[81] = 0x28;
-		buf[82] = (1 << 5 | 1 << 14);
-		buf[83] = (1 << 10 | 1 << 12 | 1 << 13 | 1 << 14);
+		buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
+			   ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
+		buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
+			   ATA_SUPPORT_FLUSHCACHE48 | 1 << 14);
 		buf[84] = (1 << 14);
-		buf[85] = (1 << 5 | 1 << 14);
-		buf[86] = (1 << 10 | 1 << 12 | 1 << 13);
+		buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
+			   ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
+		buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
+			   ATA_SUPPORT_FLUSHCACHE48 | 1 << 15);
 		buf[87] = (1 << 14);
 		buf[88] = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			buf[88] |= (1 << ((p->xfermode & 7) + 8));
-		buf[93] = (1 | 1 <<14);
 		buf[100] = sectors;
 		buf[101] = (sectors >> 16);
 		buf[102] = (sectors >> 32);
 		buf[103] = (sectors >> 48);
+		if (candelete && !ro) {
+			buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT;
+			buf[105] = 1;
+			buf[169] = ATA_SUPPORT_DSM_TRIM;
+		}
+		buf[106] = 0x4000;
+		buf[209] = 0x4000;
+		if (psectsz > sectsz) {
+			buf[106] |= 0x2000;
+			buf[106] |= ffsl(psectsz / sectsz) - 1;
+			buf[209] |= (psectoff / sectsz);
+		}
+		if (sectsz > 512) {
+			buf[106] |= 0x1000;
+			buf[117] = sectsz / 2;
+			buf[118] = ((sectsz / 2) >> 16);
+		}
+		buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
+		buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
+		buf[222] = 0x1020;
+		buf[255] = 0x00a5;
+		ahci_checksum((uint8_t *)buf, sizeof(buf));
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
-		p->tfd = ATA_S_DSC | ATA_S_READY;
-		p->is |= AHCI_P_IX_DP;
-		p->ci &= ~(1 << slot);
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
-	ahci_generate_intr(p->pr_sc);
 }
 
 static void
 handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	if (!p->atapi) {
-		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
-		p->is |= AHCI_P_IX_TFE;
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		uint16_t buf[256];
 
 		memset(buf, 0, sizeof(buf));
 		buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
-		/* TODO emulate different serial? */
-		ata_string((uint8_t *)(buf+10), "123456", 20);
+		ata_string((uint8_t *)(buf+10), p->ident, 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
 		buf[49] = (1 << 9 | 1 << 8);
@@ -762,27 +1097,34 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 		buf[53] = (1 << 2 | 1 << 1);
 		buf[62] = 0x3f;
 		buf[63] = 7;
+		if (p->xfermode & ATA_WDMA0)
+			buf[63] |= (1 << ((p->xfermode & 7) + 8));
 		buf[64] = 3;
-		buf[65] = 100;
-		buf[66] = 100;
-		buf[67] = 100;
-		buf[68] = 100;
-		buf[76] = (1 << 2 | 1 << 1);
+		buf[65] = 120;
+		buf[66] = 120;
+		buf[67] = 120;
+		buf[68] = 120;
+		buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
+		buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
 		buf[78] = (1 << 5);
-		buf[80] = (0x1f << 4);
-		buf[82] = (1 << 4);
+		buf[80] = 0x3f0;
+		buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
+			   ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		buf[83] = (1 << 14);
 		buf[84] = (1 << 14);
-		buf[85] = (1 << 4);
+		buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
+			   ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		buf[87] = (1 << 14);
-		buf[88] = (1 << 14 | 0x7f);
+		buf[88] = 0x7f;
+		if (p->xfermode & ATA_UDMA0)
+			buf[88] |= (1 << ((p->xfermode & 7) + 8));
+		buf[222] = 0x1020;
+		buf[255] = 0x00a5;
+		ahci_checksum((uint8_t *)buf, sizeof(buf));
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
-		p->tfd = ATA_S_DSC | ATA_S_READY;
-		p->is |= AHCI_P_IX_DHR;
-		p->ci &= ~(1 << slot);
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
-	ahci_generate_intr(p->pr_sc);
 }
 
 static void
@@ -791,22 +1133,41 @@ atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
 	uint8_t buf[36];
 	uint8_t *acmd;
 	int len;
+	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
-	buf[0] = 0x05;
-	buf[1] = 0x80;
-	buf[2] = 0x00;
-	buf[3] = 0x21;
-	buf[4] = 31;
-	buf[5] = 0;
-	buf[6] = 0;
-	buf[7] = 0;
-	atapi_string(buf + 8, "BHYVE", 8);
-	atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
-	atapi_string(buf + 32, "001", 4);
-
-	len = sizeof(buf);
+	if (acmd[1] & 1) {		/* VPD */
+		if (acmd[2] == 0) {	/* Supported VPD pages */
+			buf[0] = 0x05;
+			buf[1] = 0;
+			buf[2] = 0;
+			buf[3] = 1;
+			buf[4] = 0;
+			len = 4 + buf[3];
+		} else {
+			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+			p->asc = 0x24;
+			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+			ahci_write_fis_d2h(p, slot, cfis, tfd);
+			return;
+		}
+	} else {
+		buf[0] = 0x05;
+		buf[1] = 0x80;
+		buf[2] = 0x00;
+		buf[3] = 0x21;
+		buf[4] = 31;
+		buf[5] = 0;
+		buf[6] = 0;
+		buf[7] = 0;
+		atapi_string(buf + 8, "BHYVE", 8);
+		atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
+		atapi_string(buf + 32, "001", 4);
+		len = sizeof(buf);
+	}
+
 	if (len > acmd[4])
 		len = acmd[4];
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
@@ -918,10 +1279,9 @@ atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
 	{
 		int msf, size;
 		uint64_t sectors;
-		uint8_t start_track, *bp, buf[50];
+		uint8_t *bp, buf[50];
 
 		msf = (acmd[1] >> 1) & 1;
-		start_track = acmd[6];
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
@@ -1010,25 +1370,34 @@ atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
 }
 
 static void
-atapi_read(struct ahci_port *p, int slot, uint8_t *cfis,
-		uint32_t done, int seek)
+atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+	uint8_t buf[16];
+
+	memset(buf, 0, sizeof(buf));
+	buf[3] = 8;
+
+	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+	write_prdt(p, slot, cfis, buf, sizeof(buf));
+	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	struct blockif_req *breq;
-	struct pci_ahci_softc *sc;
 	uint8_t *acmd;
 	uint64_t lba;
 	uint32_t len;
-	int i, err, iovcnt;
+	int err;
 
-	sc = p->pr_sc;
 	acmd = cfis + 0x40;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
-	prdt += seek;
 	lba = be32dec(acmd + 2);
 	if (acmd[0] == READ_10)
 		len = be16dec(acmd + 7);
@@ -1053,37 +1422,14 @@ atapi_read(struct ahci_port *p, int slot, uint8_t *cfis,
 	aior->done = done;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
-	iovcnt = hdr->prdtl - seek;
-	if (iovcnt > BLOCKIF_IOV_MAX) {
-		aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
-		iovcnt = BLOCKIF_IOV_MAX;
-	} else
-		aior->prdtl = 0;
-	breq->br_iovcnt = iovcnt;
+	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
-	/*
-	 * Mark this command in-flight.
-	 */
+	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
-	/*
-	 * Stuff request onto busy list
-	 */
+	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
-	/*
-	 * Build up the iovec based on the prdt
-	 */
-	for (i = 0; i < iovcnt; i++) {
-		uint32_t dbcsz;
-
-		dbcsz = (prdt->dbc & DBCMASK) + 1;
-		breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
-		    prdt->dba, dbcsz);
-		breq->br_iov[i].iov_len = dbcsz;
-		aior->done += dbcsz;
-		prdt++;
-	}
 	err = blockif_read(p->bctx, breq);
 	assert(err == 0);
 }
@@ -1137,7 +1483,7 @@ static void
 atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
-	uint32_t tfd;
+	uint32_t tfd = 0;
 	uint8_t pc, code;
 	int len;
 
@@ -1278,9 +1624,12 @@ handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 	case READ_TOC:
 		atapi_read_toc(p, slot, cfis);
 		break;
+	case REPORT_LUNS:
+		atapi_report_luns(p, slot, cfis);
+		break;
 	case READ_10:
 	case READ_12:
-		atapi_read(p, slot, cfis, 0, 0);
+		atapi_read(p, slot, cfis, 0);
 		break;
 	case REQUEST_SENSE:
 		atapi_request_sense(p, slot, cfis);
@@ -1308,6 +1657,7 @@ static void
 ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 
+	p->tfd |= ATA_S_BUSY;
 	switch (cfis[2]) {
 	case ATA_ATA_IDENTIFY:
 		handle_identify(p, slot, cfis);
@@ -1363,28 +1713,68 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 			p->mult_sectors = cfis[12];
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 		}
-		p->is |= AHCI_P_IX_DP;
-		p->ci &= ~(1 << slot);
-		ahci_generate_intr(p->pr_sc);
+		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
+	case ATA_READ:
+	case ATA_WRITE:
+	case ATA_READ48:
+	case ATA_WRITE48:
+	case ATA_READ_MUL:
+	case ATA_WRITE_MUL:
+	case ATA_READ_MUL48:
+	case ATA_WRITE_MUL48:
 	case ATA_READ_DMA:
 	case ATA_WRITE_DMA:
 	case ATA_READ_DMA48:
 	case ATA_WRITE_DMA48:
 	case ATA_READ_FPDMA_QUEUED:
 	case ATA_WRITE_FPDMA_QUEUED:
-		ahci_handle_dma(p, slot, cfis, 0, 0);
+		ahci_handle_rw(p, slot, cfis, 0);
 		break;
 	case ATA_FLUSHCACHE:
 	case ATA_FLUSHCACHE48:
 		ahci_handle_flush(p, slot, cfis);
 		break;
-	case ATA_STANDBY_CMD:
+	case ATA_DATA_SET_MANAGEMENT:
+		if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM &&
+		    cfis[13] == 0 && cfis[12] == 1) {
+			ahci_handle_dsm_trim(p, slot, cfis, 0);
+			break;
+		}
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+		break;
+	case ATA_SEND_FPDMA_QUEUED:
+		if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM &&
+		    cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM &&
+		    cfis[11] == 0 && cfis[3] == 1) {
+			ahci_handle_dsm_trim(p, slot, cfis, 0);
+			break;
+		}
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+		break;
+	case ATA_READ_LOG_EXT:
+	case ATA_READ_LOG_DMA_EXT:
+		ahci_handle_read_log(p, slot, cfis);
 		break;
+	case ATA_SECURITY_FREEZE_LOCK:
+	case ATA_SMART_CMD:
 	case ATA_NOP:
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
+		break;
+	case ATA_CHECK_POWER_MODE:
+		cfis[12] = 0xff;	/* always on */
+		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+		break;
+	case ATA_STANDBY_CMD:
 	case ATA_STANDBY_IMMEDIATE:
+	case ATA_IDLE_CMD:
 	case ATA_IDLE_IMMEDIATE:
 	case ATA_SLEEP:
+	case ATA_READ_VERIFY:
+	case ATA_READ_VERIFY48:
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_ATAPI_IDENTIFY:
@@ -1392,17 +1782,15 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 		break;
 	case ATA_PACKET_CMD:
 		if (!p->atapi) {
-			p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
-			p->is |= AHCI_P_IX_TFE;
-			ahci_generate_intr(p->pr_sc);
+			ahci_write_fis_d2h(p, slot, cfis,
+			    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		} else
 			handle_packet_cmd(p, slot, cfis);
 		break;
 	default:
 		WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
-		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
-		p->is |= AHCI_P_IX_TFE;
-		ahci_generate_intr(p->pr_sc);
+		ahci_write_fis_d2h(p, slot, cfis,
+		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
@@ -1411,19 +1799,25 @@ static void
 ahci_handle_slot(struct ahci_port *p, int slot)
 {
 	struct ahci_cmd_hdr *hdr;
+#ifdef AHCI_DEBUG
 	struct ahci_prdt_entry *prdt;
+#endif
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
-	int cfl;
+#ifdef AHCI_DEBUG
+	int cfl, i;
+#endif
 
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+#ifdef AHCI_DEBUG
 	cfl = (hdr->flags & 0x1f) * 4;
+#endif
 	cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
 			0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
+#ifdef AHCI_DEBUG
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
-#ifdef AHCI_DEBUG
 	DPRINTF("\ncfis:");
 	for (i = 0; i < cfl; i++) {
 		if (i % 10 == 0)
@@ -1459,20 +1853,23 @@ ahci_handle_slot(struct ahci_port *p, int slot)
 static void
 ahci_handle_port(struct ahci_port *p)
 {
-	int i;
 
 	if (!(p->cmd & AHCI_P_CMD_ST))
 		return;
 
 	/*
 	 * Search for any new commands to issue ignoring those that
-	 * are already in-flight.
+	 * are already in-flight.  Stop if device is busy or in error.
 	 */
-	for (i = 0; (i < 32) && p->ci; i++) {
-		if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) {
+	for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) {
+		if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0)
+			break;
+		if (p->waitforclear)
+			break;
+		if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) {
 			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
-			p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;
-			ahci_handle_slot(p, i);
+			p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT;
+			ahci_handle_slot(p, p->ccs);
 		}
 	}
 }
@@ -1490,22 +1887,26 @@ ata_ioreq_cb(struct blockif_req *br, int err)
 	struct pci_ahci_softc *sc;
 	uint32_t tfd;
 	uint8_t *cfis;
-	int pending, slot, ncq;
+	int slot, ncq, dsm;
 
 	DPRINTF("%s %d\n", __func__, err);
 
-	ncq = 0;
+	ncq = dsm = 0;
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
-	pending = aior->prdtl;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
-			cfis[2] == ATA_READ_FPDMA_QUEUED)
+	    cfis[2] == ATA_READ_FPDMA_QUEUED ||
+	    cfis[2] == ATA_SEND_FPDMA_QUEUED)
 		ncq = 1;
+	if (cfis[2] == ATA_DATA_SET_MANAGEMENT ||
+	    (cfis[2] == ATA_SEND_FPDMA_QUEUED &&
+	     (cfis[13] & 0x1f) == ATA_SFPDMA_DSM))
+		dsm = 1;
 
 	pthread_mutex_lock(&sc->mtx);
 
@@ -1519,29 +1920,24 @@ ata_ioreq_cb(struct blockif_req *br, int err)
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
-	if (pending && !err) {
-		ahci_handle_dma(p, slot, cfis, aior->done,
-		    hdr->prdtl - pending);
+	if (!err)
+		hdr->prdbc = aior->done;
+
+	if (!err && aior->more) {
+		if (dsm)
+			ahci_handle_dsm_trim(p, slot, cfis, aior->done);
+		else 
+			ahci_handle_rw(p, slot, cfis, aior->done);
 		goto out;
 	}
 
-	if (!err && aior->done == aior->len) {
+	if (!err)
 		tfd = ATA_S_READY | ATA_S_DSC;
-		if (ncq)
-			hdr->prdbc = 0;
-		else
-			hdr->prdbc = aior->len;
-	} else {
+	else
 		tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
-		hdr->prdbc = 0;
-		if (ncq)
-			p->serr |= (1 << slot);
-	}
-
-	if (ncq) {
-		p->sact &= ~(1 << slot);
-		ahci_write_fis_sdb(p, slot, tfd);
-	} else
+	if (ncq)
+		ahci_write_fis_sdb(p, slot, cfis, tfd);
+	else
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 
 	/*
@@ -1550,6 +1946,7 @@ ata_ioreq_cb(struct blockif_req *br, int err)
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
+	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
@@ -1564,7 +1961,7 @@ atapi_ioreq_cb(struct blockif_req *br, int err)
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 	uint32_t tfd;
-	int pending, slot;
+	int slot;
 
 	DPRINTF("%s %d\n", __func__, err);
 
@@ -1572,7 +1969,6 @@ atapi_ioreq_cb(struct blockif_req *br, int err)
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
-	pending = aior->prdtl;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
 
@@ -1588,21 +1984,21 @@ atapi_ioreq_cb(struct blockif_req *br, int err)
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
-	if (pending && !err) {
-		atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending);
+	if (!err)
+		hdr->prdbc = aior->done;
+
+	if (!err && aior->more) {
+		atapi_read(p, slot, cfis, aior->done);
 		goto out;
 	}
 
-	if (!err && aior->done == aior->len) {
+	if (!err) {
 		tfd = ATA_S_READY | ATA_S_DSC;
-		hdr->prdbc = aior->len;
 	} else {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x21;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
-		hdr->prdbc = 0;
 	}
-
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 
@@ -1612,6 +2008,7 @@ atapi_ioreq_cb(struct blockif_req *br, int err)
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
+	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
@@ -1669,15 +2066,23 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 		break;
 	case AHCI_P_IS:
 		p->is &= ~value;
+		ahci_port_intr(p);
 		break;
 	case AHCI_P_IE:
 		p->ie = value & 0xFDC000FF;
-		ahci_generate_intr(sc);
+		ahci_port_intr(p);
 		break;
 	case AHCI_P_CMD:
 	{
-		p->cmd = value;
-		
+		p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
+		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
+		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
+		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK);
+		p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
+		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
+		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
+		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value;
+
 		if (!(value & AHCI_P_CMD_ST)) {
 			ahci_port_stop(p);
 		} else {
@@ -1701,10 +2106,14 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 		}
 
 		if (value & AHCI_P_CMD_CLO) {
-			p->tfd = 0;
+			p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ);
 			p->cmd &= ~AHCI_P_CMD_CLO;
 		}
 
+		if (value & AHCI_P_CMD_ICC_MASK) {
+			p->cmd &= ~AHCI_P_CMD_ICC_MASK;
+		}
+
 		ahci_handle_port(p);
 		break;
 	}
@@ -1714,10 +2123,10 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 		WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset);
 		break;
 	case AHCI_P_SCTL:
+		p->sctl = value;
 		if (!(p->cmd & AHCI_P_CMD_ST)) {
 			if (value & ATA_SC_DET_RESET)
 				ahci_port_reset(p);
-			p->sctl = value;
 		}
 		break;
 	case AHCI_P_SERR:
@@ -1751,16 +2160,19 @@ pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 		DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset);
 		break;
 	case AHCI_GHC:
-		if (value & AHCI_GHC_HR)
+		if (value & AHCI_GHC_HR) {
 			ahci_reset(sc);
-		else if (value & AHCI_GHC_IE) {
-			sc->ghc |= AHCI_GHC_IE;
-			ahci_generate_intr(sc);
+			break;
 		}
+		if (value & AHCI_GHC_IE)
+			sc->ghc |= AHCI_GHC_IE;
+		else
+			sc->ghc &= ~AHCI_GHC_IE;
+		ahci_generate_intr(sc, 0xffffffff);
 		break;
 	case AHCI_IS:
 		sc->is &= ~value;
-		ahci_generate_intr(sc);
+		ahci_generate_intr(sc, value);
 		break;
 	default:
 		break;
@@ -1774,7 +2186,7 @@ pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	struct pci_ahci_softc *sc = pi->pi_arg;
 
 	assert(baridx == 5);
-	assert(size == 4);
+	assert((offset % 4) == 0 && size == 4);
 
 	pthread_mutex_lock(&sc->mtx);
 
@@ -1863,24 +2275,29 @@ pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
 
 static uint64_t
 pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
-    uint64_t offset, int size)
+    uint64_t regoff, int size)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
+	uint64_t offset;
 	uint32_t value;
 
 	assert(baridx == 5);
-	assert(size == 4);
+	assert(size == 1 || size == 2 || size == 4);
+	assert((regoff & (size - 1)) == 0);
 
 	pthread_mutex_lock(&sc->mtx);
 
+	offset = regoff & ~0x3;	    /* round down to a multiple of 4 bytes */
 	if (offset < AHCI_OFFSET)
 		value = pci_ahci_host_read(sc, offset);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		value = pci_ahci_port_read(sc, offset);
 	else {
 		value = 0;
-		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset);
+		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n",
+		    regoff);
 	}
+	value >>= 8 * (regoff & 0x3);
 
 	pthread_mutex_unlock(&sc->mtx);
 
@@ -1890,18 +2307,16 @@ pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 static int
 pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 {
-	char bident[sizeof("XX:X:X")];
+	char bident[sizeof("XX:XX:XX")];
 	struct blockif_ctxt *bctxt;
 	struct pci_ahci_softc *sc;
-	int ret, slots;
+	int ret, slots, p;
+	MD5_CTX mdctx;
+	u_char digest[16];
+	char *next, *next2;
 
 	ret = 0;
 
-	if (opts == NULL) {
-		fprintf(stderr, "pci_ahci: backing device required\n");
-		return (1);
-	}
-
 #ifdef AHCI_DEBUG
 	dbg = fopen("/tmp/log", "w+");
 #endif
@@ -1909,48 +2324,96 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 	sc = calloc(1, sizeof(struct pci_ahci_softc));
 	pi->pi_arg = sc;
 	sc->asc_pi = pi;
-	sc->ports = MAX_PORTS;
+	pthread_mutex_init(&sc->mtx, NULL);
+	sc->ports = 0;
+	sc->pi = 0;
+	slots = 32;
+
+	for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) {
+		/* Identify and cut off type of present port. */
+		if (strncmp(opts, "hd:", 3) == 0) {
+			atapi = 0;
+			opts += 3;
+		} else if (strncmp(opts, "cd:", 3) == 0) {
+			atapi = 1;
+			opts += 3;
+		}
 
-	/*
-	 * Only use port 0 for a backing device. All other ports will be
-	 * marked as unused
-	 */
-	sc->port[0].atapi = atapi;
+		/* Find and cut off the next port options. */
+		next = strstr(opts, ",hd:");
+		next2 = strstr(opts, ",cd:");
+		if (next == NULL || (next2 != NULL && next2 < next))
+			next = next2;
+		if (next != NULL) {
+			next[0] = 0;
+			next++;
+		}
 
-	/*
-	 * Attempt to open the backing image. Use the PCI
-	 * slot/func for the identifier string.
-	 */
-	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
-	bctxt = blockif_open(opts, bident);
-	if (bctxt == NULL) {       	
-		ret = 1;
-		goto open_fail;
-	}	
-	sc->port[0].bctx = bctxt;
-	sc->port[0].pr_sc = sc;
+		if (opts[0] == 0)
+			continue;
 
-	/*
-	 * Allocate blockif request structures and add them
-	 * to the free list
-	 */
-	pci_ahci_ioreq_init(&sc->port[0]);
+		/*
+		 * Attempt to open the backing image. Use the PCI slot/func
+		 * and the port number for the identifier string.
+		 */
+		snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot,
+		    pi->pi_func, p);
+		bctxt = blockif_open(opts, bident);
+		if (bctxt == NULL) {
+			sc->ports = p;
+			ret = 1;
+			goto open_fail;
+		}	
+		sc->port[p].bctx = bctxt;
+		sc->port[p].pr_sc = sc;
+		sc->port[p].port = p;
+		sc->port[p].atapi = atapi;
+
+#ifndef __FreeBSD__
+		/*
+		 * Attempt to enable the write cache for this device, as the
+		 * guest will issue FLUSH commands when it requires durability.
+		 *
+		 * Failure here is fine, since an always-sync device will not
+		 * have an impact on correctness.
+		 */
+		(void) blockif_set_wce(bctxt, 1);
+#endif
 
-	pthread_mutex_init(&sc->mtx, NULL);
+		/*
+		 * Create an identifier for the backing file.
+		 * Use parts of the md5 sum of the filename
+		 */
+		MD5Init(&mdctx);
+		MD5Update(&mdctx, opts, strlen(opts));
+		MD5Final(digest, &mdctx);
+		snprintf(sc->port[p].ident, AHCI_PORT_IDENT,
+		    "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+		    digest[0], digest[1], digest[2], digest[3], digest[4],
+		    digest[5]);
+
+		/*
+		 * Allocate blockif request structures and add them
+		 * to the free list
+		 */
+		pci_ahci_ioreq_init(&sc->port[p]);
+
+		sc->pi |= (1 << p);
+		if (sc->port[p].ioqsz < slots)
+			slots = sc->port[p].ioqsz;
+	}
+	sc->ports = p;
 
 	/* Intel ICH8 AHCI */
-	slots = sc->port[0].ioqsz;
-	if (slots > 32)
-		slots = 32;
 	--slots;
+	if (sc->ports < DEF_PORTS)
+		sc->ports = DEF_PORTS;
 	sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
 	    AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
 	    AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
 	    AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
 	    (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
 
-	/* Only port 0 implemented */
-	sc->pi = 1;
 	sc->vs = 0x10300;
 	sc->cap2 = AHCI_CAP2_APST;
 	ahci_reset(sc);
@@ -1960,7 +2423,9 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
 	pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
-	pci_emul_add_msicap(pi, 1);
+	p = MIN(sc->ports, 16);
+	p = flsl(p) - ((p & (p - 1)) ? 0 : 1);
+	pci_emul_add_msicap(pi, 1 << p);
 	pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
 	    AHCI_OFFSET + sc->ports * AHCI_STEP);
 
@@ -1968,7 +2433,10 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 
 open_fail:
 	if (ret) {
-		blockif_close(sc->port[0].bctx);
+		for (p = 0; p < sc->ports; p++) {
+			if (sc->port[p].bctx != NULL)
+				blockif_close(sc->port[p].bctx);
+		}
 		free(sc);
 	}
 
@@ -1992,6 +2460,14 @@ pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 /*
  * Use separate emulation names to distinguish drive and atapi devices
  */
+struct pci_devemu pci_de_ahci = {
+	.pe_emu =	"ahci",
+	.pe_init =	pci_ahci_hd_init,
+	.pe_barwrite =	pci_ahci_write,
+	.pe_barread =	pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci);
+
 struct pci_devemu pci_de_ahci_hd = {
 	.pe_emu =	"ahci-hd",
 	.pe_init =	pci_ahci_hd_init,
diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c
new file mode 100644
index 0000000000..e211b5cf9c
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_e82545.c
@@ -0,0 +1,2418 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
+ * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
+ * Copyright (c) 2013 Jeremiah Lott, Avere Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/limits.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#ifndef	__FreeBSD__
+#include <sys/filio.h>
+#endif
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <md5.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "e1000_regs.h"
+#include "e1000_defines.h"
+#include "mii.h"
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+
+/* Hardware/register definitions XXX: move some to common code. */
+#define E82545_VENDOR_ID_INTEL			0x8086
+#define E82545_DEV_ID_82545EM_COPPER		0x100F
+#define E82545_SUBDEV_ID			0x1008
+
+#define E82545_REVISION_4			4
+
+#define E82545_MDIC_DATA_MASK			0x0000FFFF
+#define E82545_MDIC_OP_MASK			0x0c000000
+#define E82545_MDIC_IE				0x20000000
+
+#define E82545_EECD_FWE_DIS	0x00000010 /* Flash writes disabled */
+#define E82545_EECD_FWE_EN	0x00000020 /* Flash writes enabled */
+#define E82545_EECD_FWE_MASK	0x00000030 /* Flash writes mask */
+
+#define E82545_BAR_REGISTER			0
+#define E82545_BAR_REGISTER_LEN			(128*1024)
+#define E82545_BAR_FLASH			1
+#define E82545_BAR_FLASH_LEN			(64*1024)
+#define E82545_BAR_IO				2
+#define E82545_BAR_IO_LEN			8
+
+#define E82545_IOADDR				0x00000000
+#define E82545_IODATA				0x00000004
+#define E82545_IO_REGISTER_MAX			0x0001FFFF
+#define E82545_IO_FLASH_BASE			0x00080000
+#define E82545_IO_FLASH_MAX			0x000FFFFF
+
+#define E82545_ARRAY_ENTRY(reg, offset)		(reg + (offset<<2))
+#define E82545_RAR_MAX				15
+#define E82545_MTA_MAX				127
+#define E82545_VFTA_MAX				127
+
+/* Slightly modified from the driver versions, hardcoded for 3 opcode bits,
+ * followed by 6 address bits.
+ * TODO: make opcode bits and addr bits configurable?
+ * NVM Commands - Microwire */
+#define E82545_NVM_OPCODE_BITS	3
+#define E82545_NVM_ADDR_BITS	6
+#define E82545_NVM_DATA_BITS	16
+#define E82545_NVM_OPADDR_BITS	(E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS)
+#define E82545_NVM_ADDR_MASK	((1 << E82545_NVM_ADDR_BITS)-1)
+#define E82545_NVM_OPCODE_MASK	\
+    (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS)
+#define E82545_NVM_OPCODE_READ	(0x6 << E82545_NVM_ADDR_BITS)	/* read */
+#define E82545_NVM_OPCODE_WRITE	(0x5 << E82545_NVM_ADDR_BITS)	/* write */
+#define E82545_NVM_OPCODE_ERASE	(0x7 << E82545_NVM_ADDR_BITS)	/* erase */
+#define	E82545_NVM_OPCODE_EWEN	(0x4 << E82545_NVM_ADDR_BITS)	/* wr-enable */
+
+#define	E82545_NVM_EEPROM_SIZE	64 /* 64 * 16-bit values == 128K */
+
+#define E1000_ICR_SRPD		0x00010000
+
+/* This is an arbitrary number.  There is no hard limit on the chip. */
+#define I82545_MAX_TXSEGS	64
+
+/* Legacy receive descriptor */
+struct e1000_rx_desc {
+	uint64_t buffer_addr;	/* Address of the descriptor's data buffer */
+	uint16_t length;	/* Length of data DMAed into data buffer */
+	uint16_t csum;		/* Packet checksum */
+	uint8_t	 status;       	/* Descriptor status */
+	uint8_t  errors;	/* Descriptor Errors */
+	uint16_t special;
+};
+
+/* Transmit descriptor types */
+#define	E1000_TXD_MASK		(E1000_TXD_CMD_DEXT | 0x00F00000)
+#define E1000_TXD_TYP_L		(0)
+#define E1000_TXD_TYP_C		(E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C)
+#define E1000_TXD_TYP_D		(E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)
+
+/* Legacy transmit descriptor */
+struct e1000_tx_desc {
+	uint64_t buffer_addr;   /* Address of the descriptor's data buffer */
+	union {
+		uint32_t data;
+		struct {
+			uint16_t length;  /* Data buffer length */
+			uint8_t  cso;  /* Checksum offset */
+			uint8_t  cmd;  /* Descriptor control */
+		} flags;
+	} lower;
+	union {
+		uint32_t data;
+		struct {
+			uint8_t status; /* Descriptor status */
+			uint8_t css;  /* Checksum start */
+			uint16_t special;
+		} fields;
+	} upper;
+};
+
+/* Context descriptor */
+struct e1000_context_desc {
+	union {
+		uint32_t ip_config;
+		struct {
+			uint8_t ipcss;  /* IP checksum start */
+			uint8_t ipcso;  /* IP checksum offset */
+			uint16_t ipcse;  /* IP checksum end */
+		} ip_fields;
+	} lower_setup;
+	union {
+		uint32_t tcp_config;
+		struct {
+			uint8_t tucss;  /* TCP checksum start */
+			uint8_t tucso;  /* TCP checksum offset */
+			uint16_t tucse;  /* TCP checksum end */
+		} tcp_fields;
+	} upper_setup;
+	uint32_t cmd_and_length;
+	union {
+		uint32_t data;
+		struct {
+			uint8_t status;  /* Descriptor status */
+			uint8_t hdr_len;  /* Header length */
+			uint16_t mss;  /* Maximum segment size */
+		} fields;
+	} tcp_seg_setup;
+};
+
+/* Data descriptor */
+struct e1000_data_desc {
+	uint64_t buffer_addr;  /* Address of the descriptor's buffer address */
+	union {
+		uint32_t data;
+		struct {
+			uint16_t length;  /* Data buffer length */
+			uint8_t typ_len_ext;
+			uint8_t cmd;
+		} flags;
+	} lower;
+	union {
+		uint32_t data;
+		struct {
+			uint8_t status;  /* Descriptor status */
+			uint8_t popts;  /* Packet Options */
+			uint16_t special;
+		} fields;
+	} upper;
+};
+
+union e1000_tx_udesc {
+	struct e1000_tx_desc td;
+	struct e1000_context_desc cd;
+	struct e1000_data_desc dd;
+};
+
+/* Tx checksum info for a packet. */
+struct ck_info {
+	int	ck_valid;	/* ck_info is valid */
+	uint8_t	ck_start;	/* start byte of cksum calcuation */
+	uint8_t	ck_off;		/* offset of cksum insertion */
+	uint16_t ck_len;	/* length of cksum calc: 0 is to packet-end */
+};
+
+/*
+ * Debug printf
+ */
+static int e82545_debug = 0;
+#define DPRINTF(msg,params...) if (e82545_debug) fprintf(stderr, "e82545: " msg, params)
+#define WPRINTF(msg,params...) fprintf(stderr, "e82545: " msg, params)
+
+#define	MIN(a,b) (((a)<(b))?(a):(b))
+#define	MAX(a,b) (((a)>(b))?(a):(b))
+
+/* s/w representation of the RAL/RAH regs */
+struct  eth_uni {
+	int		eu_valid;
+	int		eu_addrsel;
+	struct ether_addr eu_eth;
+};
+
+
+struct e82545_softc {
+	struct pci_devinst *esc_pi;
+	struct vmctx	*esc_ctx;
+	struct mevent   *esc_mevp;
+	struct mevent   *esc_mevpitr;
+	pthread_mutex_t	esc_mtx;
+	struct ether_addr esc_mac;
+	int		esc_tapfd;
+
+	/* General */
+	uint32_t	esc_CTRL;	/* x0000 device ctl */
+	uint32_t	esc_FCAL;	/* x0028 flow ctl addr lo */
+	uint32_t	esc_FCAH;	/* x002C flow ctl addr hi */
+	uint32_t	esc_FCT;	/* x0030 flow ctl type */
+	uint32_t	esc_VET;	/* x0038 VLAN eth type */
+	uint32_t	esc_FCTTV;	/* x0170 flow ctl tx timer */
+	uint32_t	esc_LEDCTL;	/* x0E00 LED control */
+	uint32_t	esc_PBA;	/* x1000 pkt buffer allocation */
+	
+	/* Interrupt control */
+	int		esc_irq_asserted;
+	uint32_t	esc_ICR;	/* x00C0 cause read/clear */
+	uint32_t	esc_ITR;	/* x00C4 intr throttling */
+	uint32_t	esc_ICS;	/* x00C8 cause set */
+	uint32_t	esc_IMS;	/* x00D0 mask set/read */
+	uint32_t	esc_IMC;	/* x00D8 mask clear */
+
+	/* Transmit */
+	union e1000_tx_udesc *esc_txdesc;
+	struct e1000_context_desc esc_txctx;
+	pthread_t	esc_tx_tid;
+	pthread_cond_t	esc_tx_cond;
+	int		esc_tx_enabled;
+	int		esc_tx_active;
+	uint32_t	esc_TXCW;	/* x0178 transmit config */
+	uint32_t	esc_TCTL;	/* x0400 transmit ctl */
+	uint32_t	esc_TIPG;	/* x0410 inter-packet gap */
+	uint16_t	esc_AIT;	/* x0458 Adaptive Interframe Throttle */
+	uint64_t	esc_tdba;      	/* verified 64-bit desc table addr */
+	uint32_t	esc_TDBAL;	/* x3800 desc table addr, low bits */
+	uint32_t	esc_TDBAH;	/* x3804 desc table addr, hi 32-bits */
+	uint32_t	esc_TDLEN;	/* x3808 # descriptors in bytes */
+	uint16_t	esc_TDH;	/* x3810 desc table head idx */
+	uint16_t	esc_TDHr;	/* internal read version of TDH */
+	uint16_t	esc_TDT;	/* x3818 desc table tail idx */
+	uint32_t	esc_TIDV;	/* x3820 intr delay */
+	uint32_t	esc_TXDCTL;	/* x3828 desc control */
+	uint32_t	esc_TADV;	/* x382C intr absolute delay */
+	
+	/* L2 frame acceptance */
+	struct eth_uni	esc_uni[16];	/* 16 x unicast MAC addresses */
+	uint32_t	esc_fmcast[128]; /* Multicast filter bit-match */
+	uint32_t	esc_fvlan[128]; /* VLAN 4096-bit filter */
+	
+	/* Receive */
+	struct e1000_rx_desc *esc_rxdesc;
+	pthread_cond_t	esc_rx_cond;
+	int		esc_rx_enabled;
+	int		esc_rx_active;
+	int		esc_rx_loopback;
+	uint32_t	esc_RCTL;	/* x0100 receive ctl */
+	uint32_t	esc_FCRTL;	/* x2160 flow cntl thresh, low */
+	uint32_t	esc_FCRTH;	/* x2168 flow cntl thresh, hi */
+	uint64_t	esc_rdba;	/* verified 64-bit desc table addr */
+	uint32_t	esc_RDBAL;	/* x2800 desc table addr, low bits */
+	uint32_t	esc_RDBAH;	/* x2804 desc table addr, hi 32-bits*/
+	uint32_t	esc_RDLEN;	/* x2808 #descriptors */
+	uint16_t	esc_RDH;	/* x2810 desc table head idx */
+	uint16_t	esc_RDT;	/* x2818 desc table tail idx */
+	uint32_t	esc_RDTR;	/* x2820 intr delay */
+	uint32_t	esc_RXDCTL;	/* x2828 desc control */
+	uint32_t	esc_RADV;	/* x282C intr absolute delay */
+	uint32_t	esc_RSRPD;	/* x2C00 recv small packet detect */
+	uint32_t	esc_RXCSUM;     /* x5000 receive cksum ctl */
+	
+	/* IO Port register access */
+	uint32_t io_addr;
+
+	/* Shadow copy of MDIC */
+	uint32_t mdi_control;
+	/* Shadow copy of EECD */
+	uint32_t eeprom_control;
+	/* Latest NVM in/out */
+	uint16_t nvm_data;
+	uint16_t nvm_opaddr;
+	/* stats */
+	uint32_t missed_pkt_count; /* dropped for no room in rx queue */
+	uint32_t pkt_rx_by_size[6];
+	uint32_t pkt_tx_by_size[6];
+	uint32_t good_pkt_rx_count;
+	uint32_t bcast_pkt_rx_count;
+	uint32_t mcast_pkt_rx_count;
+	uint32_t good_pkt_tx_count;
+	uint32_t bcast_pkt_tx_count;
+	uint32_t mcast_pkt_tx_count;
+	uint32_t oversize_rx_count;
+	uint32_t tso_tx_count;
+	uint64_t good_octets_rx;
+	uint64_t good_octets_tx;
+	uint64_t missed_octets; /* counts missed and oversized */
+
+	uint8_t nvm_bits:6; /* number of bits remaining in/out */
+	uint8_t nvm_mode:2;
+#define E82545_NVM_MODE_OPADDR  0x0
+#define E82545_NVM_MODE_DATAIN  0x1
+#define E82545_NVM_MODE_DATAOUT 0x2
+	/* EEPROM data */
+	uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE];
+};
+
+static void e82545_reset(struct e82545_softc *sc, int dev);
+static void e82545_rx_enable(struct e82545_softc *sc);
+static void e82545_rx_disable(struct e82545_softc *sc);
+#ifdef	__FreeBSD__
+static void e82545_tap_callback(int fd, enum ev_type type, void *param);
+#endif
+static void e82545_tx_start(struct e82545_softc *sc);
+static void e82545_tx_enable(struct e82545_softc *sc);
+static void e82545_tx_disable(struct e82545_softc *sc);
+
+static inline int
+e82545_size_stat_index(uint32_t size)
+{
+	if (size <= 64) {
+		return 0;
+	} else if (size >= 1024) {
+		return 5;
+	} else {
+		/* should be 1-4 */
+		return (ffs(size) - 6);
+	}
+}
+
+static void
+e82545_init_eeprom(struct e82545_softc *sc)
+{
+	uint16_t checksum, i;
+
+        /* mac addr */
+	sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) |
+		(((uint16_t)sc->esc_mac.octet[1]) << 8);
+	sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) |
+		(((uint16_t)sc->esc_mac.octet[3]) << 8);
+	sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) |
+		(((uint16_t)sc->esc_mac.octet[5]) << 8);
+
+	/* pci ids */
+	sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID;
+	sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL;
+	sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER;
+	sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL;
+
+	/* fill in the checksum */
+        checksum = 0;
+	for (i = 0; i < NVM_CHECKSUM_REG; i++) {
+		checksum += sc->eeprom_data[i];
+	}
+	checksum = NVM_SUM - checksum;
+	sc->eeprom_data[NVM_CHECKSUM_REG] = checksum;
+	DPRINTF("eeprom checksum: 0x%x\r\n", checksum);
+}
+
+static void
+e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr,
+			uint8_t phy_addr, uint32_t data)
+{
+	DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x\r\n", reg_addr, phy_addr, data);
+}
+
+static uint32_t
+e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr,
+			uint8_t phy_addr)
+{
+	//DPRINTF("Read mdi reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr);
+	switch (reg_addr) {
+	case PHY_STATUS:
+		return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS |
+			MII_SR_AUTONEG_COMPLETE);
+	case PHY_AUTONEG_ADV:
+		return NWAY_AR_SELECTOR_FIELD;
+	case PHY_LP_ABILITY:
+		return 0;
+	case PHY_1000T_STATUS:
+		return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS |
+			SR_1000T_LOCAL_RX_STATUS);
+	case PHY_ID1:
+		return (M88E1011_I_PHY_ID >> 16) & 0xFFFF;
+	case PHY_ID2:
+		return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF;
+	default:
+		DPRINTF("Unknown mdi read reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr);
+		return 0;
+	}
+	/* not reached */
+}
+
+static void
+e82545_eecd_strobe(struct e82545_softc *sc)
+{
+	/* Microwire state machine */
+	/*
+	DPRINTF("eeprom state machine srtobe "
+		"0x%x 0x%x 0x%x 0x%x\r\n",
+		sc->nvm_mode, sc->nvm_bits,
+		sc->nvm_opaddr, sc->nvm_data);*/
+
+	if (sc->nvm_bits == 0) {
+		DPRINTF("eeprom state machine not expecting data! "
+			"0x%x 0x%x 0x%x 0x%x\r\n",
+			sc->nvm_mode, sc->nvm_bits,
+			sc->nvm_opaddr, sc->nvm_data);
+		return;
+	}
+	sc->nvm_bits--;
+	if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) {
+		/* shifting out */
+		if (sc->nvm_data & 0x8000) {
+			sc->eeprom_control |= E1000_EECD_DO;
+		} else {
+			sc->eeprom_control &= ~E1000_EECD_DO;
+		}
+		sc->nvm_data <<= 1;
+		if (sc->nvm_bits == 0) {
+			/* read done, back to opcode mode. */
+			sc->nvm_opaddr = 0;
+			sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+			sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+		}
+	} else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) {
+		/* shifting in */
+		sc->nvm_data <<= 1;
+		if (sc->eeprom_control & E1000_EECD_DI) {
+			sc->nvm_data |= 1;
+		}
+		if (sc->nvm_bits == 0) {
+			/* eeprom write */
+			uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
+			uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK;
+			if (op != E82545_NVM_OPCODE_WRITE) {
+				DPRINTF("Illegal eeprom write op 0x%x\r\n",
+					sc->nvm_opaddr);
+			} else if (addr >= E82545_NVM_EEPROM_SIZE) {
+				DPRINTF("Illegal eeprom write addr 0x%x\r\n",
+					sc->nvm_opaddr);
+			} else {
+				DPRINTF("eeprom write eeprom[0x%x] = 0x%x\r\n",
+				addr, sc->nvm_data);
+				sc->eeprom_data[addr] = sc->nvm_data;
+			}
+			/* back to opcode mode */
+			sc->nvm_opaddr = 0;
+			sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+			sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+		}
+	} else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) {
+		sc->nvm_opaddr <<= 1;
+		if (sc->eeprom_control & E1000_EECD_DI) {
+			sc->nvm_opaddr |= 1;
+		}
+		if (sc->nvm_bits == 0) {
+			uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
+			switch (op) {
+			case E82545_NVM_OPCODE_EWEN:
+				DPRINTF("eeprom write enable: 0x%x\r\n",
+					sc->nvm_opaddr);
+				/* back to opcode mode */
+				sc->nvm_opaddr = 0;
+				sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+				sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+				break;
+			case E82545_NVM_OPCODE_READ:
+			{
+				uint16_t addr = sc->nvm_opaddr &
+					E82545_NVM_ADDR_MASK;
+				sc->nvm_mode = E82545_NVM_MODE_DATAOUT;
+				sc->nvm_bits = E82545_NVM_DATA_BITS;
+				if (addr < E82545_NVM_EEPROM_SIZE) {
+					sc->nvm_data = sc->eeprom_data[addr];
+					DPRINTF("eeprom read: eeprom[0x%x] = 0x%x\r\n",
+						addr, sc->nvm_data);
+				} else {
+					DPRINTF("eeprom illegal read: 0x%x\r\n",
+						sc->nvm_opaddr);
+					sc->nvm_data = 0;
+				}
+				break;
+			}
+			case E82545_NVM_OPCODE_WRITE:
+				sc->nvm_mode = E82545_NVM_MODE_DATAIN;
+				sc->nvm_bits = E82545_NVM_DATA_BITS;
+				sc->nvm_data = 0;
+				break;
+			default:
+				DPRINTF("eeprom unknown op: 0x%x\r\r",
+					sc->nvm_opaddr);
+				/* back to opcode mode */
+				sc->nvm_opaddr = 0;
+				sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+				sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+			}
+		}
+	} else {
+		DPRINTF("eeprom state machine wrong state! "
+			"0x%x 0x%x 0x%x 0x%x\r\n",
+			sc->nvm_mode, sc->nvm_bits,
+			sc->nvm_opaddr, sc->nvm_data);
+	}
+}
+
+#ifdef	__FreeBSD__
+static void
+e82545_itr_callback(int fd, enum ev_type type, void *param)
+{
+	uint32_t new;
+	struct e82545_softc *sc = param;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+	new = sc->esc_ICR & sc->esc_IMS;
+	if (new && !sc->esc_irq_asserted) {
+		DPRINTF("itr callback: lintr assert %x\r\n", new);
+		sc->esc_irq_asserted = 1;
+		pci_lintr_assert(sc->esc_pi);
+	} else {
+		mevent_delete(sc->esc_mevpitr);
+		sc->esc_mevpitr = NULL;
+	}
+	pthread_mutex_unlock(&sc->esc_mtx);
+}
+#endif
+
+static void
+e82545_icr_assert(struct e82545_softc *sc, uint32_t bits)
+{
+	uint32_t new;
+
+	DPRINTF("icr assert: 0x%x\r\n", bits);
+	
+	/*
+	 * An interrupt is only generated if bits are set that
+	 * aren't already in the ICR, these bits are unmasked,
+	 * and there isn't an interrupt already pending.
+	 */
+	new = bits & ~sc->esc_ICR & sc->esc_IMS;
+	sc->esc_ICR |= bits;
+
+	if (new == 0) {
+		DPRINTF("icr assert: masked %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (sc->esc_mevpitr != NULL) {
+		DPRINTF("icr assert: throttled %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (!sc->esc_irq_asserted) {
+		DPRINTF("icr assert: lintr assert %x\r\n", new);
+		sc->esc_irq_asserted = 1;
+		pci_lintr_assert(sc->esc_pi);
+		if (sc->esc_ITR != 0) {
+#ifdef	__FreeBSD__
+			sc->esc_mevpitr = mevent_add(
+			    (sc->esc_ITR + 3905) / 3906,  /* 256ns -> 1ms */
+			    EVF_TIMER, e82545_itr_callback, sc);
+#endif
+		}
+	}
+}
+
+static void
+e82545_ims_change(struct e82545_softc *sc, uint32_t bits)
+{
+	uint32_t new;
+
+	/*
+	 * Changing the mask may allow previously asserted
+	 * but masked interrupt requests to generate an interrupt.
+	 */
+	new = bits & sc->esc_ICR & ~sc->esc_IMS;
+	sc->esc_IMS |= bits;
+
+	if (new == 0) {
+		DPRINTF("ims change: masked %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (sc->esc_mevpitr != NULL) {
+		DPRINTF("ims change: throttled %x, ims %x\r\n", new, sc->esc_IMS);
+	} else if (!sc->esc_irq_asserted) {
+		DPRINTF("ims change: lintr assert %x\n\r", new);
+		sc->esc_irq_asserted = 1;
+		pci_lintr_assert(sc->esc_pi);
+		if (sc->esc_ITR != 0) {
+#ifdef	__FreeBSD__
+			sc->esc_mevpitr = mevent_add(
+			    (sc->esc_ITR + 3905) / 3906,  /* 256ns -> 1ms */
+			    EVF_TIMER, e82545_itr_callback, sc);
+#endif
+		}
+	}
+}
+
+static void
+e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits)
+{
+
+	DPRINTF("icr deassert: 0x%x\r\n", bits);
+	sc->esc_ICR &= ~bits;
+
+	/*
+	 * If there are no longer any interrupt sources and there
+	 * was an asserted interrupt, clear it
+	 */
+	if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) {
+		DPRINTF("icr deassert: lintr deassert %x\r\n", bits);
+		pci_lintr_deassert(sc->esc_pi);
+		sc->esc_irq_asserted = 0;
+	}
+}
+
+static void
+e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value)
+{
+
+	DPRINTF("intr_write: off %x, val %x\n\r", offset, value);
+	
+	switch (offset) {
+	case E1000_ICR:
+		e82545_icr_deassert(sc, value);
+		break;
+	case E1000_ITR:
+		sc->esc_ITR = value;
+		break;
+	case E1000_ICS:
+		sc->esc_ICS = value;	/* not used: store for debug */
+		e82545_icr_assert(sc, value);
+		break;
+	case E1000_IMS:
+		e82545_ims_change(sc, value);
+		break;
+	case E1000_IMC:
+		sc->esc_IMC = value;	/* for debug */
+		sc->esc_IMS &= ~value;
+		// XXX clear interrupts if all ICR bits now masked
+		// and interrupt was pending ?
+		break;
+	default:
+		break;
+	}
+}
+
+static uint32_t
+e82545_intr_read(struct e82545_softc *sc, uint32_t offset)
+{
+	uint32_t retval;
+
+	retval = 0;
+
+	DPRINTF("intr_read: off %x\n\r", offset);
+	
+	switch (offset) {
+	case E1000_ICR:
+		retval = sc->esc_ICR;
+		sc->esc_ICR = 0;
+		e82545_icr_deassert(sc, ~0);
+		break;
+	case E1000_ITR:
+		retval = sc->esc_ITR;
+		break;
+	case E1000_ICS:
+		/* write-only register */
+		break;
+	case E1000_IMS:
+		retval = sc->esc_IMS;
+		break;
+	case E1000_IMC:
+		/* write-only register */
+		break;
+	default:
+		break;
+	}
+
+	return (retval);
+}
+
+static void
+e82545_devctl(struct e82545_softc *sc, uint32_t val)
+{
+
+	sc->esc_CTRL = val & ~E1000_CTRL_RST;
+
+	if (val & E1000_CTRL_RST) {
+		DPRINTF("e1k: s/w reset, ctl %x\n", val);
+		e82545_reset(sc, 1);
+	}
+	/* XXX check for phy reset ? */
+}
+
+static void
+e82545_rx_update_rdba(struct e82545_softc *sc)
+{
+
+	/* XXX verify desc base/len within phys mem range */
+	sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 |
+	    sc->esc_RDBAL;
+	
+	/* Cache host mapping of guest descriptor array */
+	sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx,
+	    sc->esc_rdba, sc->esc_RDLEN);	
+}
+
+static void
+e82545_rx_ctl(struct e82545_softc *sc, uint32_t val)
+{
+	int on;
+
+	on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN);
+
+	/* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */
+	sc->esc_RCTL = val & ~0xF9204c01;
+
+	DPRINTF("rx_ctl - %s RCTL %x, val %x\n",
+		on ? "on" : "off", sc->esc_RCTL, val);
+
+	/* state change requested */
+	if (on != sc->esc_rx_enabled) {
+		if (on) {
+			/* Catch disallowed/unimplemented settings */
+			//assert(!(val & E1000_RCTL_LBM_TCVR));
+
+			if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) {
+				sc->esc_rx_loopback = 1;
+			} else {
+				sc->esc_rx_loopback = 0;
+			}
+
+			e82545_rx_update_rdba(sc);
+			e82545_rx_enable(sc);
+		} else {
+			e82545_rx_disable(sc);
+			sc->esc_rx_loopback = 0;
+			sc->esc_rdba = 0;
+			sc->esc_rxdesc = NULL;
+		}
+	}
+}
+
+static void
+e82545_tx_update_tdba(struct e82545_softc *sc)
+{
+
+	/* XXX verify desc base/len within phys mem range */
+	sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL;
+
+	/* Cache host mapping of guest descriptor array */
+	sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba,
+            sc->esc_TDLEN);
+}
+
+static void
+e82545_tx_ctl(struct e82545_softc *sc, uint32_t val)
+{
+	int on;
+	
+	on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN);
+
+	/* ignore TCTL_EN settings that don't change state */
+	if (on == sc->esc_tx_enabled)
+		return;
+
+	if (on) {
+		e82545_tx_update_tdba(sc);
+		e82545_tx_enable(sc);
+	} else {
+		e82545_tx_disable(sc);
+		sc->esc_tdba = 0;
+		sc->esc_txdesc = NULL;
+	}
+
+	/* Save TCTL value after stripping reserved bits 31:25,23,2,0 */
+	sc->esc_TCTL = val & ~0xFE800005;
+}
+
+int
+e82545_bufsz(uint32_t rctl)
+{
+
+	switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) {
+	case (E1000_RCTL_SZ_2048): return (2048);
+	case (E1000_RCTL_SZ_1024): return (1024);
+	case (E1000_RCTL_SZ_512): return (512);
+	case (E1000_RCTL_SZ_256): return (256);
+	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384);
+	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192);
+	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096);
+	}
+	return (256);	/* Forbidden value. */
+}
+
+#ifdef	__FreeBSD__
+static uint8_t dummybuf[2048];
+
+/* XXX one packet at a time until this is debugged */
+static void
+e82545_tap_callback(int fd, enum ev_type type, void *param)
+{
+	struct e82545_softc *sc = param;
+	struct e1000_rx_desc *rxd;
+	struct iovec vec[64];
+	int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size;
+	uint32_t cause = 0;
+	uint16_t *tp, tag, head;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+	DPRINTF("rx_run: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT);
+
+	if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
+		DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n",
+		    sc->esc_rx_enabled, sc->esc_rx_loopback);
+		while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+		}
+		goto done1;
+	}
+	bufsz = e82545_bufsz(sc->esc_RCTL);
+	maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522;
+	maxpktdesc = (maxpktsz + bufsz - 1) / bufsz;
+	size = sc->esc_RDLEN / 16;
+	head = sc->esc_RDH;
+	left = (size + sc->esc_RDT - head) % size;
+	if (left < maxpktdesc) {
+		DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n",
+		    left, maxpktdesc);
+		while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+		}
+		goto done1;
+	}
+
+	sc->esc_rx_active = 1;
+	pthread_mutex_unlock(&sc->esc_mtx);
+
+	for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) {
+
+		/* Grab rx descriptor pointed to by the head pointer */
+		for (i = 0; i < maxpktdesc; i++) {
+			rxd = &sc->esc_rxdesc[(head + i) % size];
+			vec[i].iov_base = paddr_guest2host(sc->esc_ctx,
+			    rxd->buffer_addr, bufsz);
+			vec[i].iov_len = bufsz;
+		}
+		len = readv(sc->esc_tapfd, vec, maxpktdesc);
+		if (len <= 0) {
+			DPRINTF("tap: readv() returned %d\n", len);
+			goto done;
+		}
+
+		/*
+		 * Adjust the packet length based on whether the CRC needs
+		 * to be stripped or if the packet is less than the minimum
+		 * eth packet size.
+		 */
+		if (len < ETHER_MIN_LEN - ETHER_CRC_LEN)
+			len = ETHER_MIN_LEN - ETHER_CRC_LEN;
+		if (!(sc->esc_RCTL & E1000_RCTL_SECRC))
+			len += ETHER_CRC_LEN;
+		n = (len + bufsz - 1) / bufsz;
+
+		DPRINTF("packet read %d bytes, %d segs, head %d\r\n",
+		    len, n, head);
+
+		/* Apply VLAN filter. */
+		tp = (uint16_t *)vec[0].iov_base + 6;
+		if ((sc->esc_RCTL & E1000_RCTL_VFE) &&
+		    (ntohs(tp[0]) == sc->esc_VET)) {
+			tag = ntohs(tp[1]) & 0x0fff;
+			if ((sc->esc_fvlan[tag >> 5] &
+			    (1 << (tag & 0x1f))) != 0) {
+				DPRINTF("known VLAN %d\r\n", tag);
+			} else {
+				DPRINTF("unknown VLAN %d\r\n", tag);
+				n = 0;
+				continue;
+			}
+		}
+
+		/* Update all consumed descriptors. */
+		for (i = 0; i < n - 1; i++) {
+			rxd = &sc->esc_rxdesc[(head + i) % size];
+			rxd->length = bufsz;
+			rxd->csum = 0;
+			rxd->errors = 0;
+			rxd->special = 0;
+			rxd->status = E1000_RXD_STAT_DD;
+		}
+		rxd = &sc->esc_rxdesc[(head + i) % size];
+		rxd->length = len % bufsz;
+		rxd->csum = 0;
+		rxd->errors = 0;
+		rxd->special = 0;
+		/* XXX signal no checksum for now */
+		rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM |
+		    E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD;
+
+		/* Schedule receive interrupts. */
+		if (len <= sc->esc_RSRPD) {
+			cause |= E1000_ICR_SRPD | E1000_ICR_RXT0;
+		} else {
+			/* XXX: RDRT and RADV timers should be here. */
+			cause |= E1000_ICR_RXT0;
+		}
+
+		head = (head + n) % size;
+		left -= n;
+	}
+
+done:
+	pthread_mutex_lock(&sc->esc_mtx);
+	sc->esc_rx_active = 0;
+	if (sc->esc_rx_enabled == 0)
+		pthread_cond_signal(&sc->esc_rx_cond);
+
+	sc->esc_RDH = head;
+	/* Respect E1000_RCTL_RDMTS */
+	left = (size + sc->esc_RDT - head) % size;
+	if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1)))
+		cause |= E1000_ICR_RXDMT0;
+	/* Assert all accumulated interrupts. */
+	if (cause != 0)
+		e82545_icr_assert(sc, cause);
+done1:
+	DPRINTF("rx_run done: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT);
+	pthread_mutex_unlock(&sc->esc_mtx);
+}
+#endif
+
+static uint16_t
+e82545_carry(uint32_t sum)
+{
+
+	sum = (sum & 0xFFFF) + (sum >> 16);
+	if (sum > 0xFFFF)
+		sum -= 0xFFFF;
+	return (sum);
+}
+
+static uint16_t
+#ifdef __FreeBSD__
+e82545_buf_checksum(uint8_t *buf, int len)
+#else
+e82545_buf_checksum(caddr_t buf, int len)
+#endif
+{
+	int i;
+	uint32_t sum = 0;
+
+	/* Checksum all the pairs of bytes first... */
+	for (i = 0; i < (len & ~1U); i += 2)
+		sum += *((u_int16_t *)(buf + i));
+
+	/*
+	 * If there's a single byte left over, checksum it, too.
+	 * Network byte order is big-endian, so the remaining byte is
+	 * the high byte.
+	 */
+	if (i < len)
+		sum += htons(buf[i] << 8);
+
+	return (e82545_carry(sum));
+}
+
+static uint16_t
+e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len)
+{
+	int now, odd;
+	uint32_t sum = 0, s;
+
+	/* Skip completely unneeded vectors. */
+	while (iovcnt > 0 && iov->iov_len <= off && off > 0) {
+		off -= iov->iov_len;
+		iov++;
+		iovcnt--;
+	}
+
+	/* Calculate checksum of requested range. */
+	odd = 0;
+	while (len > 0 && iovcnt > 0) {
+		now = MIN(len, iov->iov_len - off);
+		s = e82545_buf_checksum(iov->iov_base + off, now);
+		sum += odd ? (s << 8) : s;
+		odd ^= (now & 1);
+		len -= now;
+		off = 0;
+		iov++;
+		iovcnt--;
+	}
+
+	return (e82545_carry(sum));
+}
+
+/*
+ * Return the transmit descriptor type.
+ */
+int
+e82545_txdesc_type(uint32_t lower)
+{
+	int type;
+
+	type = 0;
+	
+	if (lower & E1000_TXD_CMD_DEXT)
+		type = lower & E1000_TXD_MASK;
+
+	return (type);
+}
+
+static void
+e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck)
+{
+	uint16_t cksum;
+	int cklen;
+
+	DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d\r\n",
+	    iovcnt, ck->ck_start, ck->ck_off, ck->ck_len);
+	cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX;
+	cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen);
+	*(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum;
+}
+
+static void
+e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt)
+{
+
+	if (sc->esc_tapfd == -1)
+		return;
+
+	(void) writev(sc->esc_tapfd, iov, iovcnt);
+}
+
+static void
+e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail,
+    uint16_t dsize, int *tdwb)
+{
+	union e1000_tx_udesc *dsc;
+
+	for ( ; head != tail; head = (head + 1) % dsize) {
+		dsc = &sc->esc_txdesc[head];
+		if (dsc->td.lower.data & E1000_TXD_CMD_RS) {
+			dsc->td.upper.data |= E1000_TXD_STAT_DD;
+			*tdwb = 1;
+		}
+	}
+}
+
+static int
+e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
+    uint16_t dsize, uint16_t *rhead, int *tdwb)
+{
+#ifdef	__FreeBSD__
+	uint8_t *hdr, *hdrp;
+#else
+	caddr_t hdr, hdrp;
+#endif
+	struct iovec iovb[I82545_MAX_TXSEGS + 2];
+	struct iovec tiov[I82545_MAX_TXSEGS + 2];
+	struct e1000_context_desc *cd;
+	struct ck_info ckinfo[2];
+	struct iovec *iov;
+	union  e1000_tx_udesc *dsc;
+	int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso;
+	int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff;
+	uint32_t tcpsum, tcpseq;
+	uint16_t ipcs, tcpcs, ipid, ohead;
+
+	ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0;
+	iovcnt = 0;
+	tlen = 0;
+	ntype = 0;
+	tso = 0;
+	ohead = head;
+	hdr = NULL;
+
+	/* iovb[0/1] may be used for writable copy of headers. */
+	iov = &iovb[2];
+
+	for (desc = 0; ; desc++, head = (head + 1) % dsize) {
+		if (head == tail) {
+			*rhead = head;
+			return (0);
+		}
+		dsc = &sc->esc_txdesc[head];
+		dtype = e82545_txdesc_type(dsc->td.lower.data);
+
+		if (desc == 0) {
+			switch (dtype) {
+			case E1000_TXD_TYP_C:
+				DPRINTF("tx ctxt desc idx %d: %016jx "
+				    "%08x%08x\r\n",
+				    head, dsc->td.buffer_addr,
+				    dsc->td.upper.data, dsc->td.lower.data);
+				/* Save context and return */
+				sc->esc_txctx = dsc->cd;
+				goto done;
+			case E1000_TXD_TYP_L:
+				DPRINTF("tx legacy desc idx %d: %08x%08x\r\n",
+				    head, dsc->td.upper.data, dsc->td.lower.data);
+				/*
+				 * legacy cksum start valid in first descriptor
+				 */
+				ntype = dtype;
+				ckinfo[0].ck_start = dsc->td.upper.fields.css;
+				break;
+			case E1000_TXD_TYP_D:
+				DPRINTF("tx data desc idx %d: %08x%08x\r\n",
+				    head, dsc->td.upper.data, dsc->td.lower.data);
+				ntype = dtype;
+				break;
+			default:
+				break;
+			}
+		} else {
+			/* Descriptor type must be consistent */
+			assert(dtype == ntype);
+			DPRINTF("tx next desc idx %d: %08x%08x\r\n",
+			    head, dsc->td.upper.data, dsc->td.lower.data);
+		}
+
+		len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length :
+		    dsc->dd.lower.data & 0xFFFFF;
+
+		if (len > 0) {
+			/* Strip checksum supplied by guest. */
+			if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 &&
+			    (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0)
+				len -= 2;
+			tlen += len;
+			if (iovcnt < I82545_MAX_TXSEGS) {
+				iov[iovcnt].iov_base = paddr_guest2host(
+				    sc->esc_ctx, dsc->td.buffer_addr, len);
+				iov[iovcnt].iov_len = len;
+			}
+			iovcnt++;
+		}
+
+		/*
+		 * Pull out info that is valid in the final descriptor
+		 * and exit descriptor loop.
+		 */
+		if (dsc->td.lower.data & E1000_TXD_CMD_EOP) {
+			if (dtype == E1000_TXD_TYP_L) {
+				if (dsc->td.lower.data & E1000_TXD_CMD_IC) {
+					ckinfo[0].ck_valid = 1;
+					ckinfo[0].ck_off =
+					    dsc->td.lower.flags.cso;
+					ckinfo[0].ck_len = 0;
+				}
+			} else {
+				cd = &sc->esc_txctx;
+				if (dsc->dd.lower.data & E1000_TXD_CMD_TSE)
+					tso = 1;
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_IXSM)
+					ckinfo[0].ck_valid = 1;
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_IXSM || tso) {
+					ckinfo[0].ck_start =
+					    cd->lower_setup.ip_fields.ipcss;
+					ckinfo[0].ck_off =
+					    cd->lower_setup.ip_fields.ipcso;
+					ckinfo[0].ck_len =
+					    cd->lower_setup.ip_fields.ipcse;
+				}
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_TXSM)
+					ckinfo[1].ck_valid = 1;
+				if (dsc->dd.upper.fields.popts &
+				    E1000_TXD_POPTS_TXSM || tso) {
+					ckinfo[1].ck_start =
+					    cd->upper_setup.tcp_fields.tucss;
+					ckinfo[1].ck_off =
+					    cd->upper_setup.tcp_fields.tucso;
+					ckinfo[1].ck_len =
+					    cd->upper_setup.tcp_fields.tucse;
+				}
+			}
+			break;
+		}
+	}
+
+	if (iovcnt > I82545_MAX_TXSEGS) {
+		WPRINTF("tx too many descriptors (%d > %d) -- dropped\r\n",
+		    iovcnt, I82545_MAX_TXSEGS);
+		goto done;
+	}
+
+	hdrlen = vlen = 0;
+	/* Estimate writable space for VLAN header insertion. */
+	if ((sc->esc_CTRL & E1000_CTRL_VME) &&
+	    (dsc->td.lower.data & E1000_TXD_CMD_VLE)) {
+		hdrlen = ETHER_ADDR_LEN*2;
+		vlen = ETHER_VLAN_ENCAP_LEN;
+	}
+	if (!tso) {
+		/* Estimate required writable space for checksums. */
+		if (ckinfo[0].ck_valid)
+			hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2);
+		if (ckinfo[1].ck_valid)
+			hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2);
+		/* Round up writable space to the first vector. */
+		if (hdrlen != 0 && iov[0].iov_len > hdrlen &&
+		    iov[0].iov_len < hdrlen + 100)
+			hdrlen = iov[0].iov_len;
+	} else {
+		/* In case of TSO header length provided by software. */
+		hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len;
+	}
+
+	/* Allocate, fill and prepend writable header vector. */
+	if (hdrlen != 0) {
+		hdr = __builtin_alloca(hdrlen + vlen);
+		hdr += vlen;
+		for (left = hdrlen, hdrp = hdr; left > 0;
+		    left -= now, hdrp += now) {
+			now = MIN(left, iov->iov_len);
+			memcpy(hdrp, iov->iov_base, now);
+			iov->iov_base += now;
+			iov->iov_len -= now;
+			if (iov->iov_len == 0) {
+				iov++;
+				iovcnt--;
+			}
+		}
+		iov--;
+		iovcnt++;
+		iov->iov_base = hdr;
+		iov->iov_len = hdrlen;
+	}
+
+	/* Insert VLAN tag. */
+	if (vlen != 0) {
+		hdr -= ETHER_VLAN_ENCAP_LEN;
+		memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2);
+		hdrlen += ETHER_VLAN_ENCAP_LEN;
+		hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8;
+		hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff;
+		hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8;
+		hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff;
+		iov->iov_base = hdr;
+		iov->iov_len += ETHER_VLAN_ENCAP_LEN;
+		/* Correct checksum offsets after VLAN tag insertion. */
+		ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN;
+		ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN;
+		if (ckinfo[0].ck_len != 0)
+			ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN;
+		ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN;
+		ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN;
+		if (ckinfo[1].ck_len != 0)
+			ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN;
+	}
+
+	/* Simple non-TSO case. */
+	if (!tso) {
+		/* Calculate checksums and transmit. */
+		if (ckinfo[0].ck_valid)
+			e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]);
+		if (ckinfo[1].ck_valid)
+			e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]);
+		e82545_transmit_backend(sc, iov, iovcnt);
+		goto done;
+	}
+
+	/* Doing TSO. */
+	tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0;
+	mss = sc->esc_txctx.tcp_seg_setup.fields.mss;
+	paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff);
+	DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n",
+	    tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt);
+	ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]);
+	tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]);
+	ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off];
+	tcpcs = 0;
+	if (ckinfo[1].ck_valid)	/* Save partial pseudo-header checksum. */
+		tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off];
+	pv = 1;
+	pvoff = 0;
+	for (seg = 0, left = paylen; left > 0; seg++, left -= now) {
+		now = MIN(left, mss);
+
+		/* Construct IOVs for the segment. */
+		/* Include whole original header. */
+		tiov[0].iov_base = hdr;
+		tiov[0].iov_len = hdrlen;
+		tiovcnt = 1;
+		/* Include respective part of payload IOV. */
+		for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) {
+			nnow = MIN(nleft, iov[pv].iov_len - pvoff);
+			tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff;
+			tiov[tiovcnt++].iov_len = nnow;
+			if (pvoff + nnow == iov[pv].iov_len) {
+				pv++;
+				pvoff = 0;
+			} else
+				pvoff += nnow;
+		}
+		DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n",
+		    seg, hdrlen, now, tiovcnt);
+
+		/* Update IP header. */
+		if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) {
+			/* IPv4 -- set length and ID */
+			*(uint16_t *)&hdr[ckinfo[0].ck_start + 2] =
+			    htons(hdrlen - ckinfo[0].ck_start + now);
+			*(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
+			    htons(ipid + seg);
+		} else {
+			/* IPv6 -- set length */
+			*(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
+			    htons(hdrlen - ckinfo[0].ck_start - 40 +
+				  now);
+		}
+
+		/* Update pseudo-header checksum. */
+		tcpsum = tcpcs;
+		tcpsum += htons(hdrlen - ckinfo[1].ck_start + now);
+
+		/* Update TCP/UDP headers. */
+		if (tcp) {
+			/* Update sequence number and FIN/PUSH flags. */
+			*(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
+			    htonl(tcpseq + paylen - left);
+			if (now < left) {
+				hdr[ckinfo[1].ck_start + 13] &=
+				    ~(TH_FIN | TH_PUSH);
+			}
+		} else {
+			/* Update payload length. */
+			*(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
+			    hdrlen - ckinfo[1].ck_start + now;
+		}
+
+		/* Calculate checksums and transmit. */
+		if (ckinfo[0].ck_valid) {
+			*(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs;
+			e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]);
+		}
+		if (ckinfo[1].ck_valid) {
+			*(uint16_t *)&hdr[ckinfo[1].ck_off] =
+			    e82545_carry(tcpsum);
+			e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]);
+		}
+		e82545_transmit_backend(sc, tiov, tiovcnt);
+	}
+
+done:
+	head = (head + 1) % dsize;
+	e82545_transmit_done(sc, ohead, head, dsize, tdwb);
+
+	*rhead = head;
+	return (desc + 1);
+}
+
+static void
+e82545_tx_run(struct e82545_softc *sc)
+{
+	uint32_t cause;
+	uint16_t head, rhead, tail, size;
+	int lim, tdwb, sent;
+
+	head = sc->esc_TDH;
+	tail = sc->esc_TDT;
+	size = sc->esc_TDLEN / 16;
+	DPRINTF("tx_run: head %x, rhead %x, tail %x\r\n",
+	    sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
+
+	pthread_mutex_unlock(&sc->esc_mtx);
+	rhead = head;
+	tdwb = 0;
+	for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) {
+		sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb);
+		if (sent == 0)
+			break;
+		head = rhead;
+	}
+	pthread_mutex_lock(&sc->esc_mtx);
+
+	sc->esc_TDH = head;
+	sc->esc_TDHr = rhead;
+	cause = 0;
+	if (tdwb)
+		cause |= E1000_ICR_TXDW;
+	if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT)
+		cause |= E1000_ICR_TXQE;
+	if (cause)
+		e82545_icr_assert(sc, cause);
+
+	DPRINTF("tx_run done: head %x, rhead %x, tail %x\r\n",
+	    sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
+}
+
+static void *
+e82545_tx_thread(void *param)
+{
+	struct e82545_softc *sc = param;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+	for (;;) {
+		while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) {
+			if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT)
+				break;
+			sc->esc_tx_active = 0;
+			if (sc->esc_tx_enabled == 0)
+				pthread_cond_signal(&sc->esc_tx_cond);
+			pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
+		}
+		sc->esc_tx_active = 1;
+
+		/* Process some tx descriptors.  Lock dropped inside. */
+		e82545_tx_run(sc);
+	}
+#ifndef	__FreeBSD__
+	return (NULL);
+#endif
+}
+
+static void
+e82545_tx_start(struct e82545_softc *sc)
+{
+
+	if (sc->esc_tx_active == 0)
+		pthread_cond_signal(&sc->esc_tx_cond);
+}
+
+static void
+e82545_tx_enable(struct e82545_softc *sc)
+{
+
+	sc->esc_tx_enabled = 1;
+}
+
+static void
+e82545_tx_disable(struct e82545_softc *sc)
+{
+
+	sc->esc_tx_enabled = 0;
+	while (sc->esc_tx_active)
+		pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
+}
+
+static void
+e82545_rx_enable(struct e82545_softc *sc)
+{
+
+	sc->esc_rx_enabled = 1;
+}
+
+static void
+e82545_rx_disable(struct e82545_softc *sc)
+{
+
+	sc->esc_rx_enabled = 0;
+	while (sc->esc_rx_active)
+		pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx);
+}
+
+static void
+e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval)
+{
+	struct eth_uni *eu;
+	int idx;
+
+	idx = reg >> 1;
+	assert(idx < 15);
+
+	eu = &sc->esc_uni[idx];
+
+	if (reg & 0x1) {
+		/* RAH */
+		eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV);
+		eu->eu_addrsel = (wval >> 16) & 0x3;
+		eu->eu_eth.octet[5] = wval >> 8;
+		eu->eu_eth.octet[4] = wval;
+	} else {
+		/* RAL */
+		eu->eu_eth.octet[3] = wval >> 24;
+		eu->eu_eth.octet[2] = wval >> 16;
+		eu->eu_eth.octet[1] = wval >> 8;
+		eu->eu_eth.octet[0] = wval;
+	}
+}
+
+static uint32_t
+e82545_read_ra(struct e82545_softc *sc, int reg)
+{
+	struct eth_uni *eu;
+	uint32_t retval;
+	int idx;
+
+	idx = reg >> 1;
+	assert(idx < 15);
+
+	eu = &sc->esc_uni[idx];
+
+	if (reg & 0x1) {
+		/* RAH */
+		retval = (eu->eu_valid << 31) |
+			 (eu->eu_addrsel << 16) |
+			 (eu->eu_eth.octet[5] << 8) |
+			 eu->eu_eth.octet[4];
+	} else {
+		/* RAL */
+		retval = (eu->eu_eth.octet[3] << 24) |
+			 (eu->eu_eth.octet[2] << 16) |
+			 (eu->eu_eth.octet[1] << 8) |
+			 eu->eu_eth.octet[0];
+	}
+
+	return (retval);	
+}
+
+static void
+e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value)
+{
+	int ridx;
+	
+	if (offset & 0x3) {
+		DPRINTF("Unaligned register write offset:0x%x value:0x%x\r\n", offset, value);
+		return;
+	}
+	DPRINTF("Register write: 0x%x value: 0x%x\r\n", offset, value);
+
+	switch (offset) {
+	case E1000_CTRL:
+	case E1000_CTRL_DUP:
+		e82545_devctl(sc, value);
+		break;
+	case E1000_FCAL:
+		sc->esc_FCAL = value;
+		break;
+	case E1000_FCAH:
+		sc->esc_FCAH = value & ~0xFFFF0000;
+		break;
+	case E1000_FCT:
+		sc->esc_FCT = value & ~0xFFFF0000;
+		break;
+	case E1000_VET:
+		sc->esc_VET = value & ~0xFFFF0000;
+		break;
+	case E1000_FCTTV:
+		sc->esc_FCTTV = value & ~0xFFFF0000;
+		break;
+	case E1000_LEDCTL:
+		sc->esc_LEDCTL = value & ~0x30303000;
+		break;
+	case E1000_PBA:
+		sc->esc_PBA = value & 0x0000FF80;
+		break;
+	case E1000_ICR:
+	case E1000_ITR:
+	case E1000_ICS:
+	case E1000_IMS:
+	case E1000_IMC:
+		e82545_intr_write(sc, offset, value);
+		break;
+	case E1000_RCTL:
+		e82545_rx_ctl(sc, value);
+		break;
+	case E1000_FCRTL:
+		sc->esc_FCRTL = value & ~0xFFFF0007;
+		break;
+	case E1000_FCRTH:
+		sc->esc_FCRTH = value & ~0xFFFF0007;
+		break;
+	case E1000_RDBAL(0):
+		sc->esc_RDBAL = value & ~0xF;
+		if (sc->esc_rx_enabled) {
+			/* Apparently legal: update cached address */
+			e82545_rx_update_rdba(sc);
+		}
+		break;
+	case E1000_RDBAH(0):
+		assert(!sc->esc_rx_enabled);
+		sc->esc_RDBAH = value;
+		break;
+	case E1000_RDLEN(0):
+		assert(!sc->esc_rx_enabled);
+		sc->esc_RDLEN = value & ~0xFFF0007F;
+		break;
+	case E1000_RDH(0):
+		/* XXX should only ever be zero ? Range check ? */
+		sc->esc_RDH = value;
+		break;
+	case E1000_RDT(0):
+		/* XXX if this opens up the rx ring, do something ? */
+		sc->esc_RDT = value;
+		break;
+	case E1000_RDTR:
+		/* ignore FPD bit 31 */
+		sc->esc_RDTR = value & ~0xFFFF0000;
+		break;
+	case E1000_RXDCTL(0):
+		sc->esc_RXDCTL = value & ~0xFEC0C0C0;
+		break;
+	case E1000_RADV:
+		sc->esc_RADV = value & ~0xFFFF0000;
+		break;
+	case E1000_RSRPD:
+		sc->esc_RSRPD = value & ~0xFFFFF000;
+		break;
+	case E1000_RXCSUM:
+		sc->esc_RXCSUM = value & ~0xFFFFF800;
+		break;
+	case E1000_TXCW:
+		sc->esc_TXCW = value & ~0x3FFF0000;
+		break;
+	case E1000_TCTL:
+		e82545_tx_ctl(sc, value);
+		break;
+	case E1000_TIPG:
+		sc->esc_TIPG = value;
+		break;
+	case E1000_AIT:
+		sc->esc_AIT = value;
+		break;
+	case E1000_TDBAL(0):
+		sc->esc_TDBAL = value & ~0xF;
+		if (sc->esc_tx_enabled) {
+			/* Apparently legal */
+			e82545_tx_update_tdba(sc);
+		}
+		break;
+	case E1000_TDBAH(0):
+		//assert(!sc->esc_tx_enabled);		
+		sc->esc_TDBAH = value;
+		break;
+	case E1000_TDLEN(0):
+		//assert(!sc->esc_tx_enabled);
+		sc->esc_TDLEN = value & ~0xFFF0007F;
+		break;
+	case E1000_TDH(0):
+		//assert(!sc->esc_tx_enabled);
+		/* XXX should only ever be zero ? Range check ? */
+		sc->esc_TDHr = sc->esc_TDH = value;
+		break;
+	case E1000_TDT(0):
+		/* XXX range check ? */
+		sc->esc_TDT = value;
+		if (sc->esc_tx_enabled)
+			e82545_tx_start(sc);
+		break;
+	case E1000_TIDV:
+		sc->esc_TIDV = value & ~0xFFFF0000;
+		break;
+	case E1000_TXDCTL(0):
+		//assert(!sc->esc_tx_enabled);
+		sc->esc_TXDCTL = value & ~0xC0C0C0;
+		break;
+	case E1000_TADV:
+		sc->esc_TADV = value & ~0xFFFF0000;
+		break;
+	case E1000_RAL(0) ... E1000_RAH(15):
+		/* convert to u32 offset */
+		ridx = (offset - E1000_RAL(0)) >> 2;
+		e82545_write_ra(sc, ridx, value);
+		break;
+	case E1000_MTA ... (E1000_MTA + (127*4)):
+		sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value;
+		break;
+	case E1000_VFTA ... (E1000_VFTA + (127*4)):
+		sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value;
+		break;		
+	case E1000_EECD:
+	{
+		//DPRINTF("EECD write 0x%x -> 0x%x\r\n", sc->eeprom_control, value);
+		/* edge triggered low->high */
+		uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ?
+			0 : (value & E1000_EECD_SK));
+		uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS|
+					E1000_EECD_DI|E1000_EECD_REQ);
+		sc->eeprom_control &= ~eecd_mask;
+		sc->eeprom_control |= (value & eecd_mask);
+		/* grant/revoke immediately */
+		if (value & E1000_EECD_REQ) {
+			sc->eeprom_control |= E1000_EECD_GNT;
+		} else {
+                        sc->eeprom_control &= ~E1000_EECD_GNT;
+		}
+		if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) {
+			e82545_eecd_strobe(sc);
+		}
+		return;
+	}
+	case E1000_MDIC:
+	{
+		uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >>
+						E1000_MDIC_REG_SHIFT);
+		uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >>
+						E1000_MDIC_PHY_SHIFT);
+		sc->mdi_control =
+			(value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST));
+		if ((value & E1000_MDIC_READY) != 0) {
+			DPRINTF("Incorrect MDIC ready bit: 0x%x\r\n", value);
+			return;
+		}
+		switch (value & E82545_MDIC_OP_MASK) {
+		case E1000_MDIC_OP_READ:
+			sc->mdi_control &= ~E82545_MDIC_DATA_MASK;
+			sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr);
+			break;
+		case E1000_MDIC_OP_WRITE:
+			e82545_write_mdi(sc, reg_addr, phy_addr,
+				value & E82545_MDIC_DATA_MASK);
+			break;
+		default:
+			DPRINTF("Unknown MDIC op: 0x%x\r\n", value);
+			return;
+		}
+		/* TODO: barrier? */
+		sc->mdi_control |= E1000_MDIC_READY;
+		if (value & E82545_MDIC_IE) {
+			// TODO: generate interrupt
+		}
+		return;
+	}
+	case E1000_MANC:
+	case E1000_STATUS: 
+		return;
+	default:
+		DPRINTF("Unknown write register: 0x%x value:%x\r\n", offset, value);
+		return;
+	}
+}
+
+static uint32_t
+e82545_read_register(struct e82545_softc *sc, uint32_t offset)
+{
+	uint32_t retval;
+	int ridx;
+
+	if (offset & 0x3) {
+		DPRINTF("Unaligned register read offset:0x%x\r\n", offset);
+		return 0;
+	}
+
+	DPRINTF("Register read: 0x%x\r\n", offset);
+
+	switch (offset) {
+	case E1000_CTRL:
+		retval = sc->esc_CTRL;
+		break;
+	case E1000_STATUS:
+		retval = E1000_STATUS_FD | E1000_STATUS_LU |
+		    E1000_STATUS_SPEED_1000;
+		break;
+	case E1000_FCAL:
+		retval = sc->esc_FCAL;
+		break;
+	case E1000_FCAH:
+		retval = sc->esc_FCAH;
+		break;
+	case E1000_FCT:
+		retval = sc->esc_FCT;
+		break;
+	case E1000_VET:
+		retval = sc->esc_VET;
+		break;
+	case E1000_FCTTV:
+		retval = sc->esc_FCTTV;
+		break;
+	case E1000_LEDCTL:
+		retval = sc->esc_LEDCTL;
+		break;
+	case E1000_PBA:
+		retval = sc->esc_PBA;
+		break;
+	case E1000_ICR:
+	case E1000_ITR:
+	case E1000_ICS:
+	case E1000_IMS:
+	case E1000_IMC:
+		retval = e82545_intr_read(sc, offset);
+		break;
+	case E1000_RCTL:
+		retval = sc->esc_RCTL;
+		break;
+	case E1000_FCRTL:
+		retval = sc->esc_FCRTL;
+		break;
+	case E1000_FCRTH:
+		retval = sc->esc_FCRTH;
+		break;
+	case E1000_RDBAL(0):
+		retval = sc->esc_RDBAL;
+		break;
+	case E1000_RDBAH(0):
+		retval = sc->esc_RDBAH;
+		break;
+	case E1000_RDLEN(0):
+		retval = sc->esc_RDLEN;
+		break;
+	case E1000_RDH(0):
+		retval = sc->esc_RDH;
+		break;
+	case E1000_RDT(0):
+		retval = sc->esc_RDT;
+		break;
+	case E1000_RDTR:
+		retval = sc->esc_RDTR;
+		break;
+	case E1000_RXDCTL(0):
+		retval = sc->esc_RXDCTL;
+		break;
+	case E1000_RADV:
+		retval = sc->esc_RADV;
+		break;
+	case E1000_RSRPD:
+		retval = sc->esc_RSRPD;
+		break;
+	case E1000_RXCSUM:	       
+		retval = sc->esc_RXCSUM;
+		break;
+	case E1000_TXCW:
+		retval = sc->esc_TXCW;
+		break;
+	case E1000_TCTL:
+		retval = sc->esc_TCTL;
+		break;
+	case E1000_TIPG:
+		retval = sc->esc_TIPG;
+		break;
+	case E1000_AIT:
+		retval = sc->esc_AIT;
+		break;
+	case E1000_TDBAL(0):
+		retval = sc->esc_TDBAL;
+		break;
+	case E1000_TDBAH(0):
+		retval = sc->esc_TDBAH;
+		break;
+	case E1000_TDLEN(0):
+		retval = sc->esc_TDLEN;
+		break;
+	case E1000_TDH(0):
+		retval = sc->esc_TDH;
+		break;
+	case E1000_TDT(0):
+		retval = sc->esc_TDT;
+		break;
+	case E1000_TIDV:
+		retval = sc->esc_TIDV;
+		break;
+	case E1000_TXDCTL(0):
+		retval = sc->esc_TXDCTL;
+		break;
+	case E1000_TADV:
+		retval = sc->esc_TADV;
+		break;
+	case E1000_RAL(0) ... E1000_RAH(15):
+		/* convert to u32 offset */
+		ridx = (offset - E1000_RAL(0)) >> 2;
+		retval = e82545_read_ra(sc, ridx);
+		break;
+	case E1000_MTA ... (E1000_MTA + (127*4)):
+		retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2];
+		break;
+	case E1000_VFTA ... (E1000_VFTA + (127*4)):
+		retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2];
+		break;		
+	case E1000_EECD:
+		//DPRINTF("EECD read %x\r\n", sc->eeprom_control);
+		retval = sc->eeprom_control;
+		break;
+	case E1000_MDIC:
+		retval = sc->mdi_control;
+		break;
+	case E1000_MANC:
+		retval = 0;
+		break;
+	/* stats that we emulate. */
+	case E1000_MPC:
+		retval = sc->missed_pkt_count;
+		break;
+	case E1000_PRC64:
+		retval = sc->pkt_rx_by_size[0];
+		break;
+	case E1000_PRC127:
+		retval = sc->pkt_rx_by_size[1];
+		break;
+	case E1000_PRC255:
+		retval = sc->pkt_rx_by_size[2];
+		break;
+	case E1000_PRC511:
+		retval = sc->pkt_rx_by_size[3];
+		break;
+	case E1000_PRC1023:
+		retval = sc->pkt_rx_by_size[4];
+		break;
+	case E1000_PRC1522:
+		retval = sc->pkt_rx_by_size[5];
+		break;
+	case E1000_GPRC:
+		retval = sc->good_pkt_rx_count;
+		break;
+	case E1000_BPRC:
+		retval = sc->bcast_pkt_rx_count;
+		break;
+	case E1000_MPRC:
+		retval = sc->mcast_pkt_rx_count;
+		break;
+	case E1000_GPTC:
+	case E1000_TPT:
+		retval = sc->good_pkt_tx_count;
+		break;
+	case E1000_GORCL:
+		retval = (uint32_t)sc->good_octets_rx;
+		break;
+	case E1000_GORCH:
+		retval = (uint32_t)(sc->good_octets_rx >> 32);
+		break;
+	case E1000_TOTL:
+	case E1000_GOTCL:
+		retval = (uint32_t)sc->good_octets_tx;
+		break;
+	case E1000_TOTH:
+	case E1000_GOTCH:
+		retval = (uint32_t)(sc->good_octets_tx >> 32);
+		break;
+	case E1000_ROC:
+		retval = sc->oversize_rx_count;
+		break;
+	case E1000_TORL:
+		retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets);
+		break;
+	case E1000_TORH:
+		retval = (uint32_t)((sc->good_octets_rx +
+		    sc->missed_octets) >> 32);
+		break;
+	case E1000_TPR:
+		retval = sc->good_pkt_rx_count + sc->missed_pkt_count +
+		    sc->oversize_rx_count;
+		break;
+	case E1000_PTC64:
+		retval = sc->pkt_tx_by_size[0];
+		break;
+	case E1000_PTC127:
+		retval = sc->pkt_tx_by_size[1];
+		break;
+	case E1000_PTC255:
+		retval = sc->pkt_tx_by_size[2];
+		break;
+	case E1000_PTC511:
+		retval = sc->pkt_tx_by_size[3];
+		break;
+	case E1000_PTC1023:
+		retval = sc->pkt_tx_by_size[4];
+		break;
+	case E1000_PTC1522:
+		retval = sc->pkt_tx_by_size[5];
+		break;
+	case E1000_MPTC:
+		retval = sc->mcast_pkt_tx_count;
+		break;
+	case E1000_BPTC:
+		retval = sc->bcast_pkt_tx_count;
+		break;
+	case E1000_TSCTC:
+		retval = sc->tso_tx_count;
+		break;
+	/* stats that are always 0. */
+	case E1000_CRCERRS:
+	case E1000_ALGNERRC:
+	case E1000_SYMERRS:
+	case E1000_RXERRC:
+	case E1000_SCC:
+	case E1000_ECOL:
+	case E1000_MCC:
+	case E1000_LATECOL:
+	case E1000_COLC:
+	case E1000_DC:
+	case E1000_TNCRS:
+	case E1000_SEC:
+	case E1000_CEXTERR:
+	case E1000_RLEC:
+	case E1000_XONRXC:
+	case E1000_XONTXC:
+	case E1000_XOFFRXC:
+	case E1000_XOFFTXC:
+	case E1000_FCRUC:
+	case E1000_RNBC:
+	case E1000_RUC:
+	case E1000_RFC:
+	case E1000_RJC:
+	case E1000_MGTPRC:
+	case E1000_MGTPDC:
+	case E1000_MGTPTC:
+	case E1000_TSCTFC:
+		retval = 0;
+		break;
+	default:
+		DPRINTF("Unknown read register: 0x%x\r\n", offset);
+		retval = 0;
+		break;
+	}
+
+	return (retval);
+}
+
+static void
+e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	     uint64_t offset, int size, uint64_t value)
+{
+	struct e82545_softc *sc;
+
+	//DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d\r\n", baridx, offset, value, size);
+
+	sc = pi->pi_arg;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+
+	switch (baridx) {
+	case E82545_BAR_IO:
+		switch (offset) {
+		case E82545_IOADDR:
+			if (size != 4) {
+				DPRINTF("Wrong io addr write sz:%d value:0x%lx\r\n", size, value);
+			} else
+				sc->io_addr = (uint32_t)value;
+			break;
+		case E82545_IODATA:
+			if (size != 4) {
+				DPRINTF("Wrong io data write size:%d value:0x%lx\r\n", size, value);
+			} else if (sc->io_addr > E82545_IO_REGISTER_MAX) {
+				DPRINTF("Non-register io write addr:0x%x value:0x%lx\r\n", sc->io_addr, value);
+			} else
+				e82545_write_register(sc, sc->io_addr,
+						      (uint32_t)value);
+			break;
+		default:
+			DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d\r\n", offset, value, size);
+			break;
+		}
+		break;
+	case E82545_BAR_REGISTER:
+		if (size != 4) {
+			DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx\r\n", size, offset, value);
+		} else
+			e82545_write_register(sc, (uint32_t)offset,
+					      (uint32_t)value);
+		break;
+	default:
+		DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d\r\n",
+			baridx, offset, value, size);
+	}
+
+	pthread_mutex_unlock(&sc->esc_mtx);
+}
+
+static uint64_t
+e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	    uint64_t offset, int size)
+{
+	struct e82545_softc *sc;
+	uint64_t retval;
+	
+	//DPRINTF("Read  bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size);
+	sc = pi->pi_arg;
+	retval = 0;
+
+	pthread_mutex_lock(&sc->esc_mtx);
+
+	switch (baridx) {
+	case E82545_BAR_IO:
+		switch (offset) {
+		case E82545_IOADDR:
+			if (size != 4) {
+				DPRINTF("Wrong io addr read sz:%d\r\n", size);
+			} else
+				retval = sc->io_addr;
+			break;
+		case E82545_IODATA:
+			if (size != 4) {
+				DPRINTF("Wrong io data read sz:%d\r\n", size);
+			}
+			if (sc->io_addr > E82545_IO_REGISTER_MAX) {
+				DPRINTF("Non-register io read addr:0x%x\r\n",
+					sc->io_addr);
+			} else
+				retval = e82545_read_register(sc, sc->io_addr);
+			break;
+		default:
+			DPRINTF("Unknown io bar read offset:0x%lx size:%d\r\n",
+				offset, size);
+			break;
+		}
+		break;
+	case E82545_BAR_REGISTER:
+		if (size != 4) {
+			DPRINTF("Wrong register read size:%d offset:0x%lx\r\n",
+				size, offset);
+		} else
+			retval = e82545_read_register(sc, (uint32_t)offset);
+		break;
+	default:
+		DPRINTF("Unknown read bar:%d offset:0x%lx size:%d\r\n",
+			baridx, offset, size);
+		break;
+	}
+
+	pthread_mutex_unlock(&sc->esc_mtx);
+
+	return (retval);
+}
+
+static void
+e82545_reset(struct e82545_softc *sc, int drvr)
+{
+	int i;
+
+	e82545_rx_disable(sc);
+	e82545_tx_disable(sc);
+
+	/* clear outstanding interrupts */
+	if (sc->esc_irq_asserted)
+		pci_lintr_deassert(sc->esc_pi);
+
+	/* misc */
+	if (!drvr) {
+		sc->esc_FCAL = 0;
+		sc->esc_FCAH = 0;
+		sc->esc_FCT = 0;
+		sc->esc_VET = 0;
+		sc->esc_FCTTV = 0;
+	}
+	sc->esc_LEDCTL = 0x07061302;
+	sc->esc_PBA = 0x00100030;
+	
+	/* start nvm in opcode mode. */
+	sc->nvm_opaddr = 0;
+	sc->nvm_mode = E82545_NVM_MODE_OPADDR;
+	sc->nvm_bits = E82545_NVM_OPADDR_BITS;
+	sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN;
+	e82545_init_eeprom(sc);
+
+	/* interrupt */
+	sc->esc_ICR = 0;
+	sc->esc_ITR = 250;
+	sc->esc_ICS = 0;
+	sc->esc_IMS = 0;
+	sc->esc_IMC = 0;
+		
+	/* L2 filters */
+	if (!drvr) {
+		memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan));
+		memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast));
+		memset(sc->esc_uni, 0, sizeof(sc->esc_uni));
+
+		/* XXX not necessary on 82545 ?? */
+		sc->esc_uni[0].eu_valid = 1;
+		memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet,
+		    ETHER_ADDR_LEN);
+	} else {
+		/* Clear RAH valid bits */
+		for (i = 0; i < 16; i++)
+			sc->esc_uni[i].eu_valid = 0;
+	}
+	
+	/* receive */
+	if (!drvr) {
+		sc->esc_RDBAL = 0;
+		sc->esc_RDBAH = 0;
+	}
+	sc->esc_RCTL = 0;
+	sc->esc_FCRTL = 0;
+	sc->esc_FCRTH = 0;
+	sc->esc_RDLEN = 0;
+	sc->esc_RDH = 0;
+	sc->esc_RDT = 0;
+	sc->esc_RDTR = 0;
+	sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */
+	sc->esc_RADV = 0;
+	sc->esc_RXCSUM = 0;
+
+	/* transmit */
+	if (!drvr) {
+		sc->esc_TDBAL = 0;
+		sc->esc_TDBAH = 0;
+		sc->esc_TIPG = 0;
+		sc->esc_AIT = 0;
+		sc->esc_TIDV = 0;
+		sc->esc_TADV = 0;
+	}
+	sc->esc_tdba = 0;
+	sc->esc_txdesc = NULL;
+	sc->esc_TXCW = 0;
+	sc->esc_TCTL = 0;
+	sc->esc_TDLEN = 0;
+	sc->esc_TDT = 0;
+	sc->esc_TDHr = sc->esc_TDH = 0;
+	sc->esc_TXDCTL = 0;
+}
+
+static void
+e82545_open_tap(struct e82545_softc *sc, char *opts)
+{
+	char tbuf[80];
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+	
+	if (opts == NULL) {
+		sc->esc_tapfd = -1;
+		return;
+	}
+
+	strcpy(tbuf, "/dev/");
+	strlcat(tbuf, opts, sizeof(tbuf));
+
+	sc->esc_tapfd = open(tbuf, O_RDWR);
+	if (sc->esc_tapfd == -1) {
+		DPRINTF("unable to open tap device %s\n", opts);
+		exit(4);
+	}
+
+	/*
+	 * Set non-blocking and register for read
+	 * notifications with the event loop
+	 */
+	int opt = 1;
+	if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) {
+		WPRINTF("tap device O_NONBLOCK failed: %d\n", errno);
+		close(sc->esc_tapfd);
+		sc->esc_tapfd = -1;
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(sc->esc_tapfd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+	
+#ifdef	__FreeBSD__
+	sc->esc_mevp = mevent_add(sc->esc_tapfd,
+				  EVF_READ,
+				  e82545_tap_callback,
+				  sc);
+	if (sc->esc_mevp == NULL) {
+		DPRINTF("Could not register mevent %d\n", EVF_READ);
+		close(sc->esc_tapfd);
+		sc->esc_tapfd = -1;
+	}
+#endif
+}
+
+static int
+e82545_parsemac(char *mac_str, uint8_t *mac_addr)
+{
+	struct ether_addr *ea;
+	char *tmpstr;
+	char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+	tmpstr = strsep(&mac_str,"=");
+	if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+		ea = ether_aton(mac_str);
+		if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+		    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+			fprintf(stderr, "Invalid MAC %s\n", mac_str);
+			return (1);
+		} else
+			memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+	}
+	return (0);
+}
+
+static int
+e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	DPRINTF("Loading with options: %s\r\n", opts);
+
+	MD5_CTX mdctx;
+	unsigned char digest[16];
+	char nstr[80];
+	struct e82545_softc *sc;
+	char *devname;
+	char *vtopts;
+	int mac_provided;
+
+	/* Setup our softc */
+	sc = calloc(1, sizeof(*sc));
+
+	pi->pi_arg = sc;
+	sc->esc_pi = pi;
+	sc->esc_ctx = ctx;
+
+	pthread_mutex_init(&sc->esc_mtx, NULL);
+	pthread_cond_init(&sc->esc_rx_cond, NULL);
+	pthread_cond_init(&sc->esc_tx_cond, NULL);
+	pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc);
+	snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot,
+	    pi->pi_func);
+        pthread_set_name_np(sc->esc_tx_tid, nstr);
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL);
+	pci_set_cfgdata8(pi,  PCIR_CLASS, PCIC_NETWORK);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL);
+
+	pci_set_cfgdata8(pi,  PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+	pci_set_cfgdata8(pi,  PCIR_INTPIN, 0x1);
+	
+	/* TODO: this card also supports msi, but the freebsd driver for it
+	 * does not, so I have not implemented it. */
+	pci_lintr_request(pi);
+
+	pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32,
+		E82545_BAR_REGISTER_LEN);
+	pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32,
+		E82545_BAR_FLASH_LEN);
+	pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO,
+		E82545_BAR_IO_LEN);
+
+	/*
+	 * Attempt to open the tap device and read the MAC address
+	 * if specified.  Copied from virtio-net, slightly modified.
+	 */
+	mac_provided = 0;
+	sc->esc_tapfd = -1;
+	if (opts != NULL) {
+		int err;
+
+		devname = vtopts = strdup(opts);
+		(void) strsep(&vtopts, ",");
+
+		if (vtopts != NULL) {
+			err = e82545_parsemac(vtopts, sc->esc_mac.octet);
+			if (err != 0) {
+				free(devname);
+				return (err);
+			}
+			mac_provided = 1;
+		}
+
+		if (strncmp(devname, "tap", 3) == 0 ||
+		    strncmp(devname, "vmnet", 5) == 0)
+			e82545_open_tap(sc, devname);
+
+		free(devname);
+	}
+
+	/*
+	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
+	 * followed by an MD5 of the PCI slot/func number and dev name
+	 */
+	if (!mac_provided) {
+		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+		    pi->pi_func, vmname);
+
+		MD5Init(&mdctx);
+		MD5Update(&mdctx, nstr, strlen(nstr));
+		MD5Final(digest, &mdctx);
+
+		sc->esc_mac.octet[0] = 0x00;
+		sc->esc_mac.octet[1] = 0xa0;
+		sc->esc_mac.octet[2] = 0x98;
+		sc->esc_mac.octet[3] = digest[0];
+		sc->esc_mac.octet[4] = digest[1];
+		sc->esc_mac.octet[5] = digest[2];
+	}
+
+	/* H/w initiated reset */
+	e82545_reset(sc, 0);
+
+	return (0);
+}
+
+struct pci_devemu pci_de_e82545 = {
+	.pe_emu = 	"e1000",
+	.pe_init =	e82545_init,
+	.pe_barwrite =	e82545_write,
+	.pe_barread =	e82545_read
+};
+PCI_EMUL_SET(pci_de_e82545);
+
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
index 3b4ca805cc..5118b31534 100644
--- a/usr/src/cmd/bhyve/pci_emul.c
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,16 +38,17 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
-#include <sys/errno.h>
 
 #include <ctype.h>
+#include <errno.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -66,22 +69,11 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z n
 #include "pci_irq.h"
 #include "pci_lpc.h"
 
-#define CONF1_ADDR_PORT    0x0cf8
-#define CONF1_DATA_PORT    0x0cfc
+#define CONF1_ADDR_PORT	   0x0cf8
+#define CONF1_DATA_PORT	   0x0cfc
 
 #define CONF1_ENABLE	   0x80000000ul
 
-#define	CFGWRITE(pi,off,val,b)						\
-do {									\
-	if ((b) == 1) {							\
-		pci_set_cfgdata8((pi),(off),(val));			\
-	} else if ((b) == 2) {						\
-		pci_set_cfgdata16((pi),(off),(val));			\
-	} else {							\
-		pci_set_cfgdata32((pi),(off),(val));			\
-	}								\
-} while (0)
-
 #define	MAXBUSES	(PCI_BUSMAX + 1)
 #define MAXSLOTS	(PCI_SLOTMAX + 1)
 #define	MAXFUNCS	(PCI_FUNCMAX + 1)
@@ -136,6 +128,30 @@ static void pci_lintr_update(struct pci_devinst *pi);
 static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
     int func, int coff, int bytes, uint32_t *val);
 
+static __inline void
+CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes)
+{
+
+	if (bytes == 1)
+		pci_set_cfgdata8(pi, coff, val);
+	else if (bytes == 2)
+		pci_set_cfgdata16(pi, coff, val);
+	else
+		pci_set_cfgdata32(pi, coff, val);
+}
+
+static __inline uint32_t
+CFGREAD(struct pci_devinst *pi, int coff, int bytes)
+{
+
+	if (bytes == 1)
+		return (pci_get_cfgdata8(pi, coff));
+	else if (bytes == 2)
+		return (pci_get_cfgdata16(pi, coff));
+	else
+		return (pci_get_cfgdata32(pi, coff));
+}
+
 /*
  * I/O access
  */
@@ -234,6 +250,17 @@ done:
 	return (error);
 }
 
+void
+pci_print_supported_devices()
+{
+	struct pci_devemu **pdpp, *pdp;
+
+	SET_FOREACH(pdpp, pci_devemu_set) {
+		pdp = *pdpp;
+		printf("%s\n", pdp->pe_emu);
+	}
+}
+
 static int
 pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
 {
@@ -294,7 +321,7 @@ pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
 
 	/*
 	 * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
-	 * table but we also allow 1 byte access to accomodate reads from
+	 * table but we also allow 1 byte access to accommodate reads from
 	 * ddb.
 	 */
 	if (size != 1 && size != 4 && size != 8)
@@ -465,7 +492,7 @@ modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
 			iop.handler = pci_emul_io_handler;
 			iop.arg = pi;
 			error = register_inout(&iop);
-		} else 
+		} else
 			error = unregister_inout(&iop);
 		break;
 	case PCIBAR_MEM32:
@@ -533,7 +560,7 @@ memen(struct pci_devinst *pi)
  * the address range decoded by the BAR register.
  */
 static void
-update_bar_address(struct  pci_devinst *pi, uint64_t addr, int idx, int type)
+update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
 {
 	int decode;
 
@@ -570,8 +597,10 @@ int
 pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 		    enum pcibar_type type, uint64_t size)
 {
+	uint64_t *baseptr = NULL;
+	uint64_t limit = 0, lobits = 0;
+	uint64_t addr, mask, bar;
 	int error;
-	uint64_t *baseptr, limit, addr, mask, lobits, bar;
 
 	assert(idx >= 0 && idx <= PCI_BARMAX);
 
@@ -634,7 +663,11 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 		break;
 	default:
 		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+#ifdef FreeBSD
 		assert(0);
+#else
+		abort();
+#endif
 	}
 
 	if (baseptr != NULL) {
@@ -656,7 +689,7 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 		pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
 		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
 	}
-	
+
 	register_bar(pdi, idx);
 
 	return (0);
@@ -759,8 +792,6 @@ pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
 {
 	int mmc;
 
-	CTASSERT(sizeof(struct msicap) == 14);
-
 	/* Number of msi messages must be a power of 2 between 1 and 32 */
 	assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
 	mmc = ffs(msgnum) - 1;
@@ -785,7 +816,6 @@ static void
 pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
 		     uint32_t msix_tab_size)
 {
-	CTASSERT(sizeof(struct msixcap) == 12);
 
 	assert(msix_tab_size % 4096 == 0);
 
@@ -832,7 +862,7 @@ pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
 
 	assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
 	assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
-	
+
 	tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
 
 	/* Align table size to nearest 4K */
@@ -862,10 +892,9 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask;
-	int off, table_bar;
-	
+	int off;
+
 	off = offset - capoff;
-	table_bar = pi->pi_msix.table_bar;
 	/* Message Control Register */
 	if (off == 2 && bytes == 2) {
 		rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
@@ -877,8 +906,8 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
 		pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
 		pci_lintr_update(pi);
-	} 
-	
+	}
+
 	CFGWRITE(pi, offset, val, bytes);
 }
 
@@ -937,8 +966,6 @@ pci_emul_add_pciecap(struct pci_devinst *pi, int type)
 	int err;
 	struct pciecap pciecap;
 
-	CTASSERT(sizeof(struct pciecap) == 60);
-
 	if (type != PCIEM_TYPE_ROOT_PORT)
 		return (-1);
 
@@ -1085,7 +1112,7 @@ init_pci(struct vmctx *ctx)
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		if ((bi = pci_businfo[bus]) == NULL)
 			continue;
-		/* 
+		/*
 		 * Keep track of the i/o and memory resources allocated to
 		 * this bus.
 		 */
@@ -1186,7 +1213,6 @@ init_pci(struct vmctx *ctx)
 	return (0);
 }
 
-#ifdef	__FreeBSD__
 static void
 pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
     void *arg)
@@ -1340,11 +1366,11 @@ pci_bus_write_dsdt(int bus)
 		dsdt_line("Name (PPRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
- 		dsdt_line("})");
+		dsdt_line("})");
 		dsdt_line("Name (APRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
- 		dsdt_line("})");
+		dsdt_line("})");
 		dsdt_line("Method (_PRT, 0, NotSerialized)");
 		dsdt_line("{");
 		dsdt_line("  If (PICM)");
@@ -1392,7 +1418,6 @@ pci_write_dsdt(void)
 	dsdt_line("}");
 	dsdt_unindent(1);
 }
-#endif
 
 int
 pci_bus_configured(int bus)
@@ -1511,7 +1536,7 @@ pci_lintr_route(struct pci_devinst *pi)
 	 * is not yet assigned.
 	 */
 	if (ii->ii_ioapic_irq == 0)
-		ii->ii_ioapic_irq = ioapic_pci_alloc_irq();
+		ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi);
 	assert(ii->ii_ioapic_irq > 0);
 
 	/*
@@ -1519,7 +1544,7 @@ pci_lintr_route(struct pci_devinst *pi)
 	 * not yet assigned.
 	 */
 	if (ii->ii_pirq_pin == 0)
-		ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx);
+		ii->ii_pirq_pin = pirq_alloc_pin(pi);
 	assert(ii->ii_pirq_pin > 0);
 
 	pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
@@ -1667,27 +1692,31 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
 	}
 }
 
-static uint32_t
-bits_changed(uint32_t old, uint32_t new, uint32_t mask)
-{
-
-	return ((old ^ new) & mask);
-}
-
 static void
-pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
+pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
 {
-	int i;
-	uint16_t old;
+	int i, rshift;
+	uint32_t cmd, cmd2, changed, old, readonly;
+
+	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
 
 	/*
-	 * The command register is at an offset of 4 bytes and thus the
-	 * guest could write 1, 2 or 4 bytes starting at this offset.
+	 * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
+	 *
+	 * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
+	 * 'write 1 to clear'. However these bits are not set to '1' by
+	 * any device emulation so it is simpler to treat them as readonly.
 	 */
+	rshift = (coff & 0x3) * 8;
+	readonly = 0xFFFFF880 >> rshift;
+
+	old = CFGREAD(pi, coff, bytes);
+	new &= ~readonly;
+	new |= (old & readonly);
+	CFGWRITE(pi, coff, new, bytes);			/* update config */
 
-	old = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
-	CFGWRITE(pi, PCIR_COMMAND, new, bytes);		/* update config */
-	new = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* get updated value */
+	cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* get updated value */
+	changed = cmd ^ cmd2;
 
 	/*
 	 * If the MMIO or I/O address space decoding has changed then
@@ -1700,7 +1729,7 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
 				break;
 			case PCIBAR_IO:
 				/* I/O address space decoding changed? */
-				if (bits_changed(old, new, PCIM_CMD_PORTEN)) {
+				if (changed & PCIM_CMD_PORTEN) {
 					if (porten(pi))
 						register_bar(pi, i);
 					else
@@ -1710,15 +1739,15 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
 			case PCIBAR_MEM32:
 			case PCIBAR_MEM64:
 				/* MMIO address space decoding changed? */
-				if (bits_changed(old, new, PCIM_CMD_MEMEN)) {
+				if (changed & PCIM_CMD_MEMEN) {
 					if (memen(pi))
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
 				}
-				break; 
+				break;
 			default:
-				assert(0); 
+				assert(0);
 		}
 	}
 
@@ -1727,7 +1756,7 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
 	 * interrupt.
 	 */
 	pci_lintr_update(pi);
-}	
+}
 
 static void
 pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
@@ -1738,7 +1767,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
 	int idx, needcfg;
-	uint64_t addr, bar, mask;
+	uint64_t addr, mask;
+	uint64_t bar = 0;
 
 	if ((bi = pci_businfo[bus]) != NULL) {
 		si = &bi->slotinfo[slot];
@@ -1790,14 +1820,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
 			needcfg = 1;
 		}
 
-		if (needcfg) {
-			if (bytes == 1)
-				*eax = pci_get_cfgdata8(pi, coff);
-			else if (bytes == 2)
-				*eax = pci_get_cfgdata16(pi, coff);
-			else
-				*eax = pci_get_cfgdata32(pi, coff);
-		}
+		if (needcfg)
+			*eax = CFGREAD(pi, coff, bytes);
 
 		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
 	} else {
@@ -1867,8 +1891,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
 
 		} else if (pci_emul_iscap(pi, coff)) {
 			pci_emul_capwrite(pi, coff, bytes, *eax);
-		} else if (coff == PCIR_COMMAND) {
-			pci_emul_cmdwrite(pi, *eax, bytes);
+		} else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) {
+			pci_emul_cmdsts_write(pi, coff, *eax, bytes);
 		} else {
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
@@ -1940,8 +1964,8 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
 #define DIOSZ	8
 #define DMEMSZ	4096
 struct pci_emul_dsoftc {
-	uint8_t   ioregs[DIOSZ];
-	uint8_t	  memregs[DMEMSZ];
+	uint8_t	  ioregs[DIOSZ];
+	uint8_t	  memregs[2][DMEMSZ];
 };
 
 #define	PCI_EMUL_MSI_MSGS	 4
@@ -1970,6 +1994,9 @@ pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
+	error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ);
+	assert(error == 0);
+
 	return (0);
 }
 
@@ -2009,31 +2036,33 @@ pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 		}
 	}
 
-	if (baridx == 1) {
+	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("diow: memw too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
+		i = baridx - 1;		/* 'memregs' index */
+
 		if (size == 1) {
-			sc->memregs[offset] = value;
+			sc->memregs[i][offset] = value;
 		} else if (size == 2) {
-			*(uint16_t *)&sc->memregs[offset] = value;
+			*(uint16_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 4) {
-			*(uint32_t *)&sc->memregs[offset] = value;
+			*(uint32_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 8) {
-			*(uint64_t *)&sc->memregs[offset] = value;
+			*(uint64_t *)&sc->memregs[i][offset] = value;
 		} else {
 			printf("diow: memw unknown size %d\n", size);
 		}
-		
+
 		/*
 		 * magic interrupt ??
 		 */
 	}
 
-	if (baridx > 1) {
+	if (baridx > 2 || baridx < 0) {
 		printf("diow: unknown bar idx %d\n", baridx);
 	}
 }
@@ -2044,14 +2073,17 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 {
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 	uint32_t value;
+	int i;
 
+	value = 0;
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("dior: ior too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
-	
+
+		value = 0;
 		if (size == 1) {
 			value = sc->ioregs[offset];
 		} else if (size == 2) {
@@ -2062,29 +2094,31 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
-	
-	if (baridx == 1) {
+
+	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("dior: memr too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
-	
+
+		i = baridx - 1;		/* 'memregs' index */
+
 		if (size == 1) {
-			value = sc->memregs[offset];
+			value = sc->memregs[i][offset];
 		} else if (size == 2) {
-			value = *(uint16_t *) &sc->memregs[offset];
+			value = *(uint16_t *) &sc->memregs[i][offset];
 		} else if (size == 4) {
-			value = *(uint32_t *) &sc->memregs[offset];
+			value = *(uint32_t *) &sc->memregs[i][offset];
 		} else if (size == 8) {
-			value = *(uint64_t *) &sc->memregs[offset];
+			value = *(uint64_t *) &sc->memregs[i][offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 
-	if (baridx > 1) {
+	if (baridx > 2 || baridx < 0) {
 		printf("dior: unknown bar idx %d\n", baridx);
 		return (0);
 	}
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
index 6af01c4c3c..853badaadb 100644
--- a/usr/src/cmd/bhyve/pci_emul.h
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_emul.h 269700 2014-08-08 03:49:01Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _PCI_EMUL_H_
@@ -142,6 +144,8 @@ struct pci_devinst {
 		int	pba_size;
 		int	function_mask; 	
 		struct msix_table_entry *table;	/* allocated at runtime */
+		void	*pba_page;
+		int	pba_page_offset;
 	} pi_msix;
 
 	void      *pi_arg;		/* devemu-private data */
@@ -158,6 +162,7 @@ struct msicap {
 	uint32_t	addrhi;
 	uint16_t	msgdata;
 } __packed;
+static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed");
 
 struct msixcap {
 	uint8_t		capid;
@@ -166,6 +171,7 @@ struct msixcap {
 	uint32_t	table_info;	/* bar index and offset within it */
 	uint32_t	pba_info;	/* bar index and offset within it */
 } __packed;
+static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed");
 
 struct pciecap {
 	uint8_t		capid;
@@ -200,6 +206,7 @@ struct pciecap {
 	uint16_t	slot_control2;
 	uint16_t	slot_status2;
 } __packed;
+static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed");
 
 typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
     int ioapic_irq, void *arg);
@@ -225,8 +232,9 @@ int	pci_msi_enabled(struct pci_devinst *pi);
 int	pci_msix_enabled(struct pci_devinst *pi);
 int	pci_msix_table_bar(struct pci_devinst *pi);
 int	pci_msix_pba_bar(struct pci_devinst *pi);
-int	pci_msi_msgnum(struct pci_devinst *pi);
+int	pci_msi_maxmsgnum(struct pci_devinst *pi);
 int	pci_parse_slot(char *opt);
+void    pci_print_supported_devices();
 void	pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
 int	pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
 int	pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
diff --git a/usr/src/cmd/bhyve/pci_fbuf.c b/usr/src/cmd/bhyve/pci_fbuf.c
new file mode 100644
index 0000000000..8d24dde9da
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_fbuf.c
@@ -0,0 +1,467 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Nahanni Systems, Inc.
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "bhyvegc.h"
+#include "bhyverun.h"
+#include "console.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "rfb.h"
+#include "vga.h"
+
+/*
+ * bhyve Framebuffer device emulation.
+ * BAR0 points to the current mode information.
+ * BAR1 is the 32-bit framebuffer address.
+ *
+ *  -s <b>,fbuf,wait,vga=on|io|off,rfb=<ip>:port,w=width,h=height
+ */
+
+static int fbuf_debug = 1;
+#define	DEBUG_INFO	1
+#define	DEBUG_VERBOSE	4
+#define	DPRINTF(level, params)  if (level <= fbuf_debug) printf params
+
+
+#define	KB	(1024UL)
+#define	MB	(1024 * 1024UL)
+
+#define	DMEMSZ	128
+
+#define	FB_SIZE		(16*MB)
+
+#define COLS_MAX	1920
+#define	ROWS_MAX	1200
+
+#define COLS_DEFAULT	1024
+#define ROWS_DEFAULT	768
+
+#define COLS_MIN	640
+#define ROWS_MIN	480
+
+struct pci_fbuf_softc {
+	struct pci_devinst *fsc_pi;
+	struct {
+		uint32_t fbsize;
+		uint16_t width;
+		uint16_t height;
+		uint16_t depth;
+		uint16_t refreshrate;
+		uint8_t  reserved[116];
+	} __packed memregs;
+
+	/* rfb server */
+	char      *rfb_host;
+	char      *rfb_password;
+	int       rfb_port;
+#ifndef __FreeBSD__
+	char	  *rfb_unix;
+#endif
+	int       rfb_wait;
+	int       vga_enabled;
+	int	  vga_full;
+
+	uint32_t  fbaddr;
+	char      *fb_base;
+	uint16_t  gc_width;
+	uint16_t  gc_height;
+	void      *vgasc;
+	struct bhyvegc_image *gc_image;
+};
+
+static struct pci_fbuf_softc *fbuf_sc;
+
+#define	PCI_FBUF_MSI_MSGS	 4
+
+static void
+pci_fbuf_usage(char *opt)
+{
+
+	fprintf(stderr, "Invalid fbuf emulation option \"%s\"\r\n", opt);
+	fprintf(stderr, "fbuf: {wait,}{vga=on|io|off,}rfb=<ip>:port"
+	    "{,w=width}{,h=height}\r\n");
+}
+
+static void
+pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	       int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_fbuf_softc *sc;
+	uint8_t *p;
+
+	assert(baridx == 0);
+
+	sc = pi->pi_arg;
+
+	DPRINTF(DEBUG_VERBOSE,
+	    ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx\n",
+	    offset, size, value));
+
+	if (offset + size > DMEMSZ) {
+		printf("fbuf: write too large, offset %ld size %d\n",
+		       offset, size);
+		return;
+	}
+
+	p = (uint8_t *)&sc->memregs + offset;
+
+	switch (size) {
+	case 1:
+		*p = value;
+		break;
+	case 2:
+		*(uint16_t *)p = value;
+		break;
+	case 4:
+		*(uint32_t *)p = value;
+		break;
+	case 8:
+		*(uint64_t *)p = value;
+		break;
+	default:
+		printf("fbuf: write unknown size %d\n", size);
+		break;
+	}
+
+	if (!sc->gc_image->vgamode && sc->memregs.width == 0 &&
+	    sc->memregs.height == 0) {
+		DPRINTF(DEBUG_INFO, ("switching to VGA mode\r\n"));
+		sc->gc_image->vgamode = 1;
+		sc->gc_width = 0;
+		sc->gc_height = 0;
+	} else if (sc->gc_image->vgamode && sc->memregs.width != 0 &&
+	    sc->memregs.height != 0) {
+		DPRINTF(DEBUG_INFO, ("switching to VESA mode\r\n"));
+		sc->gc_image->vgamode = 0;
+	}
+}
+
+uint64_t
+pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	      int baridx, uint64_t offset, int size)
+{
+	struct pci_fbuf_softc *sc;
+	uint8_t *p;
+	uint64_t value;
+
+	assert(baridx == 0);
+
+	sc = pi->pi_arg;
+
+
+	if (offset + size > DMEMSZ) {
+		printf("fbuf: read too large, offset %ld size %d\n",
+		       offset, size);
+		return (0);
+	}
+
+	p = (uint8_t *)&sc->memregs + offset;
+	value = 0;
+	switch (size) {
+	case 1:
+		value = *p;
+		break;
+	case 2:
+		value = *(uint16_t *)p;
+		break;
+	case 4:
+		value = *(uint32_t *)p;
+		break;
+	case 8:
+		value = *(uint64_t *)p;
+		break;
+	default:
+		printf("fbuf: read unknown size %d\n", size);
+		break;
+	}
+
+	DPRINTF(DEBUG_VERBOSE,
+	    ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx\n",
+	     offset, size, value));
+
+	return (value);
+}
+
+static int
+pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
+{
+	char	*uopts, *xopts, *config;
+	char	*tmpstr;
+	int	ret;
+
+	ret = 0;
+	uopts = strdup(opts);
+	for (xopts = strtok(uopts, ",");
+	     xopts != NULL;
+	     xopts = strtok(NULL, ",")) {
+		if (strcmp(xopts, "wait") == 0) {
+			sc->rfb_wait = 1;
+			continue;
+		}
+
+		if ((config = strchr(xopts, '=')) == NULL) {
+			pci_fbuf_usage(xopts);
+			ret = -1;
+			goto done;
+		}
+
+		*config++ = '\0';
+
+		DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s\r\n",
+		   xopts, config));
+
+		if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) {
+			/*
+			 * IPv4 -- host-ip:port
+			 * IPv6 -- [host-ip%zone]:port
+			 * XXX for now port is mandatory.
+			 */
+			tmpstr = strsep(&config, "]");
+			if (config) {
+				if (tmpstr[0] == '[')
+					tmpstr++;
+				sc->rfb_host = tmpstr;
+				if (config[0] == ':')
+					config++;
+				else {
+					pci_fbuf_usage(xopts);
+					ret = -1;
+					goto done;
+				}
+				sc->rfb_port = atoi(config);
+			} else {
+				config = tmpstr;
+				tmpstr = strsep(&config, ":");
+				if (!config)
+					sc->rfb_port = atoi(tmpstr);
+				else {
+					sc->rfb_port = atoi(config);
+					sc->rfb_host = tmpstr;
+				}
+			}
+#ifndef __FreeBSD__
+		} else if (!strcmp(xopts, "unix")) {
+			sc->rfb_unix = config;
+#endif
+	        } else if (!strcmp(xopts, "vga")) {
+			if (!strcmp(config, "off")) {
+				sc->vga_enabled = 0;
+			} else if (!strcmp(config, "io")) {
+				sc->vga_enabled = 1;
+				sc->vga_full = 0;
+			} else if (!strcmp(config, "on")) {
+				sc->vga_enabled = 1;
+				sc->vga_full = 1;
+			} else {
+				pci_fbuf_usage(xopts);
+				ret = -1;
+				goto done;
+			}
+	        } else if (!strcmp(xopts, "w")) {
+		        sc->memregs.width = atoi(config);
+			if (sc->memregs.width > COLS_MAX) {
+				pci_fbuf_usage(xopts);
+				ret = -1;
+				goto done;
+			} else if (sc->memregs.width == 0)
+				sc->memregs.width = 1920;
+		} else if (!strcmp(xopts, "h")) {
+			sc->memregs.height = atoi(config);
+			if (sc->memregs.height > ROWS_MAX) {
+				pci_fbuf_usage(xopts);
+				ret = -1;
+				goto done;
+			} else if (sc->memregs.height == 0)
+				sc->memregs.height = 1080;
+		} else if (!strcmp(xopts, "password")) {
+			sc->rfb_password = config;
+		} else {
+			pci_fbuf_usage(xopts);
+			ret = -1;
+			goto done;
+		}
+	}
+
+done:
+	return (ret);
+}
+
+
+extern void vga_render(struct bhyvegc *gc, void *arg);
+
+void
+pci_fbuf_render(struct bhyvegc *gc, void *arg)
+{
+	struct pci_fbuf_softc *sc;
+
+	sc = arg;
+
+	if (sc->vga_full && sc->gc_image->vgamode) {
+		/* TODO: mode switching to vga and vesa should use the special
+		 *      EFI-bhyve protocol port.
+		 */
+		vga_render(gc, sc->vgasc);
+		return;
+	}
+	if (sc->gc_width != sc->memregs.width ||
+	    sc->gc_height != sc->memregs.height) {
+		bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height);
+		sc->gc_width = sc->memregs.width;
+		sc->gc_height = sc->memregs.height;
+	}
+
+	return;
+}
+
+static int
+pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	int error, prot;
+	struct pci_fbuf_softc *sc;
+	
+	if (fbuf_sc != NULL) {
+		fprintf(stderr, "Only one frame buffer device is allowed.\n");
+		return (-1);
+	}
+
+	sc = calloc(1, sizeof(struct pci_fbuf_softc));
+
+	pi->pi_arg = sc;
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA);
+
+	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ);
+	assert(error == 0);
+
+	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE);
+	assert(error == 0);
+
+	error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS);
+	assert(error == 0);
+
+	sc->fbaddr = pi->pi_bar[1].addr;
+	sc->memregs.fbsize = FB_SIZE;
+	sc->memregs.width  = COLS_DEFAULT;
+	sc->memregs.height = ROWS_DEFAULT;
+	sc->memregs.depth  = 32;
+
+	sc->vga_enabled = 1;
+	sc->vga_full = 0;
+
+	sc->fsc_pi = pi;
+
+	error = pci_fbuf_parse_opts(sc, opts);
+	if (error != 0)
+		goto done;
+
+	/* XXX until VGA rendering is enabled */
+	if (sc->vga_full != 0) {
+		fprintf(stderr, "pci_fbuf: VGA rendering not enabled");
+		goto done;
+	}
+
+	sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE);
+	if (sc->fb_base == MAP_FAILED) {
+		error = -1;
+		goto done;
+	}
+	DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]\r\n",
+	        sc->fb_base, FB_SIZE));
+
+	/*
+	 * Map the framebuffer into the guest address space.
+	 * XXX This may fail if the BAR is different than a prior
+	 * run. In this case flag the error. This will be fixed
+	 * when a change_memseg api is available.
+	 */
+	prot = PROT_READ | PROT_WRITE;
+	if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) {
+		fprintf(stderr, "pci_fbuf: mapseg failed - try deleting VM and restarting\n");
+		error = -1;
+		goto done;
+	}
+
+	console_init(sc->memregs.width, sc->memregs.height, sc->fb_base);
+	console_fb_register(pci_fbuf_render, sc);
+
+	if (sc->vga_enabled)
+		sc->vgasc = vga_init(!sc->vga_full);
+	sc->gc_image = console_get_image();
+
+	fbuf_sc = sc;
+
+	memset((void *)sc->fb_base, 0, FB_SIZE);
+
+#ifdef __FreeBSD__
+	error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password);
+#else
+	if (sc->rfb_unix != NULL) {
+		error = rfb_init_unix(sc->rfb_unix, sc->rfb_wait,
+		    sc->rfb_password);
+	} else {
+		error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait,
+		    sc->rfb_password);
+	}
+#endif
+done:
+	if (error)
+		free(sc);
+
+	return (error);
+}
+
+struct pci_devemu pci_fbuf = {
+	.pe_emu =	"fbuf",
+	.pe_init =	pci_fbuf_init,
+	.pe_barwrite =	pci_fbuf_write,
+	.pe_barread =	pci_fbuf_read
+};
+PCI_EMUL_SET(pci_fbuf);
diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c
index 08956d082e..b926c7817e 100644
--- a/usr/src/cmd/bhyve/pci_hostbridge.c
+++ b/usr/src/cmd/bhyve/pci_hostbridge.c
@@ -1,5 +1,8 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -23,14 +26,21 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $");
+#ifndef __FreeBSD__
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <strings.h>
+#endif
+__FBSDID("$FreeBSD$");
 
 #include "pci_emul.h"
 
+#ifdef __FreeBSD__
 static int
 pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
@@ -56,6 +66,162 @@ pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 
 	return (0);
 }
+#else
+static void
+pci_hostbridge_setup(struct pci_devinst *pi, uint16_t vendor, uint16_t device)
+{
+	/* config space */
+	pci_set_cfgdata16(pi, PCIR_VENDOR, vendor);
+	pci_set_cfgdata16(pi, PCIR_DEVICE, device);
+	pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+	pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
+}
+
+
+static int
+pci_hostbridge_parse_pci_val(const char *in, uint16_t *val)
+{
+	long num;
+	char *endp = NULL;
+
+	errno = 0;
+	num = strtol(in, &endp, 0);
+	if (errno != 0 || endp == NULL || *endp != '\0') {
+		fprintf(stderr, "pci_hostbridge: invalid num '%s'", in);
+		return (-1);
+	} else if (num < 1 || num > UINT16_MAX) {
+		fprintf(stderr, "pci_hostbridge: 0x%04lx out of range", num);
+		return (-1);
+	}
+	*val = num;
+	return (0);
+}
+
+static struct pci_hostbridge_model {
+	const char	*phm_model;
+	uint16_t	phm_vendor;
+	uint16_t	phm_device;
+} pci_hb_models[] = {
+	{ "amd",	0x1022, 0x7432 }, /* AMD/made-up */
+	{ "netapp",	0x1275, 0x1275 }, /* NetApp/NetApp */
+	{ "i440fx",	0x8086, 0x1237 }, /* Intel/82441 */
+	{ "q35",	0x8086, 0x29b0 }, /* Intel/Q35 HB */
+};
+
+#define	NUM_HB_MODELS	(sizeof (pci_hb_models) / sizeof (pci_hb_models[0]))
+
+static int
+pci_hostbridge_parse_args(char *opts, uint16_t *vendorp, uint16_t *devicep)
+{
+	const char *model = NULL;
+	char *next;
+	uint16_t vendor = 0, device = 0;
+	int err = 0;
+
+	for (; opts != NULL && *opts != '\0'; opts = next) {
+		char *val, *cp;
+
+		if ((cp = strchr(opts, ',')) != NULL) {
+			*cp = '\0';
+			next = cp + 1;
+		} else {
+			next = NULL;
+		}
+
+		if ((cp = strchr(opts, '=')) == NULL) {
+			fprintf(stderr,
+			    "pci_hostbridge: expected value for param"
+			    " (%s=VAL)", opts);
+			err = -1;
+			continue;
+		}
+
+		/* <param>=<value> handling */
+		val = cp + 1;
+		*cp = '\0';
+		if (strcmp(opts, "model") == 0) {
+			model = val;
+		} else if (strcmp(opts, "vendor") == 0) {
+			if (pci_hostbridge_parse_pci_val(val, &vendor) != 0) {
+				err = -1;
+				continue;
+			}
+		} else if (strcmp(opts, "device") == 0) {
+			if (pci_hostbridge_parse_pci_val(val, &device) != 0) {
+				err = -1;
+				continue;
+			}
+		} else {
+			fprintf(stderr,
+			    "pci_hostbridge: unrecognized option '%s'", opts);
+			err = -1;
+			continue;
+		}
+	}
+	if (err != 0) {
+		return (err);
+	}
+
+	if (model != NULL && (vendor != 0 || device != 0)) {
+		fprintf(stderr, "pci_hostbridge: cannot specify model "
+		    "and vendor/device");
+		return (-1);
+	} else if ((vendor != 0 && device == 0) ||
+	    (vendor == 0 && device != 0)) {
+		fprintf(stderr, "pci_hostbridge: must specify both vendor and"
+		    "device for custom hostbridge");
+		return (-1);
+	}
+	if (model != NULL) {
+		uint_t i;
+
+		for (i = 0; i < NUM_HB_MODELS; i++) {
+			if (strcmp(model, pci_hb_models[i].phm_model) != 0)
+				continue;
+
+			/* found a model match */
+			*vendorp = pci_hb_models[i].phm_vendor;
+			*devicep = pci_hb_models[i].phm_device;
+			return (0);
+		}
+		fprintf(stderr, "pci_hostbridge: invalid model '%s'", model);
+		return (-1);
+	}
+
+	/* custom hostbridge ID was specified */
+	*vendorp = vendor;
+	*devicep = device;
+	return (0);
+}
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	uint16_t vendor, device;
+
+	if (opts == NULL) {
+		/* Fall back to NetApp default if no options are specified */
+		vendor = 0x1275;
+		device = 0x1275;
+	} else if (pci_hostbridge_parse_args(opts, &vendor, &device) != 0) {
+		return (-1);
+	}
+
+	pci_hostbridge_setup(pi, vendor, device);
+	return (0);
+}
+
+static int
+pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	pci_hostbridge_setup(pi, 0x1022, 0x7432);
+	return (0);
+}
+
+#endif /* __FreeBSD__ */
 
 struct pci_devemu pci_de_amd_hostbridge = {
 	.pe_emu = "amd_hostbridge",
diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c
index 97ee330c65..4ecb3eddb0 100644
--- a/usr/src/cmd/bhyve/pci_irq.c
+++ b/usr/src/cmd/bhyve/pci_irq.c
@@ -1,5 +1,7 @@
 /*-
- * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
@@ -27,7 +29,7 @@
 
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_irq.c 266125 2014-05-15 14:16:55Z jhb $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <machine/vmm.h>
@@ -115,7 +117,7 @@ void
 pci_irq_reserve(int irq)
 {
 
-	assert(irq < nitems(irq_counts));
+	assert(irq >= 0 && irq < nitems(irq_counts));
 	assert(pirq_cold);
 	assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
 	irq_counts[irq] = IRQ_DISABLED;
@@ -125,10 +127,10 @@ void
 pci_irq_use(int irq)
 {
 
-	assert(irq < nitems(irq_counts));
+	assert(irq >= 0 && irq < nitems(irq_counts));
 	assert(pirq_cold);
-	if (irq_counts[irq] != IRQ_DISABLED)
-		irq_counts[irq]++;
+	assert(irq_counts[irq] != IRQ_DISABLED);
+	irq_counts[irq]++;
 }
 
 void
@@ -193,19 +195,25 @@ pci_irq_deassert(struct pci_devinst *pi)
 }
 
 int
-pirq_alloc_pin(struct vmctx *ctx)
+pirq_alloc_pin(struct pci_devinst *pi)
 {
+	struct vmctx *ctx = pi->pi_vmctx;
 	int best_count, best_irq, best_pin, irq, pin;
 
-	pirq_cold = 1;
-
-	/* First, find the least-used PIRQ pin. */
-	best_pin = 0;
-	best_count = pirqs[0].use_count;
-	for (pin = 1; pin < nitems(pirqs); pin++) {
-		if (pirqs[pin].use_count < best_count) {
-			best_pin = pin;
-			best_count = pirqs[pin].use_count;
+	pirq_cold = 0;
+
+	if (lpc_bootrom()) {
+		/* For external bootrom use fixed mapping. */
+		best_pin = (4 + pi->pi_slot + pi->pi_lintr.pin) % 8;
+	} else {
+		/* Find the least-used PIRQ pin. */
+		best_pin = 0;
+		best_count = pirqs[0].use_count;
+		for (pin = 1; pin < nitems(pirqs); pin++) {
+			if (pirqs[pin].use_count < best_count) {
+				best_pin = pin;
+				best_count = pirqs[pin].use_count;
+			}
 		}
 	}
 	pirqs[best_pin].use_count++;
@@ -222,7 +230,7 @@ pirq_alloc_pin(struct vmctx *ctx)
 				best_count = irq_counts[irq];
 			}
 		}
-		assert(best_irq != 0);
+		assert(best_irq >= 0);
 		irq_counts[best_irq]++;
 		pirqs[best_pin].reg = best_irq;
 		vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
@@ -234,16 +242,12 @@ pirq_alloc_pin(struct vmctx *ctx)
 int
 pirq_irq(int pin)
 {
-
-	if (pin == -1)
-		return (255);
 	assert(pin > 0 && pin <= nitems(pirqs));
 	return (pirqs[pin - 1].reg & PIRQ_IRQ);
 }
 
 /* XXX: Generate $PIR table. */
 
-#ifdef	__FreeBSD__
 static void
 pirq_dsdt(void)
 {
@@ -348,4 +352,3 @@ pirq_dsdt(void)
 	free(irq_prs);
 }
 LPC_DSDT(pirq_dsdt);
-#endif
diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h
index 483f12b61e..1ae56efc8f 100644
--- a/usr/src/cmd/bhyve/pci_irq.h
+++ b/usr/src/cmd/bhyve/pci_irq.h
@@ -1,5 +1,7 @@
 /*-
- * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
@@ -24,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_irq.h 266125 2014-05-15 14:16:55Z jhb $
+ * $FreeBSD$
  */
 
 #ifndef __PCI_IRQ_H__
@@ -37,7 +39,7 @@ void	pci_irq_deassert(struct pci_devinst *pi);
 void	pci_irq_init(struct vmctx *ctx);
 void	pci_irq_reserve(int irq);
 void	pci_irq_use(int irq);
-int	pirq_alloc_pin(struct vmctx *ctx);
+int	pirq_alloc_pin(struct pci_devinst *pi);
 int	pirq_irq(int pin);
 uint8_t	pirq_read(int pin);
 void	pirq_write(struct vmctx *ctx, int pin, uint8_t val);
diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c
index 8c060150dc..b7ddb772a1 100644
--- a/usr/src/cmd/bhyve/pci_lpc.c
+++ b/usr/src/cmd/bhyve/pci_lpc.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
@@ -24,11 +26,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <machine/vmm.h>
@@ -40,6 +46,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z ne
 #include <vmmapi.h>
 
 #include "acpi.h"
+#include "bootrom.h"
 #include "inout.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
@@ -62,6 +69,8 @@ SYSRES_IO(NMISC_PORT, 1);
 
 static struct pci_devinst *lpc_bridge;
 
+static const char *romfile;
+
 #define	LPC_UART_NUM	2
 static struct lpc_uart_softc {
 	struct uart_softc *uart_softc;
@@ -76,7 +85,7 @@ static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
 /*
  * LPC device configuration is in the following form:
  * <lpc_device_name>[,<options>]
- * For e.g. "com1,stdio"
+ * For e.g. "com1,stdio" or "bootrom,/var/romfile"
  */
 int
 lpc_device_parse(const char *opts)
@@ -88,6 +97,11 @@ lpc_device_parse(const char *opts)
 	str = cpy = strdup(opts);
 	lpcdev = strsep(&str, ",");
 	if (lpcdev != NULL) {
+		if (strcasecmp(lpcdev, "bootrom") == 0) {
+			romfile = str;
+			error = 0;
+			goto done;
+		}
 		for (unit = 0; unit < LPC_UART_NUM; unit++) {
 			if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) {
 				lpc_uart_softc[unit].opts = str;
@@ -104,6 +118,23 @@ done:
 	return (error);
 }
 
+void
+lpc_print_supported_devices()
+{
+	size_t i;
+
+	printf("bootrom\n");
+	for (i = 0; i < LPC_UART_NUM; i++)
+		printf("%s\n", lpc_uart_names[i]);
+}
+
+const char *
+lpc_bootrom(void)
+{
+
+	return (romfile);
+}
+
 static void
 lpc_uart_intr_assert(void *arg)
 {
@@ -148,6 +179,21 @@ lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 			uart_write(sc->uart_softc, offset + 1, *eax >> 8);
 		}
 		break;
+#ifndef __FreeBSD__
+	case 4:
+		if (in) {
+			*eax = uart_read(sc->uart_softc, offset);
+			*eax |= uart_read(sc->uart_softc, offset + 1) << 8;
+			*eax |= uart_read(sc->uart_softc, offset + 2) << 16;
+			*eax |= uart_read(sc->uart_softc, offset + 3) << 24;
+		} else {
+			uart_write(sc->uart_softc, offset, *eax);
+			uart_write(sc->uart_softc, offset + 1, *eax >> 8);
+			uart_write(sc->uart_softc, offset + 2, *eax >> 16);
+			uart_write(sc->uart_softc, offset + 3, *eax >> 24);
+		}
+		break;
+#endif
 	default:
 		return (-1);
 	}
@@ -156,13 +202,19 @@ lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 }
 
 static int
-lpc_init(void)
+lpc_init(struct vmctx *ctx)
 {
 	struct lpc_uart_softc *sc;
 	struct inout_port iop;
 	const char *name;
 	int unit, error;
 
+	if (romfile != NULL) {
+		error = bootrom_init(ctx, romfile);
+		if (error)
+			return (error);
+	}
+
 	/* COM1 and COM2 */
 	for (unit = 0; unit < LPC_UART_NUM; unit++) {
 		sc = &lpc_uart_softc[unit];
@@ -200,7 +252,6 @@ lpc_init(void)
 	return (0);
 }
 
-#ifdef	__FreeBSD__
 static void
 pci_lpc_write_dsdt(struct pci_devinst *pi)
 {
@@ -320,7 +371,6 @@ pci_lpc_uart_dsdt(void)
 	}
 }
 LPC_DSDT(pci_lpc_uart_dsdt);
-#endif
 
 static int
 pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
@@ -381,7 +431,7 @@ pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		return (-1);
 	}
 
-	if (lpc_init() != 0)
+	if (lpc_init(ctx) != 0)
 		return (-1);
 
 	/* initialize config space */
@@ -423,9 +473,7 @@ lpc_pirq_routed(void)
 struct pci_devemu pci_de_lpc = {
 	.pe_emu =	"lpc",
 	.pe_init =	pci_lpc_init,
-#ifdef	__FreeBSD__
 	.pe_write_dsdt = pci_lpc_write_dsdt,
-#endif
 	.pe_cfgwrite =	pci_lpc_cfgwrite,
 	.pe_barwrite =	pci_lpc_write,
 	.pe_barread =	pci_lpc_read
diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h
index 4f725b1dd3..9041f79c50 100644
--- a/usr/src/cmd/bhyve/pci_lpc.h
+++ b/usr/src/cmd/bhyve/pci_lpc.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.h 266125 2014-05-15 14:16:55Z jhb $
+ * $FreeBSD$
  */
 
 #ifndef _LPC_H_
@@ -66,7 +68,9 @@ struct lpc_sysres {
 #define	SYSRES_MEM(base, length)	LPC_SYSRES(LPC_SYSRES_MEM, base, length)
 
 int	lpc_device_parse(const char *opt);
+void    lpc_print_supported_devices();
 char	*lpc_pirq_name(int pin);
 void	lpc_pirq_routed(void);
+const char *lpc_bootrom(void);
 
 #endif
diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c
new file mode 100644
index 0000000000..a56c1d6959
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_nvme.c
@@ -0,0 +1,1953 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017 Shunsuke Mie
+ * Copyright (c) 2018 Leon Dang
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * bhyve PCIe-NVMe device emulation.
+ *
+ * options:
+ *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
+ *
+ *  accepted devpath:
+ *    /dev/blockdev
+ *    /path/to/image
+ *    ram=size_in_MiB
+ *
+ *  maxq    = max number of queues
+ *  qsz     = max elements in each queue
+ *  ioslots = max number of concurrent io requests
+ *  sectsz  = sector size (defaults to blockif sector size)
+ *  ser     = serial number (20-chars max)
+ *
+ */
+
+/* TODO:
+    - create async event for smart and log
+    - intr coalesce
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/atomic.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <dev/nvme/nvme.h>
+
+#include "bhyverun.h"
+#include "block_if.h"
+#include "pci_emul.h"
+
+
+static int nvme_debug = 0;
+#define	DPRINTF(params) if (nvme_debug) printf params
+#define	WPRINTF(params) printf params
+
+/* defaults; can be overridden */
+#define	NVME_MSIX_BAR		4
+
+#define	NVME_IOSLOTS		8
+
+/* The NVMe spec defines bits 13:4 in BAR0 as reserved */
+#define NVME_MMIO_SPACE_MIN	(1 << 14)
+
+#define	NVME_QUEUES		16
+#define	NVME_MAX_QENTRIES	2048
+
+#define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
+#define	NVME_MAX_BLOCKIOVS	512
+
+/* helpers */
+
+/* Convert a zero-based value into a one-based value */
+#define ONE_BASED(zero)		((zero) + 1)
+/* Convert a one-based value into a zero-based value */
+#define ZERO_BASED(one)		((one)  - 1)
+
+/* Encode number of SQ's and CQ's for Set/Get Features */
+#define NVME_FEATURE_NUM_QUEUES(sc) \
+	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
+	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
+
+#define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
+
+enum nvme_controller_register_offsets {
+	NVME_CR_CAP_LOW = 0x00,
+	NVME_CR_CAP_HI  = 0x04,
+	NVME_CR_VS      = 0x08,
+	NVME_CR_INTMS   = 0x0c,
+	NVME_CR_INTMC   = 0x10,
+	NVME_CR_CC      = 0x14,
+	NVME_CR_CSTS    = 0x1c,
+	NVME_CR_NSSR    = 0x20,
+	NVME_CR_AQA     = 0x24,
+	NVME_CR_ASQ_LOW = 0x28,
+	NVME_CR_ASQ_HI  = 0x2c,
+	NVME_CR_ACQ_LOW = 0x30,
+	NVME_CR_ACQ_HI  = 0x34,
+};
+
+enum nvme_cmd_cdw11 {
+	NVME_CMD_CDW11_PC  = 0x0001,
+	NVME_CMD_CDW11_IEN = 0x0002,
+	NVME_CMD_CDW11_IV  = 0xFFFF0000,
+};
+
+#define	NVME_CQ_INTEN	0x01
+#define	NVME_CQ_INTCOAL	0x02
+
+struct nvme_completion_queue {
+	struct nvme_completion *qbase;
+	uint32_t	size;
+	uint16_t	tail; /* nvme progress */
+	uint16_t	head; /* guest progress */
+	uint16_t	intr_vec;
+	uint32_t	intr_en;
+	pthread_mutex_t	mtx;
+};
+
+struct nvme_submission_queue {
+	struct nvme_command *qbase;
+	uint32_t	size;
+	uint16_t	head; /* nvme progress */
+	uint16_t	tail; /* guest progress */
+	uint16_t	cqid; /* completion queue id */
+	int		busy; /* queue is being processed */
+	int		qpriority;
+};
+
+enum nvme_storage_type {
+	NVME_STOR_BLOCKIF = 0,
+	NVME_STOR_RAM = 1,
+};
+
+struct pci_nvme_blockstore {
+	enum nvme_storage_type type;
+	void		*ctx;
+	uint64_t	size;
+	uint32_t	sectsz;
+	uint32_t	sectsz_bits;
+};
+
+struct pci_nvme_ioreq {
+	struct pci_nvme_softc *sc;
+	struct pci_nvme_ioreq *next;
+	struct nvme_submission_queue *nvme_sq;
+	uint16_t	sqid;
+
+	/* command information */
+	uint16_t	opc;
+	uint16_t	cid;
+	uint32_t	nsid;
+
+	uint64_t	prev_gpaddr;
+	size_t		prev_size;
+
+	/*
+	 * lock if all iovs consumed (big IO);
+	 * complete transaction before continuing
+	 */
+	pthread_mutex_t	mtx;
+	pthread_cond_t	cv;
+
+	struct blockif_req io_req;
+
+	/* pad to fit up to 512 page descriptors from guest IO request */
+	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
+};
+
+struct pci_nvme_softc {
+	struct pci_devinst *nsc_pi;
+
+	pthread_mutex_t	mtx;
+
+	struct nvme_registers regs;
+
+	struct nvme_namespace_data  nsdata;
+	struct nvme_controller_data ctrldata;
+	struct nvme_error_information_entry err_log;
+	struct nvme_health_information_page health_log;
+	struct nvme_firmware_page fw_log;
+
+	struct pci_nvme_blockstore nvstore;
+
+	uint16_t	max_qentries;	/* max entries per queue */
+	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
+	uint32_t	num_cqueues;
+	uint32_t	num_squeues;
+
+	struct pci_nvme_ioreq *ioreqs;
+	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
+	uint32_t	pending_ios;
+	uint32_t	ioslots;
+	sem_t		iosemlock;
+
+	/*
+	 * Memory mapped Submission and Completion queues
+	 * Each array includes both Admin and IO queues
+	 */
+	struct nvme_completion_queue *compl_queues;
+	struct nvme_submission_queue *submit_queues;
+
+	/* controller features */
+	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
+	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
+	uint32_t	async_ev_config;         /* 0x0B: async event config */
+};
+
+
+static void pci_nvme_io_partial(struct blockif_req *br, int err);
+
+/* Controller Configuration utils */
+#define	NVME_CC_GET_EN(cc) \
+	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
+#define	NVME_CC_GET_CSS(cc) \
+	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
+#define	NVME_CC_GET_SHN(cc) \
+	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
+#define	NVME_CC_GET_IOSQES(cc) \
+	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
+#define	NVME_CC_GET_IOCQES(cc) \
+	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
+
+#define	NVME_CC_WRITE_MASK \
+	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
+	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
+	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
+
+#define	NVME_CC_NEN_WRITE_MASK \
+	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
+	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
+	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
+
+/* Controller Status utils */
+#define	NVME_CSTS_GET_RDY(sts) \
+	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
+
+#define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
+
+/* Completion Queue status word utils */
+#define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
+#define	NVME_STATUS_MASK \
+	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
+	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
+
+static __inline void
+cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
+{
+	size_t len;
+
+	len = strnlen(src, dst_size);
+	memset(dst, pad, dst_size);
+	memcpy(dst, src, len);
+}
+
+static __inline void
+pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
+{
+
+	*status &= ~NVME_STATUS_MASK;
+	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
+		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
+}
+
+static __inline void
+pci_nvme_status_genc(uint16_t *status, uint16_t code)
+{
+
+	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
+}
+
+static __inline void
+pci_nvme_toggle_phase(uint16_t *status, int prev)
+{
+
+	if (prev)
+		*status &= ~NVME_STATUS_P;
+	else
+		*status |= NVME_STATUS_P;
+}
+
+static void
+pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
+{
+	struct nvme_controller_data *cd = &sc->ctrldata;
+
+	cd->vid = 0xFB5D;
+	cd->ssvid = 0x0000;
+
+	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
+	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
+
+	/* Num of submission commands that we can handle at a time (2^rab) */
+	cd->rab   = 4;
+
+	/* FreeBSD OUI */
+	cd->ieee[0] = 0x58;
+	cd->ieee[1] = 0x9c;
+	cd->ieee[2] = 0xfc;
+
+	cd->mic = 0;
+
+	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
+
+	cd->ver = 0x00010300;
+
+	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
+	cd->acl = 2;
+	cd->aerl = 4;
+
+	cd->lpa = 0;	/* TODO: support some simple things like SMART */
+	cd->elpe = 0;	/* max error log page entries */
+	cd->npss = 1;	/* number of power states support */
+
+	/* Warning Composite Temperature Threshold */
+	cd->wctemp = 0x0157;
+
+	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
+	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
+	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
+	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
+	cd->nn = 1;	/* number of namespaces */
+
+	cd->fna = 0x03;
+
+	cd->power_state[0].mp = 10;
+}
+
+static void
+pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
+{
+	struct nvme_namespace_data *nd;
+
+	nd = &sc->nsdata;
+
+	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
+	nd->ncap = nd->nsze;
+	nd->nuse = nd->nsze;
+
+	/* Get LBA and backstore information from backing store */
+	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
+	/* LBA data-sz = 2^lbads */
+	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
+
+	nd->flbas = 0;
+}
+
+static void
+pci_nvme_init_logpages(struct pci_nvme_softc *sc)
+{
+
+	memset(&sc->err_log, 0, sizeof(sc->err_log));
+	memset(&sc->health_log, 0, sizeof(sc->health_log));
+	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
+}
+
+static void
+pci_nvme_reset_locked(struct pci_nvme_softc *sc)
+{
+	DPRINTF(("%s\r\n", __func__));
+
+	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
+	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
+	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
+
+	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
+
+	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
+
+	sc->regs.cc = 0;
+	sc->regs.csts = 0;
+
+	sc->num_cqueues = sc->num_squeues = sc->max_queues;
+	if (sc->submit_queues != NULL) {
+		for (int i = 0; i < sc->num_squeues + 1; i++) {
+			/*
+			 * The Admin Submission Queue is at index 0.
+			 * It must not be changed at reset otherwise the
+			 * emulation will be out of sync with the guest.
+			 */
+			if (i != 0) {
+				sc->submit_queues[i].qbase = NULL;
+				sc->submit_queues[i].size = 0;
+				sc->submit_queues[i].cqid = 0;
+			}
+			sc->submit_queues[i].tail = 0;
+			sc->submit_queues[i].head = 0;
+			sc->submit_queues[i].busy = 0;
+		}
+	} else
+		sc->submit_queues = calloc(sc->num_squeues + 1,
+		                        sizeof(struct nvme_submission_queue));
+
+	if (sc->compl_queues != NULL) {
+		for (int i = 0; i < sc->num_cqueues + 1; i++) {
+			/* See Admin Submission Queue note above */
+			if (i != 0) {
+				sc->compl_queues[i].qbase = NULL;
+				sc->compl_queues[i].size = 0;
+			}
+
+			sc->compl_queues[i].tail = 0;
+			sc->compl_queues[i].head = 0;
+		}
+	} else {
+		sc->compl_queues = calloc(sc->num_cqueues + 1,
+		                        sizeof(struct nvme_completion_queue));
+
+		for (int i = 0; i < sc->num_cqueues + 1; i++)
+			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
+	}
+}
+
+static void
+pci_nvme_reset(struct pci_nvme_softc *sc)
+{
+	pthread_mutex_lock(&sc->mtx);
+	pci_nvme_reset_locked(sc);
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
+{
+	uint16_t acqs, asqs;
+
+	DPRINTF(("%s\r\n", __func__));
+
+	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
+	sc->submit_queues[0].size = asqs;
+	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
+	            sizeof(struct nvme_command) * asqs);
+
+	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
+	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
+
+	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
+	    NVME_AQA_REG_ACQS_MASK) + 1;
+	sc->compl_queues[0].size = acqs;
+	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
+	         sizeof(struct nvme_completion) * acqs);
+	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
+	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
+}
+
+static int
+nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
+	size_t len)
+{
+	uint8_t *dst;
+	size_t bytes;
+
+	if (len > (8 * 1024)) {
+		return (-1);
+	}
+
+	/* Copy from the start of prp1 to the end of the physical page */
+	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
+	bytes = MIN(bytes, len);
+
+	dst = vm_map_gpa(ctx, prp1, bytes);
+	if (dst == NULL) {
+		return (-1);
+	}
+
+	memcpy(dst, src, bytes);
+
+	src += bytes;
+
+	len -= bytes;
+	if (len == 0) {
+		return (0);
+	}
+
+	len = MIN(len, PAGE_SIZE);
+
+	dst = vm_map_gpa(ctx, prp2, len);
+	if (dst == NULL) {
+		return (-1);
+	}
+
+	memcpy(dst, src, len);
+
+	return (0);
+}
+
+static int
+nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	uint16_t qid = command->cdw10 & 0xffff;
+
+	DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
+	if (qid == 0 || qid > sc->num_squeues) {
+		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
+		        __func__, qid, sc->num_squeues));
+		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+		return (1);
+	}
+
+	sc->submit_queues[qid].qbase = NULL;
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	if (command->cdw11 & NVME_CMD_CDW11_PC) {
+		uint16_t qid = command->cdw10 & 0xffff;
+		struct nvme_submission_queue *nsq;
+
+		if ((qid == 0) || (qid > sc->num_squeues)) {
+			WPRINTF(("%s queue index %u > num_squeues %u\r\n",
+			        __func__, qid, sc->num_squeues));
+			pci_nvme_status_tc(&compl->status,
+			    NVME_SCT_COMMAND_SPECIFIC,
+			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+			return (1);
+		}
+
+		nsq = &sc->submit_queues[qid];
+		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
+
+		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+		              sizeof(struct nvme_command) * (size_t)nsq->size);
+		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
+		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
+
+		DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
+		        qid, nsq->size, nsq->qbase, nsq->cqid));
+
+		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+
+		DPRINTF(("%s completed creating IOSQ qid %u\r\n",
+		         __func__, qid));
+	} else {
+		/* 
+		 * Guest sent non-cont submission queue request.
+		 * This setting is unsupported by this emulation.
+		 */
+		WPRINTF(("%s unsupported non-contig (list-based) "
+		         "create i/o submission queue\r\n", __func__));
+
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+	}
+	return (1);
+}
+
+static int
+nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	uint16_t qid = command->cdw10 & 0xffff;
+
+	DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
+	if (qid == 0 || qid > sc->num_cqueues) {
+		WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
+		        __func__, qid, sc->num_cqueues));
+		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+		return (1);
+	}
+
+	sc->compl_queues[qid].qbase = NULL;
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	if (command->cdw11 & NVME_CMD_CDW11_PC) {
+		uint16_t qid = command->cdw10 & 0xffff;
+		struct nvme_completion_queue *ncq;
+
+		if ((qid == 0) || (qid > sc->num_cqueues)) {
+			WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
+			        __func__, qid, sc->num_cqueues));
+			pci_nvme_status_tc(&compl->status,
+			    NVME_SCT_COMMAND_SPECIFIC,
+			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+			return (1);
+		}
+
+		ncq = &sc->compl_queues[qid];
+		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
+		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
+		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
+
+		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
+		             command->prp1,
+		             sizeof(struct nvme_command) * (size_t)ncq->size);
+
+		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	} else {
+		/* 
+		 * Non-contig completion queue unsupported.
+		 */
+		WPRINTF(("%s unsupported non-contig (list-based) "
+		         "create i/o completion queue\r\n",
+		         __func__));
+
+		/* 0x12 = Invalid Use of Controller Memory Buffer */
+		pci_nvme_status_genc(&compl->status, 0x12);
+	}
+
+	return (1);
+}
+
+static int
+nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
+	uint8_t logpage = command->cdw10 & 0xFF;
+
+	DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+
+	switch (logpage) {
+	case NVME_LOG_ERROR:
+		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
+		    command->prp2, (uint8_t *)&sc->err_log, logsize);
+		break;
+	case NVME_LOG_HEALTH_INFORMATION:
+		/* TODO: present some smart info */
+		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
+		    command->prp2, (uint8_t *)&sc->health_log, logsize);
+		break;
+	case NVME_LOG_FIRMWARE_SLOT:
+		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
+		    command->prp2, (uint8_t *)&sc->fw_log, logsize);
+		break;
+	default:
+		WPRINTF(("%s get log page %x command not supported\r\n",
+		        __func__, logpage));
+
+		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+		    NVME_SC_INVALID_LOG_PAGE);
+	}
+
+	return (1);
+}
+
+static int
+nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	void *dest;
+
+	DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
+	        command->cdw10 & 0xFF, command->nsid));
+
+	switch (command->cdw10 & 0xFF) {
+	case 0x00: /* return Identify Namespace data structure */
+		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
+		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
+		break;
+	case 0x01: /* return Identify Controller data structure */
+		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
+		    command->prp2, (uint8_t *)&sc->ctrldata,
+		    sizeof(sc->ctrldata));
+		break;
+	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
+		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+		                  sizeof(uint32_t) * 1024);
+		((uint32_t *)dest)[0] = 1;
+		((uint32_t *)dest)[1] = 0;
+		break;
+	case 0x11:
+		pci_nvme_status_genc(&compl->status,
+		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+		return (1);
+	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
+	case 0x10:
+	case 0x12:
+	case 0x13:
+	case 0x14:
+	case 0x15:
+	default:
+		DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
+		         __func__, command->cdw10 & 0xFF));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (1);
+	}
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	uint16_t nqr;	/* Number of Queues Requested */
+
+	nqr = command->cdw11 & 0xFFFF;
+	if (nqr == 0xffff) {
+		WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (-1);
+	}
+
+	sc->num_squeues = ONE_BASED(nqr);
+	if (sc->num_squeues > sc->max_queues) {
+		DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
+					sc->max_queues));
+		sc->num_squeues = sc->max_queues;
+	}
+
+	nqr = (command->cdw11 >> 16) & 0xFFFF;
+	if (nqr == 0xffff) {
+		WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (-1);
+	}
+
+	sc->num_cqueues = ONE_BASED(nqr);
+	if (sc->num_cqueues > sc->max_queues) {
+		DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
+					sc->max_queues));
+		sc->num_cqueues = sc->max_queues;
+	}
+
+	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
+
+	return (0);
+}
+
+static int
+nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	int feature = command->cdw10 & 0xFF;
+	uint32_t iv;
+
+	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
+	compl->cdw0 = 0;
+
+	switch (feature) {
+	case NVME_FEAT_ARBITRATION:
+		DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_POWER_MANAGEMENT:
+		DPRINTF(("  power management 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_LBA_RANGE_TYPE:
+		DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_TEMPERATURE_THRESHOLD:
+		DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_ERROR_RECOVERY:
+		DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_VOLATILE_WRITE_CACHE:
+		DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_NUMBER_OF_QUEUES:
+		nvme_set_feature_queues(sc, command, compl);
+		break;
+	case NVME_FEAT_INTERRUPT_COALESCING:
+		DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
+
+		/* in uS */
+		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
+
+		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
+		break;
+	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
+		iv = command->cdw11 & 0xFFFF;
+
+		DPRINTF(("  interrupt vector configuration 0x%x\r\n",
+		        command->cdw11));
+
+		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
+			if (sc->compl_queues[i].intr_vec == iv) {
+				if (command->cdw11 & (1 << 16))
+					sc->compl_queues[i].intr_en |=
+					                      NVME_CQ_INTCOAL;  
+				else
+					sc->compl_queues[i].intr_en &=
+					                     ~NVME_CQ_INTCOAL;  
+			}
+		}
+		break;
+	case NVME_FEAT_WRITE_ATOMICITY:
+		DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+		DPRINTF(("  async event configuration 0x%x\r\n",
+		        command->cdw11));
+		sc->async_ev_config = command->cdw11;
+		break;
+	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
+		DPRINTF(("  software progress marker 0x%x\r\n",
+		        command->cdw11));
+		break;
+	case 0x0C:
+		DPRINTF(("  autonomous power state transition 0x%x\r\n",
+		        command->cdw11));
+		break;
+	default:
+		WPRINTF(("%s invalid feature\r\n", __func__));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (1);
+	}
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	int feature = command->cdw10 & 0xFF;
+
+	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
+
+	compl->cdw0 = 0;
+
+	switch (feature) {
+	case NVME_FEAT_ARBITRATION:
+		DPRINTF(("  arbitration\r\n"));
+		break;
+	case NVME_FEAT_POWER_MANAGEMENT:
+		DPRINTF(("  power management\r\n"));
+		break;
+	case NVME_FEAT_LBA_RANGE_TYPE:
+		DPRINTF(("  lba range\r\n"));
+		break;
+	case NVME_FEAT_TEMPERATURE_THRESHOLD:
+		DPRINTF(("  temperature threshold\r\n"));
+		switch ((command->cdw11 >> 20) & 0x3) {
+		case 0:
+			/* Over temp threshold */
+			compl->cdw0 = 0xFFFF;
+			break;
+		case 1:
+			/* Under temp threshold */
+			compl->cdw0 = 0;
+			break;
+		default:
+			WPRINTF(("  invalid threshold type select\r\n"));
+			pci_nvme_status_genc(&compl->status,
+			    NVME_SC_INVALID_FIELD);
+			return (1);
+		}
+		break;
+	case NVME_FEAT_ERROR_RECOVERY:
+		DPRINTF(("  error recovery\r\n"));
+		break;
+	case NVME_FEAT_VOLATILE_WRITE_CACHE:
+		DPRINTF(("  volatile write cache\r\n"));
+		break;
+	case NVME_FEAT_NUMBER_OF_QUEUES:
+		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
+
+		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
+		        compl->cdw0 & 0xFFFF,
+		        (compl->cdw0 >> 16) & 0xFFFF));
+
+		break;
+	case NVME_FEAT_INTERRUPT_COALESCING:
+		DPRINTF(("  interrupt coalescing\r\n"));
+		break;
+	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
+		DPRINTF(("  interrupt vector configuration\r\n"));
+		break;
+	case NVME_FEAT_WRITE_ATOMICITY:
+		DPRINTF(("  write atomicity\r\n"));
+		break;
+	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+		DPRINTF(("  async event configuration\r\n"));
+		sc->async_ev_config = command->cdw11;
+		break;
+	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
+		DPRINTF(("  software progress marker\r\n"));
+		break;
+	case 0x0C:
+		DPRINTF(("  autonomous power state transition\r\n"));
+		break;
+	default:
+		WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (1);
+	}
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
+	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
+
+	/* TODO: search for the command ID and abort it */
+
+	compl->cdw0 = 1;
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+#ifdef __FreeBSD__
+static int
+nvme_opc_async_event_req(struct pci_nvme_softc* sc,
+	struct nvme_command* command, struct nvme_completion* compl)
+{
+	DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
+
+	/*
+	 * TODO: raise events when they happen based on the Set Features cmd.
+	 * These events happen async, so only set completion successful if
+	 * there is an event reflective of the request to get event.
+	 */
+	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
+	return (0);
+}
+#else
+/* This is kept behind an ifdef while it's unused to appease the compiler. */
+#endif /* __FreeBSD__ */
+
+static void
+pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
+{
+	struct nvme_completion compl;
+	struct nvme_command *cmd;
+	struct nvme_submission_queue *sq;
+	struct nvme_completion_queue *cq;
+	int do_intr = 0;
+	uint16_t sqhead;
+
+	DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
+
+	sq = &sc->submit_queues[0];
+
+	sqhead = atomic_load_acq_short(&sq->head);
+
+	if (atomic_testandset_int(&sq->busy, 1)) {
+		DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
+		        __func__, sqhead, sq->tail));
+		return;
+	}
+
+	DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
+	
+	while (sqhead != atomic_load_acq_short(&sq->tail)) {
+		cmd = &(sq->qbase)[sqhead];
+		compl.status = 0;
+
+		switch (cmd->opc) {
+		case NVME_OPC_DELETE_IO_SQ:
+			DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
+			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_CREATE_IO_SQ:
+			DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
+			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_DELETE_IO_CQ:
+			DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
+			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_CREATE_IO_CQ:
+			DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
+			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_GET_LOG_PAGE:
+			DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
+			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
+			break;
+		case NVME_OPC_IDENTIFY:
+			DPRINTF(("%s command IDENTIFY\r\n", __func__));
+			do_intr |= nvme_opc_identify(sc, cmd, &compl);
+			break;
+		case NVME_OPC_ABORT:
+			DPRINTF(("%s command ABORT\r\n", __func__));
+			do_intr |= nvme_opc_abort(sc, cmd, &compl);
+			break;
+		case NVME_OPC_SET_FEATURES:
+			DPRINTF(("%s command SET_FEATURES\r\n", __func__));
+			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
+			break;
+		case NVME_OPC_GET_FEATURES:
+			DPRINTF(("%s command GET_FEATURES\r\n", __func__));
+			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
+			break;
+		case NVME_OPC_ASYNC_EVENT_REQUEST:
+			DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
+			/* XXX dont care, unhandled for now
+			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
+			*/
+			break;
+		default:
+			WPRINTF(("0x%x command is not implemented\r\n",
+			    cmd->opc));
+		}
+	
+		/* for now skip async event generation */
+		if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
+			struct nvme_completion *cp;
+			int phase;
+
+			cq = &sc->compl_queues[0];
+
+			cp = &(cq->qbase)[cq->tail];
+			cp->cdw0 = compl.cdw0;
+			cp->sqid = 0;
+			cp->sqhd = sqhead;
+			cp->cid = cmd->cid;
+
+			phase = NVME_STATUS_GET_P(cp->status);
+			cp->status = compl.status;
+			pci_nvme_toggle_phase(&cp->status, phase);
+
+			cq->tail = (cq->tail + 1) % cq->size;
+		}
+		sqhead = (sqhead + 1) % sq->size;
+	}
+
+	DPRINTF(("setting sqhead %u\r\n", sqhead));
+	atomic_store_short(&sq->head, sqhead);
+	atomic_store_int(&sq->busy, 0);
+
+	if (do_intr)
+		pci_generate_msix(sc->nsc_pi, 0);
+
+}
+
+static int
+pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
+	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
+{
+	int iovidx;
+
+	if (req != NULL) {
+		/* concatenate contig block-iovs to minimize number of iovs */
+		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
+			iovidx = req->io_req.br_iovcnt - 1;
+
+			req->io_req.br_iov[iovidx].iov_base =
+			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+			                     req->prev_gpaddr, size);
+
+			req->prev_size += size;
+			req->io_req.br_resid += size;
+
+			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
+		} else {
+			pthread_mutex_lock(&req->mtx);
+
+			iovidx = req->io_req.br_iovcnt;
+			if (iovidx == NVME_MAX_BLOCKIOVS) {
+				int err = 0;
+
+				DPRINTF(("large I/O, doing partial req\r\n"));
+
+				iovidx = 0;
+				req->io_req.br_iovcnt = 0;
+
+				req->io_req.br_callback = pci_nvme_io_partial;
+
+				if (!do_write)
+					err = blockif_read(sc->nvstore.ctx,
+					                   &req->io_req);
+				else
+					err = blockif_write(sc->nvstore.ctx,
+					                    &req->io_req);
+
+				/* wait until req completes before cont */
+				if (err == 0)
+					pthread_cond_wait(&req->cv, &req->mtx);
+			}
+			if (iovidx == 0) {
+				req->io_req.br_offset = lba;
+				req->io_req.br_resid = 0;
+				req->io_req.br_param = req;
+			}
+
+			req->io_req.br_iov[iovidx].iov_base =
+			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+			                     gpaddr, size);
+
+			req->io_req.br_iov[iovidx].iov_len = size;
+
+			req->prev_gpaddr = gpaddr;
+			req->prev_size = size;
+			req->io_req.br_resid += size;
+
+			req->io_req.br_iovcnt++;
+
+			pthread_mutex_unlock(&req->mtx);
+		}
+	} else {
+		/* RAM buffer: read/write directly */
+		void *p = sc->nvstore.ctx;
+		void *gptr;
+
+		if ((lba + size) > sc->nvstore.size) {
+			WPRINTF(("%s write would overflow RAM\r\n", __func__));
+			return (-1);
+		}
+
+		p = (void *)((uintptr_t)p + (uintptr_t)lba);
+		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
+		if (do_write) 
+			memcpy(p, gptr, size);
+		else
+			memcpy(gptr, p, size);
+	}
+	return (0);
+}
+
+static void
+pci_nvme_set_completion(struct pci_nvme_softc *sc,
+	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
+	uint32_t cdw0, uint16_t status, int ignore_busy)
+{
+	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
+	struct nvme_completion *compl;
+	int do_intr = 0;
+	int phase;
+
+	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
+		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
+		 NVME_STATUS_GET_SC(status)));
+
+	pthread_mutex_lock(&cq->mtx);
+
+	assert(cq->qbase != NULL);
+
+	compl = &cq->qbase[cq->tail];
+
+	compl->sqhd = atomic_load_acq_short(&sq->head);
+	compl->sqid = sqid;
+	compl->cid = cid;
+
+	// toggle phase
+	phase = NVME_STATUS_GET_P(compl->status);
+	compl->status = status;
+	pci_nvme_toggle_phase(&compl->status, phase);
+
+	cq->tail = (cq->tail + 1) % cq->size;
+
+	if (cq->intr_en & NVME_CQ_INTEN)
+		do_intr = 1;
+
+	pthread_mutex_unlock(&cq->mtx);
+
+	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
+		if (do_intr)
+			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
+}
+
+static void
+pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
+{
+	req->sc = NULL;
+	req->nvme_sq = NULL;
+	req->sqid = 0;
+
+	pthread_mutex_lock(&sc->mtx);
+
+	req->next = sc->ioreqs_free;
+	sc->ioreqs_free = req;
+	sc->pending_ios--;
+
+	/* when no more IO pending, can set to ready if device reset/enabled */
+	if (sc->pending_ios == 0 &&
+	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
+		sc->regs.csts |= NVME_CSTS_RDY;
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	sem_post(&sc->iosemlock);
+}
+
+static struct pci_nvme_ioreq *
+pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
+{
+	struct pci_nvme_ioreq *req = NULL;;
+
+	sem_wait(&sc->iosemlock);
+	pthread_mutex_lock(&sc->mtx);
+
+	req = sc->ioreqs_free;
+	assert(req != NULL);
+
+	sc->ioreqs_free = req->next;
+
+	req->next = NULL;
+	req->sc = sc;
+
+	sc->pending_ios++;
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	req->io_req.br_iovcnt = 0;
+	req->io_req.br_offset = 0;
+	req->io_req.br_resid = 0;
+	req->io_req.br_param = req;
+	req->prev_gpaddr = 0;
+	req->prev_size = 0;
+
+	return req;
+}
+
+static void
+pci_nvme_io_done(struct blockif_req *br, int err)
+{
+	struct pci_nvme_ioreq *req = br->br_param;
+	struct nvme_submission_queue *sq = req->nvme_sq;
+	uint16_t code, status = 0;
+
+	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
+	
+	/* TODO return correct error */
+	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
+	pci_nvme_status_genc(&status, code);
+
+	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
+	pci_nvme_release_ioreq(req->sc, req);
+}
+
+static void
+pci_nvme_io_partial(struct blockif_req *br, int err)
+{
+	struct pci_nvme_ioreq *req = br->br_param;
+
+	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
+
+	pthread_cond_signal(&req->cv);
+}
+
+
+static void
+pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
+{
+	struct nvme_submission_queue *sq;
+	uint16_t status = 0;
+	uint16_t sqhead;
+	int err;
+
+	/* handle all submissions up to sq->tail index */
+	sq = &sc->submit_queues[idx];
+
+	if (atomic_testandset_int(&sq->busy, 1)) {
+		DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
+		return;
+	}
+
+	sqhead = atomic_load_acq_short(&sq->head);
+
+	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
+	         idx, sqhead, sq->tail, sq->qbase));
+
+	while (sqhead != atomic_load_acq_short(&sq->tail)) {
+		struct nvme_command *cmd;
+		struct pci_nvme_ioreq *req = NULL;
+		uint64_t lba;
+		uint64_t nblocks, bytes, size, cpsz;
+
+		/* TODO: support scatter gather list handling */
+
+		cmd = &sq->qbase[sqhead];
+		sqhead = (sqhead + 1) % sq->size;
+
+		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
+
+		if (cmd->opc == NVME_OPC_FLUSH) {
+			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+
+			continue;
+		} else if (cmd->opc == 0x08) {
+			/* TODO: write zeroes */
+			WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
+			        __func__, lba, cmd->cdw12 & 0xFFFF));
+			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+
+			continue;
+		}
+
+		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
+
+		bytes = nblocks * sc->nvstore.sectsz;
+
+		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
+			req = pci_nvme_get_ioreq(sc);
+			req->nvme_sq = sq;
+			req->sqid = idx;
+		}
+
+		/*
+		 * If data starts mid-page and flows into the next page, then
+		 * increase page count
+		 */
+
+		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
+		         "(%lu-bytes)\r\n",
+		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
+		         cmd->opc == NVME_OPC_WRITE ?
+			     "WRITE" : "READ",
+		         lba, nblocks, bytes));
+
+		cmd->prp1 &= ~(0x03UL);
+		cmd->prp2 &= ~(0x03UL);
+
+		DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
+
+		size = bytes;
+		lba *= sc->nvstore.sectsz;
+
+		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
+
+		if (cpsz > bytes)
+			cpsz = bytes;
+
+		if (req != NULL) {
+			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
+			                        cmd->cdw10;
+			req->opc = cmd->opc;
+			req->cid = cmd->cid;
+			req->nsid = cmd->nsid;
+		}
+
+		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
+		    cmd->opc == NVME_OPC_WRITE, lba);
+		lba += cpsz;
+		size -= cpsz;
+
+		if (size == 0)
+			goto iodone;
+
+		if (size <= PAGE_SIZE) {
+			/* prp2 is second (and final) page in transfer */
+
+			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
+			    size,
+			    cmd->opc == NVME_OPC_WRITE,
+			    lba);
+		} else {
+			uint64_t *prp_list;
+			int i;
+
+			/* prp2 is pointer to a physical region page list */
+			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
+			                            cmd->prp2, PAGE_SIZE);
+
+			i = 0;
+			while (size != 0) {
+				cpsz = MIN(size, PAGE_SIZE);
+
+				/*
+				 * Move to linked physical region page list
+				 * in last item.
+				 */ 
+				if (i == (NVME_PRP2_ITEMS-1) &&
+				    size > PAGE_SIZE) {
+					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
+					prp_list = paddr_guest2host(
+					              sc->nsc_pi->pi_vmctx,
+					              prp_list[i], PAGE_SIZE);
+					i = 0;
+				}
+				if (prp_list[i] == 0) {
+					WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
+					err = 1;
+					break;
+				}
+
+				err = pci_nvme_append_iov_req(sc, req,
+				    prp_list[i], cpsz,
+				    cmd->opc == NVME_OPC_WRITE, lba);
+				if (err)
+					break;
+
+				lba += cpsz;
+				size -= cpsz;
+				i++;
+			}
+		}
+
+iodone:
+		if (sc->nvstore.type == NVME_STOR_RAM) {
+			uint16_t code, status = 0;
+
+			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
+			    NVME_SC_SUCCESS;
+			pci_nvme_status_genc(&status, code);
+
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+
+			continue;
+		}
+
+
+		if (err)
+			goto do_error;
+
+		req->io_req.br_callback = pci_nvme_io_done;
+
+		err = 0;
+		switch (cmd->opc) {
+		case NVME_OPC_READ:
+			err = blockif_read(sc->nvstore.ctx, &req->io_req);
+			break;
+		case NVME_OPC_WRITE:
+			err = blockif_write(sc->nvstore.ctx, &req->io_req);
+			break;
+		default:
+			WPRINTF(("%s unhandled io command 0x%x\r\n",
+				 __func__, cmd->opc));
+			err = 1;
+		}
+
+do_error:
+		if (err) {
+			uint16_t status = 0;
+
+			pci_nvme_status_genc(&status,
+			    NVME_SC_DATA_TRANSFER_ERROR);
+
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+			pci_nvme_release_ioreq(sc, req);
+		}
+	}
+
+	atomic_store_short(&sq->head, sqhead);
+	atomic_store_int(&sq->busy, 0);
+}
+
+static void
+pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
+	uint64_t idx, int is_sq, uint64_t value)
+{
+	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
+	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
+
+	if (is_sq) {
+		atomic_store_short(&sc->submit_queues[idx].tail,
+		                   (uint16_t)value);
+
+		if (idx == 0) {
+			pci_nvme_handle_admin_cmd(sc, value);
+		} else {
+			/* submission queue; handle new entries in SQ */
+			if (idx > sc->num_squeues) {
+				WPRINTF(("%s SQ index %lu overflow from "
+				         "guest (max %u)\r\n",
+				         __func__, idx, sc->num_squeues));
+				return;
+			}
+			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
+		}
+	} else {
+		if (idx > sc->num_cqueues) {
+			WPRINTF(("%s queue index %lu overflow from "
+			         "guest (max %u)\r\n",
+			         __func__, idx, sc->num_cqueues));
+			return;
+		}
+
+		sc->compl_queues[idx].head = (uint16_t)value;
+	}
+}
+
+static void
+pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
+{
+	const char *s = iswrite ? "WRITE" : "READ";
+
+	switch (offset) {
+	case NVME_CR_CAP_LOW:
+		DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
+		break;
+	case NVME_CR_CAP_HI:
+		DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
+		break;
+	case NVME_CR_VS:
+		DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
+		break;
+	case NVME_CR_INTMS:
+		DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
+		break;
+	case NVME_CR_INTMC:
+		DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
+		break;
+	case NVME_CR_CC:
+		DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
+		break;
+	case NVME_CR_CSTS:
+		DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
+		break;
+	case NVME_CR_NSSR:
+		DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
+		break;
+	case NVME_CR_AQA:
+		DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
+		break;
+	case NVME_CR_ASQ_LOW:
+		DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
+		break;
+	case NVME_CR_ASQ_HI:
+		DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
+		break;
+	case NVME_CR_ACQ_LOW:
+		DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
+		break;
+	case NVME_CR_ACQ_HI:
+		DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
+		break;
+	default:
+		DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
+	}
+
+}
+
+static void
+pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
+	uint64_t offset, int size, uint64_t value)
+{
+	uint32_t ccreg;
+
+	if (offset >= NVME_DOORBELL_OFFSET) {
+		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
+		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
+		int is_sq = (belloffset % 8) < 4;
+
+		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
+			WPRINTF(("guest attempted an overflow write offset "
+			         "0x%lx, val 0x%lx in %s",
+			         offset, value, __func__));
+			return;
+		}
+
+		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
+		return;
+	}
+
+	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
+	        offset, size, value));
+
+	if (size != 4) {
+		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
+		         "val 0x%lx) to bar0 in %s",
+		         size, offset, value, __func__));
+		/* TODO: shutdown device */
+		return;
+	}
+
+	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
+
+	pthread_mutex_lock(&sc->mtx);
+
+	switch (offset) {
+	case NVME_CR_CAP_LOW:
+	case NVME_CR_CAP_HI:
+		/* readonly */
+		break;
+	case NVME_CR_VS:
+		/* readonly */
+		break;
+	case NVME_CR_INTMS:
+		/* MSI-X, so ignore */
+		break;
+	case NVME_CR_INTMC:
+		/* MSI-X, so ignore */
+		break;
+	case NVME_CR_CC:
+		ccreg = (uint32_t)value;
+
+		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
+		         "iocqes %u\r\n",
+		        __func__,
+			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
+			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
+			 NVME_CC_GET_IOCQES(ccreg)));
+
+		if (NVME_CC_GET_SHN(ccreg)) {
+			/* perform shutdown - flush out data to backend */
+			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
+			    NVME_CSTS_REG_SHST_SHIFT);
+			sc->regs.csts |= NVME_SHST_COMPLETE <<
+			    NVME_CSTS_REG_SHST_SHIFT;
+		}
+		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
+			if (NVME_CC_GET_EN(ccreg) == 0)
+				/* transition 1-> causes controller reset */
+				pci_nvme_reset_locked(sc);
+			else
+				pci_nvme_init_controller(ctx, sc);
+		}
+
+		/* Insert the iocqes, iosqes and en bits from the write */
+		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
+		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
+		if (NVME_CC_GET_EN(ccreg) == 0) {
+			/* Insert the ams, mps and css bit fields */
+			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
+			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
+			sc->regs.csts &= ~NVME_CSTS_RDY;
+		} else if (sc->pending_ios == 0) {
+			sc->regs.csts |= NVME_CSTS_RDY;
+		}
+		break;
+	case NVME_CR_CSTS:
+		break;
+	case NVME_CR_NSSR:
+		/* ignore writes; don't support subsystem reset */
+		break;
+	case NVME_CR_AQA:
+		sc->regs.aqa = (uint32_t)value;
+		break;
+	case NVME_CR_ASQ_LOW:
+		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
+		               (0xFFFFF000 & value);
+		break;
+	case NVME_CR_ASQ_HI:
+		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
+		               (value << 32);
+		break;
+	case NVME_CR_ACQ_LOW:
+		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
+		               (0xFFFFF000 & value);
+		break;
+	case NVME_CR_ACQ_HI:
+		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
+		               (value << 32);
+		break;
+	default:
+		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
+		         __func__, offset, value, size));
+	}
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+                int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_nvme_softc* sc = pi->pi_arg;
+
+	if (baridx == pci_msix_table_bar(pi) ||
+	    baridx == pci_msix_pba_bar(pi)) {
+		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
+		         " value 0x%lx\r\n", baridx, offset, size, value));
+
+		pci_emul_msix_twrite(pi, offset, size, value);
+		return;
+	}
+
+	switch (baridx) {
+	case 0:
+		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
+		break;
+
+	default:
+		DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
+		         __func__, baridx, value));
+	}
+}
+
+static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
+	uint64_t offset, int size)
+{
+	uint64_t value;
+
+	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
+
+	if (offset < NVME_DOORBELL_OFFSET) {
+		void *p = &(sc->regs);
+		pthread_mutex_lock(&sc->mtx);
+		memcpy(&value, (void *)((uintptr_t)p + offset), size);
+		pthread_mutex_unlock(&sc->mtx);
+	} else {
+		value = 0;
+                WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
+	}
+
+	switch (size) {
+	case 1:
+		value &= 0xFF;
+		break;
+	case 2:
+		value &= 0xFFFF;
+		break;
+	case 4:
+		value &= 0xFFFFFFFF;
+		break;
+	}
+
+	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
+	         offset, size, (uint32_t)value));
+
+	return (value);
+}
+
+
+
+static uint64_t
+pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+    uint64_t offset, int size)
+{
+	struct pci_nvme_softc* sc = pi->pi_arg;
+
+	if (baridx == pci_msix_table_bar(pi) ||
+	    baridx == pci_msix_pba_bar(pi)) {
+		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
+		        baridx, offset, size));
+
+		return pci_emul_msix_tread(pi, offset, size);
+	}
+
+	switch (baridx) {
+	case 0:
+       		return pci_nvme_read_bar_0(sc, offset, size);
+
+	default:
+		DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
+	}
+
+	return (0);
+}
+
+
+static int
+pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
+{
+	char bident[sizeof("XX:X:X")];
+	char	*uopt, *xopts, *config;
+	uint32_t sectsz;
+	int optidx;
+
+	sc->max_queues = NVME_QUEUES;
+	sc->max_qentries = NVME_MAX_QENTRIES;
+	sc->ioslots = NVME_IOSLOTS;
+	sc->num_squeues = sc->max_queues;
+	sc->num_cqueues = sc->max_queues;
+	sectsz = 0;
+
+	uopt = strdup(opts);
+	optidx = 0;
+	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
+	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
+	for (xopts = strtok(uopt, ",");
+	     xopts != NULL;
+	     xopts = strtok(NULL, ",")) {
+
+		if ((config = strchr(xopts, '=')) != NULL)
+			*config++ = '\0';
+
+		if (!strcmp("maxq", xopts)) {
+			sc->max_queues = atoi(config);
+		} else if (!strcmp("qsz", xopts)) {
+			sc->max_qentries = atoi(config);
+		} else if (!strcmp("ioslots", xopts)) {
+			sc->ioslots = atoi(config);
+		} else if (!strcmp("sectsz", xopts)) {
+			sectsz = atoi(config);
+		} else if (!strcmp("ser", xopts)) {
+			/*
+			 * This field indicates the Product Serial Number in
+			 * 7-bit ASCII, unused bytes should be space characters.
+			 * Ref: NVMe v1.3c.
+			 */
+			cpywithpad((char *)sc->ctrldata.sn,
+			           sizeof(sc->ctrldata.sn), config, ' ');
+		} else if (!strcmp("ram", xopts)) {
+			uint64_t sz = strtoull(&xopts[4], NULL, 10);
+
+			sc->nvstore.type = NVME_STOR_RAM;
+			sc->nvstore.size = sz * 1024 * 1024;
+			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
+			sc->nvstore.sectsz = 4096;
+			sc->nvstore.sectsz_bits = 12;
+			if (sc->nvstore.ctx == NULL) {
+				perror("Unable to allocate RAM");
+				free(uopt);
+				return (-1);
+			}
+		} else if (optidx == 0) {
+			snprintf(bident, sizeof(bident), "%d:%d",
+			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
+			sc->nvstore.ctx = blockif_open(xopts, bident);
+			if (sc->nvstore.ctx == NULL) {
+				perror("Could not open backing file");
+				free(uopt);
+				return (-1);
+			}
+			sc->nvstore.type = NVME_STOR_BLOCKIF;
+			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
+		} else {
+			fprintf(stderr, "Invalid option %s\n", xopts);
+			free(uopt);
+			return (-1);
+		}
+
+		optidx++;
+	}
+	free(uopt);
+
+	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
+		fprintf(stderr, "backing store not specified\n");
+		return (-1);
+	}
+	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
+		sc->nvstore.sectsz = sectsz;
+	else if (sc->nvstore.type != NVME_STOR_RAM)
+		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
+	for (sc->nvstore.sectsz_bits = 9;
+	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
+	     sc->nvstore.sectsz_bits++);
+
+	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
+		sc->max_queues = NVME_QUEUES;
+
+	if (sc->max_qentries <= 0) {
+		fprintf(stderr, "Invalid qsz option\n");
+		return (-1);
+	}
+	if (sc->ioslots <= 0) {
+		fprintf(stderr, "Invalid ioslots option\n");
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct pci_nvme_softc *sc;
+	uint32_t pci_membar_sz;
+	int	error;
+
+	error = 0;
+
+	sc = calloc(1, sizeof(struct pci_nvme_softc));
+	pi->pi_arg = sc;
+	sc->nsc_pi = pi;
+
+	error = pci_nvme_parse_opts(sc, opts);
+	if (error < 0)
+		goto done;
+	else
+		error = 0;
+
+	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
+	for (int i = 0; i < sc->ioslots; i++) {
+		if (i < (sc->ioslots-1))
+			sc->ioreqs[i].next = &sc->ioreqs[i+1];
+		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
+		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
+	}
+	sc->ioreqs_free = sc->ioreqs;
+	sc->intr_coales_aggr_thresh = 1;
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
+	pci_set_cfgdata8(pi, PCIR_PROGIF,
+	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
+
+	/*
+	 * Allocate size of NVMe registers + doorbell space for all queues.
+	 *
+	 * The specification requires a minimum memory I/O window size of 16K.
+	 * The Windows driver will refuse to start a device with a smaller
+	 * window.
+	 */
+	pci_membar_sz = sizeof(struct nvme_registers) +
+	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
+	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
+
+	DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
+
+	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
+	if (error) {
+		WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
+		goto done;
+	}
+
+	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
+	if (error) {
+		WPRINTF(("%s pci add msixcap failed\r\n", __func__));
+		goto done;
+	}
+
+	pthread_mutex_init(&sc->mtx, NULL);
+	sem_init(&sc->iosemlock, 0, sc->ioslots);
+
+	pci_nvme_reset(sc);
+	pci_nvme_init_ctrldata(sc);
+	pci_nvme_init_nsdata(sc);
+	pci_nvme_init_logpages(sc);
+
+	pci_lintr_request(pi);
+
+done:
+	return (error);
+}
+
+
+struct pci_devemu pci_de_nvme = {
+	.pe_emu =	"nvme",
+	.pe_init =	pci_nvme_init,
+	.pe_barwrite =	pci_nvme_write,
+	.pe_barread =	pci_nvme_read
+};
+PCI_EMUL_SET(pci_de_nvme);
diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c
new file mode 100644
index 0000000000..d2c69e795c
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_passthru.c
@@ -0,0 +1,937 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/pciio.h>
+#include <sys/ioctl.h>
+
+#include <dev/io/iodev.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/iodev.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+#include "pci_emul.h"
+#include "mem.h"
+
+#ifndef _PATH_DEVPCI
+#define	_PATH_DEVPCI	"/dev/pci"
+#endif
+
+#ifndef	_PATH_DEVIO
+#define	_PATH_DEVIO	"/dev/io"
+#endif
+
+#ifndef _PATH_MEM
+#define	_PATH_MEM	"/dev/mem"
+#endif
+
+#define	LEGACY_SUPPORT	1
+
+#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
+#define MSIX_CAPLEN 12
+
+static int pcifd = -1;
+static int iofd = -1;
+static int memfd = -1;
+
+struct passthru_softc {
+	struct pci_devinst *psc_pi;
+	struct pcibar psc_bar[PCI_BARMAX + 1];
+	struct {
+		int		capoff;
+		int		msgctrl;
+		int		emulated;
+	} psc_msi;
+	struct {
+		int		capoff;
+	} psc_msix;
+	struct pcisel psc_sel;
+};
+
+static int
+msi_caplen(int msgctrl)
+{
+	int len;
+	
+	len = 10;		/* minimum length of msi capability */
+
+	if (msgctrl & PCIM_MSICTRL_64BIT)
+		len += 4;
+
+#if 0
+	/*
+	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
+	 * We'll let the guest manipulate them directly.
+	 */
+	if (msgctrl & PCIM_MSICTRL_VECTOR)
+		len += 10;
+#endif
+
+	return (len);
+}
+
+static uint32_t
+read_config(const struct pcisel *sel, long reg, int width)
+{
+	struct pci_io pi;
+
+	bzero(&pi, sizeof(pi));
+	pi.pi_sel = *sel;
+	pi.pi_reg = reg;
+	pi.pi_width = width;
+
+	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
+		return (0);				/* XXX */
+	else
+		return (pi.pi_data);
+}
+
+static void
+write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
+{
+	struct pci_io pi;
+
+	bzero(&pi, sizeof(pi));
+	pi.pi_sel = *sel;
+	pi.pi_reg = reg;
+	pi.pi_width = width;
+	pi.pi_data = data;
+
+	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
+}
+
+#ifdef LEGACY_SUPPORT
+static int
+passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
+{
+	int capoff, i;
+	struct msicap msicap;
+	u_char *capdata;
+
+	pci_populate_msicap(&msicap, msgnum, nextptr);
+
+	/*
+	 * XXX
+	 * Copy the msi capability structure in the last 16 bytes of the
+	 * config space. This is wrong because it could shadow something
+	 * useful to the device.
+	 */
+	capoff = 256 - roundup(sizeof(msicap), 4);
+	capdata = (u_char *)&msicap;
+	for (i = 0; i < sizeof(msicap); i++)
+		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+	return (capoff);
+}
+#endif	/* LEGACY_SUPPORT */
+
+static int
+cfginitmsi(struct passthru_softc *sc)
+{
+	int i, ptr, capptr, cap, sts, caplen, table_size;
+	uint32_t u32;
+	struct pcisel sel;
+	struct pci_devinst *pi;
+	struct msixcap msixcap;
+	uint32_t *msixcap_ptr;
+
+	pi = sc->psc_pi;
+	sel = sc->psc_sel;
+
+	/*
+	 * Parse the capabilities and cache the location of the MSI
+	 * and MSI-X capabilities.
+	 */
+	sts = read_config(&sel, PCIR_STATUS, 2);
+	if (sts & PCIM_STATUS_CAPPRESENT) {
+		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
+		while (ptr != 0 && ptr != 0xff) {
+			cap = read_config(&sel, ptr + PCICAP_ID, 1);
+			if (cap == PCIY_MSI) {
+				/*
+				 * Copy the MSI capability into the config
+				 * space of the emulated pci device
+				 */
+				sc->psc_msi.capoff = ptr;
+				sc->psc_msi.msgctrl = read_config(&sel,
+								  ptr + 2, 2);
+				sc->psc_msi.emulated = 0;
+				caplen = msi_caplen(sc->psc_msi.msgctrl);
+				capptr = ptr;
+				while (caplen > 0) {
+					u32 = read_config(&sel, capptr, 4);
+					pci_set_cfgdata32(pi, capptr, u32);
+					caplen -= 4;
+					capptr += 4;
+				}
+			} else if (cap == PCIY_MSIX) {
+				/*
+				 * Copy the MSI-X capability 
+				 */
+				sc->psc_msix.capoff = ptr;
+				caplen = 12;
+				msixcap_ptr = (uint32_t*) &msixcap;
+				capptr = ptr;
+				while (caplen > 0) {
+					u32 = read_config(&sel, capptr, 4);
+					*msixcap_ptr = u32;
+					pci_set_cfgdata32(pi, capptr, u32);
+					caplen -= 4;
+					capptr += 4;
+					msixcap_ptr++;
+				}
+			}
+			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
+		}
+	}
+
+	if (sc->psc_msix.capoff != 0) {
+		pi->pi_msix.pba_bar =
+		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
+		pi->pi_msix.pba_offset =
+		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
+		pi->pi_msix.table_bar =
+		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
+		pi->pi_msix.table_offset =
+		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
+		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
+		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
+
+		/* Allocate the emulated MSI-X table array */
+		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+		pi->pi_msix.table = calloc(1, table_size);
+
+		/* Mask all table entries */
+		for (i = 0; i < pi->pi_msix.table_count; i++) {
+			pi->pi_msix.table[i].vector_control |=
+						PCIM_MSIX_VCTRL_MASK;
+		}
+	}
+
+#ifdef LEGACY_SUPPORT
+	/*
+	 * If the passthrough device does not support MSI then craft a
+	 * MSI capability for it. We link the new MSI capability at the
+	 * head of the list of capabilities.
+	 */
+	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
+		int origptr, msiptr;
+		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
+		msiptr = passthru_add_msicap(pi, 1, origptr);
+		sc->psc_msi.capoff = msiptr;
+		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
+		sc->psc_msi.emulated = 1;
+		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
+	}
+#endif
+
+	/* Make sure one of the capabilities is present */
+	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) 
+		return (-1);
+	else
+		return (0);
+}
+
+static uint64_t
+msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
+{
+	struct pci_devinst *pi;
+	struct msix_table_entry *entry;
+	uint8_t *src8;
+	uint16_t *src16;
+	uint32_t *src32;
+	uint64_t *src64;
+	uint64_t data;
+	size_t entry_offset;
+	int index;
+
+	pi = sc->psc_pi;
+	if (offset >= pi->pi_msix.pba_offset &&
+	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+		switch(size) {
+		case 1:
+			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			data = *src8;
+			break;
+		case 2:
+			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			data = *src16;
+			break;
+		case 4:
+			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			data = *src32;
+			break;
+		case 8:
+			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			data = *src64;
+			break;
+		default:
+			return (-1);
+		}
+		return (data);
+	}
+
+	if (offset < pi->pi_msix.table_offset)
+		return (-1);
+
+	offset -= pi->pi_msix.table_offset;
+	index = offset / MSIX_TABLE_ENTRY_SIZE;
+	if (index >= pi->pi_msix.table_count)
+		return (-1);
+
+	entry = &pi->pi_msix.table[index];
+	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+	switch(size) {
+	case 1:
+		src8 = (uint8_t *)((void *)entry + entry_offset);
+		data = *src8;
+		break;
+	case 2:
+		src16 = (uint16_t *)((void *)entry + entry_offset);
+		data = *src16;
+		break;
+	case 4:
+		src32 = (uint32_t *)((void *)entry + entry_offset);
+		data = *src32;
+		break;
+	case 8:
+		src64 = (uint64_t *)((void *)entry + entry_offset);
+		data = *src64;
+		break;
+	default:
+		return (-1);
+	}
+
+	return (data);
+}
+
+static void
+msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
+		 uint64_t offset, int size, uint64_t data)
+{
+	struct pci_devinst *pi;
+	struct msix_table_entry *entry;
+	uint8_t *dest8;
+	uint16_t *dest16;
+	uint32_t *dest32;
+	uint64_t *dest64;
+	size_t entry_offset;
+	uint32_t vector_control;
+	int index;
+
+	pi = sc->psc_pi;
+	if (offset >= pi->pi_msix.pba_offset &&
+	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+		switch(size) {
+		case 1:
+			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			*dest8 = data;
+			break;
+		case 2:
+			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			*dest16 = data;
+			break;
+		case 4:
+			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			*dest32 = data;
+			break;
+		case 8:
+			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
+			    pi->pi_msix.pba_page_offset);
+			*dest64 = data;
+			break;
+		default:
+			break;
+		}
+		return;
+	}
+
+	if (offset < pi->pi_msix.table_offset)
+		return;
+
+	offset -= pi->pi_msix.table_offset;
+	index = offset / MSIX_TABLE_ENTRY_SIZE;
+	if (index >= pi->pi_msix.table_count)
+		return;
+
+	entry = &pi->pi_msix.table[index];
+	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+	/* Only 4 byte naturally-aligned writes are supported */
+	assert(size == 4);
+	assert(entry_offset % 4 == 0);
+
+	vector_control = entry->vector_control;
+	dest32 = (uint32_t *)((void *)entry + entry_offset);
+	*dest32 = data;
+	/* If MSI-X hasn't been enabled, do nothing */
+	if (pi->pi_msix.enabled) {
+		/* If the entry is masked, don't set it up */
+		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
+		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+			(void)vm_setup_pptdev_msix(ctx, vcpu,
+			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
+			    sc->psc_sel.pc_func, index, entry->addr,
+			    entry->msg_data, entry->vector_control);
+		}
+	}
+}
+
+static int
+init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
+{
+	int b, s, f;
+	int error, idx;
+	size_t len, remaining;
+	uint32_t table_size, table_offset;
+	uint32_t pba_size, pba_offset;
+	vm_paddr_t start;
+	struct pci_devinst *pi = sc->psc_pi;
+
+	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
+
+	b = sc->psc_sel.pc_bus;
+	s = sc->psc_sel.pc_dev;
+	f = sc->psc_sel.pc_func;
+
+	/* 
+	 * If the MSI-X table BAR maps memory intended for
+	 * other uses, it is at least assured that the table 
+	 * either resides in its own page within the region, 
+	 * or it resides in a page shared with only the PBA.
+	 */
+	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
+
+	table_size = pi->pi_msix.table_offset - table_offset;
+	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+	table_size = roundup2(table_size, 4096);
+
+	idx = pi->pi_msix.table_bar;
+	start = pi->pi_bar[idx].addr;
+	remaining = pi->pi_bar[idx].size;
+
+	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
+		pba_offset = pi->pi_msix.pba_offset;
+		pba_size = pi->pi_msix.pba_size;
+		if (pba_offset >= table_offset + table_size ||
+		    table_offset >= pba_offset + pba_size) {
+			/*
+			 * If the PBA does not share a page with the MSI-x
+			 * tables, no PBA emulation is required.
+			 */
+			pi->pi_msix.pba_page = NULL;
+			pi->pi_msix.pba_page_offset = 0;
+		} else {
+			/*
+			 * The PBA overlaps with either the first or last
+			 * page of the MSI-X table region.  Map the
+			 * appropriate page.
+			 */
+			if (pba_offset <= table_offset)
+				pi->pi_msix.pba_page_offset = table_offset;
+			else
+				pi->pi_msix.pba_page_offset = table_offset +
+				    table_size - 4096;
+			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
+			    PROT_WRITE, MAP_SHARED, memfd, start +
+			    pi->pi_msix.pba_page_offset);
+			if (pi->pi_msix.pba_page == MAP_FAILED) {
+				warn(
+			    "Failed to map PBA page for MSI-X on %d/%d/%d",
+				    b, s, f);
+				return (-1);
+			}
+		}
+	}
+
+	/* Map everything before the MSI-X table */
+	if (table_offset > 0) {
+		len = table_offset;
+		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
+		if (error)
+			return (error);
+
+		base += len;
+		start += len;
+		remaining -= len;
+	}
+
+	/* Skip the MSI-X table */
+	base += table_size;
+	start += table_size;
+	remaining -= table_size;
+
+	/* Map everything beyond the end of the MSI-X table */
+	if (remaining > 0) {
+		len = remaining;
+		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+static int
+cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
+{
+	int i, error;
+	struct pci_devinst *pi;
+	struct pci_bar_io bar;
+	enum pcibar_type bartype;
+	uint64_t base, size;
+
+	pi = sc->psc_pi;
+
+	/*
+	 * Initialize BAR registers
+	 */
+	for (i = 0; i <= PCI_BARMAX; i++) {
+		bzero(&bar, sizeof(bar));
+		bar.pbi_sel = sc->psc_sel;
+		bar.pbi_reg = PCIR_BAR(i);
+
+		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
+			continue;
+
+		if (PCI_BAR_IO(bar.pbi_base)) {
+			bartype = PCIBAR_IO;
+			base = bar.pbi_base & PCIM_BAR_IO_BASE;
+		} else {
+			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
+			case PCIM_BAR_MEM_64:
+				bartype = PCIBAR_MEM64;
+				break;
+			default:
+				bartype = PCIBAR_MEM32;
+				break;
+			}
+			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
+		}
+		size = bar.pbi_length;
+
+		if (bartype != PCIBAR_IO) {
+			if (((base | size) & PAGE_MASK) != 0) {
+				warnx("passthru device %d/%d/%d BAR %d: "
+				    "base %#lx or size %#lx not page aligned\n",
+				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
+				    sc->psc_sel.pc_func, i, base, size);
+				return (-1);
+			}
+		}
+
+		/* Cache information about the "real" BAR */
+		sc->psc_bar[i].type = bartype;
+		sc->psc_bar[i].size = size;
+		sc->psc_bar[i].addr = base;
+
+		/* Allocate the BAR in the guest I/O or MMIO space */
+		error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
+		if (error)
+			return (-1);
+
+		/* The MSI-X table needs special handling */
+		if (i == pci_msix_table_bar(pi)) {
+			error = init_msix_table(ctx, sc, base);
+			if (error) 
+				return (-1);
+		} else if (bartype != PCIBAR_IO) {
+			/* Map the physical BAR in the guest MMIO space */
+			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+			if (error)
+				return (-1);
+		}
+
+		/*
+		 * 64-bit BAR takes up two slots so skip the next one.
+		 */
+		if (bartype == PCIBAR_MEM64) {
+			i++;
+			assert(i <= PCI_BARMAX);
+			sc->psc_bar[i].type = PCIBAR_MEMHI64;
+		}
+	}
+	return (0);
+}
+
+static int
+cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
+{
+	int error;
+	struct passthru_softc *sc;
+
+	error = 1;
+	sc = pi->pi_arg;
+
+	bzero(&sc->psc_sel, sizeof(struct pcisel));
+	sc->psc_sel.pc_bus = bus;
+	sc->psc_sel.pc_dev = slot;
+	sc->psc_sel.pc_func = func;
+
+	if (cfginitmsi(sc) != 0) {
+		warnx("failed to initialize MSI for PCI %d/%d/%d",
+		    bus, slot, func);
+		goto done;
+	}
+
+	if (cfginitbar(ctx, sc) != 0) {
+		warnx("failed to initialize BARs for PCI %d/%d/%d",
+		    bus, slot, func);
+		goto done;
+	}
+
+	error = 0;				/* success */
+done:
+	return (error);
+}
+
+static int
+passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	int bus, slot, func, error, memflags;
+	struct passthru_softc *sc;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+	cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR };
+	cap_ioctl_t io_ioctls[] = { IODEV_PIO };
+#endif
+
+	sc = NULL;
+	error = 1;
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
+#endif
+
+	memflags = vm_get_memflags(ctx);
+	if (!(memflags & VM_MEM_F_WIRED)) {
+		warnx("passthru requires guest memory to be wired");
+		goto done;
+	}
+
+	if (pcifd < 0) {
+		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
+		if (pcifd < 0) {
+			warn("failed to open %s", _PATH_DEVPCI);
+			goto done;
+		}
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	if (cap_rights_limit(pcifd, &rights) == -1 && errno != ENOSYS)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	if (cap_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1 && errno != ENOSYS)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	if (iofd < 0) {
+		iofd = open(_PATH_DEVIO, O_RDWR, 0);
+		if (iofd < 0) {
+			warn("failed to open %s", _PATH_DEVIO);
+			goto done;
+		}
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	if (cap_rights_limit(iofd, &rights) == -1 && errno != ENOSYS)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	if (cap_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1 && errno != ENOSYS)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	if (memfd < 0) {
+		memfd = open(_PATH_MEM, O_RDWR, 0);
+		if (memfd < 0) {
+			warn("failed to open %s", _PATH_MEM);
+			goto done;
+		}
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_clear(&rights, CAP_IOCTL);
+	cap_rights_set(&rights, CAP_MMAP_RW);
+	if (cap_rights_limit(memfd, &rights) == -1 && errno != ENOSYS)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	if (opts == NULL ||
+	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
+		warnx("invalid passthru options");
+		goto done;
+	}
+
+	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
+		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
+		    bus, slot, func);
+		goto done;
+	}
+
+	sc = calloc(1, sizeof(struct passthru_softc));
+
+	pi->pi_arg = sc;
+	sc->psc_pi = pi;
+
+	/* initialize config space */
+	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
+		goto done;
+	
+	error = 0;		/* success */
+done:
+	if (error) {
+		free(sc);
+		vm_unassign_pptdev(ctx, bus, slot, func);
+	}
+	return (error);
+}
+
+static int
+bar_access(int coff)
+{
+	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+		return (1);
+	else
+		return (0);
+}
+
+static int
+msicap_access(struct passthru_softc *sc, int coff)
+{
+	int caplen;
+
+	if (sc->psc_msi.capoff == 0)
+		return (0);
+
+	caplen = msi_caplen(sc->psc_msi.msgctrl);
+
+	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
+		return (1);
+	else
+		return (0);
+}
+
+static int 
+msixcap_access(struct passthru_softc *sc, int coff)
+{
+	if (sc->psc_msix.capoff == 0) 
+		return (0);
+
+	return (coff >= sc->psc_msix.capoff && 
+	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
+}
+
+static int
+passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		 int coff, int bytes, uint32_t *rv)
+{
+	struct passthru_softc *sc;
+
+	sc = pi->pi_arg;
+
+	/*
+	 * PCI BARs and MSI capability is emulated.
+	 */
+	if (bar_access(coff) || msicap_access(sc, coff))
+		return (-1);
+
+#ifdef LEGACY_SUPPORT
+	/*
+	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
+	 * natively.
+	 */
+	if (sc->psc_msi.emulated) {
+		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
+			return (-1);
+	}
+#endif
+
+	/* Everything else just read from the device's config space */
+	*rv = read_config(&sc->psc_sel, coff, bytes);
+
+	return (0);
+}
+
+static int
+passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		  int coff, int bytes, uint32_t val)
+{
+	int error, msix_table_entries, i;
+	struct passthru_softc *sc;
+
+	sc = pi->pi_arg;
+
+	/*
+	 * PCI BARs are emulated
+	 */
+	if (bar_access(coff))
+		return (-1);
+
+	/*
+	 * MSI capability is emulated
+	 */
+	if (msicap_access(sc, coff)) {
+		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
+
+		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
+			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+			pi->pi_msi.addr, pi->pi_msi.msg_data,
+			pi->pi_msi.maxmsgnum);
+		if (error != 0)
+			err(1, "vm_setup_pptdev_msi");
+		return (0);
+	}
+
+	if (msixcap_access(sc, coff)) {
+		msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
+		if (pi->pi_msix.enabled) {
+			msix_table_entries = pi->pi_msix.table_count;
+			for (i = 0; i < msix_table_entries; i++) {
+				error = vm_setup_pptdev_msix(ctx, vcpu,
+				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, 
+				    sc->psc_sel.pc_func, i, 
+				    pi->pi_msix.table[i].addr,
+				    pi->pi_msix.table[i].msg_data,
+				    pi->pi_msix.table[i].vector_control);
+		
+				if (error)
+					err(1, "vm_setup_pptdev_msix");
+			}
+		}
+		return (0);
+	}
+
+#ifdef LEGACY_SUPPORT
+	/*
+	 * If this device does not support MSI natively then we cannot let
+	 * the guest disable legacy interrupts from the device. It is the
+	 * legacy interrupt that is triggering the virtual MSI to the guest.
+	 */
+	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
+		if (coff == PCIR_COMMAND && bytes == 2)
+			val &= ~PCIM_CMD_INTxDIS;
+	}
+#endif
+
+	write_config(&sc->psc_sel, coff, bytes, val);
+
+	return (0);
+}
+
+static void
+passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	       uint64_t offset, int size, uint64_t value)
+{
+	struct passthru_softc *sc;
+	struct iodev_pio_req pio;
+
+	sc = pi->pi_arg;
+
+	if (baridx == pci_msix_table_bar(pi)) {
+		msix_table_write(ctx, vcpu, sc, offset, size, value);
+	} else {
+		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+		bzero(&pio, sizeof(struct iodev_pio_req));
+		pio.access = IODEV_PIO_WRITE;
+		pio.port = sc->psc_bar[baridx].addr + offset;
+		pio.width = size;
+		pio.val = value;
+		
+		(void)ioctl(iofd, IODEV_PIO, &pio);
+	}
+}
+
+static uint64_t
+passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	      uint64_t offset, int size)
+{
+	struct passthru_softc *sc;
+	struct iodev_pio_req pio;
+	uint64_t val;
+
+	sc = pi->pi_arg;
+
+	if (baridx == pci_msix_table_bar(pi)) {
+		val = msix_table_read(sc, offset, size);
+	} else {
+		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+		bzero(&pio, sizeof(struct iodev_pio_req));
+		pio.access = IODEV_PIO_READ;
+		pio.port = sc->psc_bar[baridx].addr + offset;
+		pio.width = size;
+		pio.val = 0;
+
+		(void)ioctl(iofd, IODEV_PIO, &pio);
+
+		val = pio.val;
+	}
+
+	return (val);
+}
+
+struct pci_devemu passthru = {
+	.pe_emu		= "passthru",
+	.pe_init	= passthru_init,
+	.pe_cfgwrite	= passthru_cfgwrite,
+	.pe_cfgread	= passthru_cfgread,
+	.pe_barwrite 	= passthru_write,
+	.pe_barread    	= passthru_read,
+};
+PCI_EMUL_SET(passthru);
diff --git a/usr/src/cmd/bhyve/pci_uart.c b/usr/src/cmd/bhyve/pci_uart.c
new file mode 100644
index 0000000000..093d0cb361
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_uart.c
@@ -0,0 +1,121 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <stdio.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "uart_emul.h"
+
+/*
+ * Pick a PCI vid/did of a chip with a single uart at
+ * BAR0, that most versions of FreeBSD can understand:
+ * Siig CyberSerial 1-port.
+ */
+#define COM_VENDOR	0x131f
+#define COM_DEV		0x2000
+
+static void
+pci_uart_intr_assert(void *arg)
+{
+	struct pci_devinst *pi = arg;
+
+	pci_lintr_assert(pi);
+}
+
+static void
+pci_uart_intr_deassert(void *arg)
+{
+	struct pci_devinst *pi = arg;
+
+	pci_lintr_deassert(pi);
+}
+
+static void
+pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	       int baridx, uint64_t offset, int size, uint64_t value)
+{
+
+	assert(baridx == 0);
+	assert(size == 1);
+
+	uart_write(pi->pi_arg, offset, value);
+}
+
+uint64_t
+pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	      int baridx, uint64_t offset, int size)
+{
+	uint8_t val;
+
+	assert(baridx == 0);
+	assert(size == 1);
+
+	val = uart_read(pi->pi_arg, offset);
+	return (val);
+}
+
+static int
+pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct uart_softc *sc;
+
+	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE);
+	pci_lintr_request(pi);
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
+
+	sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi);
+	pi->pi_arg = sc;
+
+	if (uart_set_backend(sc, opts) != 0) {
+		fprintf(stderr, "Unable to initialize backend '%s' for "
+		    "pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func);
+		return (-1);
+	}
+
+	return (0);
+}
+
+struct pci_devemu pci_de_com = {
+	.pe_emu =	"uart",
+	.pe_init =	pci_uart_init,
+	.pe_barwrite =	pci_uart_write,
+	.pe_barread =	pci_uart_read
+};
+PCI_EMUL_SET(pci_de_com);
diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c
index 65e2d9c57d..5a7ecbfe9e 100644
--- a/usr/src/cmd/bhyve/pci_virtio_block.c
+++ b/usr/src/cmd/bhyve/pci_virtio_block.c
@@ -1,6 +1,9 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
+ * Copyright (c) 2019 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -23,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,10 +39,11 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
@@ -63,24 +67,23 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
+#include "block_if.h"
 
-#define VTBLK_RINGSZ	64
+#define VTBLK_RINGSZ	128
 
-#ifdef	__FreeBSD__
-#define VTBLK_MAXSEGS	32
-#else
-#define	VTBLK_MAXSEGS	16
-#endif
+_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
 
 #define VTBLK_S_OK	0
 #define VTBLK_S_IOERR	1
 #define	VTBLK_S_UNSUPP	2
 
-#define	VTBLK_BLK_ID_BYTES	20
+#define	VTBLK_BLK_ID_BYTES	20 + 1
 
 /* Capability bits */
 #define	VTBLK_F_SEG_MAX		(1 << 2)	/* Maximum request segments */
-#define	VTBLK_F_BLK_SIZE       	(1 << 6)	/* cfg block size valid */
+#define	VTBLK_F_BLK_SIZE	(1 << 6)	/* cfg block size valid */
+#define	VTBLK_F_FLUSH		(1 << 9)	/* Cache flush support */
+#define	VTBLK_F_TOPOLOGY	(1 << 10)	/* Optimal I/O alignment */
 
 /*
  * Host capabilities
@@ -88,6 +91,8 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:
 #define VTBLK_S_HOSTCAPS      \
   ( VTBLK_F_SEG_MAX  |						    \
     VTBLK_F_BLK_SIZE |						    \
+    VTBLK_F_FLUSH    |						    \
+    VTBLK_F_TOPOLOGY |						    \
     VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
 
 /*
@@ -97,11 +102,19 @@ struct vtblk_config {
 	uint64_t	vbc_capacity;
 	uint32_t	vbc_size_max;
 	uint32_t	vbc_seg_max;
-	uint16_t	vbc_geom_c;
-	uint8_t		vbc_geom_h;
-	uint8_t		vbc_geom_s;
+	struct {
+		uint16_t cylinders;
+		uint8_t heads;
+		uint8_t sectors;
+	} vbc_geometry;
 	uint32_t	vbc_blk_size;
-	uint32_t	vbc_sectors_max;
+	struct {
+		uint8_t physical_block_exp;
+		uint8_t alignment_offset;
+		uint16_t min_io_size;
+		uint32_t opt_io_size;
+	} vbc_topology;
+	uint8_t		vbc_writeback;
 } __packed;
 
 /*
@@ -110,9 +123,11 @@ struct vtblk_config {
 struct virtio_blk_hdr {
 #define	VBH_OP_READ		0
 #define	VBH_OP_WRITE		1
-#define	VBH_OP_IDENT		8		
+#define	VBH_OP_FLUSH		4
+#define	VBH_OP_FLUSH_OUT	5
+#define	VBH_OP_IDENT		8
 #define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
-	uint32_t       	vbh_type;
+	uint32_t	vbh_type;
 	uint32_t	vbh_ioprio;
 	uint64_t	vbh_sector;
 } __packed;
@@ -124,6 +139,13 @@ static int pci_vtblk_debug;
 #define DPRINTF(params) if (pci_vtblk_debug) printf params
 #define WPRINTF(params) printf params
 
+struct pci_vtblk_ioreq {
+	struct blockif_req		io_req;
+	struct pci_vtblk_softc		*io_sc;
+	uint8_t				*io_status;
+	uint16_t			io_idx;
+};
+
 /*
  * Per-device softc
  */
@@ -131,24 +153,36 @@ struct pci_vtblk_softc {
 	struct virtio_softc vbsc_vs;
 	pthread_mutex_t vsc_mtx;
 	struct vqueue_info vbsc_vq;
-	int		vbsc_fd;
-	struct vtblk_config vbsc_cfg;	
+	struct vtblk_config vbsc_cfg;
+	struct blockif_ctxt *bc;
+#ifndef __FreeBSD__
+	int vbsc_wce;
+#endif
 	char vbsc_ident[VTBLK_BLK_ID_BYTES];
+	struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
 };
 
 static void pci_vtblk_reset(void *);
 static void pci_vtblk_notify(void *, struct vqueue_info *);
 static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
 static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+#ifndef __FreeBSD__
+static void pci_vtblk_apply_feats(void *, uint64_t);
+#endif
 
 static struct virtio_consts vtblk_vi_consts = {
 	"vtblk",		/* our name */
 	1,			/* we support 1 virtqueue */
-	sizeof(struct vtblk_config), /* config reg size */
+	sizeof(struct vtblk_config),	/* config reg size */
 	pci_vtblk_reset,	/* reset */
 	pci_vtblk_notify,	/* device-wide qnotify */
 	pci_vtblk_cfgread,	/* read PCI config */
 	pci_vtblk_cfgwrite,	/* write PCI config */
+#ifndef __FreeBSD__
+	pci_vtblk_apply_feats,	/* apply negotiated features */
+#else
+	NULL,			/* apply negotiated features */
+#endif
 	VTBLK_S_HOSTCAPS,	/* our capabilities */
 };
 
@@ -159,22 +193,58 @@ pci_vtblk_reset(void *vsc)
 
 	DPRINTF(("vtblk: device reset requested !\n"));
 	vi_reset_dev(&sc->vbsc_vs);
+#ifndef __FreeBSD__
+	/* Disable write cache until FLUSH feature is negotiated */
+	(void) blockif_set_wce(sc->bc, 0);
+	sc->vbsc_wce = 0;
+#endif
+}
+
+static void
+pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
+{
+	struct pci_vtblk_softc *sc = io->io_sc;
+
+	/* convert errno into a virtio block error return */
+	if (err == EOPNOTSUPP || err == ENOSYS)
+		*io->io_status = VTBLK_S_UNSUPP;
+	else if (err != 0)
+		*io->io_status = VTBLK_S_IOERR;
+	else
+		*io->io_status = VTBLK_S_OK;
+
+	/*
+	 * Return the descriptor back to the host.
+	 * We wrote 1 byte (our status) to host.
+	 */
+	vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
+	vq_endchains(&sc->vbsc_vq, 0);
+}
+
+static void
+pci_vtblk_done(struct blockif_req *br, int err)
+{
+	struct pci_vtblk_ioreq *io = br->br_param;
+	struct pci_vtblk_softc *sc = io->io_sc;
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+	pci_vtblk_done_locked(io, err);
+	pthread_mutex_unlock(&sc->vsc_mtx);
 }
 
 static void
 pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 {
 	struct virtio_blk_hdr *vbh;
-	uint8_t *status;
+	struct pci_vtblk_ioreq *io;
 	int i, n;
 	int err;
-	int iolen;
+	ssize_t iolen;
 	int writeop, type;
-	off_t offset;
-	struct iovec iov[VTBLK_MAXSEGS + 2];
-	uint16_t flags[VTBLK_MAXSEGS + 2];
+	struct iovec iov[BLOCKIF_IOV_MAX + 2];
+	uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
 
-	n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags);
+	n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
 
 	/*
 	 * The first descriptor will be the read-only fixed header,
@@ -184,13 +254,16 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 	 * XXX - note - this fails on crash dump, which does a
 	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
 	 */
-	assert(n >= 2 && n <= VTBLK_MAXSEGS + 2);
+	assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
 
+	io = &sc->vbsc_ios[idx];
 	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
 	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
-	vbh = (struct virtio_block_hdr *)iov[0].iov_base;
-
-	status = iov[--n].iov_base;
+	vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
+	memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
+	io->io_req.br_iovcnt = n - 2;
+	io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
+	io->io_status = (uint8_t *)iov[--n].iov_base;
 	assert(iov[n].iov_len == 1);
 	assert(flags[n] & VRING_DESC_F_WRITE);
 
@@ -202,8 +275,6 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
 	writeop = (type == VBH_OP_WRITE);
 
-	offset = vbh->vbh_sector * DEV_BSIZE;
-
 	iolen = 0;
 	for (i = 1; i < n; i++) {
 		/*
@@ -215,42 +286,36 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 		assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
 		iolen += iov[i].iov_len;
 	}
+	io->io_req.br_resid = iolen;
 
-	DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", 
-		 writeop ? "write" : "read/ident", iolen, i - 1, offset));
+	DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r",
+		 writeop ? "write" : "read/ident", iolen, i - 1,
+		 io->io_req.br_offset));
 
 	switch (type) {
+	case VBH_OP_READ:
+		err = blockif_read(sc->bc, &io->io_req);
+		break;
 	case VBH_OP_WRITE:
-		err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset);
+		err = blockif_write(sc->bc, &io->io_req);
 		break;
-	case VBH_OP_READ:
-		err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset);
+	case VBH_OP_FLUSH:
+	case VBH_OP_FLUSH_OUT:
+		err = blockif_flush(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_IDENT:
 		/* Assume a single buffer */
-		strlcpy(iov[1].iov_base, sc->vbsc_ident,
+		/* S/n equal to buffer is not zero-terminated. */
+		memset(iov[1].iov_base, 0, iov[1].iov_len);
+		strncpy(iov[1].iov_base, sc->vbsc_ident,
 		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
-		err = 0;
-		break;
+		pci_vtblk_done_locked(io, 0);
+		return;
 	default:
-		err = -ENOSYS;
-		break;
+		pci_vtblk_done_locked(io, EOPNOTSUPP);
+		return;
 	}
-
-	/* convert errno into a virtio block error return */
-	if (err < 0) {
-		if (err == -ENOSYS)
-			*status = VTBLK_S_UNSUPP;
-		else
-			*status = VTBLK_S_IOERR;
-	} else
-		*status = VTBLK_S_OK;
-
-	/*
-	 * Return the descriptor back to the host.
-	 * We wrote 1 byte (our status) to host.
-	 */
-	vq_relchain(vq, 1);
+	assert(err == 0);
 }
 
 static void
@@ -258,22 +323,20 @@ pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
-	vq_startchains(vq);
 	while (vq_has_descs(vq))
 		pci_vtblk_proc(sc, vq);
-	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
 }
 
 static int
 pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
-	struct stat sbuf;
+	char bident[sizeof("XX:X:X")];
+	struct blockif_ctxt *bctxt;
 	MD5_CTX mdctx;
 	u_char digest[16];
 	struct pci_vtblk_softc *sc;
-	off_t size;	
-	int fd;
-	int sectsz;
+	off_t size;
+	int i, sectsz, sts, sto;
 
 	if (opts == NULL) {
 		printf("virtio-block: backing device required\n");
@@ -283,40 +346,32 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	/*
 	 * The supplied backing file has to exist
 	 */
-	fd = open(opts, O_RDWR);
-	if (fd < 0) {
+	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
+	bctxt = blockif_open(opts, bident);
+	if (bctxt == NULL) {       	
 		perror("Could not open backing file");
 		return (1);
 	}
 
-	if (fstat(fd, &sbuf) < 0) {
-		perror("Could not stat backing file");
-		close(fd);
-		return (1);
-	}
-
-	/*
-	 * Deal with raw devices
-	 */
-	size = sbuf.st_size;
-	sectsz = DEV_BSIZE;
-#ifdef	__FreeBSD__
-	if (S_ISCHR(sbuf.st_mode)) {
-		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
-		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
-			perror("Could not fetch dev blk/sector size");
-			close(fd);
-			return (1);
-		}
-		assert(size != 0);
-		assert(sectsz != 0);
-	}
-#endif
+	size = blockif_size(bctxt);
+	sectsz = blockif_sectsz(bctxt);
+	blockif_psectsz(bctxt, &sts, &sto);
 
 	sc = calloc(1, sizeof(struct pci_vtblk_softc));
+	sc->bc = bctxt;
+	for (i = 0; i < VTBLK_RINGSZ; i++) {
+		struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
+		io->io_req.br_callback = pci_vtblk_done;
+		io->io_req.br_param = io;
+		io->io_sc = sc;
+		io->io_idx = i;
+	}
 
-	/* record fd of storage device/file */
-	sc->vbsc_fd = fd;
+#ifndef __FreeBSD__
+	/* Disable write cache until FLUSH feature is negotiated */
+	(void) blockif_set_wce(sc->bc, 0);
+	sc->vbsc_wce = 0;
+#endif
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
@@ -333,19 +388,34 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	 */
 	MD5Init(&mdctx);
 	MD5Update(&mdctx, opts, strlen(opts));
-	MD5Final(digest, &mdctx);	
-	sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+	MD5Final(digest, &mdctx);
+	snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
+	    "BHYVE-%02X%02X-%02X%02X-%02X%02X",
 	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
 
 	/* setup virtio block config space */
 	sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
-	sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
-	sc->vbsc_cfg.vbc_blk_size = sectsz;
 	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
-	sc->vbsc_cfg.vbc_geom_c = 0;	/* no geometry */
-	sc->vbsc_cfg.vbc_geom_h = 0;
-	sc->vbsc_cfg.vbc_geom_s = 0;
-	sc->vbsc_cfg.vbc_sectors_max = 0;
+
+	/*
+	 * If Linux is presented with a seg_max greater than the virtio queue
+	 * size, it can stumble into situations where it violates its own
+	 * invariants and panics.  For safety, we keep seg_max clamped, paying
+	 * heed to the two extra descriptors needed for the header and status
+	 * of a request.
+	 */
+	sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
+	sc->vbsc_cfg.vbc_geometry.cylinders = 0;	/* no geometry */
+	sc->vbsc_cfg.vbc_geometry.heads = 0;
+	sc->vbsc_cfg.vbc_geometry.sectors = 0;
+	sc->vbsc_cfg.vbc_blk_size = sectsz;
+	sc->vbsc_cfg.vbc_topology.physical_block_exp =
+	    (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
+	sc->vbsc_cfg.vbc_topology.alignment_offset =
+	    (sto != 0) ? ((sts - sto) / sectsz) : 0;
+	sc->vbsc_cfg.vbc_topology.min_io_size = 0;
+	sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
+	sc->vbsc_cfg.vbc_writeback = 0;
 
 	/*
 	 * Should we move some of this into virtio.c?  Could
@@ -356,9 +426,13 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
-	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix()))
+	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
+		blockif_close(sc->bc);
+		free(sc);
 		return (1);
+	}
 	vi_set_io_bar(&sc->vbsc_vs, 0);
 	return (0);
 }
@@ -383,6 +457,20 @@ pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 	return (0);
 }
 
+#ifndef __FreeBSD__
+void
+pci_vtblk_apply_feats(void *vsc, uint64_t caps)
+{
+	struct pci_vtblk_softc *sc = vsc;
+	const int wce_next = ((caps & VTBLK_F_FLUSH) != 0) ? 1 : 0;
+
+	if (sc->vbsc_wce != wce_next) {
+		(void) blockif_set_wce(sc->bc, wce_next);
+		sc->vbsc_wce = wce_next;
+	}
+}
+#endif /* __FreeBSD__ */
+
 struct pci_devemu pci_de_vblk = {
 	.pe_emu =	"virtio-blk",
 	.pe_init =	pci_vtblk_init,
diff --git a/usr/src/cmd/bhyve/pci_virtio_console.c b/usr/src/cmd/bhyve/pci_virtio_console.c
new file mode 100644
index 0000000000..90437662df
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_console.c
@@ -0,0 +1,701 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 iXsystems Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Jakub Klama <jceel@FreeBSD.org>
+ * under sponsorship from iXsystems Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/linker_set.h>
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <libgen.h>
+#include <sysexits.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+#include "mevent.h"
+#include "sockstream.h"
+
+#define	VTCON_RINGSZ	64
+#define	VTCON_MAXPORTS	16
+#define	VTCON_MAXQ	(VTCON_MAXPORTS * 2 + 2)
+
+#define	VTCON_DEVICE_READY	0
+#define	VTCON_DEVICE_ADD	1
+#define	VTCON_DEVICE_REMOVE	2
+#define	VTCON_PORT_READY	3
+#define	VTCON_CONSOLE_PORT	4
+#define	VTCON_CONSOLE_RESIZE	5
+#define	VTCON_PORT_OPEN		6
+#define	VTCON_PORT_NAME		7
+
+#define	VTCON_F_SIZE		0
+#define	VTCON_F_MULTIPORT	1
+#define	VTCON_F_EMERG_WRITE	2
+#define	VTCON_S_HOSTCAPS	\
+    (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE)
+
+static int pci_vtcon_debug;
+#define DPRINTF(params) if (pci_vtcon_debug) printf params
+#define WPRINTF(params) printf params
+
+struct pci_vtcon_softc;
+struct pci_vtcon_port;
+struct pci_vtcon_config;
+typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *,
+    int);
+
+struct pci_vtcon_port {
+	struct pci_vtcon_softc * vsp_sc;
+	int                      vsp_id;
+	const char *             vsp_name;
+	bool                     vsp_enabled;
+	bool                     vsp_console;
+	bool                     vsp_rx_ready;
+	bool                     vsp_open;
+	int                      vsp_rxq;
+	int                      vsp_txq;
+	void *                   vsp_arg;
+	pci_vtcon_cb_t *         vsp_cb;
+};
+
+struct pci_vtcon_sock
+{
+	struct pci_vtcon_port *  vss_port;
+	const char *             vss_path;
+	struct mevent *          vss_server_evp;
+	struct mevent *          vss_conn_evp;
+	int                      vss_server_fd;
+	int                      vss_conn_fd;
+	bool                     vss_open;
+};
+
+struct pci_vtcon_softc {
+	struct virtio_softc      vsc_vs;
+	struct vqueue_info       vsc_queues[VTCON_MAXQ];
+	pthread_mutex_t          vsc_mtx;
+	uint64_t                 vsc_cfg;
+	uint64_t                 vsc_features;
+	char *                   vsc_rootdir;
+	int                      vsc_kq;
+	int                      vsc_nports;
+	bool                     vsc_ready;
+	struct pci_vtcon_port    vsc_control_port;
+ 	struct pci_vtcon_port    vsc_ports[VTCON_MAXPORTS];
+	struct pci_vtcon_config *vsc_config;
+};
+
+struct pci_vtcon_config {
+	uint16_t cols;
+	uint16_t rows;
+	uint32_t max_nr_ports;
+	uint32_t emerg_wr;
+} __attribute__((packed));
+
+struct pci_vtcon_control {
+	uint32_t id;
+	uint16_t event;
+	uint16_t value;
+} __attribute__((packed));
+
+struct pci_vtcon_console_resize {
+	uint16_t cols;
+	uint16_t rows;
+} __attribute__((packed));
+
+static void pci_vtcon_reset(void *);
+static void pci_vtcon_notify_rx(void *, struct vqueue_info *);
+static void pci_vtcon_notify_tx(void *, struct vqueue_info *);
+static int pci_vtcon_cfgread(void *, int, int, uint32_t *);
+static int pci_vtcon_cfgwrite(void *, int, int, uint32_t);
+static void pci_vtcon_neg_features(void *, uint64_t);
+static void pci_vtcon_sock_accept(int, enum ev_type,  void *);
+static void pci_vtcon_sock_rx(int, enum ev_type, void *);
+static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *,
+    int);
+static void pci_vtcon_control_send(struct pci_vtcon_softc *,
+    struct pci_vtcon_control *, const void *, size_t);
+static void pci_vtcon_announce_port(struct pci_vtcon_port *);
+static void pci_vtcon_open_port(struct pci_vtcon_port *, bool);
+
+static struct virtio_consts vtcon_vi_consts = {
+	"vtcon",		/* our name */
+	VTCON_MAXQ,		/* we support VTCON_MAXQ virtqueues */
+	sizeof(struct pci_vtcon_config), /* config reg size */
+	pci_vtcon_reset,	/* reset */
+	NULL,			/* device-wide qnotify */
+	pci_vtcon_cfgread,	/* read virtio config */
+	pci_vtcon_cfgwrite,	/* write virtio config */
+	pci_vtcon_neg_features,	/* apply negotiated features */
+	VTCON_S_HOSTCAPS,	/* our capabilities */
+};
+
+
+static void
+pci_vtcon_reset(void *vsc)
+{
+	struct pci_vtcon_softc *sc;
+
+	sc = vsc;
+
+	DPRINTF(("vtcon: device reset requested!\n"));
+	vi_reset_dev(&sc->vsc_vs);
+}
+
+static void
+pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features)
+{
+	struct pci_vtcon_softc *sc = vsc;
+
+	sc->vsc_features = negotiated_features;
+}
+
+static int
+pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+	struct pci_vtcon_softc *sc = vsc;
+	void *ptr;
+
+	ptr = (uint8_t *)sc->vsc_config + offset;
+	memcpy(retval, ptr, size);
+	return (0);
+}
+
+static int
+pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val)
+{
+
+	return (0);
+}
+
+static inline struct pci_vtcon_port *
+pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq)
+{
+	uint16_t num = vq->vq_num;
+
+	if (num == 0 || num == 1)
+		return (&sc->vsc_ports[0]);
+
+	if (num == 2 || num == 3)
+		return (&sc->vsc_control_port);
+
+	return (&sc->vsc_ports[(num / 2) - 1]);
+}
+
+static inline struct vqueue_info *
+pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue)
+{
+	int qnum;
+
+	qnum = tx_queue ? port->vsp_txq : port->vsp_rxq;
+	return (&port->vsp_sc->vsc_queues[qnum]);
+}
+
+static struct pci_vtcon_port *
+pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name,
+    pci_vtcon_cb_t *cb, void *arg)
+{
+	struct pci_vtcon_port *port;
+
+	if (sc->vsc_nports == VTCON_MAXPORTS) {
+		errno = EBUSY;
+		return (NULL);
+	}
+
+	port = &sc->vsc_ports[sc->vsc_nports++];
+	port->vsp_id = sc->vsc_nports - 1;
+	port->vsp_sc = sc;
+	port->vsp_name = name;
+	port->vsp_cb = cb;
+	port->vsp_arg = arg;
+
+	if (port->vsp_id == 0) {
+		/* port0 */
+		port->vsp_txq = 0;
+		port->vsp_rxq = 1;
+	} else {
+		port->vsp_txq = sc->vsc_nports * 2;
+		port->vsp_rxq = port->vsp_txq + 1;
+	}
+
+	port->vsp_enabled = true;
+	return (port);
+}
+
+static int
+pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name,
+    const char *path)
+{
+	struct pci_vtcon_sock *sock;
+#ifdef __FreeBSD__
+	struct sockaddr_un sun;
+	char *pathcopy;
+#else
+	/* Our compiler #defines 'sun' as '1'.  Awesome. */
+	struct sockaddr_un addr;
+#endif
+	int s = -1, fd = -1, error = 0;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+
+	sock = calloc(1, sizeof(struct pci_vtcon_sock));
+	if (sock == NULL) {
+		error = -1;
+		goto out;
+	}
+
+	s = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (s < 0) {
+		error = -1;
+		goto out;
+	}
+
+#ifdef __FreeBSD__
+	pathcopy = strdup(path);
+	if (pathcopy == NULL) {
+		error = -1;
+		goto out;
+	}
+
+	fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY);
+	if (fd < 0) {
+		free(pathcopy);
+		error = -1;
+		goto out;
+	}
+
+	sun.sun_family = AF_UNIX;
+	sun.sun_len = sizeof(struct sockaddr_un);
+	strcpy(pathcopy, path);
+	strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path));
+	free(pathcopy);
+
+	if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) {
+		error = -1;
+		goto out;
+	}
+#else /* __FreeBSD__ */
+	/* Do a simple bind rather than the FreeBSD bindat() */
+	addr.sun_family = AF_UNIX;
+	(void) strlcpy(addr.sun_path, path, sizeof (addr.sun_path));
+	if (bind(fd, (struct sockaddr *)&addr, sizeof (addr)) < 0) {
+		error = -1;
+		goto out;
+	}
+#endif /* __FreeBSD__ */
+
+	if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) {
+		error = -1;
+		goto out;
+	}
+
+	if (listen(s, 1) < 0) {
+		error = -1;
+		goto out;
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(s, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock);
+	if (sock->vss_port == NULL) {
+		error = -1;
+		goto out;
+	}
+
+	sock->vss_open = false;
+	sock->vss_conn_fd = -1;
+	sock->vss_server_fd = s;
+	sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept,
+	    sock);
+
+	if (sock->vss_server_evp == NULL) {
+		error = -1;
+		goto out;
+	}
+
+out:
+	if (fd != -1)
+		close(fd);
+
+	if (error != 0 && s != -1)
+		close(s);
+
+	return (error);
+}
+
+static void
+pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg)
+{
+	struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg;
+	int s;
+
+	s = accept(sock->vss_server_fd, NULL, NULL);
+	if (s < 0)
+		return;
+
+	if (sock->vss_open) {
+		close(s);
+		return;
+	}
+
+	sock->vss_open = true;
+	sock->vss_conn_fd = s;
+	sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock);
+
+	pci_vtcon_open_port(sock->vss_port, true);
+}
+
+static void
+pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg)
+{
+	struct pci_vtcon_port *port;
+	struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg;
+	struct vqueue_info *vq;
+	struct iovec iov;
+	static char dummybuf[2048];
+	int len, n;
+	uint16_t idx;
+
+	port = sock->vss_port;
+	vq = pci_vtcon_port_to_vq(port, true);
+
+	if (!sock->vss_open || !port->vsp_rx_ready) {
+		len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf));
+		if (len == 0)
+			goto close;
+
+		return;
+	}
+
+	if (!vq_has_descs(vq)) {
+		len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf));
+		vq_endchains(vq, 1);
+		if (len == 0)
+			goto close;
+
+		return;
+	}
+
+	do {
+		n = vq_getchain(vq, &idx, &iov, 1, NULL);
+		len = readv(sock->vss_conn_fd, &iov, n);
+
+		if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) {
+			vq_retchain(vq);
+			vq_endchains(vq, 0);
+			if (len == 0)
+				goto close;
+
+			return;
+		}
+
+		vq_relchain(vq, idx, len);
+	} while (vq_has_descs(vq));
+
+	vq_endchains(vq, 1);
+
+close:
+	mevent_delete_close(sock->vss_conn_evp);
+	sock->vss_conn_fd = -1;
+	sock->vss_open = false;
+}
+
+static void
+pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov,
+    int niov)
+{
+	struct pci_vtcon_sock *sock;
+#ifdef __FreeBSD__
+	int i, ret;
+#else
+	int i, ret = 0;
+#endif
+
+	sock = (struct pci_vtcon_sock *)arg;
+
+	if (sock->vss_conn_fd == -1)
+		return;
+
+	for (i = 0; i < niov; i++) {
+		ret = stream_write(sock->vss_conn_fd, iov[i].iov_base,
+		    iov[i].iov_len);
+		if (ret <= 0)
+			break;
+	}
+
+	if (ret <= 0) {
+		mevent_delete_close(sock->vss_conn_evp);
+		sock->vss_conn_fd = -1;
+		sock->vss_open = false;
+	}
+}
+
+static void
+pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov,
+    int niov)
+{
+	struct pci_vtcon_softc *sc;
+	struct pci_vtcon_port *tmp;
+	struct pci_vtcon_control resp, *ctrl;
+	int i;
+
+	assert(niov == 1);
+
+	sc = port->vsp_sc;
+	ctrl = (struct pci_vtcon_control *)iov->iov_base;
+
+	switch (ctrl->event) {
+	case VTCON_DEVICE_READY:
+		sc->vsc_ready = true;
+		/* set port ready events for registered ports */
+		for (i = 0; i < VTCON_MAXPORTS; i++) {
+			tmp = &sc->vsc_ports[i];
+			if (tmp->vsp_enabled)
+				pci_vtcon_announce_port(tmp);
+
+			if (tmp->vsp_open)
+				pci_vtcon_open_port(tmp, true);
+		}
+		break;
+
+	case VTCON_PORT_READY:
+		if (ctrl->id >= sc->vsc_nports) {
+			WPRINTF(("VTCON_PORT_READY event for unknown port %d\n",
+			    ctrl->id));
+			return;
+		}
+
+		tmp = &sc->vsc_ports[ctrl->id];
+		if (tmp->vsp_console) {
+			resp.event = VTCON_CONSOLE_PORT;
+			resp.id = ctrl->id;
+			resp.value = 1;
+			pci_vtcon_control_send(sc, &resp, NULL, 0);
+		}
+		break;
+	}
+}
+
+static void
+pci_vtcon_announce_port(struct pci_vtcon_port *port)
+{
+	struct pci_vtcon_control event;
+
+	event.id = port->vsp_id;
+	event.event = VTCON_DEVICE_ADD;
+	event.value = 1;
+	pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0);
+
+	event.event = VTCON_PORT_NAME;
+	pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name,
+	    strlen(port->vsp_name));
+}
+
+static void
+pci_vtcon_open_port(struct pci_vtcon_port *port, bool open)
+{
+	struct pci_vtcon_control event;
+
+	if (!port->vsp_sc->vsc_ready) {
+		port->vsp_open = true;
+		return;
+	}
+
+	event.id = port->vsp_id;
+	event.event = VTCON_PORT_OPEN;
+	event.value = (int)open;
+	pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0);
+}
+
+static void
+pci_vtcon_control_send(struct pci_vtcon_softc *sc,
+    struct pci_vtcon_control *ctrl, const void *payload, size_t len)
+{
+	struct vqueue_info *vq;
+	struct iovec iov;
+	uint16_t idx;
+	int n;
+
+	vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true);
+
+	if (!vq_has_descs(vq))
+		return;
+
+	n = vq_getchain(vq, &idx, &iov, 1, NULL);
+
+	assert(n == 1);
+
+	memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control));
+	if (payload != NULL && len > 0)
+		memcpy(iov.iov_base + sizeof(struct pci_vtcon_control),
+		     payload, len);
+
+	vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len);
+	vq_endchains(vq, 1);
+}
+    
+
+static void
+pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq)
+{
+	struct pci_vtcon_softc *sc;
+	struct pci_vtcon_port *port;
+	struct iovec iov[1];
+	uint16_t idx, n;
+	uint16_t flags[8];
+
+	sc = vsc;
+	port = pci_vtcon_vq_to_port(sc, vq);
+
+	while (vq_has_descs(vq)) {
+		n = vq_getchain(vq, &idx, iov, 1, flags);
+		assert(n >= 1);
+		if (port != NULL)
+			port->vsp_cb(port, port->vsp_arg, iov, 1);
+
+		/*
+		 * Release this chain and handle more
+		 */
+		vq_relchain(vq, idx, 0);
+	}
+	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
+}
+
+static void
+pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq)
+{
+	struct pci_vtcon_softc *sc;
+	struct pci_vtcon_port *port;
+
+	sc = vsc;
+	port = pci_vtcon_vq_to_port(sc, vq);
+
+	if (!port->vsp_rx_ready) {
+		port->vsp_rx_ready = 1;
+		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+	}
+}
+
+static int
+pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct pci_vtcon_softc *sc;
+	char *portname = NULL;
+	char *portpath = NULL;
+	char *opt;
+	int i;	
+
+	sc = calloc(1, sizeof(struct pci_vtcon_softc));
+	sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config));
+	sc->vsc_config->max_nr_ports = VTCON_MAXPORTS;
+	sc->vsc_config->cols = 80;
+	sc->vsc_config->rows = 25; 
+
+	vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues);
+	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
+
+	for (i = 0; i < VTCON_MAXQ; i++) {
+		sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ;
+		sc->vsc_queues[i].vq_notify = i % 2 == 0
+		    ? pci_vtcon_notify_rx
+		    : pci_vtcon_notify_tx;
+	}
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_CONSOLE);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+	if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
+		return (1);
+	vi_set_io_bar(&sc->vsc_vs, 0);
+
+	/* create control port */
+	sc->vsc_control_port.vsp_sc = sc;
+	sc->vsc_control_port.vsp_txq = 2;
+	sc->vsc_control_port.vsp_rxq = 3;
+	sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx;
+	sc->vsc_control_port.vsp_enabled = true;
+
+	while ((opt = strsep(&opts, ",")) != NULL) {
+		portname = strsep(&opt, "=");
+		portpath = opt;
+
+		/* create port */
+		if (pci_vtcon_sock_add(sc, portname, portpath) < 0) {
+			fprintf(stderr, "cannot create port %s: %s\n",
+			    portname, strerror(errno));
+			return (1);
+		}
+	}
+
+	return (0);
+}
+
+struct pci_devemu pci_de_vcon = {
+	.pe_emu =	"virtio-console",
+	.pe_init =	pci_vtcon_init,
+	.pe_barwrite =	vi_pci_write,
+	.pe_barread =	vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vcon);
diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c
index e58bdd0115..aa188a3e59 100644
--- a/usr/src/cmd/bhyve/pci_virtio_net.c
+++ b/usr/src/cmd/bhyve/pci_virtio_net.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,18 +38,33 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
 #include <sys/linker_set.h>
 #include <sys/select.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
+#include <machine/atomic.h>
 #include <net/ethernet.h>
+#ifdef __FreeBSD__
+#ifndef NETMAP_WITH_LIBS
+#define NETMAP_WITH_LIBS
+#endif
+#include <net/netmap_user.h>
+#endif
 
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
@@ -60,21 +77,22 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37
 #include <md5.h>
 #include <pthread.h>
 #include <pthread_np.h>
-#ifndef	__FreeBSD__
+#include <sysexits.h>
+#ifndef __FreeBSD__
 #include <poll.h>
 #include <libdlpi.h>
 #endif
 
 #include "bhyverun.h"
 #include "pci_emul.h"
-#ifdef	__FreeBSD__
+#ifdef __FreeBSD__
 #include "mevent.h"
 #endif
 #include "virtio.h"
 
 #define VTNET_RINGSZ	1024
 
-#define VTNET_MAXSEGS	32
+#define VTNET_MAXSEGS	256
 
 /*
  * Host capabilities.  Note that we only offer a few of these.
@@ -101,7 +119,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37
 
 #define VTNET_S_HOSTCAPS      \
   ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
-    VIRTIO_F_NOTIFY_ON_EMPTY)
+    VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
 
 /*
  * PCI config-space "registers"
@@ -155,25 +173,35 @@ struct pci_vtnet_softc {
 	dlpi_handle_t	vsc_dhp;
 	int		vsc_dlpifd;
 #endif
+	struct nm_desc	*vsc_nmd;
+
 	int		vsc_rx_ready;
 	volatile int	resetting;	/* set and checked outside lock */
 
-	uint32_t	vsc_features;
+	uint64_t	vsc_features;	/* negotiated features */
+
 	struct virtio_net_config vsc_config;
 
 	pthread_mutex_t	rx_mtx;
 	int		rx_in_progress;
+	int		rx_vhdrlen;
+	int		rx_merge;	/* merged rx bufs in use */
 
 	pthread_t 	tx_tid;
 	pthread_mutex_t	tx_mtx;
 	pthread_cond_t	tx_cond;
 	int		tx_in_progress;
+
+	void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
+	void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
+			     int iovcnt, int len);
 };
 
 static void pci_vtnet_reset(void *);
 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */
 static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
+static void pci_vtnet_neg_features(void *, uint64_t);
 
 static struct virtio_consts vtnet_vi_consts = {
 	"vtnet",		/* our name */
@@ -183,6 +211,7 @@ static struct virtio_consts vtnet_vi_consts = {
 	NULL,			/* device-wide qnotify -- not used */
 	pci_vtnet_cfgread,	/* read PCI config */
 	pci_vtnet_cfgwrite,	/* write PCI config */
+	pci_vtnet_neg_features,	/* apply negotiated features */
 	VTNET_S_HOSTCAPS,	/* our capabilities */
 };
 
@@ -235,6 +264,8 @@ pci_vtnet_reset(void *vsc)
 	pci_vtnet_rxwait(sc);
 
 	sc->vsc_rx_ready = 0;
+	sc->rx_merge = 1;
+	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
 
 	/* now reset rings, MSI-X vectors, and negotiated capabilities */
 	vi_reset_dev(&sc->vsc_vs);
@@ -245,7 +276,7 @@ pci_vtnet_reset(void *vsc)
 /*
  * Called to send a buffer chain out to the tap device
  */
-#ifdef	__FreeBSD__
+#ifdef __FreeBSD__
 static void
 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 		 int len)
@@ -275,13 +306,13 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 	int i;
 
 	for (i = 0; i < iovcnt; i++) {
-		(void) dlpi_send(sc->vsc_dhp, NULL, NULL,
-				 iov[i].iov_base, iov[i].iov_len, NULL);
+		(void) dlpi_send(sc->vsc_dhp, NULL, 0,
+		    iov[i].iov_base, iov[i].iov_len, NULL);
 	}
 }
-#endif
+#endif /* __FreeBSD__ */
 
-#ifdef	__FreeBSD__
+#ifdef __FreeBSD__
 /*
  *  Called when there is read activity on the tap file descriptor.
  * Each buffer posted by the guest is assumed to be able to contain
@@ -290,23 +321,43 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
  * is no need for it to be per-vtnet or locked.
  */
 static uint8_t dummybuf[2048];
-#endif
+#endif /* __FreeBSD__ */
+
+static __inline struct iovec *
+rx_iov_trim(struct iovec *iov, int *niov, int tlen)
+{
+	struct iovec *riov;
+
+	/* XXX short-cut: assume first segment is >= tlen */
+	assert(iov[0].iov_len >= tlen);
+
+	iov[0].iov_len -= tlen;
+	if (iov[0].iov_len == 0) {
+		assert(*niov > 1);
+		*niov -= 1;
+		riov = &iov[1];
+	} else {
+		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
+		riov = &iov[0];
+	}
+
+	return (riov);
+}
 
 static void
 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 {
+	struct iovec iov[VTNET_MAXSEGS], *riov;
 	struct vqueue_info *vq;
-	struct virtio_net_rxhdr *vrx;
-	uint8_t *buf;
+	void *vrx;
+	int n;
 #ifdef	__FreeBSD__
 	int len;
-#endif
-	struct iovec iov[VTNET_MAXSEGS];
-#ifndef	__FreeBSD__
+#else
 	size_t len;
 	int ret;
 #endif
-	int total_len = 0;
+	uint16_t idx;
 
 	/*
 	 * Should never be called without a valid tap fd
@@ -335,7 +386,6 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 	 * Check for available rx buffers
 	 */
 	vq = &sc->vsc_queues[VTNET_RXQ];
-	vq_startchains(vq);
 	if (!vq_has_descs(vq)) {
 		/*
 		 * Drop the packet and try later.  Interrupt on
@@ -352,109 +402,267 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 		/*
 		 * Get descriptor chain
 		 */
-		if (sc->vsc_vs.vs_negotiated_caps & VIRTIO_NET_F_MRG_RXBUF) { 
-			assert(vq_getchain(vq, iov, 1, NULL) == 1);
+		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
+		assert(n >= 1 && n <= VTNET_MAXSEGS);
 
-			/*
-			 * Get a pointer to the rx header, and use the
-			 * data immediately following it for the packet buffer.
-			 */
-			vrx = (struct virtio_net_rxhdr *)iov[0].iov_base;
-			buf = (uint8_t *)(vrx + 1);
-			total_len = iov[0].iov_len;
+		/*
+		 * Get a pointer to the rx header, and use the
+		 * data immediately following it for the packet buffer.
+		 */
+		vrx = iov[0].iov_base;
+		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
 #ifdef	__FreeBSD__
-			len = read(sc->vsc_tapfd, buf,
-			   iov[0].iov_len - sizeof(struct virtio_net_rxhdr));
-
-			if (len < 0 && errno == EWOULDBLOCK) {
-				/*
-				 * No more packets, but still some avail ring
-				 * entries.  Interrupt if needed/appropriate.
-				 */
-				vq_endchains(vq, 0);
-				return;
-			}
+		len = readv(sc->vsc_tapfd, riov, n);
 #else
-			len = iov[0].iov_len - sizeof(struct virtio_net_rxhdr);
-			ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf,
-			    &len, 0, NULL);
-			if (ret != DLPI_SUCCESS) {
-				/*
-				 * No more packets, but still some avail ring
-				 * entries.  Interrupt if needed/appropriate.
-				 */
-				vq_endchains(vq, 0);
-				return;
-			}
+		len = riov[0].iov_len;
+		ret = dlpi_recv(sc->vsc_dhp, NULL, NULL,
+		    (uint8_t *)riov[0].iov_base, &len, 0, NULL);
+		if (ret != DLPI_SUCCESS) {
+			errno = EWOULDBLOCK;
+			len = 0;
+		}
 #endif
-		} else {
-			int i;
-			int num_segs;
-			num_segs = vq_getchain(vq, iov,
-			    VTNET_MAXSEGS, NULL);
-			vrx = (struct virtio_net_rxhrd *)iov[0].iov_base;
-			total_len = iov[0].iov_len;
-			for (i = 1; i < num_segs; i++) {
-				buf = (uint8_t *)iov[i].iov_base;
-				total_len += iov[i].iov_len;
+		if (len <= 0 && errno == EWOULDBLOCK) {
+			/*
+			 * No more packets, but still some avail ring
+			 * entries.  Interrupt if needed/appropriate.
+			 */
+			vq_retchain(vq);
+			vq_endchains(vq, 0);
+			return;
+		}
+
+		/*
+		 * The only valid field in the rx packet header is the
+		 * number of buffers if merged rx bufs were negotiated.
+		 */
+		memset(vrx, 0, sc->rx_vhdrlen);
+
+		if (sc->rx_merge) {
+			struct virtio_net_rxhdr *vrxh;
+
+			vrxh = vrx;
+			vrxh->vrh_bufs = 1;
+		}
+
+		/*
+		 * Release this chain and handle more chains.
+		 */
+		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
+	} while (vq_has_descs(vq));
+
+	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+	vq_endchains(vq, 1);
+}
+
 #ifdef __FreeBSD__
-				len = read(sc->vsc_tapfd, buf, iov[i].iov_len);
-				if (len < 0 && errno == EWOULDBLOCK) {
-					/*
-					 * No more packets,
-					 * but still some avail ring entries.
-					 * Interrupt if needed/appropriate.
-					 */
-					break;
-				}
-#else
-				len = iov[i].iov_len;
-				ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf,
-				    &len, 0, NULL);
-				if (ret != DLPI_SUCCESS) {
-					/*
-					 * No more packets,
-					 * but still some avail ring entries.
-					 * Interrupt if needed/appropriate.
-					 */
-					 total_len = 0;
-					 break;
-				}
-#endif
-			}
-			if (total_len == 0) {
-				vq_endchains(vq, 0);
-				return;
-			}
+static __inline int
+pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+{
+	int r, i;
+	int len = 0;
+
+	for (r = nmd->cur_tx_ring; ; ) {
+		struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
+		uint32_t cur, idx;
+		char *buf;
+
+		if (nm_ring_empty(ring)) {
+			r++;
+			if (r > nmd->last_tx_ring)
+				r = nmd->first_tx_ring;
+			if (r == nmd->cur_tx_ring)
+				break;
+			continue;
+		}
+		cur = ring->cur;
+		idx = ring->slot[cur].buf_idx;
+		buf = NETMAP_BUF(ring, idx);
+
+		for (i = 0; i < iovcnt; i++) {
+			if (len + iov[i].iov_len > 2048)
+				break;
+			memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
+			len += iov[i].iov_len;
+		}
+		ring->slot[cur].len = len;
+		ring->head = ring->cur = nm_ring_next(ring, cur);
+		nmd->cur_tx_ring = r;
+		ioctl(nmd->fd, NIOCTXSYNC, NULL);
+		break;
+	}
+
+	return (len);
+}
+
+static __inline int
+pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+{
+	int len = 0;
+	int i = 0;
+	int r;
+
+	for (r = nmd->cur_rx_ring; ; ) {
+		struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
+		uint32_t cur, idx;
+		char *buf;
+		size_t left;
+
+		if (nm_ring_empty(ring)) {
+			r++;
+			if (r > nmd->last_rx_ring)
+				r = nmd->first_rx_ring;
+			if (r == nmd->cur_rx_ring)
+				break;
+			continue;
+		}
+		cur = ring->cur;
+		idx = ring->slot[cur].buf_idx;
+		buf = NETMAP_BUF(ring, idx);
+		left = ring->slot[cur].len;
+
+		for (i = 0; i < iovcnt && left > 0; i++) {
+			if (iov[i].iov_len > left)
+				iov[i].iov_len = left;
+			memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
+			len += iov[i].iov_len;
+			left -= iov[i].iov_len;
+		}
+		ring->head = ring->cur = nm_ring_next(ring, cur);
+		nmd->cur_rx_ring = r;
+		ioctl(nmd->fd, NIOCRXSYNC, NULL);
+		break;
+	}
+	for (; i < iovcnt; i++)
+		iov[i].iov_len = 0;
+
+	return (len);
+}
+
+/*
+ * Called to send a buffer chain out to the vale port
+ */
+static void
+pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+		    int len)
+{
+	static char pad[60]; /* all zero bytes */
+
+	if (sc->vsc_nmd == NULL)
+		return;
+
+	/*
+	 * If the length is < 60, pad out to that and add the
+	 * extra zero'd segment to the iov. It is guaranteed that
+	 * there is always an extra iov available by the caller.
+	 */
+	if (len < 60) {
+		iov[iovcnt].iov_base = pad;
+		iov[iovcnt].iov_len = 60 - len;
+		iovcnt++;
+	}
+	(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
+}
+
+static void
+pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
+{
+	struct iovec iov[VTNET_MAXSEGS], *riov;
+	struct vqueue_info *vq;
+	void *vrx;
+	int len, n;
+	uint16_t idx;
+
+	/*
+	 * Should never be called without a valid netmap descriptor
+	 */
+	assert(sc->vsc_nmd != NULL);
+
+	/*
+	 * But, will be called when the rx ring hasn't yet
+	 * been set up or the guest is resetting the device.
+	 */
+	if (!sc->vsc_rx_ready || sc->resetting) {
+		/*
+		 * Drop the packet and try later.
+		 */
+		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+		return;
+	}
+
+	/*
+	 * Check for available rx buffers
+	 */
+	vq = &sc->vsc_queues[VTNET_RXQ];
+	if (!vq_has_descs(vq)) {
+		/*
+		 * Drop the packet and try later.  Interrupt on
+		 * empty, if that's negotiated.
+		 */
+		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+		vq_endchains(vq, 1);
+		return;
+	}
+
+	do {
+		/*
+		 * Get descriptor chain.
+		 */
+		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
+		assert(n >= 1 && n <= VTNET_MAXSEGS);
+
+		/*
+		 * Get a pointer to the rx header, and use the
+		 * data immediately following it for the packet buffer.
+		 */
+		vrx = iov[0].iov_base;
+		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
+
+		len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
+
+		if (len == 0) {
+			/*
+			 * No more packets, but still some avail ring
+			 * entries.  Interrupt if needed/appropriate.
+			 */
+			vq_retchain(vq);
+			vq_endchains(vq, 0);
+			return;
 		}
 
 		/*
 		 * The only valid field in the rx packet header is the
-		 * number of buffers, which is always 1 without TSO
-		 * support.
+		 * number of buffers if merged rx bufs were negotiated.
 		 */
-		memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
-		vrx->vrh_bufs = 1;
+		memset(vrx, 0, sc->rx_vhdrlen);
+
+		if (sc->rx_merge) {
+			struct virtio_net_rxhdr *vrxh;
+
+			vrxh = vrx;
+			vrxh->vrh_bufs = 1;
+		}
 
 		/*
 		 * Release this chain and handle more chains.
 		 */
-		vq_relchain(vq, total_len);
+		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
 	} while (vq_has_descs(vq));
 
 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
 	vq_endchains(vq, 1);
 }
+#endif /* __FreeBSD__ */
 
-#ifdef	__FreeBSD__
+#ifdef __FreeBSD__
 static void
-pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 
 	pthread_mutex_lock(&sc->rx_mtx);
 	sc->rx_in_progress = 1;
-	pci_vtnet_tap_rx(sc);
+	sc->pci_vtnet_rx(sc);
 	sc->rx_in_progress = 0;
 	pthread_mutex_unlock(&sc->rx_mtx);
 
@@ -477,11 +685,15 @@ pci_vtnet_poll_thread(void *param)
 			continue;
 		}
 		pthread_mutex_lock(&sc->vsc_mtx);
+		sc->rx_in_progress = 1;
 		pci_vtnet_tap_rx(sc);
+		sc->rx_in_progress = 0;
 		pthread_mutex_unlock(&sc->vsc_mtx);
 	}
+
+	return (NULL);
 }
-#endif
+#endif /* __FreeBSD__ */
 
 static void
 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
@@ -493,6 +705,7 @@ pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
 	 */
 	if (sc->vsc_rx_ready == 0) {
 		sc->vsc_rx_ready = 1;
+		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 	}
 }
 
@@ -502,13 +715,14 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	int i, n;
 	int plen, tlen;
+	uint16_t idx;
 
 	/*
 	 * Obtain chain of descriptors.  The first one is
 	 * really the header descriptor, so we need to sum
 	 * up two lengths: packet length and transfer length.
 	 */
-	n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL);
+	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
 	assert(n >= 1 && n <= VTNET_MAXSEGS);
 	plen = 0;
 	tlen = iov[0].iov_len;
@@ -518,10 +732,10 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
 	}
 
 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
-	pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen);
+	sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
 
 	/* chain is processed, release it and set tlen */
-	vq_relchain(vq, tlen);
+	vq_relchain(vq, idx, tlen);
 }
 
 static void
@@ -537,6 +751,7 @@ pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
 
 	/* Signal the tx thread for processing */
 	pthread_mutex_lock(&sc->tx_mtx);
+	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 	if (sc->tx_in_progress == 0)
 		pthread_cond_signal(&sc->tx_cond);
 	pthread_mutex_unlock(&sc->tx_mtx);
@@ -550,7 +765,7 @@ pci_vtnet_tx_thread(void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 	struct vqueue_info *vq;
-	int have_work, error;
+	int error;
 
 	vq = &sc->vsc_queues[VTNET_TXQ];
 
@@ -564,23 +779,20 @@ pci_vtnet_tx_thread(void *param)
 
 	for (;;) {
 		/* note - tx mutex is locked here */
-		do {
-			if (sc->resetting)
-				have_work = 0;
-			else
-				have_work = vq_has_descs(vq);
-
-			if (!have_work) {
-				sc->tx_in_progress = 0;
-				error = pthread_cond_wait(&sc->tx_cond,
-							  &sc->tx_mtx);
-				assert(error == 0);
-			}
-		} while (!have_work);
+		while (sc->resetting || !vq_has_descs(vq)) {
+			vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
+			mb();
+			if (!sc->resetting && vq_has_descs(vq))
+				break;
+
+			sc->tx_in_progress = 0;
+			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
+			assert(error == 0);
+		}
+		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 		sc->tx_in_progress = 1;
 		pthread_mutex_unlock(&sc->tx_mtx);
 
-		vq_startchains(vq);
 		do {
 			/*
 			 * Run through entries, placing them into
@@ -597,42 +809,161 @@ pci_vtnet_tx_thread(void *param)
 
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
+	return (NULL);
 }
 
-#ifdef notyet
+#ifdef __FreeBSD__
 static void
 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
 {
 
 	DPRINTF(("vtnet: control qnotify!\n\r"));
 }
-#endif
+#endif /* __FreeBSD__ */
 
-#ifdef	__FreeBSD__
+#ifdef __FreeBSD__
 static int
 pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
 {
-        struct ether_addr *ea;
-        char *tmpstr;
-        char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+	struct ether_addr *ea;
+	char *tmpstr;
+	char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+	tmpstr = strsep(&mac_str,"=");
 
-        tmpstr = strsep(&mac_str,"=");
-       
-        if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
-                ea = ether_aton(mac_str);
+	if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+		ea = ether_aton(mac_str);
 
-                if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
-                    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+		if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+		    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
 			fprintf(stderr, "Invalid MAC %s\n", mac_str);
-                        return (EINVAL);
-                } else
-                        memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
-        }
+			return (EINVAL);
+		} else
+			memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+	}
 
-        return (0);
+	return (0);
 }
+#endif /* __FreeBSD__ */
+
+static void
+pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
+{
+	char tbuf[80];
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+#ifndef	__FreeBSD__
+	uchar_t physaddr[DLPI_PHYSADDR_MAX];
+	size_t physaddrlen = DLPI_PHYSADDR_MAX;
+	int error;
+#endif
+
+	strcpy(tbuf, "/dev/");
+	strlcat(tbuf, devname, sizeof(tbuf));
+
+	sc->pci_vtnet_rx = pci_vtnet_tap_rx;
+	sc->pci_vtnet_tx = pci_vtnet_tap_tx;
+#ifdef	__FreeBSD__
+	sc->vsc_tapfd = open(tbuf, O_RDWR);
+	if (sc->vsc_tapfd == -1) {
+		WPRINTF(("open of tap device %s failed\n", tbuf));
+		return;
+	}
+
+	/*
+	 * Set non-blocking and register for read
+	 * notifications with the event loop
+	 */
+	int opt = 1;
+	if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+		WPRINTF(("tap device O_NONBLOCK failed\n"));
+		close(sc->vsc_tapfd);
+		sc->vsc_tapfd = -1;
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+				  EVF_READ,
+				  pci_vtnet_rx_callback,
+				  sc);
+	if (sc->vsc_mevp == NULL) {
+		WPRINTF(("Could not register event\n"));
+		close(sc->vsc_tapfd);
+		sc->vsc_tapfd = -1;
+	}
+#else
+	if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
+		WPRINTF(("open of vnic device %s failed\n", devname));
+	}
+
+	if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr,
+	    &physaddrlen) != DLPI_SUCCESS) {
+		WPRINTF(("read MAC address of vnic device %s failed\n",
+		    devname));
+	}
+	if (physaddrlen != ETHERADDRL) {
+		WPRINTF(("bad MAC address len %d on vnic device %s\n",
+		    physaddrlen, devname));
+	}
+	memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
+
+	if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
+		WPRINTF(("bind of vnic device %s failed\n", devname));
+	}
+
+	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
+		WPRINTF(("enable promiscous mode(physical) of vnic device %s "
+		    "failed\n", devname));
+	}
+	if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
+		WPRINTF(("enable promiscous mode(SAP) of vnic device %s "
+		    "failed\n", devname));
+	}
+
+	sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
+
+	if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
+		WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n",
+		    devname));
+		dlpi_close(sc->vsc_dhp);
+		sc->vsc_dlpifd = -1;
+	}
+
+	error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
+	assert(error == 0);
 #endif
+}
+
+#ifdef __FreeBSD__
+static void
+pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
+{
+	sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
+	sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
+
+	sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
+	if (sc->vsc_nmd == NULL) {
+		WPRINTF(("open of netmap device %s failed\n", ifname));
+		return;
+	}
 
+	sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
+				  EVF_READ,
+				  pci_vtnet_rx_callback,
+				  sc);
+	if (sc->vsc_mevp == NULL) {
+		WPRINTF(("Could not register event\n"));
+		nm_close(sc->vsc_nmd);
+		sc->vsc_nmd = NULL;
+	}
+}
+#endif /* __FreeBSD__ */
 
 static int
 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
@@ -640,31 +971,30 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 #ifdef	__FreeBSD__
 	MD5_CTX mdctx;
 	unsigned char digest[16];
-#else
-	uchar_t physaddr[DLPI_PHYSADDR_MAX];
-	size_t physaddrlen = DLPI_PHYSADDR_MAX;
-	int error;
-#endif
 	char nstr[80];
+#endif
 	char tname[MAXCOMLEN + 1];
 	struct pci_vtnet_softc *sc;
 	const char *env_msi;
 	char *devname;
 	char *vtopts;
+#ifdef __FreeBSD__
 	int mac_provided;
+#endif
 	int use_msix;
 
-	sc = malloc(sizeof(struct pci_vtnet_softc));
-	memset(sc, 0, sizeof(struct pci_vtnet_softc));
+	sc = calloc(1, sizeof(struct pci_vtnet_softc));
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
+	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
+
 	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
 	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
-#ifdef notyet
+#ifdef __FreeBSD__
 	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
         sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
 #endif
@@ -682,13 +1012,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	 * Attempt to open the tap device and read the MAC address
 	 * if specified
 	 */
-	mac_provided = 0;
 #ifdef	__FreeBSD__
+	mac_provided = 0;
 	sc->vsc_tapfd = -1;
 #endif
+	sc->vsc_nmd = NULL;
 	if (opts != NULL) {
-		char tbuf[80];
+#ifdef	__FreeBSD__
 		int err;
+#endif
 
 		devname = vtopts = strdup(opts);
 		(void) strsep(&vtopts, ",");
@@ -704,72 +1036,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		}
 #endif
 
-		strcpy(tbuf, "/dev/");
-		strlcat(tbuf, devname, sizeof(tbuf));
+#ifdef __FreeBSD__
+		if (strncmp(devname, "vale", 4) == 0)
+			pci_vtnet_netmap_setup(sc, devname);
+#endif
+		if (strncmp(devname, "tap", 3) == 0 ||
+		    strncmp(devname, "vmnet", 5) == 0)
+			pci_vtnet_tap_setup(sc, devname);
 
 		free(devname);
-
-#ifdef	__FreeBSD__
-		sc->vsc_tapfd = open(tbuf, O_RDWR);
-		if (sc->vsc_tapfd == -1) {
-			WPRINTF(("open of tap device %s failed\n", tbuf));
-		} else {
-			/*
-			 * Set non-blocking and register for read
-			 * notifications with the event loop
-			 */
-			int opt = 1;
-			if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
-				WPRINTF(("tap device O_NONBLOCK failed\n"));
-				close(sc->vsc_tapfd);
-				sc->vsc_tapfd = -1;
-			}
-
-			sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
-						  EVF_READ,
-						  pci_vtnet_tap_callback,
-						  sc);
-			if (sc->vsc_mevp == NULL) {
-				WPRINTF(("Could not register event\n"));
-				close(sc->vsc_tapfd);
-				sc->vsc_tapfd = -1;
-			}
-		}		
-#else
-		if (dlpi_open(opts, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
-			 WPRINTF(("open of vnic device %s failed\n", opts));
-		}
-
-		if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, &physaddrlen) != DLPI_SUCCESS) {
-			 WPRINTF(("read MAC address of vnic device %s failed\n", opts));
-		}
-		if (physaddrlen != ETHERADDRL) {
-			WPRINTF(("bad MAC address len %d on vnic device %s\n", physaddrlen, opts));
-		}
-		memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
-
-		if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
-			 WPRINTF(("bind of vnic device %s failed\n", opts));
-		}
-
-		if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
-			 WPRINTF(("enable promiscous mode(physical) of vnic device %s failed\n", opts));
-		}
-		if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
-			 WPRINTF(("enable promiscous mode(SAP) of vnic device %s failed\n", opts));
-		}
-
-		sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
-
-		if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
-			 WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", opts));
-			 dlpi_close(sc->vsc_dhp);
-			 sc->vsc_dlpifd = -1;
-		}
-
-		error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
-		assert(error == 0);
-#endif
 	}
 
 #ifdef	__FreeBSD__
@@ -799,9 +1074,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
-	/* link always up */
-	sc->vsc_config.status = 1;
+	/* Link is up if we managed to open tap device or vale port. */
+#ifdef	__FreeBSD__
+	sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
+#else
+	sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 ||
+#endif
+	    sc->vsc_nmd != NULL);
 	
 	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
 	if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
@@ -812,6 +1093,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 
 	sc->resetting = 0;
 
+	sc->rx_merge = 1;
+	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
 	sc->rx_in_progress = 0;
 	pthread_mutex_init(&sc->rx_mtx, NULL); 
 
@@ -824,8 +1107,9 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pthread_mutex_init(&sc->tx_mtx, NULL);
 	pthread_cond_init(&sc->tx_cond, NULL);
 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
-        snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot);
-        pthread_set_name_np(sc->tx_tid, tname);
+	snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
+	    pi->pi_func);
+	pthread_set_name_np(sc->tx_tid, tname);
 
 	return (0);
 }
@@ -844,9 +1128,10 @@ pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 		ptr = &sc->vsc_config.mac[offset];
 		memcpy(ptr, &value, size);
 	} else {
+		/* silently ignore other writes */
 		DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
-		return (1);
 	}
+
 	return (0);
 }
 
@@ -861,6 +1146,20 @@ pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 	return (0);
 }
 
+static void
+pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
+{
+	struct pci_vtnet_softc *sc = vsc;
+
+	sc->vsc_features = negotiated_features;
+
+	if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
+		sc->rx_merge = 0;
+		/* non-merge rx header is 2 bytes shorter */
+		sc->rx_vhdrlen -= 2;
+	}
+}
+
 struct pci_devemu pci_de_vnet = {
 	.pe_emu = 	"virtio-net",
 	.pe_init =	pci_vtnet_init,
diff --git a/usr/src/cmd/bhyve/pci_virtio_rnd.c b/usr/src/cmd/bhyve/pci_virtio_rnd.c
new file mode 100644
index 0000000000..5f470c03a6
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_rnd.c
@@ -0,0 +1,209 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * virtio entropy device emulation.
+ * Randomness is sourced from /dev/random which does not block
+ * once it has been seeded at bootup.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/linker_set.h>
+#include <sys/uio.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sysexits.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTRND_RINGSZ	64
+
+
+static int pci_vtrnd_debug;
+#define DPRINTF(params) if (pci_vtrnd_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtrnd_softc {
+	struct virtio_softc vrsc_vs;
+	struct vqueue_info  vrsc_vq;
+	pthread_mutex_t     vrsc_mtx;
+	uint64_t            vrsc_cfg;
+	int                 vrsc_fd;
+};
+
+static void pci_vtrnd_reset(void *);
+static void pci_vtrnd_notify(void *, struct vqueue_info *);
+
+static struct virtio_consts vtrnd_vi_consts = {
+	"vtrnd",		/* our name */
+	1,			/* we support 1 virtqueue */
+	0,			/* config reg size */
+	pci_vtrnd_reset,	/* reset */
+	pci_vtrnd_notify,	/* device-wide qnotify */
+	NULL,			/* read virtio config */
+	NULL,			/* write virtio config */
+	NULL,			/* apply negotiated features */
+	0,			/* our capabilities */
+};
+
+
+static void
+pci_vtrnd_reset(void *vsc)
+{
+	struct pci_vtrnd_softc *sc;
+
+	sc = vsc;
+
+	DPRINTF(("vtrnd: device reset requested !\n"));
+	vi_reset_dev(&sc->vrsc_vs);
+}
+
+
+static void
+pci_vtrnd_notify(void *vsc, struct vqueue_info *vq)
+{
+	struct iovec iov;
+	struct pci_vtrnd_softc *sc;
+	int len;
+	uint16_t idx;
+
+	sc = vsc;
+
+	if (sc->vrsc_fd < 0) {
+		vq_endchains(vq, 0);
+		return;
+	}
+
+	while (vq_has_descs(vq)) {
+		vq_getchain(vq, &idx, &iov, 1, NULL);
+
+		len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len);
+
+		DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len));
+
+		/* Catastrophe if unable to read from /dev/random */
+		assert(len > 0);
+
+		/*
+		 * Release this chain and handle more
+		 */
+		vq_relchain(vq, idx, len);
+	}
+	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
+}
+
+
+static int
+pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct pci_vtrnd_softc *sc;
+	int fd;
+	int len;
+	uint8_t v;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+
+	/*
+	 * Should always be able to open /dev/random.
+	 */
+	fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+
+	assert(fd >= 0);
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_READ);
+	if (caph_rights_limit(fd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	/*
+	 * Check that device is seeded and non-blocking.
+	 */
+	len = read(fd, &v, sizeof(v));
+	if (len <= 0) {
+		WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len));
+		close(fd);
+		return (1);
+	}
+
+	sc = calloc(1, sizeof(struct pci_vtrnd_softc));
+
+	vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq);
+	sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx;
+
+	sc->vrsc_vq.vq_qsize = VTRND_RINGSZ;
+
+	/* keep /dev/random opened while emulating */
+	sc->vrsc_fd = fd;
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+	if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix()))
+		return (1);
+	vi_set_io_bar(&sc->vrsc_vs, 0);
+
+	return (0);
+}
+
+
+struct pci_devemu pci_de_vrnd = {
+	.pe_emu =	"virtio-rnd",
+	.pe_init =	pci_vtrnd_init,
+	.pe_barwrite =	vi_pci_write,
+	.pe_barread =	vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vrnd);
diff --git a/usr/src/cmd/bhyve/pci_virtio_scsi.c b/usr/src/cmd/bhyve/pci_virtio_scsi.c
new file mode 100644
index 0000000000..38e7d918a0
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_scsi.c
@@ -0,0 +1,737 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
+ * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/time.h>
+#include <sys/queue.h>
+#include <sys/sbuf.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include <cam/scsi/scsi_all.h>
+#include <cam/scsi/scsi_message.h>
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_backend.h>
+#include <cam/ctl/ctl_ioctl.h>
+#include <cam/ctl/ctl_util.h>
+#include <cam/ctl/ctl_scsi_all.h>
+#include <camlib.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+#include "iov.h"
+
+#define VTSCSI_RINGSZ		64
+#define	VTSCSI_REQUESTQ		1
+#define	VTSCSI_THR_PER_Q	16
+#define	VTSCSI_MAXQ		(VTSCSI_REQUESTQ + 2)
+#define	VTSCSI_MAXSEG		64
+
+#define	VTSCSI_IN_HEADER_LEN(_sc)	\
+	(sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size)
+
+#define	VTSCSI_OUT_HEADER_LEN(_sc) 	\
+	(sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size)
+
+#define	VIRTIO_SCSI_MAX_CHANNEL	0
+#define	VIRTIO_SCSI_MAX_TARGET	0
+#define	VIRTIO_SCSI_MAX_LUN	16383
+
+#define	VIRTIO_SCSI_F_INOUT	(1 << 0)
+#define	VIRTIO_SCSI_F_HOTPLUG	(1 << 1)
+#define	VIRTIO_SCSI_F_CHANGE	(1 << 2)
+
+static int pci_vtscsi_debug = 0;
+#define	DPRINTF(params) if (pci_vtscsi_debug) printf params
+#define	WPRINTF(params) printf params
+
+struct pci_vtscsi_config {
+	uint32_t num_queues;
+	uint32_t seg_max;
+	uint32_t max_sectors;
+	uint32_t cmd_per_lun;
+	uint32_t event_info_size;
+	uint32_t sense_size;
+	uint32_t cdb_size;
+	uint16_t max_channel;
+	uint16_t max_target;
+	uint32_t max_lun;
+} __attribute__((packed));
+
+struct pci_vtscsi_queue {
+	struct pci_vtscsi_softc *         vsq_sc;
+	struct vqueue_info *              vsq_vq;
+	pthread_mutex_t                   vsq_mtx;
+	pthread_mutex_t                   vsq_qmtx;
+	pthread_cond_t                    vsq_cv;
+	STAILQ_HEAD(, pci_vtscsi_request) vsq_requests;
+	LIST_HEAD(, pci_vtscsi_worker)    vsq_workers;
+};
+
+struct pci_vtscsi_worker {
+	struct pci_vtscsi_queue *     vsw_queue;
+	pthread_t                     vsw_thread;
+	bool                          vsw_exiting;
+	LIST_ENTRY(pci_vtscsi_worker) vsw_link;
+};
+
+struct pci_vtscsi_request {
+	struct pci_vtscsi_queue * vsr_queue;
+	struct iovec              vsr_iov_in[VTSCSI_MAXSEG];
+	int                       vsr_niov_in;
+	struct iovec              vsr_iov_out[VTSCSI_MAXSEG];
+	int                       vsr_niov_out;
+	uint32_t                  vsr_idx;
+	STAILQ_ENTRY(pci_vtscsi_request) vsr_link;
+};
+
+/*
+ * Per-device softc
+ */
+struct pci_vtscsi_softc {
+	struct virtio_softc      vss_vs;
+	struct vqueue_info       vss_vq[VTSCSI_MAXQ];
+	struct pci_vtscsi_queue  vss_queues[VTSCSI_REQUESTQ];
+	pthread_mutex_t          vss_mtx;
+	int                      vss_iid;
+	int                      vss_ctl_fd;
+	uint32_t                 vss_features;
+	struct pci_vtscsi_config vss_config;
+};
+
+#define	VIRTIO_SCSI_T_TMF			0
+#define	VIRTIO_SCSI_T_TMF_ABORT_TASK		0
+#define	VIRTIO_SCSI_T_TMF_ABORT_TASK_SET	1
+#define	VIRTIO_SCSI_T_TMF_CLEAR_ACA		2
+#define	VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET	3
+#define	VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET	4
+#define	VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET	5
+#define	VIRTIO_SCSI_T_TMF_QUERY_TASK		6
+#define	VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 	7
+
+/* command-specific response values */
+#define	VIRTIO_SCSI_S_FUNCTION_COMPLETE		0
+#define	VIRTIO_SCSI_S_FUNCTION_SUCCEEDED	10
+#define	VIRTIO_SCSI_S_FUNCTION_REJECTED		11
+
+struct pci_vtscsi_ctrl_tmf {
+	uint32_t type;
+	uint32_t subtype;
+	uint8_t lun[8];
+	uint64_t id;
+	uint8_t response;
+} __attribute__((packed));
+
+#define	VIRTIO_SCSI_T_AN_QUERY			1
+#define	VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2
+#define	VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT	4
+#define	VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST	8
+#define	VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE	16
+#define	VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST	32
+#define	VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY	64
+
+struct pci_vtscsi_ctrl_an {
+	uint32_t type;
+	uint8_t lun[8];
+	uint32_t event_requested;
+	uint32_t event_actual;
+	uint8_t response;
+} __attribute__((packed));
+
+/* command-specific response values */
+#define	VIRTIO_SCSI_S_OK 			0
+#define	VIRTIO_SCSI_S_OVERRUN			1
+#define	VIRTIO_SCSI_S_ABORTED			2
+#define	VIRTIO_SCSI_S_BAD_TARGET		3
+#define	VIRTIO_SCSI_S_RESET			4
+#define	VIRTIO_SCSI_S_BUSY			5
+#define	VIRTIO_SCSI_S_TRANSPORT_FAILURE		6
+#define	VIRTIO_SCSI_S_TARGET_FAILURE		7
+#define	VIRTIO_SCSI_S_NEXUS_FAILURE		8
+#define	VIRTIO_SCSI_S_FAILURE			9
+#define	VIRTIO_SCSI_S_INCORRECT_LUN		12
+
+/* task_attr */
+#define	VIRTIO_SCSI_S_SIMPLE			0
+#define	VIRTIO_SCSI_S_ORDERED			1
+#define	VIRTIO_SCSI_S_HEAD			2
+#define	VIRTIO_SCSI_S_ACA			3
+
+struct pci_vtscsi_event {
+	uint32_t event;
+	uint8_t lun[8];
+	uint32_t reason;
+} __attribute__((packed));
+
+struct pci_vtscsi_req_cmd_rd {
+	uint8_t lun[8];
+	uint64_t id;
+	uint8_t task_attr;
+	uint8_t prio;
+	uint8_t crn;
+	uint8_t cdb[];
+} __attribute__((packed));
+
+struct pci_vtscsi_req_cmd_wr {
+	uint32_t sense_len;
+	uint32_t residual;
+	uint16_t status_qualifier;
+	uint8_t status;
+	uint8_t response;
+	uint8_t sense[];
+} __attribute__((packed));
+
+static void *pci_vtscsi_proc(void *);
+static void pci_vtscsi_reset(void *);
+static void pci_vtscsi_neg_features(void *, uint64_t);
+static int pci_vtscsi_cfgread(void *, int, int, uint32_t *);
+static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t);
+static inline int pci_vtscsi_get_lun(uint8_t *);
+static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t);
+static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *,
+    struct pci_vtscsi_ctrl_tmf *);
+static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *,
+    struct pci_vtscsi_ctrl_an *);
+static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *,
+    int, struct iovec *, int);
+static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *);
+static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *);
+static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *);
+static int  pci_vtscsi_init_queue(struct pci_vtscsi_softc *,
+    struct pci_vtscsi_queue *, int);
+static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *);
+
+static struct virtio_consts vtscsi_vi_consts = {
+	"vtscsi",				/* our name */
+	VTSCSI_MAXQ,				/* we support 2+n virtqueues */
+	sizeof(struct pci_vtscsi_config),	/* config reg size */
+	pci_vtscsi_reset,			/* reset */
+	NULL,					/* device-wide qnotify */
+	pci_vtscsi_cfgread,			/* read virtio config */
+	pci_vtscsi_cfgwrite,			/* write virtio config */
+	pci_vtscsi_neg_features,		/* apply negotiated features */
+	0,					/* our capabilities */
+};
+
+static void *
+pci_vtscsi_proc(void *arg)
+{
+	struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg;
+	struct pci_vtscsi_queue *q = worker->vsw_queue;
+	struct pci_vtscsi_request *req;
+	int iolen;
+
+	for (;;) {
+		pthread_mutex_lock(&q->vsq_mtx);
+
+		while (STAILQ_EMPTY(&q->vsq_requests)
+		    && !worker->vsw_exiting)
+			pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx);
+
+		if (worker->vsw_exiting)
+			break;
+
+		req = STAILQ_FIRST(&q->vsq_requests);
+		STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link);
+
+		pthread_mutex_unlock(&q->vsq_mtx);
+		iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in,
+		    req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out);
+
+		pthread_mutex_lock(&q->vsq_qmtx);
+		vq_relchain(q->vsq_vq, req->vsr_idx, iolen);
+		vq_endchains(q->vsq_vq, 0);
+		pthread_mutex_unlock(&q->vsq_qmtx);
+
+		DPRINTF(("virtio-scsi: request <idx=%d> completed\n",
+		    req->vsr_idx));
+		free(req);
+	}
+
+	pthread_mutex_unlock(&q->vsq_mtx);
+	return (NULL);
+}
+
+static void
+pci_vtscsi_reset(void *vsc)
+{
+	struct pci_vtscsi_softc *sc;
+
+	sc = vsc;
+
+	DPRINTF(("vtscsi: device reset requested\n"));
+	vi_reset_dev(&sc->vss_vs);
+
+	/* initialize config structure */
+	sc->vss_config = (struct pci_vtscsi_config){
+		.num_queues = VTSCSI_REQUESTQ,
+		.seg_max = VTSCSI_MAXSEG,
+		.max_sectors = 2,
+		.cmd_per_lun = 1,
+		.event_info_size = sizeof(struct pci_vtscsi_event),
+		.sense_size = 96,
+		.cdb_size = 32,
+		.max_channel = VIRTIO_SCSI_MAX_CHANNEL,
+		.max_target = VIRTIO_SCSI_MAX_TARGET,
+		.max_lun = VIRTIO_SCSI_MAX_LUN
+	};
+}
+
+static void
+pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features)
+{
+	struct pci_vtscsi_softc *sc = vsc;
+
+	sc->vss_features = negotiated_features;
+}
+
+static int
+pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+	struct pci_vtscsi_softc *sc = vsc;
+	void *ptr;
+
+	ptr = (uint8_t *)&sc->vss_config + offset;
+	memcpy(retval, ptr, size);
+	return (0);
+}
+
+static int
+pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val)
+{
+
+	return (0);
+}
+
+static inline int
+pci_vtscsi_get_lun(uint8_t *lun)
+{
+
+	return (((lun[2] << 8) | lun[3]) & 0x3fff);
+}
+
+static int
+pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf,
+    size_t bufsize)
+{
+	struct pci_vtscsi_ctrl_tmf *tmf;
+	struct pci_vtscsi_ctrl_an *an;
+	uint32_t type;
+
+	type = *(uint32_t *)buf;
+
+	if (type == VIRTIO_SCSI_T_TMF) {
+		tmf = (struct pci_vtscsi_ctrl_tmf *)buf;
+		return (pci_vtscsi_tmf_handle(sc, tmf));
+	}
+
+	if (type == VIRTIO_SCSI_T_AN_QUERY) {
+		an = (struct pci_vtscsi_ctrl_an *)buf;
+		return (pci_vtscsi_an_handle(sc, an));
+	}
+
+	return (0);
+}
+
+static int
+pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc,
+    struct pci_vtscsi_ctrl_tmf *tmf)
+{
+	union ctl_io *io;
+	int err;
+
+	io = ctl_scsi_alloc_io(sc->vss_iid);
+	ctl_scsi_zero_io(io);
+
+	io->io_hdr.io_type = CTL_IO_TASK;
+	io->io_hdr.nexus.initid = sc->vss_iid;
+	io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun);
+	io->taskio.tag_type = CTL_TAG_SIMPLE;
+	io->taskio.tag_num = (uint32_t)tmf->id;
+
+	switch (tmf->subtype) {
+	case VIRTIO_SCSI_T_TMF_ABORT_TASK:
+		io->taskio.task_action = CTL_TASK_ABORT_TASK;
+		break;
+
+	case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
+		io->taskio.task_action = CTL_TASK_ABORT_TASK_SET;
+		break;
+
+	case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
+		io->taskio.task_action = CTL_TASK_CLEAR_ACA;
+		break;
+
+	case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
+		io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET;
+		break;
+
+	case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
+		io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
+		break;
+
+	case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
+		io->taskio.task_action = CTL_TASK_LUN_RESET;
+		break;
+
+	case VIRTIO_SCSI_T_TMF_QUERY_TASK:
+		io->taskio.task_action = CTL_TASK_QUERY_TASK;
+		break;
+
+	case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
+		io->taskio.task_action = CTL_TASK_QUERY_TASK_SET;
+		break;
+	}
+
+	if (pci_vtscsi_debug) {
+		struct sbuf *sb = sbuf_new_auto();
+		ctl_io_sbuf(io, sb);
+		sbuf_finish(sb);
+		DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb)));
+		sbuf_delete(sb);
+	}
+
+	err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
+	if (err != 0)
+		WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno)));
+
+	tmf->response = io->taskio.task_status;
+	ctl_scsi_free_io(io);
+	return (1);
+}
+
+static int
+pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc,
+    struct pci_vtscsi_ctrl_an *an)
+{
+
+	return (0);
+}
+
+static int
+pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in,
+    int niov_in, struct iovec *iov_out, int niov_out)
+{
+	struct pci_vtscsi_softc *sc = q->vsq_sc;
+	struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL;
+	struct pci_vtscsi_req_cmd_wr *cmd_wr;
+	struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG];
+	union ctl_io *io;
+	int data_niov_in, data_niov_out;
+	void *ext_data_ptr = NULL;
+	uint32_t ext_data_len = 0, ext_sg_entries = 0;
+	int err;
+
+	seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in,
+	    VTSCSI_IN_HEADER_LEN(sc));
+	seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out,
+	    VTSCSI_OUT_HEADER_LEN(sc));
+
+	truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc));
+	truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc));
+	iov_to_buf(iov_in, niov_in, (void **)&cmd_rd);
+
+	cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc));
+	io = ctl_scsi_alloc_io(sc->vss_iid);
+	ctl_scsi_zero_io(io);
+
+	io->io_hdr.nexus.initid = sc->vss_iid;
+	io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun);
+
+	io->io_hdr.io_type = CTL_IO_SCSI;
+
+	if (data_niov_in > 0) {
+		ext_data_ptr = (void *)data_iov_in;
+		ext_sg_entries = data_niov_in;
+		ext_data_len = count_iov(data_iov_in, data_niov_in);
+		io->io_hdr.flags |= CTL_FLAG_DATA_OUT;
+	} else if (data_niov_out > 0) {
+		ext_data_ptr = (void *)data_iov_out;
+		ext_sg_entries = data_niov_out;
+		ext_data_len = count_iov(data_iov_out, data_niov_out);
+		io->io_hdr.flags |= CTL_FLAG_DATA_IN;
+	}
+
+	io->scsiio.sense_len = sc->vss_config.sense_size;
+	io->scsiio.tag_num = (uint32_t)cmd_rd->id;
+	switch (cmd_rd->task_attr) {
+	case VIRTIO_SCSI_S_ORDERED:
+		io->scsiio.tag_type = CTL_TAG_ORDERED;
+		break;
+	case VIRTIO_SCSI_S_HEAD:
+		io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE;
+		break;
+	case VIRTIO_SCSI_S_ACA:
+		io->scsiio.tag_type = CTL_TAG_ACA;
+		break;
+	case VIRTIO_SCSI_S_SIMPLE:
+	default:
+		io->scsiio.tag_type = CTL_TAG_SIMPLE;
+		break;
+	}
+	io->scsiio.ext_sg_entries = ext_sg_entries;
+	io->scsiio.ext_data_ptr = ext_data_ptr;
+	io->scsiio.ext_data_len = ext_data_len;
+	io->scsiio.ext_data_filled = 0;
+	io->scsiio.cdb_len = sc->vss_config.cdb_size;
+	memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size);
+
+	if (pci_vtscsi_debug) {
+		struct sbuf *sb = sbuf_new_auto();
+		ctl_io_sbuf(io, sb);
+		sbuf_finish(sb);
+		DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb)));
+		sbuf_delete(sb);
+	}
+
+	err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
+	if (err != 0) {
+		WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno)));
+		cmd_wr->response = VIRTIO_SCSI_S_FAILURE;
+	} else {
+		cmd_wr->sense_len = MIN(io->scsiio.sense_len,
+		    sc->vss_config.sense_size);
+		cmd_wr->residual = io->scsiio.residual;
+		cmd_wr->status = io->scsiio.scsi_status;
+		cmd_wr->response = VIRTIO_SCSI_S_OK;
+		memcpy(&cmd_wr->sense, &io->scsiio.sense_data,
+		    cmd_wr->sense_len);
+	}
+
+	buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0);
+	free(cmd_rd);
+	free(cmd_wr);
+	ctl_scsi_free_io(io);
+	return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled);
+}
+
+static void
+pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq)
+{
+	struct pci_vtscsi_softc *sc;
+	struct iovec iov[VTSCSI_MAXSEG];
+	uint16_t idx, n;
+	void *buf = NULL;
+	size_t bufsize;
+	int iolen;
+
+	sc = vsc;
+
+	while (vq_has_descs(vq)) {
+		n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL);
+		bufsize = iov_to_buf(iov, n, &buf);
+		iolen = pci_vtscsi_control_handle(sc, buf, bufsize);
+		buf_to_iov(buf + bufsize - iolen, iolen, iov, n,
+		    bufsize - iolen);
+
+		/*
+		 * Release this chain and handle more
+		 */
+		vq_relchain(vq, idx, iolen);
+	}
+	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
+	free(buf);
+}
+
+static void
+pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq)
+{
+
+	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+}
+
+static void
+pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq)
+{
+	struct pci_vtscsi_softc *sc;
+	struct pci_vtscsi_queue *q;
+	struct pci_vtscsi_request *req;
+	struct iovec iov[VTSCSI_MAXSEG];
+	uint16_t flags[VTSCSI_MAXSEG];
+	uint16_t idx, n, i;
+	int readable;
+
+	sc = vsc;
+	q = &sc->vss_queues[vq->vq_num - 2];
+
+	while (vq_has_descs(vq)) {
+		readable = 0;
+		n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags);
+
+		/* Count readable descriptors */
+		for (i = 0; i < n; i++) {
+			if (flags[i] & VRING_DESC_F_WRITE)
+				break;
+
+			readable++;
+		}
+
+		req = calloc(1, sizeof(struct pci_vtscsi_request));
+		req->vsr_idx = idx;
+		req->vsr_queue = q;
+		req->vsr_niov_in = readable;
+		req->vsr_niov_out = n - readable;
+		memcpy(req->vsr_iov_in, iov,
+		    req->vsr_niov_in * sizeof(struct iovec));
+		memcpy(req->vsr_iov_out, iov + readable,
+		    req->vsr_niov_out * sizeof(struct iovec));
+
+		pthread_mutex_lock(&q->vsq_mtx);
+		STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link);
+		pthread_cond_signal(&q->vsq_cv);
+		pthread_mutex_unlock(&q->vsq_mtx);
+
+		DPRINTF(("virtio-scsi: request <idx=%d> enqueued\n", idx));
+	}
+}
+
+static int
+pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, 
+    struct pci_vtscsi_queue *queue, int num)
+{
+	struct pci_vtscsi_worker *worker;
+	char tname[MAXCOMLEN + 1];
+	int i;
+
+	queue->vsq_sc = sc;
+	queue->vsq_vq = &sc->vss_vq[num + 2];
+
+	pthread_mutex_init(&queue->vsq_mtx, NULL);
+	pthread_mutex_init(&queue->vsq_qmtx, NULL);
+	pthread_cond_init(&queue->vsq_cv, NULL);
+	STAILQ_INIT(&queue->vsq_requests);
+	LIST_INIT(&queue->vsq_workers);
+
+	for (i = 0; i < VTSCSI_THR_PER_Q; i++) {
+		worker = calloc(1, sizeof(struct pci_vtscsi_worker));
+		worker->vsw_queue = queue;
+
+		pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc,
+		    (void *)worker);
+
+		snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i);
+		pthread_set_name_np(worker->vsw_thread, tname);
+		LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link);
+	}
+
+	return (0);
+}
+
+static int
+pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct pci_vtscsi_softc *sc;
+	char *opt, *optname;
+	const char *devname;
+	int i, optidx = 0;
+
+	sc = calloc(1, sizeof(struct pci_vtscsi_softc));
+	devname = "/dev/cam/ctl";
+	while ((opt = strsep(&opts, ",")) != NULL) {
+		optname = strsep(&opt, "=");
+		if (opt == NULL && optidx == 0) {
+			if (optname[0] != 0)
+				devname = optname;
+		} else if (strcmp(optname, "dev") == 0 && opt != NULL) {
+			devname = opt;
+		} else if (strcmp(optname, "iid") == 0 && opt != NULL) {
+			sc->vss_iid = strtoul(opt, NULL, 10);
+		} else {
+			fprintf(stderr, "Invalid option %s\n", optname);
+			free(sc);
+			return (1);
+		}
+		optidx++;
+	}
+
+	sc->vss_ctl_fd = open(devname, O_RDWR);
+	if (sc->vss_ctl_fd < 0) {
+		WPRINTF(("cannot open %s: %s\n", devname, strerror(errno)));
+		free(sc);
+		return (1);
+	}
+
+	vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq);
+	sc->vss_vs.vs_mtx = &sc->vss_mtx;
+
+	/* controlq */
+	sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ;
+	sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify;
+
+	/* eventq */
+	sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ;
+	sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify;
+
+	/* request queues */
+	for (i = 2; i < VTSCSI_MAXQ; i++) {
+		sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ;
+		sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify;
+		pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2);
+	}
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+	if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix()))
+		return (1);
+	vi_set_io_bar(&sc->vss_vs, 0);
+
+	return (0);
+}
+
+
+struct pci_devemu pci_de_vscsi = {
+	.pe_emu =	"virtio-scsi",
+	.pe_init =	pci_vtscsi_init,
+	.pe_barwrite =	vi_pci_write,
+	.pe_barread =	vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vscsi);
diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c
index f4d5d528be..e5a5cb584f 100644
--- a/usr/src/cmd/bhyve/pci_virtio_viona.c
+++ b/usr/src/cmd/bhyve/pci_virtio_viona.c
@@ -34,6 +34,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
@@ -289,6 +290,8 @@ pci_viona_tx_thread(void *param)
 		sc->vsc_tx_kick_lock_held = B_FALSE;
 	}
 	pthread_mutex_unlock(&sc->tx_mtx);
+
+	return (NULL);
 }
 
 static void
@@ -347,8 +350,10 @@ static int
 pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
 {
 	vioc_create_t		vna_create;
+#if notyet
 	char			devname[MAXNAMELEN];
 	int			ctlfd;
+#endif
 	int			error;
 
 	sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL);
@@ -360,10 +365,12 @@ pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
 	vna_create.c_linkid = sc->vsc_linkid;
 	strlcpy(vna_create.c_vmname, vmname,
 	    sizeof (vna_create.c_vmname));
+#if notyet
 	vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size,
 	    NULL);
 	vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL),
 	    &vna_create.c_himem_size, NULL);
+#endif
 	error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
 	if (error != 0) {
 		WPRINTF(("ioctl viona create failed %d\n", error));
@@ -495,7 +502,7 @@ viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
 
 static void
 pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
-		int baridx, uint64_t offset, int size, uint64_t value)
+    int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct pci_viona_softc *sc = pi->pi_arg;
 	void *ptr;
diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c
new file mode 100644
index 0000000000..29d56ec32c
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_xhci.c
@@ -0,0 +1,2855 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+   XHCI options:
+    -s <n>,xhci,{devices}
+
+   devices:
+     tablet             USB tablet mouse
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include <dev/usb/usbdi.h>
+#include <dev/usb/usb.h>
+#include <dev/usb/usb_freebsd.h>
+#include <xhcireg.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "pci_xhci.h"
+#include "usb_emul.h"
+
+
+static int xhci_debug = 0;
+#define	DPRINTF(params) if (xhci_debug) printf params
+#define	WPRINTF(params) printf params
+
+
+#define	XHCI_NAME		"xhci"
+#define	XHCI_MAX_DEVS		8	/* 4 USB3 + 4 USB2 devs */
+
+#define	XHCI_MAX_SLOTS		64	/* min allowed by Windows drivers */
+
+/*
+ * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping
+ * to 4k to avoid going over the guest physical memory barrier.
+ */
+#define	XHCI_PADDR_SZ		4096	/* paddr_guest2host max size */
+
+#define	XHCI_ERST_MAX		0	/* max 2^entries event ring seg tbl */
+
+#define	XHCI_CAPLEN		(4*8)	/* offset of op register space */
+#define	XHCI_HCCPRAMS2		0x1C	/* offset of HCCPARAMS2 register */
+#define	XHCI_PORTREGS_START	0x400
+#define	XHCI_DOORBELL_MAX	256
+
+#define	XHCI_STREAMS_MAX	1	/* 4-15 in XHCI spec */
+
+/* caplength and hci-version registers */
+#define	XHCI_SET_CAPLEN(x)		((x) & 0xFF)
+#define	XHCI_SET_HCIVERSION(x)		(((x) & 0xFFFF) << 16)
+#define	XHCI_GET_HCIVERSION(x)		(((x) >> 16) & 0xFFFF)
+
+/* hcsparams1 register */
+#define	XHCI_SET_HCSP1_MAXSLOTS(x)	((x) & 0xFF)
+#define	XHCI_SET_HCSP1_MAXINTR(x)	(((x) & 0x7FF) << 8)
+#define	XHCI_SET_HCSP1_MAXPORTS(x)	(((x) & 0xFF) << 24)
+
+/* hcsparams2 register */
+#define	XHCI_SET_HCSP2_IST(x)		((x) & 0x0F)
+#define	XHCI_SET_HCSP2_ERSTMAX(x)	(((x) & 0x0F) << 4)
+#define	XHCI_SET_HCSP2_MAXSCRATCH_HI(x)	(((x) & 0x1F) << 21)
+#define	XHCI_SET_HCSP2_MAXSCRATCH_LO(x)	(((x) & 0x1F) << 27)
+
+/* hcsparams3 register */
+#define	XHCI_SET_HCSP3_U1EXITLATENCY(x)	((x) & 0xFF)
+#define	XHCI_SET_HCSP3_U2EXITLATENCY(x)	(((x) & 0xFFFF) << 16)
+
+/* hccparams1 register */
+#define	XHCI_SET_HCCP1_AC64(x)		((x) & 0x01)
+#define	XHCI_SET_HCCP1_BNC(x)		(((x) & 0x01) << 1)
+#define	XHCI_SET_HCCP1_CSZ(x)		(((x) & 0x01) << 2)
+#define	XHCI_SET_HCCP1_PPC(x)		(((x) & 0x01) << 3)
+#define	XHCI_SET_HCCP1_PIND(x)		(((x) & 0x01) << 4)
+#define	XHCI_SET_HCCP1_LHRC(x)		(((x) & 0x01) << 5)
+#define	XHCI_SET_HCCP1_LTC(x)		(((x) & 0x01) << 6)
+#define	XHCI_SET_HCCP1_NSS(x)		(((x) & 0x01) << 7)
+#define	XHCI_SET_HCCP1_PAE(x)		(((x) & 0x01) << 8)
+#define	XHCI_SET_HCCP1_SPC(x)		(((x) & 0x01) << 9)
+#define	XHCI_SET_HCCP1_SEC(x)		(((x) & 0x01) << 10)
+#define	XHCI_SET_HCCP1_CFC(x)		(((x) & 0x01) << 11)
+#define	XHCI_SET_HCCP1_MAXPSA(x)	(((x) & 0x0F) << 12)
+#define	XHCI_SET_HCCP1_XECP(x)		(((x) & 0xFFFF) << 16)
+
+/* hccparams2 register */
+#define	XHCI_SET_HCCP2_U3C(x)		((x) & 0x01)
+#define	XHCI_SET_HCCP2_CMC(x)		(((x) & 0x01) << 1)
+#define	XHCI_SET_HCCP2_FSC(x)		(((x) & 0x01) << 2)
+#define	XHCI_SET_HCCP2_CTC(x)		(((x) & 0x01) << 3)
+#define	XHCI_SET_HCCP2_LEC(x)		(((x) & 0x01) << 4)
+#define	XHCI_SET_HCCP2_CIC(x)		(((x) & 0x01) << 5)
+
+/* other registers */
+#define	XHCI_SET_DOORBELL(x)		((x) & ~0x03)
+#define	XHCI_SET_RTSOFFSET(x)		((x) & ~0x0F)
+
+/* register masks */
+#define	XHCI_PS_PLS_MASK		(0xF << 5)	/* port link state */
+#define	XHCI_PS_SPEED_MASK		(0xF << 10)	/* port speed */
+#define	XHCI_PS_PIC_MASK		(0x3 << 14)	/* port indicator */
+
+/* port register set */
+#define	XHCI_PORTREGS_BASE		0x400		/* base offset */
+#define	XHCI_PORTREGS_PORT0		0x3F0
+#define	XHCI_PORTREGS_SETSZ		0x10		/* size of a set */
+
+#define	MASK_64_HI(x)			((x) & ~0xFFFFFFFFULL)
+#define	MASK_64_LO(x)			((x) & 0xFFFFFFFFULL)
+
+#define	FIELD_REPLACE(a,b,m,s)		(((a) & ~((m) << (s))) | \
+					(((b) & (m)) << (s)))
+#define	FIELD_COPY(a,b,m,s)		(((a) & ~((m) << (s))) | \
+					(((b) & ((m) << (s)))))
+
+struct pci_xhci_trb_ring {
+	uint64_t ringaddr;		/* current dequeue guest address */
+	uint32_t ccs;			/* consumer cycle state */
+};
+
+/* device endpoint transfer/stream rings */
+struct pci_xhci_dev_ep {
+	union {
+		struct xhci_trb		*_epu_tr;
+		struct xhci_stream_ctx	*_epu_sctx;
+	} _ep_trbsctx;
+#define	ep_tr		_ep_trbsctx._epu_tr
+#define	ep_sctx		_ep_trbsctx._epu_sctx
+
+	union {
+		struct pci_xhci_trb_ring _epu_trb;
+		struct pci_xhci_trb_ring *_epu_sctx_trbs;
+	} _ep_trb_rings;
+#define	ep_ringaddr	_ep_trb_rings._epu_trb.ringaddr
+#define	ep_ccs		_ep_trb_rings._epu_trb.ccs
+#define	ep_sctx_trbs	_ep_trb_rings._epu_sctx_trbs
+
+	struct usb_data_xfer *ep_xfer;	/* transfer chain */
+};
+
+/* device context base address array: maps slot->device context */
+struct xhci_dcbaa {
+	uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */
+};
+
+/* port status registers */
+struct pci_xhci_portregs {
+	uint32_t	portsc;		/* port status and control */
+	uint32_t	portpmsc;	/* port pwr mgmt status & control */
+	uint32_t	portli;		/* port link info */
+	uint32_t	porthlpmc;	/* port hardware LPM control */
+} __packed;
+#define	XHCI_PS_SPEED_SET(x)	(((x) & 0xF) << 10)
+
+/* xHC operational registers */
+struct pci_xhci_opregs {
+	uint32_t	usbcmd;		/* usb command */
+	uint32_t	usbsts;		/* usb status */
+	uint32_t	pgsz;		/* page size */
+	uint32_t	dnctrl;		/* device notification control */
+	uint64_t	crcr;		/* command ring control */
+	uint64_t	dcbaap;		/* device ctx base addr array ptr */
+	uint32_t	config;		/* configure */
+
+	/* guest mapped addresses: */
+	struct xhci_trb	*cr_p;		/* crcr dequeue */
+	struct xhci_dcbaa *dcbaa_p;	/* dev ctx array ptr */
+};
+
+/* xHC runtime registers */
+struct pci_xhci_rtsregs {
+	uint32_t	mfindex;	/* microframe index */
+	struct {			/* interrupter register set */
+		uint32_t	iman;	/* interrupter management */
+		uint32_t	imod;	/* interrupter moderation */
+		uint32_t	erstsz;	/* event ring segment table size */
+		uint32_t	rsvd;
+		uint64_t	erstba;	/* event ring seg-tbl base addr */
+		uint64_t	erdp;	/* event ring dequeue ptr */
+	} intrreg __packed;
+
+	/* guest mapped addresses */
+	struct xhci_event_ring_seg *erstba_p;
+	struct xhci_trb *erst_p;	/* event ring segment tbl */
+	int		er_deq_seg;	/* event ring dequeue segment */
+	int		er_enq_idx;	/* event ring enqueue index - xHCI */
+	int		er_enq_seg;	/* event ring enqueue segment */
+	uint32_t	er_events_cnt;	/* number of events in ER */
+	uint32_t	event_pcs;	/* producer cycle state flag */
+};
+
+
+struct pci_xhci_softc;
+
+
+/*
+ * USB device emulation container.
+ * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each
+ * emulated device instance.
+ */
+struct pci_xhci_dev_emu {
+	struct pci_xhci_softc	*xsc;
+
+	/* XHCI contexts */
+	struct xhci_dev_ctx	*dev_ctx;
+	struct pci_xhci_dev_ep	eps[XHCI_MAX_ENDPOINTS];
+	int			dev_slotstate;
+
+	struct usb_devemu	*dev_ue;	/* USB emulated dev */
+	void			*dev_sc;	/* device's softc */
+
+	struct usb_hci		hci;
+};
+
+struct pci_xhci_softc {
+	struct pci_devinst *xsc_pi;
+
+	pthread_mutex_t	mtx;
+
+	uint32_t	caplength;	/* caplen & hciversion */
+	uint32_t	hcsparams1;	/* structural parameters 1 */
+	uint32_t	hcsparams2;	/* structural parameters 2 */
+	uint32_t	hcsparams3;	/* structural parameters 3 */
+	uint32_t	hccparams1;	/* capability parameters 1 */
+	uint32_t	dboff;		/* doorbell offset */
+	uint32_t	rtsoff;		/* runtime register space offset */
+	uint32_t	hccparams2;	/* capability parameters 2 */
+
+	uint32_t	regsend;	/* end of configuration registers */
+
+	struct pci_xhci_opregs  opregs;
+	struct pci_xhci_rtsregs rtsregs;
+
+	struct pci_xhci_portregs *portregs;
+	struct pci_xhci_dev_emu  **devices; /* XHCI[port] = device */
+	struct pci_xhci_dev_emu  **slots;   /* slots assigned from 1 */
+	int		ndevices;
+
+	int		usb2_port_start;
+	int		usb3_port_start;
+};
+
+
+/* portregs and devices arrays are set up to start from idx=1 */
+#define	XHCI_PORTREG_PTR(x,n)	&(x)->portregs[(n)]
+#define	XHCI_DEVINST_PTR(x,n)	(x)->devices[(n)]
+#define	XHCI_SLOTDEV_PTR(x,n)	(x)->slots[(n)]
+
+#define	XHCI_HALTED(sc)		((sc)->opregs.usbsts & XHCI_STS_HCH)
+
+#define	XHCI_GADDR(sc,a)	paddr_guest2host((sc)->xsc_pi->pi_vmctx, \
+				    (a),                                 \
+				    XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1)))
+
+static int xhci_in_use;
+
+/* map USB errors to XHCI */
+static const int xhci_usb_errors[USB_ERR_MAX] = {
+	[USB_ERR_NORMAL_COMPLETION]	= XHCI_TRB_ERROR_SUCCESS,
+	[USB_ERR_PENDING_REQUESTS]	= XHCI_TRB_ERROR_RESOURCE,
+	[USB_ERR_NOT_STARTED]		= XHCI_TRB_ERROR_ENDP_NOT_ON,
+	[USB_ERR_INVAL]			= XHCI_TRB_ERROR_INVALID,
+	[USB_ERR_NOMEM]			= XHCI_TRB_ERROR_RESOURCE,
+	[USB_ERR_CANCELLED]		= XHCI_TRB_ERROR_STOPPED,
+	[USB_ERR_BAD_ADDRESS]		= XHCI_TRB_ERROR_PARAMETER,
+	[USB_ERR_BAD_BUFSIZE]		= XHCI_TRB_ERROR_PARAMETER,
+	[USB_ERR_BAD_FLAG]		= XHCI_TRB_ERROR_PARAMETER,
+	[USB_ERR_NO_CALLBACK]		= XHCI_TRB_ERROR_STALL,
+	[USB_ERR_IN_USE]		= XHCI_TRB_ERROR_RESOURCE,
+	[USB_ERR_NO_ADDR]		= XHCI_TRB_ERROR_RESOURCE,
+	[USB_ERR_NO_PIPE]               = XHCI_TRB_ERROR_RESOURCE,
+	[USB_ERR_ZERO_NFRAMES]          = XHCI_TRB_ERROR_UNDEFINED,
+	[USB_ERR_ZERO_MAXP]             = XHCI_TRB_ERROR_UNDEFINED,
+	[USB_ERR_SET_ADDR_FAILED]       = XHCI_TRB_ERROR_RESOURCE,
+	[USB_ERR_NO_POWER]              = XHCI_TRB_ERROR_ENDP_NOT_ON,
+	[USB_ERR_TOO_DEEP]              = XHCI_TRB_ERROR_RESOURCE,
+	[USB_ERR_IOERROR]               = XHCI_TRB_ERROR_TRB,
+	[USB_ERR_NOT_CONFIGURED]        = XHCI_TRB_ERROR_ENDP_NOT_ON,
+	[USB_ERR_TIMEOUT]               = XHCI_TRB_ERROR_CMD_ABORTED,
+	[USB_ERR_SHORT_XFER]            = XHCI_TRB_ERROR_SHORT_PKT,
+	[USB_ERR_STALLED]               = XHCI_TRB_ERROR_STALL,
+	[USB_ERR_INTERRUPTED]           = XHCI_TRB_ERROR_CMD_ABORTED,
+	[USB_ERR_DMA_LOAD_FAILED]       = XHCI_TRB_ERROR_DATA_BUF,
+	[USB_ERR_BAD_CONTEXT]           = XHCI_TRB_ERROR_TRB,
+	[USB_ERR_NO_ROOT_HUB]           = XHCI_TRB_ERROR_UNDEFINED,
+	[USB_ERR_NO_INTR_THREAD]        = XHCI_TRB_ERROR_UNDEFINED,
+	[USB_ERR_NOT_LOCKED]            = XHCI_TRB_ERROR_UNDEFINED,
+};
+#define	USB_TO_XHCI_ERR(e)	((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \
+				XHCI_TRB_ERROR_INVALID)
+
+static int pci_xhci_insert_event(struct pci_xhci_softc *sc,
+    struct xhci_trb *evtrb, int do_intr);
+static void pci_xhci_dump_trb(struct xhci_trb *trb);
+static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc);
+static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot);
+static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm);
+static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc,
+    struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
+    struct xhci_endp_ctx *ep_ctx, uint32_t streamid,
+    uint64_t ringaddr, int ccs);
+
+static void
+pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode,
+    uint32_t evtype)
+{
+	evtrb->qwTrb0 = port << 24;
+	evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode);
+	evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype);
+}
+
+
+/* controller reset */
+static void
+pci_xhci_reset(struct pci_xhci_softc *sc)
+{
+	int i;
+
+	sc->rtsregs.er_enq_idx = 0;
+	sc->rtsregs.er_events_cnt = 0;
+	sc->rtsregs.event_pcs = 1;
+
+	for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+		pci_xhci_reset_slot(sc, i);
+	}
+}
+
+static uint32_t
+pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd)
+{
+	int do_intr = 0;
+	int i;
+
+	if (cmd & XHCI_CMD_RS) {
+		do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0;
+
+		sc->opregs.usbcmd |= XHCI_CMD_RS;
+		sc->opregs.usbsts &= ~XHCI_STS_HCH;
+		sc->opregs.usbsts |= XHCI_STS_PCD;
+
+		/* Queue port change event on controller run from stop */
+		if (do_intr)
+			for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+				struct pci_xhci_dev_emu *dev;
+				struct pci_xhci_portregs *port;
+				struct xhci_trb		evtrb;
+
+				if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL)
+					continue;
+
+				port = XHCI_PORTREG_PTR(sc, i);
+				port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS;
+				port->portsc &= ~XHCI_PS_PLS_MASK;
+
+				/*
+				 * XHCI 4.19.3 USB2 RxDetect->Polling,
+				 *             USB3 Polling->U0
+				 */
+				if (dev->dev_ue->ue_usbver == 2)
+					port->portsc |=
+					    XHCI_PS_PLS_SET(UPS_PORT_LS_POLL);
+				else
+					port->portsc |=
+					    XHCI_PS_PLS_SET(UPS_PORT_LS_U0);
+
+				pci_xhci_set_evtrb(&evtrb, i,
+				    XHCI_TRB_ERROR_SUCCESS,
+				    XHCI_TRB_EVENT_PORT_STS_CHANGE);
+
+				if (pci_xhci_insert_event(sc, &evtrb, 0) !=
+				    XHCI_TRB_ERROR_SUCCESS)
+					break;
+			}
+	} else {
+		sc->opregs.usbcmd &= ~XHCI_CMD_RS;
+		sc->opregs.usbsts |= XHCI_STS_HCH;
+		sc->opregs.usbsts &= ~XHCI_STS_PCD;
+	}
+
+	/* start execution of schedule; stop when set to 0 */
+	cmd |= sc->opregs.usbcmd & XHCI_CMD_RS;
+
+	if (cmd & XHCI_CMD_HCRST) {
+		/* reset controller */
+		pci_xhci_reset(sc);
+		cmd &= ~XHCI_CMD_HCRST;
+	}
+
+	cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS);
+
+	if (do_intr)
+		pci_xhci_assert_interrupt(sc);
+
+	return (cmd);
+}
+
+static void
+pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset,
+    uint64_t value)
+{
+	struct xhci_trb		evtrb;
+	struct pci_xhci_portregs *p;
+	int port;
+	uint32_t oldpls, newpls;
+
+	if (sc->portregs == NULL)
+		return;
+
+	port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ;
+	offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ;
+
+	DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx\r\n",
+	        offset, port, value));
+
+	assert(port >= 0);
+
+	if (port > XHCI_MAX_DEVS) {
+		DPRINTF(("pci_xhci: portregs_write port %d > ndevices\r\n",
+		    port));
+		return;
+	}
+
+	if (XHCI_DEVINST_PTR(sc, port) == NULL) {
+		DPRINTF(("pci_xhci: portregs_write to unattached port %d\r\n",
+		     port));
+	}
+
+	p = XHCI_PORTREG_PTR(sc, port);
+	switch (offset) {
+	case 0:
+		/* port reset or warm reset */
+		if (value & (XHCI_PS_PR | XHCI_PS_WPR)) {
+			pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR);
+			break;
+		}
+
+		if ((p->portsc & XHCI_PS_PP) == 0) {
+			WPRINTF(("pci_xhci: portregs_write to unpowered "
+			         "port %d\r\n", port));
+			break;
+		}
+
+		/* Port status and control register  */
+		oldpls = XHCI_PS_PLS_GET(p->portsc);
+		newpls = XHCI_PS_PLS_GET(value);
+
+		p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK |
+		             XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK;
+  
+		if (XHCI_DEVINST_PTR(sc, port))
+			p->portsc |= XHCI_PS_CCS;
+
+		p->portsc |= (value &
+		              ~(XHCI_PS_OCA |
+		                XHCI_PS_PR  |
+			        XHCI_PS_PED |
+			        XHCI_PS_PLS_MASK   |	/* link state */
+			        XHCI_PS_SPEED_MASK |
+			        XHCI_PS_PIC_MASK   |	/* port indicator */
+			        XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR));
+
+		/* clear control bits */
+		p->portsc &= ~(value &
+		               (XHCI_PS_CSC |
+		                XHCI_PS_PEC |
+		                XHCI_PS_WRC |
+		                XHCI_PS_OCC |
+		                XHCI_PS_PRC |
+		                XHCI_PS_PLC |
+		                XHCI_PS_CEC |
+		                XHCI_PS_CAS));
+
+		/* port disable request; for USB3, don't care */
+		if (value & XHCI_PS_PED)
+			DPRINTF(("Disable port %d request\r\n", port));
+
+		if (!(value & XHCI_PS_LWS))
+			break;
+
+		DPRINTF(("Port new PLS: %d\r\n", newpls));
+		switch (newpls) {
+		case 0: /* U0 */
+		case 3: /* U3 */
+			if (oldpls != newpls) {
+				p->portsc &= ~XHCI_PS_PLS_MASK;
+				p->portsc |= XHCI_PS_PLS_SET(newpls) |
+				             XHCI_PS_PLC;
+
+				if (oldpls != 0 && newpls == 0) {
+					pci_xhci_set_evtrb(&evtrb, port,
+					    XHCI_TRB_ERROR_SUCCESS,
+					    XHCI_TRB_EVENT_PORT_STS_CHANGE);
+
+					pci_xhci_insert_event(sc, &evtrb, 1);
+				}
+			}
+			break;
+
+		default:
+			DPRINTF(("Unhandled change port %d PLS %u\r\n",
+			         port, newpls));
+			break;
+		}
+		break;
+	case 4: 
+		/* Port power management status and control register  */
+		p->portpmsc = value;
+		break;
+	case 8:
+		/* Port link information register */
+		DPRINTF(("pci_xhci attempted write to PORTLI, port %d\r\n",
+		        port));
+		break;
+	case 12:
+		/*
+		 * Port hardware LPM control register.
+		 * For USB3, this register is reserved.
+		 */
+		p->porthlpmc = value;
+		break;
+	}
+}
+
+struct xhci_dev_ctx *
+pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot)
+{
+	uint64_t devctx_addr;
+	struct xhci_dev_ctx *devctx;
+
+	assert(slot > 0 && slot <= sc->ndevices);
+	assert(sc->opregs.dcbaa_p != NULL);
+
+	devctx_addr = sc->opregs.dcbaa_p->dcba[slot];
+
+	if (devctx_addr == 0) {
+		DPRINTF(("get_dev_ctx devctx_addr == 0\r\n"));
+		return (NULL);
+	}
+
+	DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx\r\n",
+	        slot, devctx_addr));
+	devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL);
+
+	return (devctx);
+}
+
+struct xhci_trb *
+pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb,
+    uint64_t *guestaddr)
+{
+	struct xhci_trb *next;
+
+	assert(curtrb != NULL);
+
+	if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) {
+		if (guestaddr)
+			*guestaddr = curtrb->qwTrb0 & ~0xFUL;
+		
+		next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL);
+	} else {
+		if (guestaddr)
+			*guestaddr += sizeof(struct xhci_trb) & ~0xFUL;
+
+		next = curtrb + 1;
+	}
+
+	return (next);
+}
+
+static void
+pci_xhci_assert_interrupt(struct pci_xhci_softc *sc)
+{
+
+	sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY;
+	sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND;
+	sc->opregs.usbsts |= XHCI_STS_EINT;
+
+	/* only trigger interrupt if permitted */
+	if ((sc->opregs.usbcmd & XHCI_CMD_INTE) &&
+	    (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) {
+		if (pci_msi_enabled(sc->xsc_pi))
+			pci_generate_msi(sc->xsc_pi, 0);
+		else
+			pci_lintr_assert(sc->xsc_pi);
+	}
+}
+
+static void
+pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc)
+{
+
+	if (!pci_msi_enabled(sc->xsc_pi))
+		pci_lintr_assert(sc->xsc_pi);
+}
+
+static void
+pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid)
+{
+	struct xhci_dev_ctx    *dev_ctx;
+	struct pci_xhci_dev_ep *devep;
+	struct xhci_endp_ctx   *ep_ctx;
+	uint32_t	pstreams;
+	int		i;
+
+	dev_ctx = dev->dev_ctx;
+	ep_ctx = &dev_ctx->ctx_ep[epid];
+	devep = &dev->eps[epid];
+	pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0);
+	if (pstreams > 0) {
+		DPRINTF(("init_ep %d with pstreams %d\r\n", epid, pstreams));
+		assert(devep->ep_sctx_trbs == NULL);
+
+		devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 &
+		                            XHCI_EPCTX_2_TR_DQ_PTR_MASK);
+		devep->ep_sctx_trbs = calloc(pstreams,
+		                      sizeof(struct pci_xhci_trb_ring));
+		for (i = 0; i < pstreams; i++) {
+			devep->ep_sctx_trbs[i].ringaddr =
+			                         devep->ep_sctx[i].qwSctx0 &
+			                         XHCI_SCTX_0_TR_DQ_PTR_MASK;
+			devep->ep_sctx_trbs[i].ccs =
+			     XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0);
+		}
+	} else {
+		DPRINTF(("init_ep %d with no pstreams\r\n", epid));
+		devep->ep_ringaddr = ep_ctx->qwEpCtx2 &
+		                     XHCI_EPCTX_2_TR_DQ_PTR_MASK;
+		devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2);
+		devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr);
+		DPRINTF(("init_ep tr DCS %x\r\n", devep->ep_ccs));
+	}
+
+	if (devep->ep_xfer == NULL) {
+		devep->ep_xfer = malloc(sizeof(struct usb_data_xfer));
+		USB_DATA_XFER_INIT(devep->ep_xfer);
+	}
+}
+
+static void
+pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid)
+{
+	struct xhci_dev_ctx    *dev_ctx;
+	struct pci_xhci_dev_ep *devep;
+	struct xhci_endp_ctx   *ep_ctx;
+
+	DPRINTF(("pci_xhci disable_ep %d\r\n", epid));
+
+	dev_ctx = dev->dev_ctx;
+	ep_ctx = &dev_ctx->ctx_ep[epid];
+	ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED;
+
+	devep = &dev->eps[epid];
+	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 &&
+	    devep->ep_sctx_trbs != NULL)
+			free(devep->ep_sctx_trbs);
+
+	if (devep->ep_xfer != NULL) {
+		free(devep->ep_xfer);
+		devep->ep_xfer = NULL;
+	}
+
+	memset(devep, 0, sizeof(struct pci_xhci_dev_ep));
+}
+
+
+/* reset device at slot and data structures related to it */
+static void
+pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot)
+{
+	struct pci_xhci_dev_emu *dev;
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+
+	if (!dev) {
+		DPRINTF(("xhci reset unassigned slot (%d)?\r\n", slot));
+	} else {
+		dev->dev_slotstate = XHCI_ST_DISABLED;
+	}
+
+	/* TODO: reset ring buffer pointers */
+}
+
+static int
+pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb,
+    int do_intr)
+{
+	struct pci_xhci_rtsregs *rts;
+	uint64_t	erdp;
+	int		erdp_idx;
+	int		err;
+	struct xhci_trb *evtrbptr;
+
+	err = XHCI_TRB_ERROR_SUCCESS;
+
+	rts = &sc->rtsregs;
+
+	erdp = rts->intrreg.erdp & ~0xF;
+	erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) /
+	           sizeof(struct xhci_trb);
+
+	DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]\r\n"
+	         "\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u\r\n"
+	         "\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)\r\n",
+	         evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3,
+	         erdp_idx, rts->er_deq_seg, rts->er_enq_idx,
+	         rts->er_enq_seg,
+	         rts->event_pcs, erdp, rts->erstba_p->qwEvrsTablePtr,
+	         rts->erstba_p->dwEvrsTableSize, do_intr));
+
+	evtrbptr = &rts->erst_p[rts->er_enq_idx];
+
+	/* TODO: multi-segment table */
+	if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) {
+		DPRINTF(("pci_xhci[%d] cannot insert event; ring full\r\n",
+		         __LINE__));
+		err = XHCI_TRB_ERROR_EV_RING_FULL;
+		goto done;
+	}
+
+	if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) {
+		struct xhci_trb	errev;
+
+		if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) {
+
+			DPRINTF(("pci_xhci[%d] insert evt err: ring full\r\n",
+			         __LINE__));
+
+			errev.qwTrb0 = 0;
+			errev.dwTrb2 = XHCI_TRB_2_ERROR_SET(
+			                    XHCI_TRB_ERROR_EV_RING_FULL);
+			errev.dwTrb3 = XHCI_TRB_3_TYPE_SET(
+			                    XHCI_TRB_EVENT_HOST_CTRL) |
+			               rts->event_pcs;
+			rts->er_events_cnt++;
+			memcpy(&rts->erst_p[rts->er_enq_idx], &errev,
+			       sizeof(struct xhci_trb));
+			rts->er_enq_idx = (rts->er_enq_idx + 1) %
+			                  rts->erstba_p->dwEvrsTableSize;
+			err = XHCI_TRB_ERROR_EV_RING_FULL;
+			do_intr = 1;
+
+			goto done;
+		}
+	} else {
+		rts->er_events_cnt++;
+	}
+
+	evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT;
+	evtrb->dwTrb3 |= rts->event_pcs;
+
+	memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb));
+	rts->er_enq_idx = (rts->er_enq_idx + 1) %
+	                  rts->erstba_p->dwEvrsTableSize;
+
+	if (rts->er_enq_idx == 0)
+		rts->event_pcs ^= 1;
+
+done:
+	if (do_intr)
+		pci_xhci_assert_interrupt(sc);
+
+	return (err);
+}
+
+static uint32_t
+pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot)
+{
+	struct pci_xhci_dev_emu *dev;
+	uint32_t	cmderr;
+	int		i;
+
+	cmderr = XHCI_TRB_ERROR_NO_SLOTS;
+	if (sc->portregs != NULL)
+		for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+			dev = XHCI_SLOTDEV_PTR(sc, i);
+			if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) {
+				*slot = i;
+				dev->dev_slotstate = XHCI_ST_ENABLED;
+				cmderr = XHCI_TRB_ERROR_SUCCESS;
+				dev->hci.hci_address = i;
+				break;
+			}
+		}
+
+	DPRINTF(("pci_xhci enable slot (error=%d) slot %u\r\n",
+		cmderr != XHCI_TRB_ERROR_SUCCESS, *slot));
+
+	return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot)
+{
+	struct pci_xhci_dev_emu *dev;
+	uint32_t cmderr;
+
+	DPRINTF(("pci_xhci disable slot %u\r\n", slot));
+
+	cmderr = XHCI_TRB_ERROR_NO_SLOTS;
+	if (sc->portregs == NULL)
+		goto done;
+
+	if (slot > sc->ndevices) {
+		cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+		goto done;
+	}
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	if (dev) {
+		if (dev->dev_slotstate == XHCI_ST_DISABLED) {
+			cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+		} else {
+			dev->dev_slotstate = XHCI_ST_DISABLED;
+			cmderr = XHCI_TRB_ERROR_SUCCESS;
+			/* TODO: reset events and endpoints */
+		}
+	}
+
+done:
+	return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot)
+{
+	struct pci_xhci_dev_emu *dev;
+	struct xhci_dev_ctx     *dev_ctx;
+	struct xhci_endp_ctx    *ep_ctx;
+	uint32_t	cmderr;
+	int		i;
+
+	cmderr = XHCI_TRB_ERROR_NO_SLOTS;
+	if (sc->portregs == NULL)
+		goto done;
+
+	DPRINTF(("pci_xhci reset device slot %u\r\n", slot));
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED)
+		cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+	else {
+		dev->dev_slotstate = XHCI_ST_DEFAULT;
+
+		dev->hci.hci_address = 0;
+		dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+		/* slot state */
+		dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
+		    dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT,
+		    0x1F, 27);
+
+		/* number of contexts */
+		dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE(
+		    dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27);
+
+		/* reset all eps other than ep-0 */
+		for (i = 2; i <= 31; i++) {
+			ep_ctx = &dev_ctx->ctx_ep[i];
+			ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0,
+			    XHCI_ST_EPCTX_DISABLED, 0x7, 0);
+		}
+
+		cmderr = XHCI_TRB_ERROR_SUCCESS;
+	}
+
+	pci_xhci_reset_slot(sc, slot);
+
+done:
+	return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot,
+    struct xhci_trb *trb)
+{
+	struct pci_xhci_dev_emu	*dev;
+	struct xhci_input_dev_ctx *input_ctx;
+	struct xhci_slot_ctx	*islot_ctx;
+	struct xhci_dev_ctx	*dev_ctx;
+	struct xhci_endp_ctx	*ep0_ctx;
+	uint32_t		cmderr;
+
+	input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
+	islot_ctx = &input_ctx->ctx_slot;
+	ep0_ctx = &input_ctx->ctx_ep[1];
+
+	cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+	DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,\r\n"
+	         "          slot %08x %08x %08x %08x\r\n"
+	         "          ep0  %08x %08x %016lx %08x\r\n",
+	        input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1,
+	        islot_ctx->dwSctx0, islot_ctx->dwSctx1,
+	        islot_ctx->dwSctx2, islot_ctx->dwSctx3,
+	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+	        ep0_ctx->dwEpCtx4));
+
+	/* when setting address: drop-ctx=0, add-ctx=slot+ep0 */
+	if ((input_ctx->ctx_input.dwInCtx0 != 0) ||
+	    (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) {
+		DPRINTF(("pci_xhci: address device, input ctl invalid\r\n"));
+		cmderr = XHCI_TRB_ERROR_TRB;
+		goto done;
+	}
+
+	/* assign address to slot */
+	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+	DPRINTF(("pci_xhci: address device, dev ctx\r\n"
+	         "          slot %08x %08x %08x %08x\r\n",
+	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	assert(dev != NULL);
+
+	dev->hci.hci_address = slot;
+	dev->dev_ctx = dev_ctx;
+
+	if (dev->dev_ue->ue_reset == NULL ||
+	    dev->dev_ue->ue_reset(dev->dev_sc) < 0) {
+		cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON;
+		goto done;
+	}
+
+	memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx));
+
+	dev_ctx->ctx_slot.dwSctx3 =
+	    XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) |
+	    XHCI_SCTX_3_DEV_ADDR_SET(slot);
+
+	memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx));
+	ep0_ctx = &dev_ctx->ctx_ep[1];
+	ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) |
+	    XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING);
+
+	pci_xhci_init_ep(dev, 1);
+
+	dev->dev_slotstate = XHCI_ST_ADDRESSED;
+
+	DPRINTF(("pci_xhci: address device, output ctx\r\n"
+	         "          slot %08x %08x %08x %08x\r\n"
+	         "          ep0  %08x %08x %016lx %08x\r\n",
+	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3,
+	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+	        ep0_ctx->dwEpCtx4));
+
+done:
+	return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot,
+    struct xhci_trb *trb)
+{
+	struct xhci_input_dev_ctx *input_ctx;
+	struct pci_xhci_dev_emu	*dev;
+	struct xhci_dev_ctx	*dev_ctx;
+	struct xhci_endp_ctx	*ep_ctx, *iep_ctx;
+	uint32_t	cmderr;
+	int		i;
+
+	cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+	DPRINTF(("pci_xhci config_ep slot %u\r\n", slot));
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	assert(dev != NULL);
+
+	if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) {
+		DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u\r\n",
+		        slot));
+		if (dev->dev_ue->ue_stop != NULL)
+			dev->dev_ue->ue_stop(dev->dev_sc);
+
+		dev->dev_slotstate = XHCI_ST_ADDRESSED;
+
+		dev->hci.hci_address = 0;
+		dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+		/* number of contexts */
+		dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE(
+		    dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27);
+
+		/* slot state */
+		dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
+		    dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED,
+		    0x1F, 27);
+
+		/* disable endpoints */
+		for (i = 2; i < 32; i++)
+			pci_xhci_disable_ep(dev, i);
+
+		cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+		goto done;
+	}
+
+	if (dev->dev_slotstate < XHCI_ST_ADDRESSED) {
+		DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed\r\n",
+		        dev->dev_slotstate));
+		cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
+		goto done;
+	}
+
+	/* In addressed/configured state;
+	 * for each drop endpoint ctx flag:
+	 *   ep->state = DISABLED
+	 * for each add endpoint ctx flag:
+	 *   cp(ep-in, ep-out)
+	 *   ep->state = RUNNING
+	 * for each drop+add endpoint flag:
+	 *   reset ep resources
+	 *   cp(ep-in, ep-out)
+	 *   ep->state = RUNNING
+	 * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled)
+	 *   slot->state = configured
+	 */
+
+	input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
+	dev_ctx = dev->dev_ctx;
+	DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x\r\n",
+		input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1,
+	        input_ctx->ctx_input.dwInCtx7));
+
+	for (i = 2; i <= 31; i++) {
+		ep_ctx = &dev_ctx->ctx_ep[i];
+
+		if (input_ctx->ctx_input.dwInCtx0 &
+		    XHCI_INCTX_0_DROP_MASK(i)) {
+			DPRINTF((" config ep - dropping ep %d\r\n", i));
+			pci_xhci_disable_ep(dev, i);
+		}
+
+		if (input_ctx->ctx_input.dwInCtx1 &
+		    XHCI_INCTX_1_ADD_MASK(i)) {
+			iep_ctx = &input_ctx->ctx_ep[i];
+
+			DPRINTF((" enable ep[%d]  %08x %08x %016lx %08x\r\n",
+			   i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1,
+			   iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4));
+
+			memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx));
+
+			pci_xhci_init_ep(dev, i);
+
+			/* ep state */
+			ep_ctx->dwEpCtx0 = FIELD_REPLACE(
+			    ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0);
+		}
+	}
+
+	/* slot state to configured */
+	dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
+	    dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27);
+	dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY(
+	    dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27);
+	dev->dev_slotstate = XHCI_ST_CONFIGURED;
+
+	DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x "
+	         "[3]=0x%08x\r\n",
+	    slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+	    dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
+
+done:
+	return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot,
+    struct xhci_trb *trb)
+{
+	struct pci_xhci_dev_emu	*dev;
+	struct pci_xhci_dev_ep *devep;
+	struct xhci_dev_ctx	*dev_ctx;
+	struct xhci_endp_ctx	*ep_ctx;
+	uint32_t	cmderr, epid;
+	uint32_t	type;
+
+	epid = XHCI_TRB_3_EP_GET(trb->dwTrb3);
+
+	DPRINTF(("pci_xhci: reset ep %u: slot %u\r\n", epid, slot));
+
+	cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+	type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	assert(dev != NULL);
+
+	if (type == XHCI_TRB_TYPE_STOP_EP &&
+	    (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) {
+		/* XXX suspend endpoint for 10ms */
+	}
+
+	if (epid < 1 || epid > 31) {
+		DPRINTF(("pci_xhci: reset ep: invalid epid %u\r\n", epid));
+		cmderr = XHCI_TRB_ERROR_TRB;
+		goto done;
+	}
+
+	devep = &dev->eps[epid];
+	if (devep->ep_xfer != NULL)
+		USB_DATA_XFER_RESET(devep->ep_xfer);
+
+	dev_ctx = dev->dev_ctx;
+	assert(dev_ctx != NULL);
+
+	ep_ctx = &dev_ctx->ctx_ep[epid];
+
+	ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED;
+
+	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0)
+		ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs;
+
+	DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x\r\n",
+	        epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2,
+	        ep_ctx->dwEpCtx4));
+
+	if (type == XHCI_TRB_TYPE_RESET_EP &&
+	    (dev->dev_ue->ue_reset == NULL ||
+	    dev->dev_ue->ue_reset(dev->dev_sc) < 0)) {
+		cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON;
+		goto done;
+	}
+
+done:
+	return (cmderr);
+}
+
+
+static uint32_t
+pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep,
+    uint32_t streamid, struct xhci_stream_ctx **osctx)
+{
+	struct xhci_stream_ctx *sctx;
+	uint32_t	maxpstreams;
+
+	maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0);
+	if (maxpstreams == 0)
+		return (XHCI_TRB_ERROR_TRB);
+
+	if (maxpstreams > XHCI_STREAMS_MAX)
+		return (XHCI_TRB_ERROR_INVALID_SID);
+
+	if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) {
+		DPRINTF(("pci_xhci: find_stream; LSA bit not set\r\n"));
+		return (XHCI_TRB_ERROR_INVALID_SID);
+	}
+
+	/* only support primary stream */
+	if (streamid > maxpstreams)
+		return (XHCI_TRB_ERROR_STREAM_TYPE);
+
+	sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid;
+	if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0))
+		return (XHCI_TRB_ERROR_STREAM_TYPE);
+
+	*osctx = sctx;
+
+	return (XHCI_TRB_ERROR_SUCCESS);
+}
+
+
+static uint32_t
+pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot,
+    struct xhci_trb *trb)
+{
+	struct pci_xhci_dev_emu	*dev;
+	struct pci_xhci_dev_ep	*devep;
+	struct xhci_dev_ctx	*dev_ctx;
+	struct xhci_endp_ctx	*ep_ctx;
+	uint32_t	cmderr, epid;
+	uint32_t	streamid;
+
+	cmderr = XHCI_TRB_ERROR_SUCCESS;
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	assert(dev != NULL);
+
+	DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u\r\n"
+	         "                 stream-id %u, slot %u, epid %u, C %u\r\n",
+	         (trb->qwTrb0 & ~0xF),  (uint32_t)((trb->qwTrb0 >> 1) & 0x7),
+	         (uint32_t)(trb->qwTrb0 & 0x1), (trb->dwTrb2 >> 16) & 0xFFFF,
+	         XHCI_TRB_3_SLOT_GET(trb->dwTrb3),
+	         XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1));
+
+	epid = XHCI_TRB_3_EP_GET(trb->dwTrb3);
+	if (epid < 1 || epid > 31) {
+		DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u\r\n", epid));
+		cmderr = XHCI_TRB_ERROR_TRB;
+		goto done;
+	}
+
+	dev_ctx = dev->dev_ctx;
+	assert(dev_ctx != NULL);
+
+	ep_ctx = &dev_ctx->ctx_ep[epid];
+	devep = &dev->eps[epid];
+
+	switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) {
+	case XHCI_ST_EPCTX_STOPPED:
+	case XHCI_ST_EPCTX_ERROR:
+		break;
+	default:
+		DPRINTF(("pci_xhci cmd set_tr invalid state %x\r\n",
+		        XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)));
+		cmderr = XHCI_TRB_ERROR_CONTEXT_STATE;
+		goto done;
+	}
+
+	streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2);
+	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) {
+		struct xhci_stream_ctx *sctx;
+
+		sctx = NULL;
+		cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx);
+		if (sctx != NULL) {
+			assert(devep->ep_sctx != NULL);
+			
+			devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0;
+			devep->ep_sctx_trbs[streamid].ringaddr =
+			    trb->qwTrb0 & ~0xF;
+			devep->ep_sctx_trbs[streamid].ccs =
+			    XHCI_EPCTX_2_DCS_GET(trb->qwTrb0);
+		}
+	} else {
+		if (streamid != 0) {
+			DPRINTF(("pci_xhci cmd set_tr streamid %x != 0\r\n",
+			        streamid));
+		}
+		ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL;
+		devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL;
+		devep->ep_ccs = trb->qwTrb0 & 0x1;
+		devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr);
+
+		DPRINTF(("pci_xhci set_tr first TRB:\r\n"));
+		pci_xhci_dump_trb(devep->ep_tr);
+	}
+	ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED;
+
+done:
+	return (cmderr);
+}
+
+static uint32_t
+pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot,
+    struct xhci_trb *trb)
+{
+	struct xhci_input_dev_ctx *input_ctx;
+	struct xhci_slot_ctx      *islot_ctx;
+	struct xhci_dev_ctx       *dev_ctx;
+	struct xhci_endp_ctx      *ep0_ctx;
+	uint32_t cmderr;
+
+	input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
+	islot_ctx = &input_ctx->ctx_slot;
+	ep0_ctx = &input_ctx->ctx_ep[1];
+
+	cmderr = XHCI_TRB_ERROR_SUCCESS;
+	DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,\r\n"
+	         "          slot %08x %08x %08x %08x\r\n"
+	         "          ep0  %08x %08x %016lx %08x\r\n",
+	        input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1,
+	        islot_ctx->dwSctx0, islot_ctx->dwSctx1,
+	        islot_ctx->dwSctx2, islot_ctx->dwSctx3,
+	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+	        ep0_ctx->dwEpCtx4));
+
+	/* this command expects drop-ctx=0 & add-ctx=slot+ep0 */
+	if ((input_ctx->ctx_input.dwInCtx0 != 0) ||
+	    (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) {
+		DPRINTF(("pci_xhci: eval ctx, input ctl invalid\r\n"));
+		cmderr = XHCI_TRB_ERROR_TRB;
+		goto done;
+	}
+
+	/* assign address to slot; in this emulation, slot_id = address */
+	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+	DPRINTF(("pci_xhci: eval ctx, dev ctx\r\n"
+	         "          slot %08x %08x %08x %08x\r\n",
+	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
+
+	if (input_ctx->ctx_input.dwInCtx1 & 0x01) {	/* slot ctx */
+		/* set max exit latency */
+		dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY(
+		    dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1,
+		    0xFFFF, 0);
+
+		/* set interrupter target */
+		dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY(
+		    dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2,
+		    0x3FF, 22);
+	}
+	if (input_ctx->ctx_input.dwInCtx1 & 0x02) {	/* control ctx */
+		/* set max packet size */
+		dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY(
+		    dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1,
+		    0xFFFF, 16);
+
+		ep0_ctx = &dev_ctx->ctx_ep[1];
+	}
+
+	DPRINTF(("pci_xhci: eval ctx, output ctx\r\n"
+	         "          slot %08x %08x %08x %08x\r\n"
+	         "          ep0  %08x %08x %016lx %08x\r\n",
+	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
+	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3,
+	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
+	        ep0_ctx->dwEpCtx4));
+
+done:
+	return (cmderr);
+}
+
+static int
+pci_xhci_complete_commands(struct pci_xhci_softc *sc)
+{
+	struct xhci_trb	evtrb;
+	struct xhci_trb	*trb;
+	uint64_t	crcr;
+	uint32_t	ccs;		/* cycle state (XHCI 4.9.2) */
+	uint32_t	type;
+	uint32_t	slot;
+	uint32_t	cmderr;
+	int		error;
+
+	error = 0;
+	sc->opregs.crcr |= XHCI_CRCR_LO_CRR;
+
+	trb = sc->opregs.cr_p;
+	ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS;
+	crcr = sc->opregs.crcr & ~0xF;
+
+	while (1) {
+		sc->opregs.cr_p = trb;
+	
+		type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
+
+		if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) !=
+		    (ccs & XHCI_TRB_3_CYCLE_BIT))
+			break;
+
+		DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x"
+		        " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u\r\n",
+		        type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3,
+		        trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs));
+
+		cmderr = XHCI_TRB_ERROR_SUCCESS;
+		evtrb.dwTrb2 = 0;
+		evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) |
+		      XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE);
+		slot = 0;
+
+		switch (type) {
+		case XHCI_TRB_TYPE_LINK:			/* 0x06 */
+			if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT)
+				ccs ^= XHCI_CRCR_LO_RCS;
+			break;
+
+		case XHCI_TRB_TYPE_ENABLE_SLOT:			/* 0x09 */
+			cmderr = pci_xhci_cmd_enable_slot(sc, &slot);
+			break;
+
+		case XHCI_TRB_TYPE_DISABLE_SLOT:		/* 0x0A */
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_disable_slot(sc, slot);
+			break;
+
+		case XHCI_TRB_TYPE_ADDRESS_DEVICE:		/* 0x0B */
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_address_device(sc, slot, trb);
+			break;
+
+		case XHCI_TRB_TYPE_CONFIGURE_EP:		/* 0x0C */
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_config_ep(sc, slot, trb);
+			break;
+
+		case XHCI_TRB_TYPE_EVALUATE_CTX:		/* 0x0D */
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb);
+			break;
+
+		case XHCI_TRB_TYPE_RESET_EP:			/* 0x0E */
+			DPRINTF(("Reset Endpoint on slot %d\r\n", slot));
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb);
+			break;
+
+		case XHCI_TRB_TYPE_STOP_EP:			/* 0x0F */
+			DPRINTF(("Stop Endpoint on slot %d\r\n", slot));
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb);
+			break;
+
+		case XHCI_TRB_TYPE_SET_TR_DEQUEUE:		/* 0x10 */
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_set_tr(sc, slot, trb);
+			break;
+
+		case XHCI_TRB_TYPE_RESET_DEVICE:		/* 0x11 */
+			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
+			cmderr = pci_xhci_cmd_reset_device(sc, slot);
+			break;
+
+		case XHCI_TRB_TYPE_FORCE_EVENT:			/* 0x12 */
+			/* TODO: */
+			break;
+
+		case XHCI_TRB_TYPE_NEGOTIATE_BW:		/* 0x13 */
+			break;
+
+		case XHCI_TRB_TYPE_SET_LATENCY_TOL:		/* 0x14 */
+			break;
+
+		case XHCI_TRB_TYPE_GET_PORT_BW:			/* 0x15 */
+			break;
+
+		case XHCI_TRB_TYPE_FORCE_HEADER:		/* 0x16 */
+			break;
+
+		case XHCI_TRB_TYPE_NOOP_CMD:			/* 0x17 */
+			break;
+
+		default:
+			DPRINTF(("pci_xhci: unsupported cmd %x\r\n", type));
+			break;
+		}
+
+		if (type != XHCI_TRB_TYPE_LINK) {
+			/* 
+			 * insert command completion event and assert intr
+			 */
+			evtrb.qwTrb0 = crcr;
+			evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr);
+			evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot);
+			DPRINTF(("pci_xhci: command 0x%x result: 0x%x\r\n",
+			        type, cmderr));
+			pci_xhci_insert_event(sc, &evtrb, 1);
+		}
+
+		trb = pci_xhci_trb_next(sc, trb, &crcr);
+	}
+
+	sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs;
+	sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR;
+	return (error);
+}
+
+static void
+pci_xhci_dump_trb(struct xhci_trb *trb)
+{
+	static const char *trbtypes[] = {
+		"RESERVED",
+		"NORMAL",
+		"SETUP_STAGE",
+		"DATA_STAGE",
+		"STATUS_STAGE",
+		"ISOCH",
+		"LINK",
+		"EVENT_DATA",
+		"NOOP",
+		"ENABLE_SLOT",
+		"DISABLE_SLOT",
+		"ADDRESS_DEVICE",
+		"CONFIGURE_EP",
+		"EVALUATE_CTX",
+		"RESET_EP",
+		"STOP_EP",
+		"SET_TR_DEQUEUE",
+		"RESET_DEVICE",
+		"FORCE_EVENT",
+		"NEGOTIATE_BW",
+		"SET_LATENCY_TOL",
+		"GET_PORT_BW",
+		"FORCE_HEADER",
+		"NOOP_CMD"
+	};
+	uint32_t type;
+
+	type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
+	DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x\r\n",
+	         trb, type,
+	         type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID",
+	         trb->qwTrb0, trb->dwTrb2, trb->dwTrb3));
+}
+
+static int
+pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer,
+     uint32_t slot, uint32_t epid, int *do_intr)
+{
+	struct pci_xhci_dev_emu *dev;
+	struct pci_xhci_dev_ep	*devep;
+	struct xhci_dev_ctx	*dev_ctx;
+	struct xhci_endp_ctx	*ep_ctx;
+	struct xhci_trb		*trb;
+	struct xhci_trb		evtrb;
+	uint32_t trbflags;
+	uint32_t edtla;
+	int i, err;
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	devep = &dev->eps[epid];
+	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+
+	assert(dev_ctx != NULL);
+
+	ep_ctx = &dev_ctx->ctx_ep[epid];
+
+	err = XHCI_TRB_ERROR_SUCCESS;
+	*do_intr = 0;
+	edtla = 0;
+
+	/* go through list of TRBs and insert event(s) */
+	for (i = xfer->head; xfer->ndata > 0; ) {
+		evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data;
+		trb = XHCI_GADDR(sc, evtrb.qwTrb0);
+		trbflags = trb->dwTrb3;
+
+		DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x "
+		         "(err %d) IOC?%d\r\n",
+		     i, xfer->data[i].processed, xfer->data[i].blen,
+		     XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0,
+		     trbflags, err,
+		     trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0));
+
+		if (!xfer->data[i].processed) {
+			xfer->head = i;
+			break;
+		}
+
+		xfer->ndata--;
+		edtla += xfer->data[i].bdone;
+
+		trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs);
+
+		pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx,
+		    xfer->data[i].streamid, xfer->data[i].trbnext,
+		    xfer->data[i].ccs);
+
+		/* Only interrupt if IOC or short packet */
+		if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) &&
+		    !((err == XHCI_TRB_ERROR_SHORT_PKT) &&
+		      (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) {
+
+			i = (i + 1) % USB_MAX_XFER_BLOCKS;
+			continue;
+		}
+
+		evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) |
+		               XHCI_TRB_2_REM_SET(xfer->data[i].blen);
+
+		evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) |
+		    XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid);
+
+		if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) {
+			DPRINTF(("pci_xhci EVENT_DATA edtla %u\r\n", edtla));
+			evtrb.qwTrb0 = trb->qwTrb0;
+			evtrb.dwTrb2 = (edtla & 0xFFFFF) | 
+			         XHCI_TRB_2_ERROR_SET(err);
+			evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT;
+			edtla = 0;
+		}
+
+		*do_intr = 1;
+
+		err = pci_xhci_insert_event(sc, &evtrb, 0);
+		if (err != XHCI_TRB_ERROR_SUCCESS) {
+			break;
+		}
+
+		i = (i + 1) % USB_MAX_XFER_BLOCKS;
+	}
+
+	return (err);
+}
+
+static void
+pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev,
+    struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx,
+    uint32_t streamid, uint64_t ringaddr, int ccs)
+{
+
+	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) {
+		devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) |
+		                                   (ccs & 0x1);
+
+		devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL;
+		devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1;
+		ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1);
+
+		DPRINTF(("xhci update ep-ring stream %d, addr %lx\r\n",
+		    streamid, devep->ep_sctx[streamid].qwSctx0));
+	} else {
+		devep->ep_ringaddr = ringaddr & ~0xFUL;
+		devep->ep_ccs = ccs & 0x1;
+		devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL);
+		ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1);
+
+		DPRINTF(("xhci update ep-ring, addr %lx\r\n",
+		    (devep->ep_ringaddr | devep->ep_ccs)));
+	}
+}
+
+/*
+ * Outstanding transfer still in progress (device NAK'd earlier) so retry
+ * the transfer again to see if it succeeds.
+ */
+static int
+pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc,
+    struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
+    struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid)
+{
+	struct usb_data_xfer *xfer;
+	int		err;
+	int		do_intr;
+
+	ep_ctx->dwEpCtx0 = FIELD_REPLACE(
+		    ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0);
+
+	err = 0;
+	do_intr = 0;
+
+	xfer = devep->ep_xfer;
+#ifdef __FreeBSD__
+	USB_DATA_XFER_LOCK(xfer);
+#else
+	/*
+	 * At least one caller needs to hold this lock across the call to this
+	 * function and other code.  To avoid deadlock from a recursive mutex
+	 * enter, we ensure that all callers hold this lock.
+	 */
+	assert(USB_DATA_XFER_LOCK_HELD(xfer));
+#endif
+
+	/* outstanding requests queued up */
+	if (dev->dev_ue->ue_data != NULL) {
+		err = dev->dev_ue->ue_data(dev->dev_sc, xfer,
+		            epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2);
+		if (err == USB_ERR_CANCELLED) {
+			if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) ==
+			    USB_NAK)
+				err = XHCI_TRB_ERROR_SUCCESS;
+		} else {
+			err = pci_xhci_xfer_complete(sc, xfer, slot, epid,
+			                             &do_intr);
+			if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) {
+				pci_xhci_assert_interrupt(sc);
+			}
+
+
+			/* XXX should not do it if error? */
+			USB_DATA_XFER_RESET(xfer);
+		}
+	}
+
+#ifdef __FreeBSD__
+	USB_DATA_XFER_UNLOCK(xfer);
+#endif
+
+	return (err);
+}
+
+
+static int
+pci_xhci_handle_transfer(struct pci_xhci_softc *sc,
+    struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
+    struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot,
+    uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid)
+{
+	struct xhci_trb *setup_trb;
+	struct usb_data_xfer *xfer;
+	struct usb_data_xfer_block *xfer_block;
+	uint64_t	val;
+	uint32_t	trbflags;
+	int		do_intr, err;
+	int		do_retry;
+
+	ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0,
+	                                 XHCI_ST_EPCTX_RUNNING, 0x7, 0);
+
+	xfer = devep->ep_xfer;
+	USB_DATA_XFER_LOCK(xfer);
+
+	DPRINTF(("pci_xhci handle_transfer slot %u\r\n", slot));
+
+retry:
+	err = 0;
+	do_retry = 0;
+	do_intr = 0;
+	setup_trb = NULL;
+
+	while (1) {
+		pci_xhci_dump_trb(trb);
+
+		trbflags = trb->dwTrb3;
+
+		if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK &&
+		    (trbflags & XHCI_TRB_3_CYCLE_BIT) !=
+		    (ccs & XHCI_TRB_3_CYCLE_BIT)) {
+			DPRINTF(("Cycle-bit changed trbflags %x, ccs %x\r\n",
+			    trbflags & XHCI_TRB_3_CYCLE_BIT, ccs));
+			break;
+		}
+
+		xfer_block = NULL;
+
+		switch (XHCI_TRB_3_TYPE_GET(trbflags)) {
+		case XHCI_TRB_TYPE_LINK:
+			if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT)
+				ccs ^= 0x1;
+
+			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+			                                  (void *)addr, ccs);
+			xfer_block->processed = 1;
+			break;
+
+		case XHCI_TRB_TYPE_SETUP_STAGE:
+			if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 ||
+			    XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) {
+				DPRINTF(("pci_xhci: invalid setup trb\r\n"));
+				err = XHCI_TRB_ERROR_TRB;
+				goto errout;
+			}
+			setup_trb = trb;
+
+			val = trb->qwTrb0;
+			if (!xfer->ureq)
+				xfer->ureq = malloc(
+				           sizeof(struct usb_device_request));
+			memcpy(xfer->ureq, &val,
+			       sizeof(struct usb_device_request));
+
+			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+			                                  (void *)addr, ccs);
+			xfer_block->processed = 1;
+			break;
+
+		case XHCI_TRB_TYPE_NORMAL:
+		case XHCI_TRB_TYPE_ISOCH:
+			if (setup_trb != NULL) {
+				DPRINTF(("pci_xhci: trb not supposed to be in "
+				         "ctl scope\r\n"));
+				err = XHCI_TRB_ERROR_TRB;
+				goto errout;
+			}
+			/* fall through */
+
+		case XHCI_TRB_TYPE_DATA_STAGE:
+			xfer_block = usb_data_xfer_append(xfer,
+			     (void *)(trbflags & XHCI_TRB_3_IDT_BIT ?
+			         &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)),
+			     trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs);
+			break;
+
+		case XHCI_TRB_TYPE_STATUS_STAGE:
+			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+			                                  (void *)addr, ccs);
+			break;
+
+		case XHCI_TRB_TYPE_NOOP:
+			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+			                                  (void *)addr, ccs);
+			xfer_block->processed = 1;
+			break;
+
+		case XHCI_TRB_TYPE_EVENT_DATA:
+			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
+			                                  (void *)addr, ccs);
+			if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) {
+				xfer_block->processed = 1;
+			}
+			break;
+
+		default:
+			DPRINTF(("pci_xhci: handle xfer unexpected trb type "
+			         "0x%x\r\n",
+			         XHCI_TRB_3_TYPE_GET(trbflags)));
+			err = XHCI_TRB_ERROR_TRB;
+			goto errout;
+		}
+
+		trb = pci_xhci_trb_next(sc, trb, &addr);
+
+		DPRINTF(("pci_xhci: next trb: 0x%lx\r\n", (uint64_t)trb));
+
+		if (xfer_block) {
+			xfer_block->trbnext = addr;
+			xfer_block->streamid = streamid;
+		}
+
+		if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) &&
+		    XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) {
+			break;
+		}
+
+		/* handle current batch that requires interrupt on complete */
+		if (trbflags & XHCI_TRB_3_IOC_BIT) {
+			DPRINTF(("pci_xhci: trb IOC bit set\r\n"));
+			if (epid == 1)
+				do_retry = 1;
+			break;
+		}
+	}
+
+	DPRINTF(("pci_xhci[%d]: xfer->ndata %u\r\n", __LINE__, xfer->ndata));
+
+	if (epid == 1) {
+		err = USB_ERR_NOT_STARTED;
+		if (dev->dev_ue->ue_request != NULL)
+			err = dev->dev_ue->ue_request(dev->dev_sc, xfer);
+		setup_trb = NULL;
+	} else {
+		/* handle data transfer */
+		pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid);
+		err = XHCI_TRB_ERROR_SUCCESS;
+		goto errout;
+	}
+
+	err = USB_TO_XHCI_ERR(err);
+	if ((err == XHCI_TRB_ERROR_SUCCESS) ||
+	    (err == XHCI_TRB_ERROR_SHORT_PKT)) {
+		err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr);
+		if (err != XHCI_TRB_ERROR_SUCCESS)
+			do_retry = 0;
+	}
+
+errout:
+	if (err == XHCI_TRB_ERROR_EV_RING_FULL)
+		DPRINTF(("pci_xhci[%d]: event ring full\r\n", __LINE__));
+
+	if (!do_retry)
+		USB_DATA_XFER_UNLOCK(xfer);
+
+	if (do_intr)
+		pci_xhci_assert_interrupt(sc);
+
+	if (do_retry) {
+		USB_DATA_XFER_RESET(xfer);
+		DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs\r\n",
+		         __LINE__));
+		goto retry;
+	}
+
+	if (epid == 1)
+		USB_DATA_XFER_RESET(xfer);
+
+	return (err);
+}
+
+static void
+pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot,
+    uint32_t epid, uint32_t streamid)
+{
+	struct pci_xhci_dev_emu *dev;
+	struct pci_xhci_dev_ep	*devep;
+	struct xhci_dev_ctx	*dev_ctx;
+	struct xhci_endp_ctx	*ep_ctx;
+	struct pci_xhci_trb_ring *sctx_tr;
+	struct xhci_trb	*trb;
+	uint64_t	ringaddr;
+	uint32_t	ccs;
+
+	DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u\r\n",
+	    slot, epid, streamid));
+
+	if (slot == 0 || slot > sc->ndevices) {
+		DPRINTF(("pci_xhci: invalid doorbell slot %u\r\n", slot));
+		return;
+	}
+
+	dev = XHCI_SLOTDEV_PTR(sc, slot);
+	devep = &dev->eps[epid];
+	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
+	if (!dev_ctx) {
+		return;
+	}
+	ep_ctx = &dev_ctx->ctx_ep[epid];
+
+	sctx_tr = NULL;
+
+	DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x\r\n",
+	        epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2,
+	        ep_ctx->dwEpCtx4));
+
+	if (ep_ctx->qwEpCtx2 == 0)
+		return;
+
+	/* handle pending transfers */
+	if (devep->ep_xfer->ndata > 0) {
+#ifndef __FreeBSD__
+		USB_DATA_XFER_LOCK(devep->ep_xfer);
+#endif
+		pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid);
+#ifndef __FreeBSD__
+		USB_DATA_XFER_UNLOCK(devep->ep_xfer);
+#endif
+		return;
+	}
+
+	/* get next trb work item */
+	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) {
+		sctx_tr = &devep->ep_sctx_trbs[streamid];
+		ringaddr = sctx_tr->ringaddr;
+		ccs = sctx_tr->ccs;
+		trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL);
+		DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x\r\n",
+		        streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT,
+		        trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT));
+	} else {
+		ringaddr = devep->ep_ringaddr;
+		ccs = devep->ep_ccs;
+		trb = devep->ep_tr;
+		DPRINTF(("doorbell, ccs %lx, trb ccs %x\r\n",
+		        ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT,
+		        trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT));
+	}
+
+	if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) {
+		DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?\r\n",
+		        ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid));
+		return;
+	}
+
+	pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid,
+	                         ringaddr, ccs, streamid);
+}
+
+static void
+pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset,
+    uint64_t value)
+{
+
+	offset = (offset - sc->dboff) / sizeof(uint32_t);
+
+	DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx\r\n",
+	        offset, value));
+
+	if (XHCI_HALTED(sc)) {
+		DPRINTF(("pci_xhci: controller halted\r\n"));
+		return;
+	}
+
+	if (offset == 0)
+		pci_xhci_complete_commands(sc);
+	else if (sc->portregs != NULL)
+		pci_xhci_device_doorbell(sc, offset,
+		   XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value));
+}
+
+static void
+pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset,
+    uint64_t value)
+{
+	struct pci_xhci_rtsregs *rts;
+
+	offset -= sc->rtsoff;
+
+	if (offset == 0) {
+		DPRINTF(("pci_xhci attempted write to MFINDEX\r\n"));
+		return;
+	}
+
+	DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx\r\n",
+	        offset, value));
+
+	offset -= 0x20;		/* start of intrreg */
+
+	rts = &sc->rtsregs;
+
+	switch (offset) {
+	case 0x00:
+		if (value & XHCI_IMAN_INTR_PEND)
+			rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND;
+		rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) |
+		                    (rts->intrreg.iman & XHCI_IMAN_INTR_PEND);
+
+		if (!(value & XHCI_IMAN_INTR_ENA))
+			pci_xhci_deassert_interrupt(sc);
+
+		break;
+
+	case 0x04:
+		rts->intrreg.imod = value;
+		break;
+
+	case 0x08:
+		rts->intrreg.erstsz = value & 0xFFFF;
+		break;
+
+	case 0x10:
+		/* ERSTBA low bits */
+		rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) |
+		                      (value & ~0x3F);
+		break;
+
+	case 0x14:
+		/* ERSTBA high bits */
+		rts->intrreg.erstba = (value << 32) |
+		    MASK_64_LO(sc->rtsregs.intrreg.erstba);
+
+		rts->erstba_p = XHCI_GADDR(sc,
+		                        sc->rtsregs.intrreg.erstba & ~0x3FUL);
+
+		rts->erst_p = XHCI_GADDR(sc,
+		              sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL);
+
+		rts->er_enq_idx = 0;
+		rts->er_events_cnt = 0;
+
+		DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u\r\n",
+		        rts->erstba_p,
+		        rts->erstba_p->qwEvrsTablePtr,
+		        rts->erstba_p->dwEvrsTableSize));
+		break;
+
+	case 0x18:
+		/* ERDP low bits */
+		rts->intrreg.erdp =
+		    MASK_64_HI(sc->rtsregs.intrreg.erdp) |
+		    (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) |
+		    (value & ~0xF);
+		if (value & XHCI_ERDP_LO_BUSY) {
+			rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY;
+			rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND;
+		}
+
+		rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value);
+
+		break;
+
+	case 0x1C:
+		/* ERDP high bits */
+		rts->intrreg.erdp = (value << 32) |
+		    MASK_64_LO(sc->rtsregs.intrreg.erdp);
+
+		if (rts->er_events_cnt > 0) {
+			uint64_t erdp;
+			uint32_t erdp_i;
+
+			erdp = rts->intrreg.erdp & ~0xF;
+			erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) /
+			           sizeof(struct xhci_trb);
+
+			if (erdp_i <= rts->er_enq_idx)
+				rts->er_events_cnt = rts->er_enq_idx - erdp_i;
+			else
+				rts->er_events_cnt =
+				          rts->erstba_p->dwEvrsTableSize -
+				          (erdp_i - rts->er_enq_idx);
+
+			DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u\r\n",
+			        erdp, rts->er_events_cnt));
+		}
+
+		break;
+
+	default:
+		DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx\r\n",
+		        offset));
+		break;
+	}
+}
+
+static uint64_t
+pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+	int port;
+	uint32_t *p;
+
+	if (sc->portregs == NULL)
+		return (0);
+
+	port = (offset - 0x3F0) / 0x10;
+
+	if (port > XHCI_MAX_DEVS) {
+		DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS\r\n",
+		    port));
+
+		/* return default value for unused port */
+		return (XHCI_PS_SPEED_SET(3));
+	}
+
+	offset = (offset - 0x3F0) % 0x10;
+
+	p = &sc->portregs[port].portsc;
+	p += offset / sizeof(uint32_t);
+
+	DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x\r\n",
+	        offset, port, *p));
+
+	return (*p);
+}
+
+static void
+pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset,
+    uint64_t value)
+{
+	offset -= XHCI_CAPLEN;
+
+	if (offset < 0x400)
+		DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx\r\n",
+		         offset, value));
+
+	switch (offset) {
+	case XHCI_USBCMD:
+		sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F);
+		break;
+
+	case XHCI_USBSTS:
+		/* clear bits on write */
+		sc->opregs.usbsts &= ~(value &
+		      (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS|
+		       XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR));
+		break;
+
+	case XHCI_PAGESIZE:
+		/* read only */
+		break;
+
+	case XHCI_DNCTRL:
+		sc->opregs.dnctrl = value & 0xFFFF;
+		break;
+
+	case XHCI_CRCR_LO:
+		if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) {
+			sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA);
+			sc->opregs.crcr |= value &
+			                   (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA);
+		} else {
+			sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) |
+			           (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS));
+		}
+		break;
+
+	case XHCI_CRCR_HI:
+		if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) {
+			sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) |
+			                  (value << 32);
+
+			sc->opregs.cr_p = XHCI_GADDR(sc,
+			                  sc->opregs.crcr & ~0xF);
+		}
+
+		if (sc->opregs.crcr & XHCI_CRCR_LO_CS) {
+			/* Stop operation of Command Ring */
+		}
+
+		if (sc->opregs.crcr & XHCI_CRCR_LO_CA) {
+			/* Abort command */
+		}
+
+		break;
+
+	case XHCI_DCBAAP_LO:
+		sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) |
+		                    (value & 0xFFFFFFC0);
+		break;
+
+	case XHCI_DCBAAP_HI:
+		sc->opregs.dcbaap =  MASK_64_LO(sc->opregs.dcbaap) |
+		                     (value << 32);
+		sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL);
+
+		DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)\r\n",
+		    sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p));
+		break;
+
+	case XHCI_CONFIG:
+		sc->opregs.config = value & 0x03FF;
+		break;
+
+	default:
+		if (offset >= 0x400)
+			pci_xhci_portregs_write(sc, offset, value);
+
+		break;
+	}
+}
+
+
+static void
+pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+                int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_xhci_softc *sc;
+
+	sc = pi->pi_arg;
+
+	assert(baridx == 0);
+
+
+	pthread_mutex_lock(&sc->mtx);
+	if (offset < XHCI_CAPLEN)	/* read only registers */
+		WPRINTF(("pci_xhci: write RO-CAPs offset %ld\r\n", offset));
+	else if (offset < sc->dboff)
+		pci_xhci_hostop_write(sc, offset, value);
+	else if (offset < sc->rtsoff)
+		pci_xhci_dbregs_write(sc, offset, value);
+	else if (offset < sc->regsend)
+		pci_xhci_rtsregs_write(sc, offset, value);
+	else
+		WPRINTF(("pci_xhci: write invalid offset %ld\r\n", offset));
+
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static uint64_t
+pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+	uint64_t	value;
+
+	switch (offset) {
+	case XHCI_CAPLENGTH:	/* 0x00 */
+		value = sc->caplength;
+		break;
+
+	case XHCI_HCSPARAMS1:	/* 0x04 */
+		value = sc->hcsparams1;
+		break;
+
+	case XHCI_HCSPARAMS2:	/* 0x08 */
+		value = sc->hcsparams2;
+		break;
+
+	case XHCI_HCSPARAMS3:	/* 0x0C */
+		value = sc->hcsparams3;
+		break;
+
+	case XHCI_HCSPARAMS0:	/* 0x10 */
+		value = sc->hccparams1;
+		break;
+
+	case XHCI_DBOFF:	/* 0x14 */
+		value = sc->dboff;
+		break;
+
+	case XHCI_RTSOFF:	/* 0x18 */
+		value = sc->rtsoff;
+		break;
+
+	case XHCI_HCCPRAMS2:	/* 0x1C */
+		value = sc->hccparams2;
+		break;
+
+	default:
+		value = 0;
+		break;
+	}
+
+	DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx\r\n",
+	        offset, value));
+
+	return (value);
+}
+
+static uint64_t
+pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+	uint64_t value;
+
+	offset = (offset - XHCI_CAPLEN);
+
+	switch (offset) {
+	case XHCI_USBCMD:	/* 0x00 */
+		value = sc->opregs.usbcmd;
+		break;
+
+	case XHCI_USBSTS:	/* 0x04 */
+		value = sc->opregs.usbsts;
+		break;
+
+	case XHCI_PAGESIZE:	/* 0x08 */
+		value = sc->opregs.pgsz;
+		break;
+
+	case XHCI_DNCTRL:	/* 0x14 */
+		value = sc->opregs.dnctrl;
+		break;
+
+	case XHCI_CRCR_LO:	/* 0x18 */
+		value = sc->opregs.crcr & XHCI_CRCR_LO_CRR;
+		break;
+
+	case XHCI_CRCR_HI:	/* 0x1C */
+		value = 0;
+		break;
+
+	case XHCI_DCBAAP_LO:	/* 0x30 */
+		value = sc->opregs.dcbaap & 0xFFFFFFFF;
+		break;
+
+	case XHCI_DCBAAP_HI:	/* 0x34 */
+		value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF;
+		break;
+
+	case XHCI_CONFIG:	/* 0x38 */
+		value = sc->opregs.config;
+		break;
+
+	default:
+		if (offset >= 0x400)
+			value = pci_xhci_portregs_read(sc, offset);
+		else
+			value = 0;
+
+		break;
+	}
+
+	if (offset < 0x400)
+		DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx\r\n",
+		        offset, value));
+
+	return (value);
+}
+
+static uint64_t
+pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+
+	/* read doorbell always returns 0 */
+	return (0);
+}
+
+static uint64_t
+pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+	uint32_t	value;
+
+	offset -= sc->rtsoff;
+	value = 0;
+
+	if (offset == XHCI_MFINDEX) {
+		value = sc->rtsregs.mfindex;
+	} else if (offset >= 0x20) {
+		int item;
+		uint32_t *p;
+
+		offset -= 0x20;
+		item = offset % 32;
+
+		assert(offset < sizeof(sc->rtsregs.intrreg));
+
+		p = &sc->rtsregs.intrreg.iman;
+		p += item / sizeof(uint32_t);
+		value = *p;
+	}
+
+	DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x\r\n",
+	        offset, value));
+
+	return (value);
+}
+
+static uint64_t
+pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset)
+{
+	uint32_t	value;
+
+	offset -= sc->regsend;
+	value = 0;
+
+	switch (offset) {
+	case 0:
+		/* rev major | rev minor | next-cap | cap-id */
+		value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS;
+		break;
+	case 4:
+		/* name string = "USB" */
+		value = 0x20425355;
+		break;
+	case 8:
+		/* psic | proto-defined | compat # | compat offset */
+		value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start;
+		break;
+	case 12:
+		break;
+	case 16:
+		/* rev major | rev minor | next-cap | cap-id */
+		value = (0x03 << 24) | XHCI_ID_PROTOCOLS;
+		break;
+	case 20:
+		/* name string = "USB" */
+		value = 0x20425355;
+		break;
+	case 24:
+		/* psic | proto-defined | compat # | compat offset */
+		value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start;
+		break;
+	case 28:
+		break;
+	default:
+		DPRINTF(("pci_xhci: xecp invalid offset 0x%lx\r\n", offset));
+		break;
+	}
+
+	DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x\r\n",
+	        offset, value));
+
+	return (value);
+}
+
+
+static uint64_t
+pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+    uint64_t offset, int size)
+{
+	struct pci_xhci_softc *sc;
+	uint32_t	value;
+
+	sc = pi->pi_arg;
+
+	assert(baridx == 0);
+
+	pthread_mutex_lock(&sc->mtx);
+	if (offset < XHCI_CAPLEN)
+		value = pci_xhci_hostcap_read(sc, offset);
+	else if (offset < sc->dboff)
+		value = pci_xhci_hostop_read(sc, offset);
+	else if (offset < sc->rtsoff)
+		value = pci_xhci_dbregs_read(sc, offset);
+	else if (offset < sc->regsend)
+		value = pci_xhci_rtsregs_read(sc, offset);
+	else if (offset < (sc->regsend + 4*32))
+		value = pci_xhci_xecp_read(sc, offset);
+	else {
+		value = 0;
+		WPRINTF(("pci_xhci: read invalid offset %ld\r\n", offset));
+	}
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	switch (size) {
+	case 1:
+		value &= 0xFF;
+		break;
+	case 2:
+		value &= 0xFFFF;
+		break;
+	case 4:
+		value &= 0xFFFFFFFF;
+		break;
+	}
+
+	return (value);
+}
+
+static void
+pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm)
+{
+	struct pci_xhci_portregs *port;
+	struct pci_xhci_dev_emu	*dev;
+	struct xhci_trb		evtrb;
+	int	error;
+
+	assert(portn <= XHCI_MAX_DEVS);
+
+	DPRINTF(("xhci reset port %d\r\n", portn));
+
+	port = XHCI_PORTREG_PTR(sc, portn);
+	dev = XHCI_DEVINST_PTR(sc, portn);
+	if (dev) {
+		port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC);
+		port->portsc |= XHCI_PS_PED |
+		    XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
+
+		if (warm && dev->dev_ue->ue_usbver == 3) {
+			port->portsc |= XHCI_PS_WRC;
+		}
+
+		if ((port->portsc & XHCI_PS_PRC) == 0) {
+			port->portsc |= XHCI_PS_PRC;
+
+			pci_xhci_set_evtrb(&evtrb, portn,
+			     XHCI_TRB_ERROR_SUCCESS,
+			     XHCI_TRB_EVENT_PORT_STS_CHANGE);
+			error = pci_xhci_insert_event(sc, &evtrb, 1);
+			if (error != XHCI_TRB_ERROR_SUCCESS)
+				DPRINTF(("xhci reset port insert event "
+				         "failed\r\n"));
+		}
+	}
+}
+
+static void
+pci_xhci_init_port(struct pci_xhci_softc *sc, int portn)
+{
+	struct pci_xhci_portregs *port;
+	struct pci_xhci_dev_emu	*dev;
+
+	port = XHCI_PORTREG_PTR(sc, portn);
+	dev = XHCI_DEVINST_PTR(sc, portn);
+	if (dev) {
+		port->portsc = XHCI_PS_CCS |		/* connected */
+		               XHCI_PS_PP;		/* port power */
+		
+		if (dev->dev_ue->ue_usbver == 2) {
+			port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) |
+		               XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
+		} else {
+			port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) |
+		               XHCI_PS_PED |		/* enabled */
+		               XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
+		}
+		
+		DPRINTF(("Init port %d 0x%x\n", portn, port->portsc));
+	} else {
+		port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP;
+		DPRINTF(("Init empty port %d 0x%x\n", portn, port->portsc));
+	}
+}
+
+static int
+pci_xhci_dev_intr(struct usb_hci *hci, int epctx)
+{
+	struct pci_xhci_dev_emu *dev;
+	struct xhci_dev_ctx	*dev_ctx;
+	struct xhci_trb		evtrb;
+	struct pci_xhci_softc	*sc;
+	struct pci_xhci_portregs *p;
+	struct xhci_endp_ctx	*ep_ctx;
+	int	error;
+	int	dir_in;
+	int	epid;
+
+	dir_in = epctx & 0x80;
+	epid = epctx & ~0x80;
+
+	/* HW endpoint contexts are 0-15; convert to epid based on dir */
+	epid = (epid * 2) + (dir_in ? 1 : 0);
+
+	assert(epid >= 1 && epid <= 31);
+
+	dev = hci->hci_sc;
+	sc = dev->xsc;
+
+	/* check if device is ready; OS has to initialise it */
+	if (sc->rtsregs.erstba_p == NULL ||
+	    (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 ||
+	    dev->dev_ctx == NULL)
+		return (0);
+
+	p = XHCI_PORTREG_PTR(sc, hci->hci_port);
+
+	/* raise event if link U3 (suspended) state */
+	if (XHCI_PS_PLS_GET(p->portsc) == 3) {
+		p->portsc &= ~XHCI_PS_PLS_MASK;
+		p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME);
+		if ((p->portsc & XHCI_PS_PLC) != 0)
+			return (0);
+
+		p->portsc |= XHCI_PS_PLC;
+
+		pci_xhci_set_evtrb(&evtrb, hci->hci_port,
+		      XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE);
+		error = pci_xhci_insert_event(sc, &evtrb, 0);
+		if (error != XHCI_TRB_ERROR_SUCCESS)
+			goto done;
+	}
+
+	dev_ctx = dev->dev_ctx;
+	ep_ctx = &dev_ctx->ctx_ep[epid];
+	if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) {
+		DPRINTF(("xhci device interrupt on disabled endpoint %d\r\n",
+		         epid));
+		return (0);
+	}
+
+	DPRINTF(("xhci device interrupt on endpoint %d\r\n", epid));
+
+	pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0);
+
+done:
+	return (error);
+}
+
+static int
+pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param)
+{
+
+	DPRINTF(("xhci device event port %d\r\n", hci->hci_port));
+	return (0);
+}
+
+
+
+static void
+pci_xhci_device_usage(char *opt)
+{
+
+	fprintf(stderr, "Invalid USB emulation \"%s\"\r\n", opt);
+}
+
+static int
+pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts)
+{
+	struct pci_xhci_dev_emu	**devices;
+	struct pci_xhci_dev_emu	*dev;
+	struct usb_devemu	*ue;
+	void	*devsc;
+	char	*uopt, *xopts, *config;
+	int	usb3_port, usb2_port, i;
+
+	uopt = NULL;
+	usb3_port = sc->usb3_port_start - 1;
+	usb2_port = sc->usb2_port_start - 1;
+	devices = NULL;
+
+	if (opts == NULL)
+		goto portsfinal;
+
+	devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *));
+
+	sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *));
+	sc->devices = devices;
+	sc->ndevices = 0;
+
+	uopt = strdup(opts);
+	for (xopts = strtok(uopt, ",");
+	     xopts != NULL;
+	     xopts = strtok(NULL, ",")) {
+		if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) ||
+		    usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) {
+			WPRINTF(("pci_xhci max number of USB 2 or 3 "
+			     "devices reached, max %d\r\n", XHCI_MAX_DEVS/2));
+			usb2_port = usb3_port = -1;
+			goto done;
+		}
+
+		/* device[=<config>] */
+		if ((config = strchr(xopts, '=')) == NULL)
+			config = "";		/* no config */
+		else
+			*config++ = '\0';
+
+		ue = usb_emu_finddev(xopts);
+		if (ue == NULL) {
+			pci_xhci_device_usage(xopts);
+			DPRINTF(("pci_xhci device not found %s\r\n", xopts));
+			usb2_port = usb3_port = -1;
+			goto done;
+		}
+
+		DPRINTF(("pci_xhci adding device %s, opts \"%s\"\r\n",
+		        xopts, config));
+
+		dev = calloc(1, sizeof(struct pci_xhci_dev_emu));
+		dev->xsc = sc;
+		dev->hci.hci_sc = dev;
+		dev->hci.hci_intr = pci_xhci_dev_intr;
+		dev->hci.hci_event = pci_xhci_dev_event;
+
+		if (ue->ue_usbver == 2) {
+			dev->hci.hci_port = usb2_port + 1;
+			devices[usb2_port] = dev;
+			usb2_port++;
+		} else {
+			dev->hci.hci_port = usb3_port + 1;
+			devices[usb3_port] = dev;
+			usb3_port++;
+		}
+
+		dev->hci.hci_address = 0;
+		devsc = ue->ue_init(&dev->hci, config);
+		if (devsc == NULL) {
+			pci_xhci_device_usage(xopts);
+			usb2_port = usb3_port = -1;
+			goto done;
+		}
+
+		dev->dev_ue = ue;
+		dev->dev_sc = devsc;
+
+		/* assign slot number to device */
+		sc->slots[sc->ndevices] = dev;
+
+		sc->ndevices++;
+	}
+
+portsfinal:
+	sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs));
+
+	if (sc->ndevices > 0) {
+		/* port and slot numbering start from 1 */
+		sc->devices--;
+		sc->portregs--;
+		sc->slots--;
+
+		for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+			pci_xhci_init_port(sc, i);
+		}
+	} else {
+		WPRINTF(("pci_xhci no USB devices configured\r\n"));
+		sc->ndevices = 1;
+	}
+
+done:
+	if (devices != NULL) {
+		if (usb2_port <= 0 && usb3_port <= 0) {
+			sc->devices = NULL;
+			for (i = 0; devices[i] != NULL; i++)
+				free(devices[i]);
+			sc->ndevices = -1;
+
+			free(devices);
+		}
+	}
+	free(uopt);
+	return (sc->ndevices);
+}
+
+static int
+pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct pci_xhci_softc *sc;
+	int	error;
+
+	if (xhci_in_use) {
+		WPRINTF(("pci_xhci controller already defined\r\n"));
+		return (-1);
+	}
+	xhci_in_use = 1;
+
+	sc = calloc(1, sizeof(struct pci_xhci_softc));
+	pi->pi_arg = sc;
+	sc->xsc_pi = pi;
+
+	sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1;
+	sc->usb3_port_start = 1;
+
+	/* discover devices */
+	error = pci_xhci_parse_opts(sc, opts);
+	if (error < 0)
+		goto done;
+	else
+		error = 0;
+
+	sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) |
+	                XHCI_SET_HCIVERSION(0x0100);
+	sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) |
+	                 XHCI_SET_HCSP1_MAXINTR(1) |	/* interrupters */
+	                 XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS);
+	sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) |
+	                 XHCI_SET_HCSP2_IST(0x04);
+	sc->hcsparams3 = 0;				/* no latency */
+	sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) |	/* no 2nd-streams */
+	                 XHCI_SET_HCCP1_SPC(1) |	/* short packet */
+	                 XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX);
+	sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) |
+	                 XHCI_SET_HCCP2_U3C(1);
+	sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START +
+	            XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs));
+
+	/* dboff must be 32-bit aligned */
+	if (sc->dboff & 0x3)
+		sc->dboff = (sc->dboff + 0x3) & ~0x3;
+
+	/* rtsoff must be 32-bytes aligned */
+	sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32);
+	if (sc->rtsoff & 0x1F)
+		sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F;
+
+	DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x\r\n", sc->dboff,
+	        sc->rtsoff));
+
+	sc->opregs.usbsts = XHCI_STS_HCH;
+	sc->opregs.pgsz = XHCI_PAGESIZE_4K;
+
+	pci_xhci_reset(sc);
+
+	sc->regsend = sc->rtsoff + 0x20 + 32;		/* only 1 intrpter */
+
+	/*
+	 * Set extended capabilities pointer to be after regsend;
+	 * value of xecp field is 32-bit offset.
+	 */
+	sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4);
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB);
+	pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI);
+	pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0);
+
+	pci_emul_add_msicap(pi, 1);
+
+	/* regsend + xecp registers */
+	pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32);
+	DPRINTF(("pci_xhci pci_emu_alloc: %d\r\n", sc->regsend + 4*32));
+
+
+	pci_lintr_request(pi);
+
+	pthread_mutex_init(&sc->mtx, NULL);
+
+done:
+	if (error) {
+		free(sc);
+	}
+
+	return (error);
+}
+
+
+
+struct pci_devemu pci_de_xhci = {
+	.pe_emu =	"xhci",
+	.pe_init =	pci_xhci_init,
+	.pe_barwrite =	pci_xhci_write,
+	.pe_barread =	pci_xhci_read
+};
+PCI_EMUL_SET(pci_de_xhci);
diff --git a/usr/src/cmd/bhyve/pci_xhci.h b/usr/src/cmd/bhyve/pci_xhci.h
new file mode 100644
index 0000000000..7502f9396a
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_xhci.h
@@ -0,0 +1,355 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_PCI_XHCI_H_
+#define	_PCI_XHCI_H_
+
+#define	PCI_USBREV		0x60	/* USB protocol revision */
+
+
+enum {					/* dsc_slotstate */
+	XHCI_ST_DISABLED,
+	XHCI_ST_ENABLED,
+	XHCI_ST_DEFAULT,
+	XHCI_ST_ADDRESSED,
+	XHCI_ST_CONFIGURED,
+	XHCI_ST_MAX
+};
+
+enum {
+	XHCI_ST_SLCTX_DISABLED,
+	XHCI_ST_SLCTX_DEFAULT,
+	XHCI_ST_SLCTX_ADDRESSED,
+	XHCI_ST_SLCTX_CONFIGURED
+};
+
+enum {
+	XHCI_ST_EPCTX_DISABLED,
+	XHCI_ST_EPCTX_RUNNING,
+	XHCI_ST_EPCTX_HALTED,
+	XHCI_ST_EPCTX_STOPPED,
+	XHCI_ST_EPCTX_ERROR
+};
+
+#define	XHCI_MAX_DEVICES	MIN(USB_MAX_DEVICES, 128)
+#define	XHCI_MAX_ENDPOINTS	32	/* hardcoded - do not change */
+#define	XHCI_MAX_SCRATCHPADS	32
+#define	XHCI_MAX_EVENTS		(16 * 13)
+#define	XHCI_MAX_COMMANDS	(16 * 1)
+#define	XHCI_MAX_RSEG		1
+#define	XHCI_MAX_TRANSFERS	4
+#if USB_MAX_EP_STREAMS == 8
+#define	XHCI_MAX_STREAMS	8
+#define	XHCI_MAX_STREAMS_LOG	3
+#elif USB_MAX_EP_STREAMS == 1
+#define	XHCI_MAX_STREAMS	1
+#define	XHCI_MAX_STREAMS_LOG	0
+#else
+#error "The USB_MAX_EP_STREAMS value is not supported."
+#endif
+#define	XHCI_DEV_CTX_ADDR_ALIGN		64	/* bytes */
+#define	XHCI_DEV_CTX_ALIGN		64	/* bytes */
+#define	XHCI_INPUT_CTX_ALIGN		64	/* bytes */
+#define	XHCI_SLOT_CTX_ALIGN		32	/* bytes */
+#define	XHCI_ENDP_CTX_ALIGN		32	/* bytes */
+#define	XHCI_STREAM_CTX_ALIGN		16	/* bytes */
+#define	XHCI_TRANS_RING_SEG_ALIGN	16	/* bytes */
+#define	XHCI_CMD_RING_SEG_ALIGN		64	/* bytes */
+#define	XHCI_EVENT_RING_SEG_ALIGN	64	/* bytes */
+#define	XHCI_SCRATCH_BUF_ARRAY_ALIGN	64	/* bytes */
+#define	XHCI_SCRATCH_BUFFER_ALIGN	USB_PAGE_SIZE
+#define	XHCI_TRB_ALIGN			16	/* bytes */
+#define	XHCI_TD_ALIGN			64	/* bytes */
+#define	XHCI_PAGE_SIZE			4096	/* bytes */
+
+struct xhci_slot_ctx {
+	volatile uint32_t	dwSctx0;
+#define	XHCI_SCTX_0_ROUTE_SET(x)		((x) & 0xFFFFF)
+#define	XHCI_SCTX_0_ROUTE_GET(x)		((x) & 0xFFFFF)
+#define	XHCI_SCTX_0_SPEED_SET(x)		(((x) & 0xF) << 20)
+#define	XHCI_SCTX_0_SPEED_GET(x)		(((x) >> 20) & 0xF)
+#define	XHCI_SCTX_0_MTT_SET(x)			(((x) & 0x1) << 25)
+#define	XHCI_SCTX_0_MTT_GET(x)			(((x) >> 25) & 0x1)
+#define	XHCI_SCTX_0_HUB_SET(x)			(((x) & 0x1) << 26)
+#define	XHCI_SCTX_0_HUB_GET(x)			(((x) >> 26) & 0x1)
+#define	XHCI_SCTX_0_CTX_NUM_SET(x)		(((x) & 0x1F) << 27)
+#define	XHCI_SCTX_0_CTX_NUM_GET(x)		(((x) >> 27) & 0x1F)
+	volatile uint32_t	dwSctx1;
+#define	XHCI_SCTX_1_MAX_EL_SET(x)		((x) & 0xFFFF)
+#define	XHCI_SCTX_1_MAX_EL_GET(x)		((x) & 0xFFFF)
+#define	XHCI_SCTX_1_RH_PORT_SET(x)		(((x) & 0xFF) << 16)
+#define	XHCI_SCTX_1_RH_PORT_GET(x)		(((x) >> 16) & 0xFF)
+#define	XHCI_SCTX_1_NUM_PORTS_SET(x)		(((x) & 0xFF) << 24)
+#define	XHCI_SCTX_1_NUM_PORTS_GET(x)		(((x) >> 24) & 0xFF)
+	volatile uint32_t	dwSctx2;
+#define	XHCI_SCTX_2_TT_HUB_SID_SET(x)		((x) & 0xFF)
+#define	XHCI_SCTX_2_TT_HUB_SID_GET(x)		((x) & 0xFF)
+#define	XHCI_SCTX_2_TT_PORT_NUM_SET(x)		(((x) & 0xFF) << 8)
+#define	XHCI_SCTX_2_TT_PORT_NUM_GET(x)		(((x) >> 8) & 0xFF)
+#define	XHCI_SCTX_2_TT_THINK_TIME_SET(x)	(((x) & 0x3) << 16)
+#define	XHCI_SCTX_2_TT_THINK_TIME_GET(x)	(((x) >> 16) & 0x3)
+#define	XHCI_SCTX_2_IRQ_TARGET_SET(x)		(((x) & 0x3FF) << 22)
+#define	XHCI_SCTX_2_IRQ_TARGET_GET(x)		(((x) >> 22) & 0x3FF)
+	volatile uint32_t	dwSctx3;
+#define	XHCI_SCTX_3_DEV_ADDR_SET(x)		((x) & 0xFF)
+#define	XHCI_SCTX_3_DEV_ADDR_GET(x)		((x) & 0xFF)
+#define	XHCI_SCTX_3_SLOT_STATE_SET(x)		(((x) & 0x1F) << 27)
+#define	XHCI_SCTX_3_SLOT_STATE_GET(x)		(((x) >> 27) & 0x1F)
+	volatile uint32_t	dwSctx4;
+	volatile uint32_t	dwSctx5;
+	volatile uint32_t	dwSctx6;
+	volatile uint32_t	dwSctx7;
+};
+
+struct xhci_endp_ctx {
+	volatile uint32_t	dwEpCtx0;
+#define	XHCI_EPCTX_0_EPSTATE_SET(x)		((x) & 0x7)
+#define	XHCI_EPCTX_0_EPSTATE_GET(x)		((x) & 0x7)
+#define	XHCI_EPCTX_0_MULT_SET(x)		(((x) & 0x3) << 8)
+#define	XHCI_EPCTX_0_MULT_GET(x)		(((x) >> 8) & 0x3)
+#define	XHCI_EPCTX_0_MAXP_STREAMS_SET(x)	(((x) & 0x1F) << 10)
+#define	XHCI_EPCTX_0_MAXP_STREAMS_GET(x)	(((x) >> 10) & 0x1F)
+#define	XHCI_EPCTX_0_LSA_SET(x)			(((x) & 0x1) << 15)
+#define	XHCI_EPCTX_0_LSA_GET(x)			(((x) >> 15) & 0x1)
+#define	XHCI_EPCTX_0_IVAL_SET(x)		(((x) & 0xFF) << 16)
+#define	XHCI_EPCTX_0_IVAL_GET(x)		(((x) >> 16) & 0xFF)
+	volatile uint32_t	dwEpCtx1;
+#define	XHCI_EPCTX_1_CERR_SET(x)		(((x) & 0x3) << 1)
+#define	XHCI_EPCTX_1_CERR_GET(x)		(((x) >> 1) & 0x3)
+#define	XHCI_EPCTX_1_EPTYPE_SET(x)		(((x) & 0x7) << 3)
+#define	XHCI_EPCTX_1_EPTYPE_GET(x)		(((x) >> 3) & 0x7)
+#define	XHCI_EPCTX_1_HID_SET(x)			(((x) & 0x1) << 7)
+#define	XHCI_EPCTX_1_HID_GET(x)			(((x) >> 7) & 0x1)
+#define	XHCI_EPCTX_1_MAXB_SET(x)		(((x) & 0xFF) << 8)
+#define	XHCI_EPCTX_1_MAXB_GET(x)		(((x) >> 8) & 0xFF)
+#define	XHCI_EPCTX_1_MAXP_SIZE_SET(x)		(((x) & 0xFFFF) << 16)
+#define	XHCI_EPCTX_1_MAXP_SIZE_GET(x)		(((x) >> 16) & 0xFFFF)
+	volatile uint64_t	qwEpCtx2;
+#define	XHCI_EPCTX_2_DCS_SET(x)			((x) & 0x1)
+#define	XHCI_EPCTX_2_DCS_GET(x)			((x) & 0x1)
+#define	XHCI_EPCTX_2_TR_DQ_PTR_MASK		0xFFFFFFFFFFFFFFF0U
+	volatile uint32_t	dwEpCtx4;
+#define	XHCI_EPCTX_4_AVG_TRB_LEN_SET(x)		((x) & 0xFFFF)
+#define	XHCI_EPCTX_4_AVG_TRB_LEN_GET(x)		((x) & 0xFFFF)
+#define	XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(x)	(((x) & 0xFFFF) << 16)
+#define	XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_GET(x)	(((x) >> 16) & 0xFFFF)
+	volatile uint32_t	dwEpCtx5;
+	volatile uint32_t	dwEpCtx6;
+	volatile uint32_t	dwEpCtx7;
+};
+
+struct xhci_input_ctx {
+#define	XHCI_INCTX_NON_CTRL_MASK	0xFFFFFFFCU
+	volatile uint32_t	dwInCtx0;
+#define	XHCI_INCTX_0_DROP_MASK(n)	(1U << (n))
+	volatile uint32_t	dwInCtx1;
+#define	XHCI_INCTX_1_ADD_MASK(n)	(1U << (n))
+	volatile uint32_t	dwInCtx2;
+	volatile uint32_t	dwInCtx3;
+	volatile uint32_t	dwInCtx4;
+	volatile uint32_t	dwInCtx5;
+	volatile uint32_t	dwInCtx6;
+	volatile uint32_t	dwInCtx7;
+};
+
+struct xhci_input_dev_ctx {
+	struct xhci_input_ctx	ctx_input;
+	union {
+		struct xhci_slot_ctx	u_slot;
+		struct xhci_endp_ctx	u_ep[XHCI_MAX_ENDPOINTS];
+	} ctx_dev_slep;
+};
+
+struct xhci_dev_ctx {
+	union {
+		struct xhci_slot_ctx	u_slot;
+		struct xhci_endp_ctx	u_ep[XHCI_MAX_ENDPOINTS];
+	} ctx_dev_slep;
+} __aligned(XHCI_DEV_CTX_ALIGN);
+#define	ctx_slot	ctx_dev_slep.u_slot
+#define	ctx_ep		ctx_dev_slep.u_ep
+
+struct xhci_stream_ctx {
+	volatile uint64_t	qwSctx0;
+#define	XHCI_SCTX_0_DCS_GET(x)		((x) & 0x1)
+#define	XHCI_SCTX_0_DCS_SET(x)		((x) & 0x1)
+#define	XHCI_SCTX_0_SCT_SET(x)		(((x) & 0x7) << 1)
+#define	XHCI_SCTX_0_SCT_GET(x)		(((x) >> 1) & 0x7)
+#define	XHCI_SCTX_0_SCT_SEC_TR_RING	0x0
+#define	XHCI_SCTX_0_SCT_PRIM_TR_RING	0x1
+#define	XHCI_SCTX_0_SCT_PRIM_SSA_8	0x2
+#define	XHCI_SCTX_0_SCT_PRIM_SSA_16	0x3
+#define	XHCI_SCTX_0_SCT_PRIM_SSA_32	0x4
+#define	XHCI_SCTX_0_SCT_PRIM_SSA_64	0x5
+#define	XHCI_SCTX_0_SCT_PRIM_SSA_128	0x6
+#define	XHCI_SCTX_0_SCT_PRIM_SSA_256	0x7
+#define	XHCI_SCTX_0_TR_DQ_PTR_MASK	0xFFFFFFFFFFFFFFF0U
+	volatile uint32_t	dwSctx2;
+	volatile uint32_t	dwSctx3;
+};
+
+struct xhci_trb {
+	volatile uint64_t	qwTrb0;
+#define	XHCI_TRB_0_DIR_IN_MASK		(0x80ULL << 0)
+#define	XHCI_TRB_0_WLENGTH_MASK		(0xFFFFULL << 48)
+	volatile uint32_t	dwTrb2;
+#define	XHCI_TRB_2_ERROR_GET(x)		(((x) >> 24) & 0xFF)
+#define	XHCI_TRB_2_ERROR_SET(x)		(((x) & 0xFF) << 24)
+#define	XHCI_TRB_2_TDSZ_GET(x)		(((x) >> 17) & 0x1F)
+#define	XHCI_TRB_2_TDSZ_SET(x)		(((x) & 0x1F) << 17)
+#define	XHCI_TRB_2_REM_GET(x)		((x) & 0xFFFFFF)
+#define	XHCI_TRB_2_REM_SET(x)		((x) & 0xFFFFFF)
+#define	XHCI_TRB_2_BYTES_GET(x)		((x) & 0x1FFFF)
+#define	XHCI_TRB_2_BYTES_SET(x)		((x) & 0x1FFFF)
+#define	XHCI_TRB_2_IRQ_GET(x)		(((x) >> 22) & 0x3FF)
+#define	XHCI_TRB_2_IRQ_SET(x)		(((x) & 0x3FF) << 22)
+#define	XHCI_TRB_2_STREAM_GET(x)	(((x) >> 16) & 0xFFFF)
+#define	XHCI_TRB_2_STREAM_SET(x)	(((x) & 0xFFFF) << 16)
+
+	volatile uint32_t	dwTrb3;
+#define	XHCI_TRB_3_TYPE_GET(x)		(((x) >> 10) & 0x3F)
+#define	XHCI_TRB_3_TYPE_SET(x)		(((x) & 0x3F) << 10)
+#define	XHCI_TRB_3_CYCLE_BIT		(1U << 0)
+#define	XHCI_TRB_3_TC_BIT		(1U << 1)	/* command ring only */
+#define	XHCI_TRB_3_ENT_BIT		(1U << 1)	/* transfer ring only */
+#define	XHCI_TRB_3_ISP_BIT		(1U << 2)
+#define	XHCI_TRB_3_ED_BIT		(1U << 2)
+#define	XHCI_TRB_3_NSNOOP_BIT		(1U << 3)
+#define	XHCI_TRB_3_CHAIN_BIT		(1U << 4)
+#define	XHCI_TRB_3_IOC_BIT		(1U << 5)
+#define	XHCI_TRB_3_IDT_BIT		(1U << 6)
+#define	XHCI_TRB_3_TBC_GET(x)		(((x) >> 7) & 3)
+#define	XHCI_TRB_3_TBC_SET(x)		(((x) & 3) << 7)
+#define	XHCI_TRB_3_BEI_BIT		(1U << 9)
+#define	XHCI_TRB_3_DCEP_BIT		(1U << 9)
+#define	XHCI_TRB_3_PRSV_BIT		(1U << 9)
+#define	XHCI_TRB_3_BSR_BIT		(1U << 9)
+#define	XHCI_TRB_3_TRT_MASK		(3U << 16)
+#define	XHCI_TRB_3_TRT_NONE		(0U << 16)
+#define	XHCI_TRB_3_TRT_OUT		(2U << 16)
+#define	XHCI_TRB_3_TRT_IN		(3U << 16)
+#define	XHCI_TRB_3_DIR_IN		(1U << 16)
+#define	XHCI_TRB_3_TLBPC_GET(x)		(((x) >> 16) & 0xF)
+#define	XHCI_TRB_3_TLBPC_SET(x)		(((x) & 0xF) << 16)
+#define	XHCI_TRB_3_EP_GET(x)		(((x) >> 16) & 0x1F)
+#define	XHCI_TRB_3_EP_SET(x)		(((x) & 0x1F) << 16)
+#define	XHCI_TRB_3_FRID_GET(x)		(((x) >> 20) & 0x7FF)
+#define	XHCI_TRB_3_FRID_SET(x)		(((x) & 0x7FF) << 20)
+#define	XHCI_TRB_3_ISO_SIA_BIT		(1U << 31)
+#define	XHCI_TRB_3_SUSP_EP_BIT		(1U << 23)
+#define	XHCI_TRB_3_SLOT_GET(x)		(((x) >> 24) & 0xFF)
+#define	XHCI_TRB_3_SLOT_SET(x)		(((x) & 0xFF) << 24)
+
+/* Commands */
+#define	XHCI_TRB_TYPE_RESERVED		0x00
+#define	XHCI_TRB_TYPE_NORMAL		0x01
+#define	XHCI_TRB_TYPE_SETUP_STAGE	0x02
+#define	XHCI_TRB_TYPE_DATA_STAGE	0x03
+#define	XHCI_TRB_TYPE_STATUS_STAGE	0x04
+#define	XHCI_TRB_TYPE_ISOCH		0x05
+#define	XHCI_TRB_TYPE_LINK		0x06
+#define	XHCI_TRB_TYPE_EVENT_DATA	0x07
+#define	XHCI_TRB_TYPE_NOOP		0x08
+#define	XHCI_TRB_TYPE_ENABLE_SLOT	0x09
+#define	XHCI_TRB_TYPE_DISABLE_SLOT	0x0A
+#define	XHCI_TRB_TYPE_ADDRESS_DEVICE	0x0B
+#define	XHCI_TRB_TYPE_CONFIGURE_EP	0x0C
+#define	XHCI_TRB_TYPE_EVALUATE_CTX	0x0D
+#define	XHCI_TRB_TYPE_RESET_EP		0x0E
+#define	XHCI_TRB_TYPE_STOP_EP		0x0F
+#define	XHCI_TRB_TYPE_SET_TR_DEQUEUE	0x10
+#define	XHCI_TRB_TYPE_RESET_DEVICE	0x11
+#define	XHCI_TRB_TYPE_FORCE_EVENT	0x12
+#define	XHCI_TRB_TYPE_NEGOTIATE_BW	0x13
+#define	XHCI_TRB_TYPE_SET_LATENCY_TOL  	0x14
+#define	XHCI_TRB_TYPE_GET_PORT_BW	0x15
+#define	XHCI_TRB_TYPE_FORCE_HEADER	0x16
+#define	XHCI_TRB_TYPE_NOOP_CMD		0x17
+
+/* Events */
+#define	XHCI_TRB_EVENT_TRANSFER		0x20
+#define	XHCI_TRB_EVENT_CMD_COMPLETE	0x21
+#define	XHCI_TRB_EVENT_PORT_STS_CHANGE  0x22
+#define	XHCI_TRB_EVENT_BW_REQUEST      	0x23
+#define	XHCI_TRB_EVENT_DOORBELL		0x24
+#define	XHCI_TRB_EVENT_HOST_CTRL	0x25
+#define	XHCI_TRB_EVENT_DEVICE_NOTIFY	0x26
+#define	XHCI_TRB_EVENT_MFINDEX_WRAP	0x27
+
+/* Error codes */
+#define	XHCI_TRB_ERROR_INVALID		0x00
+#define	XHCI_TRB_ERROR_SUCCESS		0x01
+#define	XHCI_TRB_ERROR_DATA_BUF		0x02
+#define	XHCI_TRB_ERROR_BABBLE		0x03
+#define	XHCI_TRB_ERROR_XACT		0x04
+#define	XHCI_TRB_ERROR_TRB		0x05
+#define	XHCI_TRB_ERROR_STALL		0x06
+#define	XHCI_TRB_ERROR_RESOURCE		0x07
+#define	XHCI_TRB_ERROR_BANDWIDTH	0x08
+#define	XHCI_TRB_ERROR_NO_SLOTS		0x09
+#define	XHCI_TRB_ERROR_STREAM_TYPE	0x0A
+#define	XHCI_TRB_ERROR_SLOT_NOT_ON	0x0B
+#define	XHCI_TRB_ERROR_ENDP_NOT_ON	0x0C
+#define	XHCI_TRB_ERROR_SHORT_PKT	0x0D
+#define	XHCI_TRB_ERROR_RING_UNDERRUN	0x0E
+#define	XHCI_TRB_ERROR_RING_OVERRUN	0x0F
+#define	XHCI_TRB_ERROR_VF_RING_FULL	0x10
+#define	XHCI_TRB_ERROR_PARAMETER	0x11
+#define	XHCI_TRB_ERROR_BW_OVERRUN	0x12
+#define	XHCI_TRB_ERROR_CONTEXT_STATE	0x13
+#define	XHCI_TRB_ERROR_NO_PING_RESP	0x14
+#define	XHCI_TRB_ERROR_EV_RING_FULL	0x15
+#define	XHCI_TRB_ERROR_INCOMPAT_DEV	0x16
+#define	XHCI_TRB_ERROR_MISSED_SERVICE	0x17
+#define	XHCI_TRB_ERROR_CMD_RING_STOP	0x18
+#define	XHCI_TRB_ERROR_CMD_ABORTED	0x19
+#define	XHCI_TRB_ERROR_STOPPED		0x1A
+#define	XHCI_TRB_ERROR_LENGTH		0x1B
+#define	XHCI_TRB_ERROR_BAD_MELAT	0x1D
+#define	XHCI_TRB_ERROR_ISOC_OVERRUN	0x1F
+#define	XHCI_TRB_ERROR_EVENT_LOST	0x20
+#define	XHCI_TRB_ERROR_UNDEFINED	0x21
+#define	XHCI_TRB_ERROR_INVALID_SID	0x22
+#define	XHCI_TRB_ERROR_SEC_BW		0x23
+#define	XHCI_TRB_ERROR_SPLIT_XACT	0x24
+} __aligned(4);
+
+struct xhci_dev_endpoint_trbs {
+	struct xhci_trb		trb[(XHCI_MAX_STREAMS *
+	    XHCI_MAX_TRANSFERS) + XHCI_MAX_STREAMS];
+};
+
+struct xhci_event_ring_seg {
+	volatile uint64_t	qwEvrsTablePtr;
+	volatile uint32_t	dwEvrsTableSize;
+	volatile uint32_t	dwEvrsReserved;
+};
+
+#endif /* _PCI_XHCI_H_ */
diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c
index 70c4f1fae8..be188b79f2 100644
--- a/usr/src/cmd/bhyve/pm.c
+++ b/usr/src/cmd/bhyve/pm.c
@@ -1,5 +1,7 @@
 /*-
- * Copyright (c) 2013 Advanced Computing Technologies LLC
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
@@ -24,14 +26,18 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pm.c 266125 2014-05-15 14:16:55Z jhb $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <machine/vmm.h>
 
 #include <assert.h>
+#include <errno.h>
 #include <pthread.h>
 #ifndef	__FreeBSD__
 #include <stdlib.h>
@@ -51,6 +57,8 @@ static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
 #ifdef	__FreeBSD__
 static struct mevent *power_button;
 static sig_t old_power_handler;
+#else
+struct vmctx *pwr_ctx;
 #endif
 
 /*
@@ -63,6 +71,8 @@ static int
 reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
+	int error;
+
 	static uint8_t reset_control;
 
 	if (bytes != 1)
@@ -74,12 +84,8 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 
 		/* Treat hard and soft resets the same. */
 		if (reset_control & 0x4) {
-#ifdef	__FreeBSD__
 			error = vm_suspend(ctx, VM_SUSPEND_RESET);
 			assert(error == 0 || errno == EALREADY);
-#else
-			exit(0);
-#endif
 		}
 	}
 	return (0);
@@ -220,6 +226,34 @@ power_button_handler(int signal, enum ev_type type, void *arg)
 	}
 	pthread_mutex_unlock(&pm_lock);
 }
+
+#else
+/*
+ * Initiate graceful power off.
+ */
+/*ARGSUSED*/
+static void
+power_button_handler(int signal, siginfo_t *type, void *cp)
+{
+	/*
+	 * In theory, taking the 'pm_lock' mutex from within this signal
+	 * handler could lead to deadlock if the main thread already held this
+	 * mutex. In reality, this mutex is local to this file and all of the
+	 * other usage in this file only occurs in functions which are FreeBSD
+	 * specific (and thus currently not used). Thus, for consistency with
+	 * the other code in this file, we take the mutex, but in the future,
+	 * if these other functions are ever enabled for use on non-FreeBSD
+	 * systems and these functions could be called directly by a thread
+	 * (which would then hold the mutex), then we need to revisit the use
+	 * of this mutex in this signal handler.
+	 */
+	pthread_mutex_lock(&pm_lock);
+	if (!(pm1_status & PM1_PWRBTN_STS)) {
+		pm1_status |= PM1_PWRBTN_STS;
+		sci_update(pwr_ctx);
+	}
+	pthread_mutex_unlock(&pm_lock);
+}
 #endif
 
 /*
@@ -239,6 +273,7 @@ static int
 pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
+	int error;
 
 	if (bytes != 2)
 		return (-1);
@@ -259,12 +294,8 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 */
 		if (*eax & PM1_SLP_EN) {
 			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
-#ifdef	__FreeBSD__
 				error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
 				assert(error == 0 || errno == EALREADY);
-#else
-				exit(0);
-#endif
 			}
 		}
 	}
@@ -330,4 +361,18 @@ sci_init(struct vmctx *ctx)
 	 */
 	pci_irq_use(SCI_INT);
 	vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER);
+
+#ifndef	__FreeBSD__
+	{
+		/*
+		 * Install SIGTERM signal handler for graceful power off.
+		 */
+		struct sigaction act;
+
+		pwr_ctx = ctx;
+		act.sa_flags = 0;
+		act.sa_sigaction = power_button_handler;
+		(void) sigaction(SIGTERM, &act, NULL);
+	}
+#endif
 }
diff --git a/usr/src/cmd/bhyve/pmtmr.c b/usr/src/cmd/bhyve/pmtmr.c
deleted file mode 100644
index 92ab24be57..0000000000
--- a/usr/src/cmd/bhyve/pmtmr.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/*-
- * Copyright (c) 2012 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2014 Pluribus Networks Inc.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $");
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/time.h>
-#include <machine/cpufunc.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <assert.h>
-#include <pthread.h>
-#ifndef __FreeBSD__
-#include <kstat.h>
-#endif
-
-#include "acpi.h"
-#include "inout.h"
-
-/*
- * The ACPI Power Management timer is a free-running 24- or 32-bit
- * timer with a frequency of 3.579545MHz
- *
- * This implementation will be 32-bits
- */
-
-#define PMTMR_FREQ	3579545  /* 3.579545MHz */
-
-static pthread_mutex_t pmtmr_mtx;
-static pthread_once_t pmtmr_once = PTHREAD_ONCE_INIT;
-
-static uint64_t	pmtmr_old;
-
-static uint64_t	pmtmr_tscf;
-static uint64_t	pmtmr_tsc_old;
-
-#ifdef	__FreeBSD__
-static clockid_t clockid = CLOCK_UPTIME_FAST;
-static struct timespec pmtmr_uptime_old;
-
-#define	timespecsub(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec -= (uvp)->tv_sec;				\
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
-		}							\
-	} while (0)
-
-static uint64_t
-timespec_to_pmtmr(const struct timespec *tsnew, const struct timespec *tsold)
-{
-	struct timespec tsdiff;
-	int64_t nsecs;
-
-	tsdiff = *tsnew;
-	timespecsub(&tsdiff, tsold);
-	nsecs = tsdiff.tv_sec * 1000000000 + tsdiff.tv_nsec;
-	assert(nsecs >= 0);
-
-	return (nsecs * PMTMR_FREQ / 1000000000 + pmtmr_old);
-}
-#endif
-
-static uint64_t
-tsc_to_pmtmr(uint64_t tsc_new, uint64_t tsc_old)
-{
-
-	return ((tsc_new - tsc_old) * PMTMR_FREQ / pmtmr_tscf + pmtmr_old);
-}
-
-static void
-pmtmr_init(void)
-{
-#ifdef	__FreeBSD__
-	size_t len;
-	int smp_tsc, err;
-	struct timespec tsnew, tsold = { 0 };
-
-	len = sizeof(smp_tsc);
-	err = sysctlbyname("kern.timecounter.smp_tsc", &smp_tsc, &len, NULL, 0);
-	assert(err == 0);
-
-	if (smp_tsc) {
-		len = sizeof(pmtmr_tscf);
-		err = sysctlbyname("machdep.tsc_freq", &pmtmr_tscf, &len,
-				   NULL, 0);
-		assert(err == 0);
-
-		pmtmr_tsc_old = rdtsc();
-		pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0);
-	} else {
-		if (getenv("BHYVE_PMTMR_PRECISE") != NULL)
-			clockid = CLOCK_UPTIME;
-
-		err = clock_gettime(clockid, &tsnew);
-		assert(err == 0);
-
-		pmtmr_uptime_old = tsnew;
-		pmtmr_old = timespec_to_pmtmr(&tsnew, &tsold);
-	}
-#else
-	kstat_ctl_t *kstat_ctl;
-	kstat_t *kstat;
-	kstat_named_t *kstat_cpu_freq;
-
-	kstat_ctl = kstat_open();
-	kstat = kstat_lookup(kstat_ctl, "cpu_info", 0, NULL);
-	kstat_read(kstat_ctl, kstat, NULL);
-	kstat_cpu_freq = kstat_data_lookup(kstat, "current_clock_Hz");
-	pmtmr_tscf = kstat_cpu_freq->value.ul;
-	kstat_close(kstat_ctl);
-
-	pmtmr_tsc_old = rdtsc();
-	pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0);
-#endif
-	pthread_mutex_init(&pmtmr_mtx, NULL);
-}
-
-static uint32_t
-pmtmr_val(void)
-{
-	struct timespec	tsnew;
-	uint64_t	pmtmr_tsc_new;
-	uint64_t	pmtmr_new;
-	int		error;
-
-	pthread_once(&pmtmr_once, pmtmr_init);
-
-	pthread_mutex_lock(&pmtmr_mtx);
-
-#ifdef	__FreeBSD__
-	if (pmtmr_tscf) {
-		pmtmr_tsc_new = rdtsc();
-		pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old);
-		pmtmr_tsc_old = pmtmr_tsc_new;
-	} else {
-		error = clock_gettime(clockid, &tsnew);
-		assert(error == 0);
-
-		pmtmr_new = timespec_to_pmtmr(&tsnew, &pmtmr_uptime_old);
-		pmtmr_uptime_old = tsnew;
-	}
-#else
-	pmtmr_tsc_new = rdtsc();
-	pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old);
-	pmtmr_tsc_old = pmtmr_tsc_new;
-#endif
-	pmtmr_old = pmtmr_new;
-
-	pthread_mutex_unlock(&pmtmr_mtx);
-
-	return (pmtmr_new); 
-}
-
-static int
-pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-	          uint32_t *eax, void *arg)
-{
-	assert(in == 1);
-
-	if (bytes != 4)
-		return (-1);
-
-	*eax = pmtmr_val();
-
-	return (0);
-}
-
-INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler);
diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c
index dcb481aac4..d3040a8df7 100644
--- a/usr/src/cmd/bhyve/post.c
+++ b/usr/src/cmd/bhyve/post.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,11 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c
index 22e566ac21..5453a26949 100644
--- a/usr/src/cmd/bhyve/ps2kbd.c
+++ b/usr/src/cmd/bhyve/ps2kbd.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2015 Nahanni Systems Inc.
  * All rights reserved.
@@ -74,6 +76,107 @@ struct ps2kbd_softc {
 	uint8_t			curcmd;	/* current command for next byte */
 };
 
+#define SCANCODE_E0_PREFIX 1
+struct extended_translation {
+	uint32_t keysym;
+	uint8_t scancode;
+	int flags;
+};
+
+/*
+ * FIXME: Pause/break and Print Screen/SysRq require special handling.
+ */
+static const struct extended_translation extended_translations[] = {
+		{0xff08, 0x66},		/* Back space */
+		{0xff09, 0x0d},		/* Tab */
+		{0xff0d, 0x5a},		/* Return */
+		{0xff1b, 0x76},		/* Escape */
+		{0xff50, 0x6c, SCANCODE_E0_PREFIX}, 	/* Home */
+		{0xff51, 0x6b, SCANCODE_E0_PREFIX}, 	/* Left arrow */
+		{0xff52, 0x75, SCANCODE_E0_PREFIX}, 	/* Up arrow */
+		{0xff53, 0x74, SCANCODE_E0_PREFIX}, 	/* Right arrow */
+		{0xff54, 0x72, SCANCODE_E0_PREFIX}, 	/* Down arrow */
+		{0xff55, 0x7d, SCANCODE_E0_PREFIX}, 	/* PgUp */
+		{0xff56, 0x7a, SCANCODE_E0_PREFIX}, 	/* PgDown */
+		{0xff57, 0x69, SCANCODE_E0_PREFIX}, 	/* End */
+		{0xff63, 0x70, SCANCODE_E0_PREFIX}, 	/* Ins */
+		{0xff8d, 0x5a, SCANCODE_E0_PREFIX}, 	/* Keypad Enter */
+		{0xffe1, 0x12},		/* Left shift */
+		{0xffe2, 0x59},		/* Right shift */
+		{0xffe3, 0x14},		/* Left control */
+		{0xffe4, 0x14, SCANCODE_E0_PREFIX}, 	/* Right control */
+		/* {0xffe7, XXX}, Left meta */
+		/* {0xffe8, XXX}, Right meta */
+		{0xffe9, 0x11},		/* Left alt */
+		{0xfe03, 0x11, SCANCODE_E0_PREFIX}, 	/* AltGr */
+		{0xffea, 0x11, SCANCODE_E0_PREFIX}, 	/* Right alt */
+		{0xffeb, 0x1f, SCANCODE_E0_PREFIX}, 	/* Left Windows */
+		{0xffec, 0x27, SCANCODE_E0_PREFIX}, 	/* Right Windows */
+		{0xffbe, 0x05},		/* F1 */
+		{0xffbf, 0x06},		/* F2 */
+		{0xffc0, 0x04},		/* F3 */
+		{0xffc1, 0x0c},		/* F4 */
+		{0xffc2, 0x03},		/* F5 */
+		{0xffc3, 0x0b},		/* F6 */
+		{0xffc4, 0x83},		/* F7 */
+		{0xffc5, 0x0a},		/* F8 */
+		{0xffc6, 0x01},		/* F9 */
+		{0xffc7, 0x09},		/* F10 */
+		{0xffc8, 0x78},		/* F11 */
+		{0xffc9, 0x07},		/* F12 */
+		{0xffff, 0x71, SCANCODE_E0_PREFIX},	/* Del */
+		{0xff14, 0x7e},		/* ScrollLock */
+		/* NumLock and Keypads*/
+		{0xff7f, 0x77}, 	/* NumLock */
+		{0xffaf, 0x4a, SCANCODE_E0_PREFIX}, 	/* Keypad slash */
+		{0xffaa, 0x7c}, 	/* Keypad asterisk */
+		{0xffad, 0x7b}, 	/* Keypad minus */
+		{0xffab, 0x79}, 	/* Keypad plus */
+		{0xffb7, 0x6c}, 	/* Keypad 7 */
+		{0xff95, 0x6c}, 	/* Keypad home */
+		{0xffb8, 0x75}, 	/* Keypad 8 */
+		{0xff97, 0x75}, 	/* Keypad up arrow */
+		{0xffb9, 0x7d}, 	/* Keypad 9 */
+		{0xff9a, 0x7d}, 	/* Keypad PgUp */
+		{0xffb4, 0x6b}, 	/* Keypad 4 */
+		{0xff96, 0x6b}, 	/* Keypad left arrow */
+		{0xffb5, 0x73}, 	/* Keypad 5 */
+		{0xff9d, 0x73}, 	/* Keypad empty */
+		{0xffb6, 0x74}, 	/* Keypad 6 */
+		{0xff98, 0x74}, 	/* Keypad right arrow */
+		{0xffb1, 0x69}, 	/* Keypad 1 */
+		{0xff9c, 0x69}, 	/* Keypad end */
+		{0xffb2, 0x72}, 	/* Keypad 2 */
+		{0xff99, 0x72}, 	/* Keypad down arrow */
+		{0xffb3, 0x7a}, 	/* Keypad 3 */
+		{0xff9b, 0x7a}, 	/* Keypad PgDown */
+		{0xffb0, 0x70}, 	/* Keypad 0 */
+		{0xff9e, 0x70}, 	/* Keypad ins */
+		{0xffae, 0x71}, 	/* Keypad . */
+		{0xff9f, 0x71}, 	/* Keypad del */
+		{0, 0, 0} 		/* Terminator */
+};
+
+/* ASCII to type 2 scancode lookup table */
+static const uint8_t ascii_translations[128] = {
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52,
+		0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a,
+		0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d,
+		0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a,
+		0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e,
+		0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00,
+};
+
 static void
 fifo_init(struct ps2kbd_softc *sc)
 {
@@ -93,15 +196,6 @@ fifo_reset(struct ps2kbd_softc *sc)
 	fifo->size = sizeof(((struct fifo *)0)->buf);
 }
 
-static int
-fifo_available(struct ps2kbd_softc *sc)
-{
-	struct fifo *fifo;
-
-	fifo = &sc->fifo;
-	return (fifo->num < fifo->size);
-}
-
 static void
 fifo_put(struct ps2kbd_softc *sc, uint8_t val)
 {
@@ -166,6 +260,9 @@ ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val)
 		sc->curcmd = 0;
 	} else {
 		switch (val) {
+		case 0x00:
+			fifo_put(sc, PS2KC_ACK);
+			break;
 		case PS2KC_RESET_DEV:
 			fifo_reset(sc);
 			fifo_put(sc, PS2KC_ACK);
@@ -216,190 +313,57 @@ static void
 ps2kbd_keysym_queue(struct ps2kbd_softc *sc,
     int down, uint32_t keysym)
 {
-	/* ASCII to type 2 scancode lookup table */
-	const uint8_t translation[128] = {
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52,
-		0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a,
-		0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d,
-		0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a,
-		0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
-		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
-		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
-		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e,
-		0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
-		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
-		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
-		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00,
-	};
-
 	assert(pthread_mutex_isowned_np(&sc->mtx));
+	int e0_prefix, found;
+	uint8_t code;
+	const struct extended_translation *trans;
+
+	found = 0;
+	if (keysym < 0x80) {
+		code = ascii_translations[keysym];
+		e0_prefix = 0;
+		found = 1;
+	} else {
+		for (trans = &(extended_translations[0]); trans->keysym != 0;
+		    trans++) {
+			if (keysym == trans->keysym) {
+				code = trans->scancode;
+				e0_prefix = trans->flags & SCANCODE_E0_PREFIX;
+				found = 1;
+				break;
+			}
+		}
+	}
 
-	switch (keysym) {
-	case 0x0 ... 0x7f:
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, translation[keysym]);
-		break;
-	case 0xff08:	/* Back space */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x66);
-		break;
-	case 0xff09:	/* Tab */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x0d);
-		break;
-	case 0xff0d:	/* Return  */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x5a);
-		break;
-	case 0xff1b:	/* Escape */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x76);
-		break;
-	case 0xff51:	/* Left arrow */
-		fifo_put(sc, 0xe0);
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x6b);
-		break;
-	case 0xff52:	/* Up arrow */
-		fifo_put(sc, 0xe0);
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x75);
-		break;
-	case 0xff53:	/* Right arrow */
-		fifo_put(sc, 0xe0);
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x74);
-		break;
-	case 0xff54:	/* Down arrow */
-		fifo_put(sc, 0xe0);
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x72);
-		break;
-	case 0xffbe:	/* F1 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x05);
-		break;
-	case 0xffbf:	/* F2 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x06);
-		break;
-	case 0xffc0:	/* F3 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x04);
-		break;
-	case 0xffc1:	/* F4 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x0c);
-		break;
-	case 0xffc2:	/* F5 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x03);
-		break;
-	case 0xffc3:	/* F6 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x0b);
-		break;
-	case 0xffc4:	/* F7 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x83);
-		break;
-	case 0xffc5:	/* F8 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x0a);
-		break;
-	case 0xffc6:	/* F9 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x01);
-		break;
-	case 0xffc7:	/* F10 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x09);
-		break;
-	case 0xffc8:	/* F11 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x78);
-		break;
-	case 0xffc9:	/* F12 */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x07);
-		break;
-	case 0xffe1:	/* Left shift */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x12);
-		break;
-	case 0xffe2:	/* Right shift */
-		/* XXX */
-		break;
-	case 0xffe3:	/* Left control */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x14);
-		break;
-	case 0xffe4:	/* Right control */
-		/* XXX */
-		break;
-	case 0xffe7:	/* Left meta */
-		/* XXX */
-		break;
-	case 0xffe8:	/* Right meta */
-		/* XXX */
-		break;
-	case 0xffe9:	/* Left alt */
-		if (!down)
-			fifo_put(sc, 0xf0);
-		fifo_put(sc, 0x11);
-		break;
-	case 0xffea:	/* Right alt */
-		/* XXX */
-		break;
-	default:
-		fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n",
-		     keysym);
-		break;
+	if (!found) {
+		fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", keysym);
+		return;
 	}
+
+	if (e0_prefix)
+		fifo_put(sc, 0xe0);
+	if (!down)
+		fifo_put(sc, 0xf0);
+	fifo_put(sc, code);
 }
 
 static void
 ps2kbd_event(int down, uint32_t keysym, void *arg)
 {
 	struct ps2kbd_softc *sc = arg;
+	int fifo_full;
 
 	pthread_mutex_lock(&sc->mtx);
 	if (!sc->enabled) {
 		pthread_mutex_unlock(&sc->mtx);
 		return;
 	}
-
+	fifo_full = sc->fifo.num == PS2KBD_FIFOSZ;
 	ps2kbd_keysym_queue(sc, down, keysym);
 	pthread_mutex_unlock(&sc->mtx);
 
-	atkbdc_event(sc->atkbdc_sc);
+	if (!fifo_full)
+		atkbdc_event(sc->atkbdc_sc, 1);
 }
 
 struct ps2kbd_softc *
@@ -412,7 +376,8 @@ ps2kbd_init(struct atkbdc_softc *atkbdc_sc)
 	fifo_init(sc);
 	sc->atkbdc_sc = atkbdc_sc;
 
-	console_kbd_register(ps2kbd_event, sc);
+	console_kbd_register(ps2kbd_event, sc, 1);
 
 	return (sc);
 }
+
diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h
index 34c31b1ea8..17be6d0466 100644
--- a/usr/src/cmd/bhyve/ps2kbd.h
+++ b/usr/src/cmd/bhyve/ps2kbd.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c
index e96fbbf411..b2e08262b1 100644
--- a/usr/src/cmd/bhyve/ps2mouse.c
+++ b/usr/src/cmd/bhyve/ps2mouse.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2015 Nahanni Systems Inc.
  * All rights reserved.
@@ -62,6 +64,16 @@ __FBSDID("$FreeBSD$");
 /* mouse device id */
 #define	PS2MOUSE_DEV_ID		0x0
 
+/* mouse data bits */
+#define	PS2M_DATA_Y_OFLOW	0x80
+#define	PS2M_DATA_X_OFLOW	0x40
+#define	PS2M_DATA_Y_SIGN	0x20
+#define	PS2M_DATA_X_SIGN	0x10
+#define	PS2M_DATA_AONE		0x08
+#define	PS2M_DATA_MID_BUTTON	0x04
+#define	PS2M_DATA_RIGHT_BUTTON	0x02
+#define	PS2M_DATA_LEFT_BUTTON	0x01
+
 /* mouse status bits */
 #define	PS2M_STS_REMOTE_MODE	0x40
 #define	PS2M_STS_ENABLE_DEV	0x20
@@ -87,6 +99,7 @@ struct ps2mouse_softc {
 	uint8_t		status;
 	uint8_t		resolution;
 	uint8_t		sampling_rate;
+	int		ctrlenable;
 	struct fifo	fifo;
 
 	uint8_t		curcmd;	/* current command for next byte */
@@ -168,19 +181,20 @@ movement_get(struct ps2mouse_softc *sc)
 
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
-	val0 = 	sc->status & (PS2M_STS_LEFT_BUTTON |
-	    PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON);
+	val0 = PS2M_DATA_AONE;
+	val0 |= sc->status & (PS2M_DATA_LEFT_BUTTON |
+	    PS2M_DATA_RIGHT_BUTTON | PS2M_DATA_MID_BUTTON);
 
 	if (sc->delta_x >= 0) {
 		if (sc->delta_x > 255) {
-			val0 |= (1 << 6);
+			val0 |= PS2M_DATA_X_OFLOW;
 			val1 = 255;
 		} else
 			val1 = sc->delta_x;
 	} else {
-		val0 |= (1 << 4);
+		val0 |= PS2M_DATA_X_SIGN;
 		if (sc->delta_x < -255) {
-			val0 |= (1 << 6);
+			val0 |= PS2M_DATA_X_OFLOW;
 			val1 = 255;
 		} else
 			val1 = sc->delta_x;
@@ -189,23 +203,25 @@ movement_get(struct ps2mouse_softc *sc)
 
 	if (sc->delta_y >= 0) {
 		if (sc->delta_y > 255) {
-			val0 |= (1 << 7);
+			val0 |= PS2M_DATA_Y_OFLOW;
 			val2 = 255;
 		} else
 			val2 = sc->delta_y;
 	} else {
-		val0 |= (1 << 5);
+		val0 |= PS2M_DATA_Y_SIGN;
 		if (sc->delta_y < -255) {
-			val0 |= (1 << 7);
+			val0 |= PS2M_DATA_Y_OFLOW;
 			val2 = 255;
 		} else
 			val2 = sc->delta_y;
 	}
 	sc->delta_y = 0;
 
-	fifo_put(sc, val0);
-	fifo_put(sc, val1);
-	fifo_put(sc, val2);
+	if (sc->fifo.num < (sc->fifo.size - 3)) {
+		fifo_put(sc, val0);
+		fifo_put(sc, val1);
+		fifo_put(sc, val2);
+	}
 }
 
 static void
@@ -214,7 +230,7 @@ ps2mouse_reset(struct ps2mouse_softc *sc)
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 	fifo_reset(sc);
 	movement_reset(sc);
-	sc->status = 0x8;
+	sc->status = PS2M_STS_ENABLE_DEV;
 	sc->resolution = 4;
 	sc->sampling_rate = 100;
 
@@ -236,10 +252,32 @@ ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val)
 	return (retval);
 }
 
+int
+ps2mouse_fifocnt(struct ps2mouse_softc *sc)
+{
+	return (sc->fifo.num);
+}
+
 void
-ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val)
+ps2mouse_toggle(struct ps2mouse_softc *sc, int enable)
 {
 	pthread_mutex_lock(&sc->mtx);
+	if (enable)
+		sc->ctrlenable = 1;
+	else {
+		sc->ctrlenable = 0;
+		sc->fifo.rindex = 0;
+		sc->fifo.windex = 0;
+		sc->fifo.num = 0;
+	}
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+void
+ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert)
+{
+	pthread_mutex_lock(&sc->mtx);
+	fifo_reset(sc);
 	if (sc->curcmd) {
 		switch (sc->curcmd) {
 		case PS2MC_SET_SAMPLING_RATE:
@@ -256,8 +294,14 @@ ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val)
 			break;
 		}
 		sc->curcmd = 0;
+
+	} else if (insert) {
+		fifo_put(sc, val);
 	} else {
 		switch (val) {
+		case 0x00:
+			fifo_put(sc, PS2MC_ACK);
+			break;
 		case PS2MC_RESET_DEV:
 			ps2mouse_reset(sc);
 			fifo_put(sc, PS2MC_ACK);
@@ -313,6 +357,7 @@ ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val)
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		default:
+			fifo_put(sc, PS2MC_ACK);
 			fprintf(stderr, "Unhandled ps2 mouse command "
 			    "0x%02x\n", val);
 			break;
@@ -338,7 +383,7 @@ ps2mouse_event(uint8_t button, int x, int y, void *arg)
 	if (button & (1 << 2))
 		sc->status |= PS2M_STS_RIGHT_BUTTON;
 
-	if ((sc->status & PS2M_STS_ENABLE_DEV) == 0) {
+	if ((sc->status & PS2M_STS_ENABLE_DEV) == 0 || !sc->ctrlenable) {
 		/* no data reporting */
 		pthread_mutex_unlock(&sc->mtx);
 		return;
@@ -347,7 +392,8 @@ ps2mouse_event(uint8_t button, int x, int y, void *arg)
 	movement_get(sc);
 	pthread_mutex_unlock(&sc->mtx);
 
-	atkbdc_event(sc->atkbdc_sc);
+	if (sc->fifo.num > 0)
+		atkbdc_event(sc->atkbdc_sc, 0);
 }
 
 struct ps2mouse_softc *
@@ -364,8 +410,9 @@ ps2mouse_init(struct atkbdc_softc *atkbdc_sc)
 	ps2mouse_reset(sc);
 	pthread_mutex_unlock(&sc->mtx);
 
-	console_ptr_register(ps2mouse_event, sc);
+	console_ptr_register(ps2mouse_event, sc, 1);
 
 	return (sc);
 }
 
+
diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h
index 1a78934b98..59430b01e2 100644
--- a/usr/src/cmd/bhyve/ps2mouse.h
+++ b/usr/src/cmd/bhyve/ps2mouse.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -34,6 +36,8 @@ struct atkbdc_softc;
 struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc);
 
 int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val);
-void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val);
+void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert);
+void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable);
+int ps2mouse_fifocnt(struct ps2mouse_softc *sc);
 
 #endif /* _PS2MOUSE_H_ */
diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c
index 0846316378..39ea1611f9 100644
--- a/usr/src/cmd/bhyve/rfb.c
+++ b/usr/src/cmd/bhyve/rfb.c
@@ -1,6 +1,9 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
- * Copyright (c) 2015 Nahanni Systems Inc.
+ * Copyright (c) 2015 Leon Dang
+ * Copyright 2018 Joyent, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,30 +31,91 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <sys/param.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/endian.h>
 #include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <arpa/inet.h>
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
 #include <netinet/in.h>
+#include <netdb.h>
 
 #include <assert.h>
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
 #include <pthread.h>
+#include <pthread_np.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <sysexits.h>
 #include <unistd.h>
 
+#include <zlib.h>
+
+#ifndef __FreeBSD__
+#include <sys/debug.h>
+#endif
+
 #include "bhyvegc.h"
 #include "console.h"
 #include "rfb.h"
+#include "sockstream.h"
+
+#ifndef NO_OPENSSL
+#include <openssl/des.h>
+#endif
+
+static int rfb_debug = 0;
+#define	DPRINTF(params) if (rfb_debug) printf params
+#define	WPRINTF(params) printf params
+
+#define AUTH_LENGTH	16
+#define PASSWD_LENGTH	8
+
+#define SECURITY_TYPE_NONE	1
+#define SECURITY_TYPE_VNC_AUTH	2
+
+#define AUTH_FAILED_UNAUTH	1
+#define AUTH_FAILED_ERROR	2
 
 struct rfb_softc {
 	int		sfd;
 	pthread_t	tid;
 
+	int		cfd;
+
 	int		width, height;
 
-	bool		enc_raw_ok;
-	bool		enc_resize_ok;
+	char		*password;
+
+	bool	enc_raw_ok;
+	bool	enc_zlib_ok;
+	bool	enc_resize_ok;
+
+	z_stream	zstream;
+	uint8_t		*zbuf;
+	int		zbuflen;
+
+	int		conn_wait;
+	int		sending;
+	pthread_mutex_t mtx;
+	pthread_cond_t  cond;
+
+	int		hw_crc;
+	uint32_t	*crc;		/* WxH crc cells */
+	uint32_t	*crc_tmp;	/* buffer to store single crc row */
+	int		crc_width, crc_height;
 };
 
 struct rfb_pixfmt {
@@ -82,8 +146,16 @@ struct rfb_pixfmt_msg {
 };
 
 #define	RFB_ENCODING_RAW		0
+#define	RFB_ENCODING_ZLIB		6
 #define	RFB_ENCODING_RESIZE		-223
 
+#define	RFB_MAX_WIDTH			2000
+#define	RFB_MAX_HEIGHT			1200
+#define	RFB_ZLIB_BUFSZ			RFB_MAX_WIDTH*RFB_MAX_HEIGHT*4
+
+/* percentage changes to screen before sending the entire screen */
+#define	RFB_SEND_ALL_THRESH		25
+
 struct rfb_enc_msg {
 	uint8_t		type;
 	uint8_t		pad;
@@ -127,60 +199,65 @@ struct rfb_srvr_rect_hdr {
 	uint32_t	encoding;
 };
 
+struct rfb_cuttext_msg {
+	uint8_t		type;
+	uint8_t		padding[3];
+	uint32_t	length;
+};
+
+
 static void
 rfb_send_server_init_msg(int cfd)
 {
 	struct bhyvegc_image *gc_image;
 	struct rfb_srvr_info sinfo;
-	int len;
 
 	gc_image = console_get_image();
 
-	sinfo.width = ntohs(gc_image->width);
-	sinfo.height = ntohs(gc_image->height);
+	sinfo.width = htons(gc_image->width);
+	sinfo.height = htons(gc_image->height);
 	sinfo.pixfmt.bpp = 32;
 	sinfo.pixfmt.depth = 32;
 	sinfo.pixfmt.bigendian = 0;
 	sinfo.pixfmt.truecolor = 1;
-	sinfo.pixfmt.red_max = ntohs(255);
-	sinfo.pixfmt.green_max = ntohs(255);
-	sinfo.pixfmt.blue_max = ntohs(255);
+	sinfo.pixfmt.red_max = htons(255);
+	sinfo.pixfmt.green_max = htons(255);
+	sinfo.pixfmt.blue_max = htons(255);
 	sinfo.pixfmt.red_shift = 16;
 	sinfo.pixfmt.green_shift = 8;
 	sinfo.pixfmt.blue_shift = 0;
-	sinfo.namelen = ntohl(strlen("bhyve"));
-	len = write(cfd, &sinfo, sizeof(sinfo));
-	len = write(cfd, "bhyve", strlen("bhyve"));
+	sinfo.namelen = htonl(strlen("bhyve"));
+	(void)stream_write(cfd, &sinfo, sizeof(sinfo));
+	(void)stream_write(cfd, "bhyve", strlen("bhyve"));
 }
 
 static void
 rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd)
 {
 	struct rfb_srvr_updt_msg supdt_msg;
-        struct rfb_srvr_rect_hdr srect_hdr;
+	struct rfb_srvr_rect_hdr srect_hdr;
 
 	/* Number of rectangles: 1 */
 	supdt_msg.type = 0;
 	supdt_msg.pad = 0;
-	supdt_msg.numrects = ntohs(1);
-	write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
+	supdt_msg.numrects = htons(1);
+	stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
 
 	/* Rectangle header */
-	srect_hdr.x = ntohs(0);
-	srect_hdr.y = ntohs(0);
-	srect_hdr.width = ntohs(rc->width);
-	srect_hdr.height = ntohs(rc->height);
-	srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE);
-	write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
+	srect_hdr.x = htons(0);
+	srect_hdr.y = htons(0);
+	srect_hdr.width = htons(rc->width);
+	srect_hdr.height = htons(rc->height);
+	srect_hdr.encoding = htonl(RFB_ENCODING_RESIZE);
+	stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
 }
 
 static void
 rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd)
 {
 	struct rfb_pixfmt_msg pixfmt_msg;
-	int len;
 
-	len = read(cfd, ((void *)&pixfmt_msg) + 1, sizeof(pixfmt_msg) - 1);
+	(void)stream_read(cfd, ((void *)&pixfmt_msg)+1, sizeof(pixfmt_msg)-1);
 }
 
 
@@ -188,18 +265,22 @@ static void
 rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd)
 {
 	struct rfb_enc_msg enc_msg;
-	int len, i;
+	int i;
 	uint32_t encoding;
 
 	assert((sizeof(enc_msg) - 1) == 3);
-	len = read(cfd, ((void *)&enc_msg) + 1, sizeof(enc_msg) - 1);
+	(void)stream_read(cfd, ((void *)&enc_msg)+1, sizeof(enc_msg)-1);
 
-	for (i = 0; i < ntohs(enc_msg.numencs); i++) {
-		len = read(cfd, &encoding, sizeof(encoding));
-		switch (ntohl(encoding)) {
+	for (i = 0; i < htons(enc_msg.numencs); i++) {
+		(void)stream_read(cfd, &encoding, sizeof(encoding));
+		switch (htonl(encoding)) {
 		case RFB_ENCODING_RAW:
 			rc->enc_raw_ok = true;
 			break;
+		case RFB_ENCODING_ZLIB:
+			rc->enc_zlib_ok = true;
+			deflateInit(&rc->zstream, Z_BEST_SPEED);
+			break;
 		case RFB_ENCODING_RESIZE:
 			rc->enc_resize_ok = true;
 			break;
@@ -207,88 +288,460 @@ rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd)
 	}
 }
 
-static void
-rfb_resize_update(struct rfb_softc *rc, int fd)
+/*
+ * Calculate CRC32 using SSE4.2; Intel or AMD Bulldozer+ CPUs only
+ */
+static __inline uint32_t
+fast_crc32(void *buf, int len, uint32_t crcval)
+{
+	uint32_t q = len / sizeof(uint32_t);
+	uint32_t *p = (uint32_t *)buf;
+
+	while (q--) {
+		asm volatile (
+			".byte 0xf2, 0xf, 0x38, 0xf1, 0xf1;"
+			:"=S" (crcval)
+			:"0" (crcval), "c" (*p)
+		);
+		p++;
+	}
+
+	return (crcval);
+}
+
+
+static int
+rfb_send_rect(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc,
+              int x, int y, int w, int h)
+{
+	struct rfb_srvr_updt_msg supdt_msg;
+	struct rfb_srvr_rect_hdr srect_hdr;
+	unsigned long zlen;
+	ssize_t nwrite, total;
+	int err;
+	uint32_t *p;
+	uint8_t *zbufp;
+
+	/*
+	 * Send a single rectangle of the given x, y, w h dimensions.
+	 */
+
+	/* Number of rectangles: 1 */
+	supdt_msg.type = 0;
+	supdt_msg.pad = 0;
+	supdt_msg.numrects = htons(1);
+	nwrite = stream_write(cfd, &supdt_msg,
+	                      sizeof(struct rfb_srvr_updt_msg));
+	if (nwrite <= 0)
+		return (nwrite);
+
+
+	/* Rectangle header */
+	srect_hdr.x = htons(x);
+	srect_hdr.y = htons(y);
+	srect_hdr.width = htons(w);
+	srect_hdr.height = htons(h);
+
+	h = y + h;
+	w *= sizeof(uint32_t);
+	if (rc->enc_zlib_ok) {
+		zbufp = rc->zbuf;
+		rc->zstream.total_in = 0;
+		rc->zstream.total_out = 0;
+		for (p = &gc->data[y * gc->width + x]; y < h; y++) {
+			rc->zstream.next_in = (Bytef *)p;
+			rc->zstream.avail_in = w;
+			rc->zstream.next_out = (Bytef *)zbufp;
+			rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16 -
+			                        rc->zstream.total_out;
+			rc->zstream.data_type = Z_BINARY;
+
+			/* Compress with zlib */
+			err = deflate(&rc->zstream, Z_SYNC_FLUSH);
+			if (err != Z_OK) {
+				WPRINTF(("zlib[rect] deflate err: %d\n", err));
+				rc->enc_zlib_ok = false;
+				deflateEnd(&rc->zstream);
+				goto doraw;
+			}
+			zbufp = rc->zbuf + rc->zstream.total_out;
+			p += gc->width;
+		}
+		srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB);
+		nwrite = stream_write(cfd, &srect_hdr,
+		                      sizeof(struct rfb_srvr_rect_hdr));
+		if (nwrite <= 0)
+			return (nwrite);
+
+		zlen = htonl(rc->zstream.total_out);
+		nwrite = stream_write(cfd, &zlen, sizeof(uint32_t));
+		if (nwrite <= 0)
+			return (nwrite);
+		return (stream_write(cfd, rc->zbuf, rc->zstream.total_out));
+	}
+
+doraw:
+
+	total = 0;
+	zbufp = rc->zbuf;
+	for (p = &gc->data[y * gc->width + x]; y < h; y++) {
+		memcpy(zbufp, p, w);
+		zbufp += w;
+		total += w;
+		p += gc->width;
+	}
+
+	srect_hdr.encoding = htonl(RFB_ENCODING_RAW);
+	nwrite = stream_write(cfd, &srect_hdr,
+	                      sizeof(struct rfb_srvr_rect_hdr));
+	if (nwrite <= 0)
+		return (nwrite);
+
+	total = stream_write(cfd, rc->zbuf, total);
+
+	return (total);
+}
+
+static int
+rfb_send_all(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc)
 {
 	struct rfb_srvr_updt_msg supdt_msg;
         struct rfb_srvr_rect_hdr srect_hdr;
+	ssize_t nwrite;
+	unsigned long zlen;
+	int err;
+
+	/*
+	 * Send the whole thing
+	 */
 
 	/* Number of rectangles: 1 */
 	supdt_msg.type = 0;
 	supdt_msg.pad = 0;
-	supdt_msg.numrects = ntohs(1);
-	write(fd, &supdt_msg, sizeof (struct rfb_srvr_updt_msg));
+	supdt_msg.numrects = htons(1);
+	nwrite = stream_write(cfd, &supdt_msg,
+	                      sizeof(struct rfb_srvr_updt_msg));
+	if (nwrite <= 0)
+		return (nwrite);
 
 	/* Rectangle header */
-	srect_hdr.x = ntohs(0);
-	srect_hdr.y = ntohs(0);
-	srect_hdr.width = ntohs(rc->width);
-	srect_hdr.height = ntohs(rc->height);
-	srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE);
-	write(fd, &srect_hdr, sizeof (struct rfb_srvr_rect_hdr));
+	srect_hdr.x = 0;
+	srect_hdr.y = 0;
+	srect_hdr.width = htons(gc->width);
+	srect_hdr.height = htons(gc->height);
+	if (rc->enc_zlib_ok) {
+		rc->zstream.next_in = (Bytef *)gc->data;
+		rc->zstream.avail_in = gc->width * gc->height *
+		                   sizeof(uint32_t);
+		rc->zstream.next_out = (Bytef *)rc->zbuf;
+		rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16;
+		rc->zstream.data_type = Z_BINARY;
+
+		rc->zstream.total_in = 0;
+		rc->zstream.total_out = 0;
+
+		/* Compress with zlib */
+		err = deflate(&rc->zstream, Z_SYNC_FLUSH);
+		if (err != Z_OK) {
+			WPRINTF(("zlib deflate err: %d\n", err));
+			rc->enc_zlib_ok = false;
+			deflateEnd(&rc->zstream);
+			goto doraw;
+		}
+
+		srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB);
+		nwrite = stream_write(cfd, &srect_hdr,
+		                      sizeof(struct rfb_srvr_rect_hdr));
+		if (nwrite <= 0)
+			return (nwrite);
+
+		zlen = htonl(rc->zstream.total_out);
+		nwrite = stream_write(cfd, &zlen, sizeof(uint32_t));
+		if (nwrite <= 0)
+			return (nwrite);
+		return (stream_write(cfd, rc->zbuf, rc->zstream.total_out));
+	}
+
+doraw:
+	srect_hdr.encoding = htonl(RFB_ENCODING_RAW);
+	nwrite = stream_write(cfd, &srect_hdr,
+	                      sizeof(struct rfb_srvr_rect_hdr));
+	if (nwrite <= 0)
+		return (nwrite);
+
+	nwrite = stream_write(cfd, gc->data,
+	               gc->width * gc->height * sizeof(uint32_t));
+
+	return (nwrite);
 }
 
+#define	PIX_PER_CELL	32
+#define	PIXCELL_SHIFT	5
+#define	PIXCELL_MASK	0x1F
+
+static int
+rfb_send_screen(struct rfb_softc *rc, int cfd, int all)
+{
+	struct bhyvegc_image *gc_image;
+	ssize_t nwrite;
+	int x, y;
+	int celly, cellwidth;
+	int xcells, ycells;
+	int w, h;
+	uint32_t *p;
+	int rem_x, rem_y;   /* remainder for resolutions not x32 pixels ratio */
+	int retval;
+	uint32_t *crc_p, *orig_crc;
+	int changes;
+
+	console_refresh();
+	gc_image = console_get_image();
+
+	pthread_mutex_lock(&rc->mtx);
+	if (rc->sending) {
+		pthread_mutex_unlock(&rc->mtx);
+		return (1);
+	}
+	rc->sending = 1;
+	pthread_mutex_unlock(&rc->mtx);
+
+	retval = 0;
+
+	if (all) {
+		retval = rfb_send_all(rc, cfd, gc_image);
+		goto done;
+	}
+
+	/*
+	 * Calculate the checksum for each 32x32 cell. Send each that
+	 * has changed since the last scan.
+	 */
+
+	/* Resolution changed */
+
+	rc->crc_width = gc_image->width;
+	rc->crc_height = gc_image->height;
+
+	w = rc->crc_width;
+	h = rc->crc_height;
+	xcells = howmany(rc->crc_width, PIX_PER_CELL);
+	ycells = howmany(rc->crc_height, PIX_PER_CELL);
+
+	rem_x = w & PIXCELL_MASK;
+
+	rem_y = h & PIXCELL_MASK;
+	if (!rem_y)
+		rem_y = PIX_PER_CELL;
+
+	p = gc_image->data;
+
+	/*
+	 * Go through all cells and calculate crc. If significant number
+	 * of changes, then send entire screen.
+	 * crc_tmp is dual purpose: to store the new crc and to flag as
+	 * a cell that has changed.
+	 */
+	crc_p = rc->crc_tmp - xcells;
+	orig_crc = rc->crc - xcells;
+	changes = 0;
+	memset(rc->crc_tmp, 0, sizeof(uint32_t) * xcells * ycells);
+	for (y = 0; y < h; y++) {
+		if ((y & PIXCELL_MASK) == 0) {
+			crc_p += xcells;
+			orig_crc += xcells;
+		}
+
+		for (x = 0; x < xcells; x++) {
+			if (x == (xcells - 1) && rem_x > 0)
+				cellwidth = rem_x;
+			else
+				cellwidth = PIX_PER_CELL;
+
+			if (rc->hw_crc)
+				crc_p[x] = fast_crc32(p,
+				             cellwidth * sizeof(uint32_t),
+				             crc_p[x]);
+			else
+				crc_p[x] = (uint32_t)crc32(crc_p[x],
+				             (Bytef *)p,
+				             cellwidth * sizeof(uint32_t));
+
+			p += cellwidth;
+
+			/* check for crc delta if last row in cell */
+			if ((y & PIXCELL_MASK) == PIXCELL_MASK || y == (h-1)) {
+				if (orig_crc[x] != crc_p[x]) {
+					orig_crc[x] = crc_p[x];
+					crc_p[x] = 1;
+					changes++;
+				} else {
+					crc_p[x] = 0;
+				}
+			}
+		}
+	}
+
+	/* If number of changes is > THRESH percent, send the whole screen */
+	if (((changes * 100) / (xcells * ycells)) >= RFB_SEND_ALL_THRESH) {
+		retval = rfb_send_all(rc, cfd, gc_image);
+		goto done;
+	}
+	
+	/* Go through all cells, and send only changed ones */
+	crc_p = rc->crc_tmp;
+	for (y = 0; y < h; y += PIX_PER_CELL) {
+		/* previous cell's row */
+		celly = (y >> PIXCELL_SHIFT);
+
+		/* Delta check crc to previous set */
+		for (x = 0; x < xcells; x++) {
+			if (*crc_p++ == 0)
+				continue;
+
+			if (x == (xcells - 1) && rem_x > 0)
+				cellwidth = rem_x;
+			else
+				cellwidth = PIX_PER_CELL;
+			nwrite = rfb_send_rect(rc, cfd,
+				gc_image,
+				x * PIX_PER_CELL,
+				celly * PIX_PER_CELL,
+			        cellwidth,
+				y + PIX_PER_CELL >= h ? rem_y : PIX_PER_CELL);
+			if (nwrite <= 0) {
+				retval = nwrite;
+				goto done;
+			}
+		}
+	}
+	retval = 1;
+
+done:
+	pthread_mutex_lock(&rc->mtx);
+	rc->sending = 0;
+	pthread_mutex_unlock(&rc->mtx);
+	
+	return (retval);
+}
+
+
 static void
-rfb_recv_update_msg(struct rfb_softc *rc, int cfd)
+rfb_recv_update_msg(struct rfb_softc *rc, int cfd, int discardonly)
 {
 	struct rfb_updt_msg updt_msg;
-	struct rfb_srvr_updt_msg supdt_msg;
-        struct rfb_srvr_rect_hdr srect_hdr;
 	struct bhyvegc_image *gc_image;
-	int len;
 
-	len = read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1);
+	(void)stream_read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1);
 
 	console_refresh();
 	gc_image = console_get_image();
 
-	if (rc->width != gc_image->width || rc->height != gc_image->height) {
+	updt_msg.x = htons(updt_msg.x);
+	updt_msg.y = htons(updt_msg.y);
+	updt_msg.width = htons(updt_msg.width);
+	updt_msg.height = htons(updt_msg.height);
+
+	if (updt_msg.width != gc_image->width ||
+	    updt_msg.height != gc_image->height) {
 		rc->width = gc_image->width;
 		rc->height = gc_image->height;
-		rfb_send_resize_update_msg(rc, cfd);
+		if (rc->enc_resize_ok)
+			rfb_send_resize_update_msg(rc, cfd);
 	}
 
-	/*
-	 * Send the whole thing
-	 */
-	/* Number of rectangles: 1 */
-	supdt_msg.type = 0;
-	supdt_msg.pad = 0;
-	supdt_msg.numrects = ntohs(1);
-	write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
+	if (discardonly)
+		return;
 
-	/* Rectangle header */
-	srect_hdr.x = ntohs(0);
-	srect_hdr.y = ntohs(0);
-	srect_hdr.width = ntohs(gc_image->width);
-	srect_hdr.height = ntohs(gc_image->height);
-	srect_hdr.encoding = ntohl(0);	/* raw */
-	write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
-
-	write(cfd, gc_image->data, gc_image->width * gc_image->height *
-	    sizeof(uint32_t));
+	rfb_send_screen(rc, cfd, 1);
 }
 
 static void
 rfb_recv_key_msg(struct rfb_softc *rc, int cfd)
 {
 	struct rfb_key_msg key_msg;
-	int len;
 
-	len = read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1);
+	(void)stream_read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1);
 
-	console_key_event(key_msg.down, ntohl(key_msg.code));
+	console_key_event(key_msg.down, htonl(key_msg.code));
 }
 
 static void
 rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd)
 {
 	struct rfb_ptr_msg ptr_msg;
+
+	(void)stream_read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1);
+
+	console_ptr_event(ptr_msg.button, htons(ptr_msg.x), htons(ptr_msg.y));
+}
+
+static void
+rfb_recv_cuttext_msg(struct rfb_softc *rc, int cfd)
+{
+	struct rfb_cuttext_msg ct_msg;
+	unsigned char buf[32];
 	int len;
 
-	len = read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1);
+	len = stream_read(cfd, ((void *)&ct_msg) + 1, sizeof(ct_msg) - 1);
+	ct_msg.length = htonl(ct_msg.length);
+	while (ct_msg.length > 0) {
+		len = stream_read(cfd, buf, ct_msg.length > sizeof(buf) ?
+			sizeof(buf) : ct_msg.length);
+		ct_msg.length -= len;
+	}
+}
 
-	console_ptr_event(ptr_msg.button, ntohs(ptr_msg.x), ntohs(ptr_msg.y));
+static int64_t
+timeval_delta(struct timeval *prev, struct timeval *now)
+{
+	int64_t n1, n2;
+	n1 = now->tv_sec * 1000000 + now->tv_usec;
+	n2 = prev->tv_sec * 1000000 + prev->tv_usec;
+	return (n1 - n2);
+}
+
+static void *
+rfb_wr_thr(void *arg)
+{
+	struct rfb_softc *rc;
+	fd_set rfds;
+	struct timeval tv;
+	struct timeval prev_tv;
+	int64_t tdiff;
+	int cfd;
+	int err;
+
+	rc = arg;
+	cfd = rc->cfd;
+
+	prev_tv.tv_sec = 0;
+	prev_tv.tv_usec = 0;
+	while (rc->cfd >= 0) {
+		FD_ZERO(&rfds);
+		FD_SET(cfd, &rfds);
+		tv.tv_sec = 0;
+		tv.tv_usec = 10000;
+
+		err = select(cfd+1, &rfds, NULL, NULL, &tv);
+		if (err < 0)
+			return (NULL);
+
+		/* Determine if its time to push screen; ~24hz */
+		gettimeofday(&tv, NULL);
+		tdiff = timeval_delta(&prev_tv, &tv);
+		if (tdiff > 40000) {
+			prev_tv.tv_sec = tv.tv_sec;
+			prev_tv.tv_usec = tv.tv_usec;
+			if (rfb_send_screen(rc, cfd, 0) <= 0) {
+				return (NULL);
+			}
+		} else {
+			/* sleep */
+			usleep(40000 - tdiff);
+		}
+	}
+
+	return (NULL);
 }
 
 void
@@ -296,39 +749,145 @@ rfb_handle(struct rfb_softc *rc, int cfd)
 {
 	const char *vbuf = "RFB 003.008\n";
 	unsigned char buf[80];
+	unsigned char *message = NULL;
+
+#ifndef NO_OPENSSL
+	unsigned char challenge[AUTH_LENGTH];
+	unsigned char keystr[PASSWD_LENGTH];
+	unsigned char crypt_expected[AUTH_LENGTH];
+
+	DES_key_schedule ks;
+	int i;
+#endif
+
+	pthread_t tid;
+	uint32_t sres = 0;
 	int len;
-        uint32_t sres;
+	int perror = 1;
+
+	rc->cfd = cfd;
 
 	/* 1a. Send server version */
-	printf("server vers write: (%s), %d bytes\n", vbuf, (int) strlen(vbuf));
-	write(cfd, vbuf, strlen(vbuf));
+	stream_write(cfd, vbuf, strlen(vbuf));
 
 	/* 1b. Read client version */
 	len = read(cfd, buf, sizeof(buf));
 
-	/* 2a. Send security type 'none' */
+	/* 2a. Send security type */
 	buf[0] = 1;
-	buf[1] = 1; /* none */
-	write(cfd, buf, 2);
+#ifndef NO_OPENSSL
+	if (rc->password) 
+		buf[1] = SECURITY_TYPE_VNC_AUTH;
+	else
+		buf[1] = SECURITY_TYPE_NONE;
+#else
+	buf[1] = SECURITY_TYPE_NONE;
+#endif
+
+	stream_write(cfd, buf, 2);
 
 	/* 2b. Read agreed security type */
-	len = read(cfd, buf, 1);
+	len = stream_read(cfd, buf, 1);
+
+	/* 2c. Do VNC authentication */
+	switch (buf[0]) {
+	case SECURITY_TYPE_NONE:
+		sres = 0;
+		break;
+	case SECURITY_TYPE_VNC_AUTH:
+		/*
+		 * The client encrypts the challenge with DES, using a password
+		 * supplied by the user as the key.
+		 * To form the key, the password is truncated to
+		 * eight characters, or padded with null bytes on the right.
+		 * The client then sends the resulting 16-bytes response.
+		 */
+#ifndef NO_OPENSSL
+		strncpy(keystr, rc->password, PASSWD_LENGTH);
+
+		/* VNC clients encrypts the challenge with all the bit fields
+		 * in each byte of the password mirrored.
+		 * Here we flip each byte of the keystr.
+		 */
+		for (i = 0; i < PASSWD_LENGTH; i++) {
+			keystr[i] = (keystr[i] & 0xF0) >> 4
+				  | (keystr[i] & 0x0F) << 4;
+			keystr[i] = (keystr[i] & 0xCC) >> 2
+				  | (keystr[i] & 0x33) << 2;
+			keystr[i] = (keystr[i] & 0xAA) >> 1
+				  | (keystr[i] & 0x55) << 1;
+		}
+
+		/* Initialize a 16-byte random challenge */
+		arc4random_buf(challenge, sizeof(challenge));
+		stream_write(cfd, challenge, AUTH_LENGTH);
+
+		/* Receive the 16-byte challenge response */
+		stream_read(cfd, buf, AUTH_LENGTH);
+
+		memcpy(crypt_expected, challenge, AUTH_LENGTH);
+
+		/* Encrypt the Challenge with DES */
+		DES_set_key((const_DES_cblock *)keystr, &ks);
+		DES_ecb_encrypt((const_DES_cblock *)challenge,
+				(const_DES_cblock *)crypt_expected,
+				&ks, DES_ENCRYPT);
+		DES_ecb_encrypt((const_DES_cblock *)(challenge + PASSWD_LENGTH),
+				(const_DES_cblock *)(crypt_expected +
+				PASSWD_LENGTH),
+				&ks, DES_ENCRYPT);
+
+		if (memcmp(crypt_expected, buf, AUTH_LENGTH) != 0) {
+			message = "Auth Failed: Invalid Password.";
+			sres = htonl(1);
+		} else
+			sres = 0;
+#else
+		sres = 0;
+		WPRINTF(("Auth not supported, no OpenSSL in your system"));
+#endif
 
-	/* 2c. Write back a status of 0 */
-	sres = 0;
-	write(cfd, &sres, 4);
+		break;
+	}
+
+	/* 2d. Write back a status */
+	stream_write(cfd, &sres, 4);
+
+	if (sres) {
+#ifdef __FreeBSD__
+		be32enc(buf, strlen(message));
+		stream_write(cfd, buf, 4);
+		stream_write(cfd, message, strlen(message));
+#else
+		be32enc(buf, strlen((char *)message));
+		stream_write(cfd, buf, 4);
+		stream_write(cfd, message, strlen((char *)message));
+#endif
+		goto done;
+	}
 
 	/* 3a. Read client shared-flag byte */
-	len = read(cfd, buf, 1);
+	len = stream_read(cfd, buf, 1);
 
 	/* 4a. Write server-init info */
 	rfb_send_server_init_msg(cfd);
 
+	if (!rc->zbuf) {
+		rc->zbuf = malloc(RFB_ZLIB_BUFSZ + 16);
+		assert(rc->zbuf != NULL);
+	}
+
+	rfb_send_screen(rc, cfd, 1);
+
+	perror = pthread_create(&tid, NULL, rfb_wr_thr, rc);
+	if (perror == 0)
+		pthread_set_name_np(tid, "rfbout");
+
         /* Now read in client requests. 1st byte identifies type */
 	for (;;) {
 		len = read(cfd, buf, 1);
 		if (len <= 0) {
-			printf("exiting\n");
+			DPRINTF(("rfb client exiting\r\n"));
 			break;
 		}
 
@@ -340,7 +899,7 @@ rfb_handle(struct rfb_softc *rc, int cfd)
 			rfb_recv_set_encodings_msg(rc, cfd);
 			break;
 		case 3:
-			rfb_recv_update_msg(rc, cfd);
+			rfb_recv_update_msg(rc, cfd, 1);
 			break;
 		case 4:
 			rfb_recv_key_msg(rc, cfd);
@@ -348,11 +907,20 @@ rfb_handle(struct rfb_softc *rc, int cfd)
 		case 5:
 			rfb_recv_ptr_msg(rc, cfd);
 			break;
+		case 6:
+			rfb_recv_cuttext_msg(rc, cfd);
+			break;
 		default:
-			printf("unknown client code!\n");
-			exit(1);
+			WPRINTF(("rfb unknown cli-code %d!\n", buf[0] & 0xff));
+			goto done;
 		}
 	}
+done:
+	rc->cfd = -1;
+	if (perror == 0)
+		pthread_join(tid, NULL);
+	if (rc->enc_zlib_ok)
+		deflateEnd(&rc->zstream);
 }
 
 static void *
@@ -373,48 +941,208 @@ rfb_thr(void *arg)
 	}
 
 	for (;;) {
+		rc->enc_raw_ok = false;
+		rc->enc_zlib_ok = false;
+		rc->enc_resize_ok = false;
+
 		cfd = accept(rc->sfd, NULL, NULL);
+		if (rc->conn_wait) {
+			pthread_mutex_lock(&rc->mtx);
+			pthread_cond_signal(&rc->cond);
+			pthread_mutex_unlock(&rc->mtx);
+			rc->conn_wait = 0;
+		}
 		rfb_handle(rc, cfd);
+		close(cfd);
 	}
 
 	/* NOTREACHED */
 	return (NULL);
 }
 
+static int
+sse42_supported(void)
+{
+	u_int cpu_registers[4], ecx;
+
+	do_cpuid(1, cpu_registers);
+
+	ecx = cpu_registers[2];
+
+	return ((ecx & CPUID2_SSE42) != 0);
+}
+
 int
-rfb_init(int port)
+rfb_init(char *hostname, int port, int wait, char *password)
 {
+	int e;
+	char servname[6];
 	struct rfb_softc *rc;
-	struct sockaddr_in sin;
+	struct addrinfo *ai;
+	struct addrinfo hints;
 	int on = 1;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
 
 	rc = calloc(1, sizeof(struct rfb_softc));
 
-	rc->sfd = socket(AF_INET, SOCK_STREAM, 0);
+	rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+	                 sizeof(uint32_t));
+	rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+	                     sizeof(uint32_t));
+	rc->crc_width = RFB_MAX_WIDTH;
+	rc->crc_height = RFB_MAX_HEIGHT;
+
+	rc->password = password;
+
+	snprintf(servname, sizeof(servname), "%d", port ? port : 5900);
+
+	if (!hostname || strlen(hostname) == 0)
+#if defined(INET)
+		hostname = "127.0.0.1";
+#elif defined(INET6)
+		hostname = "[::1]";
+#endif
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV | AI_PASSIVE;
+
+	if ((e = getaddrinfo(hostname, servname, &hints, &ai)) != 0) {
+		fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(e));
+		return(-1);
+	}
+
+	rc->sfd = socket(ai->ai_family, ai->ai_socktype, 0);
 	if (rc->sfd < 0) {
 		perror("socket");
+		freeaddrinfo(ai);
 		return (-1);
 	}
 
 	setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
 
-#ifdef	__FreeBSD__
-	sin.sin_len = sizeof(sin);
-#endif
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = htonl(INADDR_ANY);
-	sin.sin_port = htons(port);
-	if (bind(rc->sfd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+	if (bind(rc->sfd, ai->ai_addr, ai->ai_addrlen) < 0) {
 		perror("bind");
+		freeaddrinfo(ai);
 		return (-1);
 	}
 
 	if (listen(rc->sfd, 1) < 0) {
 		perror("listen");
+		freeaddrinfo(ai);
 		return (-1);
 	}
 
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(rc->sfd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	rc->hw_crc = sse42_supported();
+
+	rc->conn_wait = wait;
+	if (wait) {
+		pthread_mutex_init(&rc->mtx, NULL);
+		pthread_cond_init(&rc->cond, NULL);
+	}
+
 	pthread_create(&rc->tid, NULL, rfb_thr, rc);
+	pthread_set_name_np(rc->tid, "rfb");
+
+	if (wait) {
+		DPRINTF(("Waiting for rfb client...\n"));
+		pthread_mutex_lock(&rc->mtx);
+		pthread_cond_wait(&rc->cond, &rc->mtx);
+		pthread_mutex_unlock(&rc->mtx);
+	}
 
+	freeaddrinfo(ai);
 	return (0);
 }
+
+#ifndef __FreeBSD__
+int
+rfb_init_unix(char *path, int wait, char *password)
+{
+	struct rfb_softc *rc;
+	struct sockaddr_un sock;
+
+	if ((rc = calloc(1, sizeof (struct rfb_softc))) == NULL) {
+		perror("calloc");
+		return (-1);
+	}
+	rc->sfd = -1;
+
+	if ((rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+	    sizeof (uint32_t))) == NULL) {
+		perror("calloc");
+		goto fail;
+	}
+	if ((rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32),
+	    sizeof (uint32_t))) == NULL) {
+		perror("calloc");
+		goto fail;
+	}
+	rc->crc_width = RFB_MAX_WIDTH;
+	rc->crc_height = RFB_MAX_HEIGHT;
+
+	rc->password = password;
+
+	rc->sfd = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (rc->sfd < 0) {
+		perror("socket");
+		goto fail;
+	}
+
+	sock.sun_family = AF_UNIX;
+	if (strlcpy(sock.sun_path, path, sizeof (sock.sun_path)) >=
+	    sizeof (sock.sun_path)) {
+		(void) fprintf(stderr, "socket path '%s' too long\n", path);
+		goto fail;
+	}
+
+	(void) unlink(path);
+	if (bind(rc->sfd, (struct sockaddr *)&sock, sizeof (sock)) < 0) {
+		perror("bind");
+		goto fail;
+	}
+
+	if (listen(rc->sfd, 1) < 0) {
+		perror("listen");
+		goto fail;
+	}
+
+	rc->hw_crc = sse42_supported();
+
+	rc->conn_wait = wait;
+	if (wait) {
+		VERIFY3S(pthread_mutex_init(&rc->mtx, NULL), ==, 0);
+		VERIFY3S(pthread_cond_init(&rc->cond, NULL), ==, 0);
+	}
+
+	VERIFY3S(pthread_create(&rc->tid, NULL, rfb_thr, rc), ==, 0);
+	pthread_set_name_np(rc->tid, "rfb");
+
+	if (wait) {
+		DPRINTF(("Waiting for rfb client...\n"));
+		VERIFY3S(pthread_mutex_lock(&rc->mtx), ==, 0);
+		VERIFY3S(pthread_cond_wait(&rc->cond, &rc->mtx), ==, 0);
+		VERIFY3S(pthread_mutex_unlock(&rc->mtx), ==, 0);
+	}
+
+	return (0);
+
+fail:
+	if (rc->sfd != -1) {
+		VERIFY3S(close(rc->sfd), ==, 0);
+	}
+	free(rc->crc);
+	free(rc->crc_tmp);
+	free(rc);
+	return (-1);
+}
+#endif
diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h
index 5504c333ab..990e2075ac 100644
--- a/usr/src/cmd/bhyve/rfb.h
+++ b/usr/src/cmd/bhyve/rfb.h
@@ -1,5 +1,8 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright 2018 Joyent, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,6 +34,9 @@
 
 #define	RFB_PORT	5900
 
-int	rfb_init(int port);
+int	rfb_init(char *hostname, int port, int wait, char *password);
+#ifndef __FreeBSD__
+int	rfb_init_unix(char *path, int wait, char *password);
+#endif
 
 #endif /* _RFB_H_ */
diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c
index 5ab78e060f..09ca3f61ae 100644
--- a/usr/src/cmd/bhyve/rtc.c
+++ b/usr/src/cmd/bhyve/rtc.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,17 +25,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
-#include <sys/time.h>
 
-#include <stdio.h>
-#include <string.h>
 #include <time.h>
 #include <assert.h>
 
@@ -41,300 +40,45 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $"
 #include <vmmapi.h>
 
 #include "acpi.h"
-#include "inout.h"
 #include "pci_lpc.h"
 #include "rtc.h"
 
-#define	IO_RTC	0x70
-
-#define RTC_SEC		0x00	/* seconds */
-#define	RTC_SEC_ALARM	0x01
-#define	RTC_MIN		0x02
-#define	RTC_MIN_ALARM	0x03
-#define	RTC_HRS		0x04
-#define	RTC_HRS_ALARM	0x05
-#define	RTC_WDAY	0x06
-#define	RTC_DAY		0x07
-#define	RTC_MONTH	0x08
-#define	RTC_YEAR	0x09
-#define	RTC_CENTURY	0x32	/* current century */
-
-#define RTC_STATUSA	0xA
-#define  RTCSA_TUP	 0x80	/* time update, don't look now */
-
-#define	RTC_STATUSB	0xB
-#define	 RTCSB_DST	 0x01
-#define	 RTCSB_24HR	 0x02
-#define	 RTCSB_BIN	 0x04	/* 0 = BCD, 1 = Binary */
-#define	 RTCSB_PINTR	 0x40	/* 1 = enable periodic clock interrupt */
-#define	 RTCSB_HALT      0x80	/* stop clock updates */
+#define	IO_RTC		0x70
 
-#define RTC_INTR	0x0c	/* status register C (R) interrupt source */
-
-#define RTC_STATUSD	0x0d	/* status register D (R) Lost Power */
-#define  RTCSD_PWR	 0x80	/* clock power OK */
-
-#define	RTC_NVRAM_START	0x0e
-#define	RTC_NVRAM_END	0x7f
-#define RTC_NVRAM_SZ	(128 - RTC_NVRAM_START)
-#define	nvoff(x)	((x) - RTC_NVRAM_START)
-
-#define	RTC_DIAG	0x0e
-#define RTC_RSTCODE	0x0f
-#define	RTC_EQUIPMENT	0x14
 #define	RTC_LMEM_LSB	0x34
 #define	RTC_LMEM_MSB	0x35
 #define	RTC_HMEM_LSB	0x5b
 #define	RTC_HMEM_SB	0x5c
 #define	RTC_HMEM_MSB	0x5d
 
-#define m_64KB		(64*1024)
+#define	m_64KB		(64*1024)
 #define	m_16MB		(16*1024*1024)
 #define	m_4GB		(4ULL*1024*1024*1024)
 
-static int addr;
-
-static uint8_t rtc_nvram[RTC_NVRAM_SZ];
-
-/* XXX initialize these to default values as they would be from BIOS */
-static uint8_t status_a, status_b;
-
-static struct {
-	uint8_t  hours;
-	uint8_t  mins;
-	uint8_t  secs;
-} rtc_alarm;
-
-static u_char const bin2bcd_data[] = {
-	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
-	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
-	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
-	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
-};
-#define	bin2bcd(bin)	(bin2bcd_data[bin])
-
-#define	rtcout(val)	((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
-
-static void
-timevalfix(struct timeval *t1)
-{
-
-	if (t1->tv_usec < 0) {
-		t1->tv_sec--;
-		t1->tv_usec += 1000000;
-	}
-	if (t1->tv_usec >= 1000000) {
-		t1->tv_sec++;
-		t1->tv_usec -= 1000000;
-	}
-}
-
-static void
-timevalsub(struct timeval *t1, const struct timeval *t2)
-{
-
-	t1->tv_sec -= t2->tv_sec;
-	t1->tv_usec -= t2->tv_usec;
-	timevalfix(t1);
-}
-
-static int
-rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
-{
-	if (bytes != 1)
-		return (-1);
-
-	if (in) {
-		/* straight read of this register will return 0xFF */
-		*eax = 0xff;
-		return (0);
-	}
-
-	switch (*eax & 0x7f) {
-	case RTC_SEC:
-	case RTC_SEC_ALARM:
-	case RTC_MIN:
-	case RTC_MIN_ALARM:
-	case RTC_HRS:
-	case RTC_HRS_ALARM:
-	case RTC_WDAY:
-	case RTC_DAY:
-	case RTC_MONTH:
-	case RTC_YEAR:
-	case RTC_STATUSA:
-	case RTC_STATUSB:
-	case RTC_INTR:
-	case RTC_STATUSD:
-	case RTC_NVRAM_START ... RTC_NVRAM_END:
-		break;
-	default:
-		return (-1);
-	}
-
-	addr = *eax & 0x7f;
-	return (0);
-}
-
-static int
-rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
+/*
+ * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970
+ */
+static time_t
+rtc_time(struct vmctx *ctx, int use_localtime)
 {
-	int hour;
+	struct tm tm;
 	time_t t;
-	struct timeval cur, delta;
-
-	static struct timeval last;
-	static struct tm tm;
-
-	if (bytes != 1)
-		return (-1);
-
-	gettimeofday(&cur, NULL);
 
-	/*
-	 * Increment the cached time only once per second so we can guarantee
-	 * that the guest has at least one second to read the hour:min:sec
-	 * separately and still get a coherent view of the time.
-	 */
-	delta = cur;
-	timevalsub(&delta, &last);
-	if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
-		t = cur.tv_sec;
+	time(&t);
+	if (use_localtime) {
 		localtime_r(&t, &tm);
-		last = cur;
-	}
-
-	if (in) {
-		switch (addr) {
-		case RTC_SEC_ALARM:
-			*eax = rtc_alarm.secs;
-			break;
-		case RTC_MIN_ALARM:
-			*eax = rtc_alarm.mins;
-			break;
-		case RTC_HRS_ALARM:
-			*eax = rtc_alarm.hours;
-			break;
-		case RTC_SEC:
-			*eax = rtcout(tm.tm_sec);
-			return (0);
-		case RTC_MIN:
-			*eax = rtcout(tm.tm_min);
-			return (0);
-		case RTC_HRS:
-			if (status_b & RTCSB_24HR)
-				hour = tm.tm_hour;
-			else
-				hour = (tm.tm_hour % 12) + 1;
-			
-			*eax = rtcout(hour);
-
-			/*
-			 * If we are representing time in the 12-hour format
-			 * then set the MSB to indicate PM.
-			 */
-			if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
-				*eax |= 0x80;
-
-			return (0);
-		case RTC_WDAY:
-			*eax = rtcout(tm.tm_wday + 1);
-			return (0);
-		case RTC_DAY:
-			*eax = rtcout(tm.tm_mday);
-			return (0);
-		case RTC_MONTH:
-			*eax = rtcout(tm.tm_mon + 1);
-			return (0);
-		case RTC_YEAR:
-			*eax = rtcout(tm.tm_year % 100);
-			return (0);
-		case RTC_STATUSA:
-			*eax = status_a;
-			return (0);
-		case RTC_STATUSB:
-			*eax = status_b;
-			return (0);
-		case RTC_INTR:
-			*eax = 0;
-			return (0);
-		case RTC_STATUSD:
-			*eax = RTCSD_PWR;
-			return (0);
-		case RTC_NVRAM_START ... RTC_NVRAM_END:
-			*eax = rtc_nvram[addr - RTC_NVRAM_START];
-			return (0);
-		default:
-			return (-1);
-		}
-	}
-
-	switch (addr) {
-	case RTC_STATUSA:
-		status_a = *eax & ~RTCSA_TUP;
-		break;
-	case RTC_STATUSB:
-		/* XXX not implemented yet XXX */
-		if (*eax & RTCSB_PINTR)
-			return (-1);
-		status_b = *eax;
-		break;
-	case RTC_STATUSD:
-		/* ignore write */
-		break;
-	case RTC_SEC_ALARM:
-		rtc_alarm.secs = *eax;
-		break;
-	case RTC_MIN_ALARM:
-		rtc_alarm.mins = *eax;
-		break;
-	case RTC_HRS_ALARM:
-		rtc_alarm.hours = *eax;
-		break;
-	case RTC_SEC:
-	case RTC_MIN:
-	case RTC_HRS:
-	case RTC_WDAY:
-	case RTC_DAY:
-	case RTC_MONTH:
-	case RTC_YEAR:
-		/*
-		 * Ignore writes to the time of day registers
-		 */
-		break;
-	case RTC_NVRAM_START ... RTC_NVRAM_END:
-		rtc_nvram[addr - RTC_NVRAM_START] = *eax;
-		break;
-	default:
-		return (-1);
+		t = timegm(&tm);
 	}
-	return (0);
+	return (t);
 }
 
 void
-rtc_init(struct vmctx *ctx)
+rtc_init(struct vmctx *ctx, int use_localtime)
 {	
-	struct timeval cur;
-	struct tm tm;
 	size_t himem;
 	size_t lomem;
 	int err;
 
-	err = gettimeofday(&cur, NULL);
-	assert(err == 0);
-	(void) localtime_r(&cur.tv_sec, &tm);
-
-	memset(rtc_nvram, 0, sizeof(rtc_nvram));
-
-	rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100);
-
 	/* XXX init diag/reset code/equipment/checksum ? */
 
 	/*
@@ -344,19 +88,23 @@ rtc_init(struct vmctx *ctx)
 	 * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
 	 */
 	lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
-	rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
-	rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
+	err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem);
+	assert(err == 0);
+	err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8);
+	assert(err == 0);
 
 	himem = vm_get_highmem_size(ctx) / m_64KB;
-	rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
-	rtc_nvram[nvoff(RTC_HMEM_SB)]  = himem >> 8;
-	rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16;
-}
+	err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem);
+	assert(err == 0);
+	err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8);
+	assert(err == 0);
+	err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16);
+	assert(err == 0);
 
-INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler);
-INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
+	err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime));
+	assert(err == 0);
+}
 
-#ifdef	__FreeBSD__
 static void
 rtc_dsdt(void)
 {
@@ -375,6 +123,9 @@ rtc_dsdt(void)
 	dsdt_line("}");
 }
 LPC_DSDT(rtc_dsdt);
-#endif
 
+/*
+ * Reserve the extended RTC I/O ports although they are not emulated at this
+ * time.
+ */
 SYSRES_IO(0x72, 6);
diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h
index 6406d24c37..1c108eed99 100644
--- a/usr/src/cmd/bhyve/rtc.h
+++ b/usr/src/cmd/bhyve/rtc.h
@@ -1,4 +1,6 @@
-/*
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
@@ -23,12 +25,12 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/rtc.h 253181 2013-07-11 03:54:35Z grehan $
+ * $FreeBSD$
  */
 
 #ifndef _RTC_H_
 #define _RTC_H_
 
-void	rtc_init(struct vmctx *ctx);
+void	rtc_init(struct vmctx *ctx, int use_localtime);
 
 #endif /* _RTC_H_ */
diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c
index 7ba0f0dfa0..da227f813a 100644
--- a/usr/src/cmd/bhyve/smbiostbl.c
+++ b/usr/src/cmd/bhyve/smbiostbl.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -25,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z grehan $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
@@ -33,6 +35,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z
 #include <errno.h>
 #include <md5.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <uuid.h>
@@ -321,8 +324,8 @@ struct smbios_table_type0 smbios_type0_template = {
 
 const char *smbios_type0_strings[] = {
 	"BHYVE",	/* vendor string */
-	__TIME__,	/* bios version string */
-	__DATE__,	/* bios release date string */
+	"1.00",		/* bios version string */
+	"03/14/2014",	/* bios release date string */
 	NULL
 };
 
@@ -634,7 +637,7 @@ smbios_type4_initializer(struct smbios_structure *template_entry,
 {
 	int i;
 
-	for (i = 0; i < guest_ncpus; i++) {
+	for (i = 0; i < sockets; i++) {
 		struct smbios_table_type4 *type4;
 		char *p;
 		int nstrings, len;
@@ -653,6 +656,16 @@ smbios_type4_initializer(struct smbios_structure *template_entry,
 		*(*endaddr) = '\0';
 		(*endaddr)++;
 		type4->socket = nstrings + 1;
+		/* Revise cores and threads after update to smbios 3.0 */
+		if (cores > 254)
+			type4->cores = 0;
+		else
+			type4->cores = cores;
+		/* This threads is total threads in a socket */
+		if ((cores * threads) > 254)
+			type4->threads = 0;
+		else
+			type4->threads = (cores * threads);
 		curaddr = *endaddr;
 	}
 
@@ -825,3 +838,80 @@ smbios_build(struct vmctx *ctx)
 
 	return (0);
 }
+
+int
+smbios_parse(const char *opts)
+{
+	char *buf;
+	char *lasts;
+	char *token;
+	char *end;
+	long type;
+	struct {
+		const char *key;
+		const char **targetp;
+	} type1_map[] = {
+		{ "manufacturer", &smbios_type1_strings[0] },
+		{ "product", &smbios_type1_strings[1] },
+		{ "version", &smbios_type1_strings[2] },
+		{ "serial", &smbios_type1_strings[3] },
+		{ "sku", &smbios_type1_strings[4] },
+		{ "family", &smbios_type1_strings[5] },
+		{ "uuid", (const char **)&guest_uuid_str },
+		{ 0 }
+	};
+
+	if ((buf = strdup(opts)) == NULL) {
+		(void) fprintf(stderr, "out of memory\n");
+		return (-1);
+	}
+
+	if ((token = strtok_r(buf, ",", &lasts)) == NULL) {
+		(void) fprintf(stderr, "too few fields\n");
+		goto fail;
+	}
+
+	errno = 0;
+	type = strtol(token, &end, 10);
+	if (errno != 0 || *end != '\0') {
+		(void) fprintf(stderr, "first token '%s' is not an integer\n",
+		    token);
+		goto fail;
+	}
+
+	/* For now, only type 1 is supported. */
+	if (type != 1) {
+		(void) fprintf(stderr, "unsupported type %d\n", type);
+		goto fail;
+	}
+
+	while ((token = strtok_r(NULL, ",", &lasts)) != NULL) {
+		char *val;
+		int i;
+
+		if ((val = strchr(token, '=')) == NULL) {
+			(void) fprintf(stderr, "invalid key=value: '%s'\n",
+			    token);
+			goto fail;
+		}
+		*val = '\0';
+		val++;
+
+		for (i = 0; type1_map[i].key != NULL; i++) {
+			if (strcmp(token, type1_map[i].key) == 0) {
+				break;
+			}
+		}
+		if (type1_map[i].key == NULL) {
+			(void) fprintf(stderr, "invalid key '%s'\n", token);
+			goto fail;
+		}
+		*type1_map[i].targetp = val;
+	}
+
+	return (0);
+
+fail:
+	free(buf);
+	return (-1);
+}
diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h
index fd7f86be80..81e26309e5 100644
--- a/usr/src/cmd/bhyve/smbiostbl.h
+++ b/usr/src/cmd/bhyve/smbiostbl.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -23,7 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/smbiostbl.h 262744 2014-03-04 17:12:06Z tychon $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _SMBIOSTBL_H_
@@ -32,5 +38,6 @@
 struct vmctx;
 
 int	smbios_build(struct vmctx *ctx);
+int	smbios_parse(const char *opts);
 
 #endif /* _SMBIOSTBL_H_ */
diff --git a/usr/src/cmd/bhyve/sockstream.c b/usr/src/cmd/bhyve/sockstream.c
new file mode 100644
index 0000000000..b592bce9aa
--- /dev/null
+++ b/usr/src/cmd/bhyve/sockstream.c
@@ -0,0 +1,86 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Nahanni Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <errno.h>
+
+#include "sockstream.h"
+
+ssize_t
+stream_read(int fd, void *buf, ssize_t nbytes)
+{
+	uint8_t *p;
+	ssize_t len = 0;
+	ssize_t n;
+
+	p = buf;
+
+	while (len < nbytes) {
+		n = read(fd, p + len, nbytes - len);
+		if (n == 0)
+			break;
+
+		if (n < 0) {
+			if (errno == EINTR || errno == EAGAIN)
+				continue;
+			return (n);
+		}
+		len += n;
+	}
+	return (len);
+}
+
+ssize_t
+stream_write(int fd, const void *buf, ssize_t nbytes)
+{
+	const uint8_t *p;
+	ssize_t len = 0;
+	ssize_t n;
+
+	p = buf;
+
+	while (len < nbytes) {
+		n = write(fd, p + len, nbytes - len);
+		if (n == 0)
+			break;
+		if (n < 0) {
+			if (errno == EINTR || errno == EAGAIN)
+				continue;
+			return (n);
+		}
+		len += n;
+	}
+	return (len);
+}
diff --git a/usr/src/cmd/bhyve/sockstream.h b/usr/src/cmd/bhyve/sockstream.h
new file mode 100644
index 0000000000..ecea849471
--- /dev/null
+++ b/usr/src/cmd/bhyve/sockstream.h
@@ -0,0 +1,35 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2015 Nahanni Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+ssize_t stream_read(int fd, void *buf, ssize_t nbytes);
+ssize_t stream_write(int fd, const void *buf, ssize_t nbytes);
diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c
index e1dd562d3f..7c4186f5ed 100644
--- a/usr/src/cmd/bhyve/spinup_ap.c
+++ b/usr/src/cmd/bhyve/spinup_ap.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,11 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h
index 090de091ba..226542f6c3 100644
--- a/usr/src/cmd/bhyve/spinup_ap.h
+++ b/usr/src/cmd/bhyve/spinup_ap.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.h 240912 2012-09-25 02:33:25Z neel $
+ * $FreeBSD$
  */
 
 #ifndef	_SPINUP_AP_H_
diff --git a/usr/src/cmd/bhyve/task_switch.c b/usr/src/cmd/bhyve/task_switch.c
new file mode 100644
index 0000000000..b5950a19d8
--- /dev/null
+++ b/usr/src/cmd/bhyve/task_switch.c
@@ -0,0 +1,941 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+#include <x86/specialreg.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+
+/*
+ * Using 'struct i386tss' is tempting but causes myriad sign extension
+ * issues because all of its fields are defined as signed integers.
+ */
+struct tss32 {
+	uint16_t	tss_link;
+	uint16_t	rsvd1;
+	uint32_t	tss_esp0;
+	uint16_t	tss_ss0;
+	uint16_t	rsvd2;
+	uint32_t	tss_esp1;
+	uint16_t	tss_ss1;
+	uint16_t	rsvd3;
+	uint32_t	tss_esp2;
+	uint16_t	tss_ss2;
+	uint16_t	rsvd4;
+	uint32_t	tss_cr3;
+	uint32_t	tss_eip;
+	uint32_t	tss_eflags;
+	uint32_t	tss_eax;
+	uint32_t	tss_ecx;
+	uint32_t	tss_edx;
+	uint32_t	tss_ebx;
+	uint32_t	tss_esp;
+	uint32_t	tss_ebp;
+	uint32_t	tss_esi;
+	uint32_t	tss_edi;
+	uint16_t	tss_es;
+	uint16_t	rsvd5;
+	uint16_t	tss_cs;
+	uint16_t	rsvd6;
+	uint16_t	tss_ss;
+	uint16_t	rsvd7;
+	uint16_t	tss_ds;
+	uint16_t	rsvd8;
+	uint16_t	tss_fs;
+	uint16_t	rsvd9;
+	uint16_t	tss_gs;
+	uint16_t	rsvd10;
+	uint16_t	tss_ldt;
+	uint16_t	rsvd11;
+	uint16_t	tss_trap;
+	uint16_t	tss_iomap;
+};
+static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
+
+#define	SEL_START(sel)	(((sel) & ~0x7))
+#define	SEL_LIMIT(sel)	(((sel) | 0x7))
+#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
+
+static uint64_t
+GETREG(struct vmctx *ctx, int vcpu, int reg)
+{
+	uint64_t val;
+	int error;
+
+	error = vm_get_register(ctx, vcpu, reg, &val);
+	assert(error == 0);
+	return (val);
+}
+
+static void
+SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+	int error;
+
+	error = vm_set_register(ctx, vcpu, reg, val);
+	assert(error == 0);
+}
+
+static struct seg_desc
+usd_to_seg_desc(struct user_segment_descriptor *usd)
+{
+	struct seg_desc seg_desc;
+
+	seg_desc.base = (u_int)USD_GETBASE(usd);
+	if (usd->sd_gran)
+		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
+	else
+		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
+	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
+	seg_desc.access |= usd->sd_xx << 12;
+	seg_desc.access |= usd->sd_def32 << 14;
+	seg_desc.access |= usd->sd_gran << 15;
+
+	return (seg_desc);
+}
+
+/*
+ * Inject an exception with an error code that is a segment selector.
+ * The format of the error code is described in section 6.13, "Error Code",
+ * Intel SDM volume 3.
+ *
+ * Bit 0 (EXT) denotes whether the exception occurred during delivery
+ * of an external event like an interrupt.
+ *
+ * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
+ * in the IDT.
+ *
+ * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
+ */
+static void
+sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
+{
+	/*
+	 * Bit 2 from the selector is retained as-is in the error code.
+	 *
+	 * Bit 1 can be safely cleared because none of the selectors
+	 * encountered during task switch emulation refer to a task
+	 * gate in the IDT.
+	 *
+	 * Bit 0 is set depending on the value of 'ext'.
+	 */
+	sel &= ~0x3;
+	if (ext)
+		sel |= 0x1;
+	vm_inject_fault(ctx, vcpu, vector, 1, sel);
+}
+
+/*
+ * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
+ * and non-zero otherwise.
+ */
+static int
+desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
+{
+	uint64_t base;
+	uint32_t limit, access;
+	int error, reg;
+
+	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+	assert(error == 0);
+
+	if (reg == VM_REG_GUEST_LDTR) {
+		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
+			return (-1);
+	}
+
+	if (limit < SEL_LIMIT(sel))
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
+ * by the selector 'sel'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc, bool doread,
+    int *faultptr)
+{
+	struct iovec iov[2];
+	uint64_t base;
+	uint32_t limit, access;
+	int error, reg;
+
+	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+	assert(error == 0);
+	assert(limit >= SEL_LIMIT(sel));
+
+	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
+	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	if (doread)
+		vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
+	else
+		vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
+	return (0);
+}
+
+static int
+desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
+{
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
+}
+
+static int
+desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
+{
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
+}
+
+/*
+ * Read the TSS descriptor referenced by 'sel' into 'desc'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
+{
+	struct vm_guest_paging sup_paging;
+	int error;
+
+	assert(!ISLDT(sel));
+	assert(IDXSEL(sel) != 0);
+
+	/* Fetch the new TSS descriptor */
+	if (desc_table_limit_check(ctx, vcpu, sel)) {
+		if (ts->reason == TSR_IRET)
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		else
+			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
+		return (1);
+	}
+
+	sup_paging = ts->paging;
+	sup_paging.cpl = 0;		/* implicit supervisor mode */
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
+	return (error);
+}
+
+static bool
+code_desc(int sd_type)
+{
+	/* code descriptor */
+	return ((sd_type & 0x18) == 0x18);
+}
+
+static bool
+stack_desc(int sd_type)
+{
+	/* writable data descriptor */
+	return ((sd_type & 0x1A) == 0x12);
+}
+
+static bool
+data_desc(int sd_type)
+{
+	/* data descriptor or a readable code descriptor */
+	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
+}
+
+static bool
+ldt_desc(int sd_type)
+{
+
+	return (sd_type == SDT_SYSLDT);
+}
+
+/*
+ * Validate the descriptor 'seg_desc' associated with 'segment'.
+ */
+static int
+validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    int segment, struct seg_desc *seg_desc, int *faultptr)
+{
+	struct vm_guest_paging sup_paging;
+	struct user_segment_descriptor usd;
+	int error, idtvec;
+	int cpl, dpl, rpl;
+	uint16_t sel, cs;
+	bool ldtseg, codeseg, stackseg, dataseg, conforming;
+
+	ldtseg = codeseg = stackseg = dataseg = false;
+	switch (segment) {
+	case VM_REG_GUEST_LDTR:
+		ldtseg = true;
+		break;
+	case VM_REG_GUEST_CS:
+		codeseg = true;
+		break;
+	case VM_REG_GUEST_SS:
+		stackseg = true;
+		break;
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+		dataseg = true;
+		break;
+	default:
+		assert(0);
+	}
+
+	/* Get the segment selector */
+	sel = GETREG(ctx, vcpu, segment);
+
+	/* LDT selector must point into the GDT */
+	if (ldtseg && ISLDT(sel)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* Descriptor table limit check */
+	if (desc_table_limit_check(ctx, vcpu, sel)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* NULL selector */
+	if (IDXSEL(sel) == 0) {
+		/* Code and stack segment selectors cannot be NULL */
+		if (codeseg || stackseg) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+		seg_desc->base = 0;
+		seg_desc->limit = 0;
+		seg_desc->access = 0x10000;	/* unusable */
+		return (0);
+	}
+
+	/* Read the descriptor from the GDT/LDT */
+	sup_paging = ts->paging;
+	sup_paging.cpl = 0;	/* implicit supervisor mode */
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	/* Verify that the descriptor type is compatible with the segment */
+	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
+	    (codeseg && !code_desc(usd.sd_type)) ||
+	    (dataseg && !data_desc(usd.sd_type)) ||
+	    (stackseg && !stack_desc(usd.sd_type))) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* Segment must be marked present */
+	if (!usd.sd_p) {
+		if (ldtseg)
+			idtvec = IDT_TS;
+		else if (stackseg)
+			idtvec = IDT_SS;
+		else
+			idtvec = IDT_NP;
+		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
+		return (1);
+	}
+
+	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+	cpl = cs & SEL_RPL_MASK;
+	rpl = sel & SEL_RPL_MASK;
+	dpl = usd.sd_dpl;
+
+	if (stackseg && (rpl != cpl || dpl != cpl)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	if (codeseg) {
+		conforming = (usd.sd_type & 0x4) ? true : false;
+		if ((conforming && (cpl < dpl)) ||
+		    (!conforming && (cpl != dpl))) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+	}
+
+	if (dataseg) {
+		/*
+		 * A data segment is always non-conforming except when it's
+		 * descriptor is a readable, conforming code segment.
+		 */
+		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
+			conforming = true;
+		else
+			conforming = false;
+
+		if (!conforming && (rpl > dpl || cpl > dpl)) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+	}
+	*seg_desc = usd_to_seg_desc(&usd);
+	return (0);
+}
+
+static void
+tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
+    uint32_t eip, struct tss32 *tss, struct iovec *iov)
+{
+
+	/* General purpose registers */
+	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
+	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
+	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
+	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
+	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
+	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
+	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
+
+	/* Segment selectors */
+	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
+	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
+	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
+	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
+
+	/* eflags and eip */
+	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+	if (task_switch->reason == TSR_IRET)
+		tss->tss_eflags &= ~PSL_NT;
+	tss->tss_eip = eip;
+
+	/* Copy updated old TSS into guest memory */
+	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
+}
+
+static void
+update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
+{
+	int error;
+
+	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
+	assert(error == 0);
+}
+
+/*
+ * Update the vcpu registers to reflect the state of the new task.
+ */
+static int
+tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
+{
+	struct seg_desc seg_desc, seg_desc2;
+	uint64_t *pdpte, maxphyaddr, reserved;
+	uint32_t eflags;
+	int error, i;
+	bool nested;
+
+	nested = false;
+	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
+		tss->tss_link = ot_sel;
+		nested = true;
+	}
+
+	eflags = tss->tss_eflags;
+	if (nested)
+		eflags |= PSL_NT;
+
+	/* LDTR */
+	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
+
+	/* PBDR */
+	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
+		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
+			/*
+			 * XXX Assuming 36-bit MAXPHYADDR.
+			 */
+			maxphyaddr = (1UL << 36) - 1;
+			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
+			for (i = 0; i < 4; i++) {
+				/* Check reserved bits if the PDPTE is valid */
+				if (!(pdpte[i] & 0x1))
+					continue;
+				/*
+				 * Bits 2:1, 8:5 and bits above the processor's
+				 * maximum physical address are reserved.
+				 */
+				reserved = ~maxphyaddr | 0x1E6;
+				if (pdpte[i] & reserved) {
+					vm_inject_gp(ctx, vcpu);
+					return (1);
+				}
+			}
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
+		}
+		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
+		ts->paging.cr3 = tss->tss_cr3;
+	}
+
+	/* eflags and eip */
+	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
+
+	/* General purpose registers */
+	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
+
+	/* Segment selectors */
+	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
+	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
+	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
+	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
+	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
+	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
+
+	/*
+	 * If this is a nested task then write out the new TSS to update
+	 * the previous link field.
+	 */
+	if (nested)
+		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
+
+	/* Validate segment descriptors */
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
+
+	/*
+	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
+	 *
+	 * The SS and CS attribute checks on VM-entry are inter-dependent so
+	 * we need to make sure that both segments are valid before updating
+	 * either of them. This ensures that the VMCS state can pass the
+	 * VM-entry checks so the guest can handle any exception injected
+	 * during task switch emulation.
+	 */
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
+	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
+	    faultptr);
+	if (error || *faultptr)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
+
+	return (0);
+}
+
+/*
+ * Push an error code on the stack of the new task. This is needed if the
+ * task switch was triggered by a hardware exception that causes an error
+ * code to be saved (e.g. #PF).
+ */
+static int
+push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    int task_type, uint32_t errcode, int *faultptr)
+{
+	struct iovec iov[2];
+	struct seg_desc seg_desc;
+	int stacksize, bytes, error;
+	uint64_t gla, cr0, rflags;
+	uint32_t esp;
+	uint16_t stacksel;
+
+	*faultptr = 0;
+
+	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+
+	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
+	    &seg_desc.limit, &seg_desc.access);
+	assert(error == 0);
+
+	/*
+	 * Section "Error Code" in the Intel SDM vol 3: the error code is
+	 * pushed on the stack as a doubleword or word (depending on the
+	 * default interrupt, trap or task gate size).
+	 */
+	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
+		bytes = 4;
+	else
+		bytes = 2;
+
+	/*
+	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
+	 * stack-segment descriptor determines the size of the stack
+	 * pointer outside of 64-bit mode.
+	 */
+	if (SEG_DESC_DEF32(seg_desc.access))
+		stacksize = 4;
+	else
+		stacksize = 2;
+
+	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+	esp -= bytes;
+
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
+	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
+		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
+		*faultptr = 1;
+		return (0);
+	}
+
+	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
+		vm_inject_ac(ctx, vcpu, 1);
+		*faultptr = 1;
+		return (0);
+	}
+
+	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
+	    iov, nitems(iov), faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
+	return (0);
+}
+
+/*
+ * Evaluate return value from helper functions and potentially return to
+ * the VM run loop.
+ */
+#define	CHKERR(error,fault)						\
+	do {								\
+		assert((error == 0) || (error == EFAULT));		\
+		if (error)						\
+			return (VMEXIT_ABORT);				\
+		else if (fault)						\
+			return (VMEXIT_CONTINUE);			\
+	} while (0)
+
+int
+vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	struct seg_desc nt;
+	struct tss32 oldtss, newtss;
+	struct vm_task_switch *task_switch;
+	struct vm_guest_paging *paging, sup_paging;
+	struct user_segment_descriptor nt_desc, ot_desc;
+	struct iovec nt_iov[2], ot_iov[2];
+	uint64_t cr0, ot_base;
+	uint32_t eip, ot_lim, access;
+	int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
+	enum task_switch_reason reason;
+	uint16_t nt_sel, ot_sel;
+
+	task_switch = &vmexit->u.task_switch;
+	nt_sel = task_switch->tsssel;
+	ext = vmexit->u.task_switch.ext;
+	reason = vmexit->u.task_switch.reason;
+	paging = &vmexit->u.task_switch.paging;
+	vcpu = *pvcpu;
+
+	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
+
+	/*
+	 * Calculate the instruction pointer to store in the old TSS.
+	 */
+	eip = vmexit->rip + vmexit->inst_length;
+
+	/*
+	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
+	 * The following page table accesses are implicitly supervisor mode:
+	 * - accesses to GDT or LDT to load segment descriptors
+	 * - accesses to the task state segment during task switch
+	 */
+	sup_paging = *paging;
+	sup_paging.cpl = 0;	/* implicit supervisor mode */
+
+	/* Fetch the new TSS descriptor */
+	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
+	    &fault);
+	CHKERR(error, fault);
+
+	nt = usd_to_seg_desc(&nt_desc);
+
+	/* Verify the type of the new TSS */
+	nt_type = SEG_DESC_TYPE(nt.access);
+	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
+	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/* TSS descriptor must have present bit set */
+	if (!SEG_DESC_PRESENT(nt.access)) {
+		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
+		goto done;
+	}
+
+	/*
+	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
+	 * 44 bytes for a 16-bit TSS.
+	 */
+	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
+		minlimit = 104 - 1;
+	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
+		minlimit = 44 - 1;
+	else
+		minlimit = 0;
+
+	assert(minlimit > 0);
+	if (nt.limit < minlimit) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/* TSS must be busy if task switch is due to IRET */
+	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/*
+	 * TSS must be available (not busy) if task switch reason is
+	 * CALL, JMP, exception or interrupt.
+	 */
+	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
+		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
+		goto done;
+	}
+
+	/* Fetch the new TSS */
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
+	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
+	CHKERR(error, fault);
+	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
+
+	/* Get the old TSS selector from the guest's task register */
+	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
+	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
+		/*
+		 * This might happen if a task switch was attempted without
+		 * ever loading the task register with LTR. In this case the
+		 * TR would contain the values from power-on:
+		 * (sel = 0, base = 0, limit = 0xffff).
+		 */
+		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
+		goto done;
+	}
+
+	/* Get the old TSS base and limit from the guest's task register */
+	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
+	    &access);
+	assert(error == 0);
+	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
+	ot_type = SEG_DESC_TYPE(access);
+	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
+
+	/* Fetch the old TSS descriptor */
+	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
+	    &fault);
+	CHKERR(error, fault);
+
+	/* Get the old TSS */
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
+	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
+	CHKERR(error, fault);
+	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
+
+	/*
+	 * Clear the busy bit in the old TSS descriptor if the task switch
+	 * due to an IRET or JMP instruction.
+	 */
+	if (reason == TSR_IRET || reason == TSR_JMP) {
+		ot_desc.sd_type &= ~0x2;
+		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
+		    &ot_desc, &fault);
+		CHKERR(error, fault);
+	}
+
+	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
+		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
+		return (VMEXIT_ABORT);
+	}
+
+	/* Save processor state in old TSS */
+	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
+
+	/*
+	 * If the task switch was triggered for any reason other than IRET
+	 * then set the busy bit in the new TSS descriptor.
+	 */
+	if (reason != TSR_IRET) {
+		nt_desc.sd_type |= 0x2;
+		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
+		    &nt_desc, &fault);
+		CHKERR(error, fault);
+	}
+
+	/* Update task register to point at the new TSS */
+	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
+
+	/* Update the hidden descriptor state of the task register */
+	nt = usd_to_seg_desc(&nt_desc);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
+
+	/* Set CR0.TS */
+	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
+
+	/*
+	 * We are now committed to the task switch. Any exceptions encountered
+	 * after this point will be handled in the context of the new task and
+	 * the saved instruction pointer will belong to the new task.
+	 */
+	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
+	assert(error == 0);
+
+	/* Load processor state from new TSS */
+	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
+	    &fault);
+	CHKERR(error, fault);
+
+	/*
+	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
+	 * caused an error code to be generated, this error code is copied
+	 * to the stack of the new task.
+	 */
+	if (task_switch->errcode_valid) {
+		assert(task_switch->ext);
+		assert(task_switch->reason == TSR_IDT_GATE);
+		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
+		    task_switch->errcode, &fault);
+		CHKERR(error, fault);
+	}
+
+	/*
+	 * Treatment of virtual-NMI blocking if NMI is delivered through
+	 * a task gate.
+	 *
+	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
+	 * If the virtual NMIs VM-execution control is 1, VM entry injects
+	 * an NMI, and delivery of the NMI causes a task switch that causes
+	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
+	 * commences.
+	 *
+	 * Thus, virtual-NMI blocking is in effect at the time of the task
+	 * switch VM exit.
+	 */
+
+	/*
+	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
+	 *
+	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
+	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
+	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
+	 *
+	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
+	 * VM exit.
+	 */
+
+	/*
+	 * If the task switch was triggered by an event delivered through
+	 * the IDT then extinguish the pending event from the vcpu's
+	 * exitintinfo.
+	 */
+	if (task_switch->reason == TSR_IDT_GATE) {
+		error = vm_set_intinfo(ctx, vcpu, 0);
+		assert(error == 0);
+	}
+
+	/*
+	 * XXX should inject debug exception if 'T' bit is 1
+	 */
+done:
+	return (VMEXIT_CONTINUE);
+}
diff --git a/usr/src/cmd/bhyve/test/Makefile b/usr/src/cmd/bhyve/test/Makefile
new file mode 100644
index 0000000000..7dbee0c5f3
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+SUBDIRS = scripts tst
+
+include Makefile.subdirs
diff --git a/usr/src/cmd/bhyve/test/Makefile.com b/usr/src/cmd/bhyve/test/Makefile.com
new file mode 100644
index 0000000000..f5efacc510
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile.com
@@ -0,0 +1,61 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+include $(SRC)/Makefile.master
+include $(SRC)/cmd/Makefile.cmd
+include $(SRC)/cmd/Makefile.cmd.64
+
+#
+# Force c99 for everything
+#
+CSTD=		$(CSTD_GNU99)
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+CFLAGS +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \
+		-_gcc=-Wno-parentheses
+CFLAGS64 +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \
+		-_gcc=-Wno-parentheses
+CPPFLAGS =	-I$(SRC)/cmd/bhyve \
+		-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+		-I$(CONTRIB)/freebsd/dev/usb/controller \
+		-I$(CONTRIB)/freebsd/dev/mii \
+		$(CPPFLAGS.master) \
+		-I$(SRC)/uts/i86pc/io/vmm \
+		-I$(SRC)/uts/common \
+		-I$(SRC)/uts/i86pc \
+		-I$(SRC)/lib/libdladm/common \
+		-DWITHOUT_CAPSICUM
+CPPFLAGS +=	-I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+SMOFF += all_func_returns
+
+CLEANFILES +=	$(EXETESTS)
+CLOBBERFILES +=	$(ROOTTESTS)
+
+#
+# Install related definitions
+#
+ROOTOPTPKG =	$(ROOT)/opt/bhyvetest
+ROOTBIN =	$(ROOTOPTPKG)/bin
+ROOTTST =	$(ROOTOPTPKG)/tst
+ROOTTSTDIR =	$(ROOTTST)/$(TSTDIR)
+ROOTTSTEXES =	$(EXETESTS:%=$(ROOTTSTDIR)/%)
+ROOTTSTSH =	$(SHTESTS:%=$(ROOTTSTDIR)/%)
+ROOTOUT =	$(OUTFILES:%=$(ROOTTSTDIR)/%)
+ROOTTESTS =	$(ROOTTSTEXES) $(ROOTTSTSH) $(ROOTOUT)
+FILEMODE =	0555
+LDLIBS =	$(LDLIBS.cmd)
+LINTEXE =	$(EXETESTS:%.exe=%.exe.ln)
diff --git a/usr/src/cmd/bhyve/test/Makefile.subdirs b/usr/src/cmd/bhyve/test/Makefile.subdirs
new file mode 100644
index 0000000000..45f0aa67fa
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile.subdirs
@@ -0,0 +1,29 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+.KEEP_STATE:
+
+all	:= TARGET += all
+clean	:= TARGET += clean
+clobber	:= TARGET += clobber
+install	:= TARGET += install
+lint	:= TARGET += lint
+
+all clean clobber install lint: $(SUBDIRS)
+
+$(SUBDIRS): FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
diff --git a/usr/src/cmd/bhyve/test/Makefile.targ b/usr/src/cmd/bhyve/test/Makefile.targ
new file mode 100644
index 0000000000..e3ec55cfdb
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/Makefile.targ
@@ -0,0 +1,55 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+$(ROOTOPTPKG):
+	$(INS.dir)
+
+$(ROOTBIN): $(ROOTOPTPKG)
+	$(INS.dir)
+
+$(ROOTBIN)/%: %.ksh $(ROOTBIN)
+	$(INS.rename)
+
+$(ROOTTST): $(ROOTOPTPKG)
+	$(INS.dir)
+
+$(ROOTTSTDIR): $(ROOTTST)
+	$(INS.dir)
+
+$(ROOTTSTDIR)/%.ksh: %.ksh $(ROOTTSTDIR)
+	$(INS.file)
+
+$(ROOTTSTDIR)/%.out: %.out $(ROOTTSTDIR)
+	$(INS.file)
+
+%.exe: %.o $(SUPOBJS)
+	$(LINK.c) -o $@ $< $(SUPOBJS) $(LDLIBS)
+	$(POST_PROCESS)
+
+$(ROOTTSTDIR)/%.exe: %.exe $(ROOTTSTDIR)
+	$(INS.file)
+
+all: install
+
+%.exe.ln: %.c $(SUPOBJS)
+	$(LINT.c) $< $(LDLIBS)
+
+lint: $(LINTEXE)
+
+clean:
+	-$(RM) *.o $(CLEANFILES)
+
+clobber: clean
+	-$(RM) $(CLOBBERFILES)
diff --git a/usr/src/cmd/bhyve/test/scripts/Makefile b/usr/src/cmd/bhyve/test/scripts/Makefile
new file mode 100644
index 0000000000..d28a5edb8f
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/scripts/Makefile
@@ -0,0 +1,28 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include ../Makefile.com
+
+SRCS =		bhyvetest
+SCRIPTS =	$(SRCS:%=$(ROOTBIN)/%)
+
+SCRIPTS :=	FILEMODE = 0555
+CLOBBERFILES =	$(SCRIPTS)
+
+install: $(SCRIPTS)
+
+lint:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh
new file mode 100644
index 0000000000..95b7743417
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh
@@ -0,0 +1,231 @@
+#!/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+#
+# bhyve test suite driver
+#
+unalias -a
+
+bt_arg0=$(basename $0)
+bt_root="$(cd $(dirname $0)/..; pwd -P)"
+bt_ksh="/usr/bin/ksh"
+bt_outdir=
+bt_keep=
+bt_all=
+bt_tnum=0
+bt_tfail=0
+bt_tsuc=0
+
+function usage
+{
+	typeset msg="$*"
+	[[ -z "$msg" ]] || echo "$msg" 2>&1
+	cat <<USAGE >&2
+Usage: $bt_arg0  [ -o dir ] [ -k ] [ -a | test ... ]
+
+	-o dir		Sets 'dir' as the output directory
+	-a		Runs all tests, ignores tests passed in
+	-k		Keep output from all tests, not just failures
+	-m		mdb binary to test
+USAGE
+	exit 2
+}
+
+function fatal
+{
+	typeset msg="$*"
+	[[ -z "$msg" ]] && msg="failed"
+	echo "$bt_arg0: $msg" >&2
+	exit 1
+}
+
+function setup_outdir
+{
+	bt_outdir="$bt_outdir/$bt_arg0.$$"
+	mkdir -p $bt_outdir || fatal "failed to make output dir $bt_outdir"
+}
+
+function run_single
+{
+	typeset name=$1
+	typeset expect base ext exe command odir res reason
+	typeset iserr
+
+	[[ -z "$name" ]] && fail "missing test to run"
+	base=${name##*/}
+	ext=${base##*.}
+	expect=${base%%.*}
+	odir="$bt_outdir/current"
+	[[ -z "$ext" ]] && fatal "found test without ext: $name"
+	[[ -z "$expect" ]] && fatal "found test without prefix: $name"
+
+	if [[ "$expect" == "err" || "$expect" == "ecreate" ]]; then
+		iserr="yup"
+	else
+		iserr=""
+	fi
+
+	case "$ext" in
+	"ksh")
+		command="$bt_ksh ./$base"
+		;;
+	"exe")
+		command="./$base"
+		;;
+	"out")
+		#
+		# This is the file format for checking output against.
+		#
+		return 0
+		;;
+	*)
+		echo "skipping test $name (unknown extensino)"
+		return 0
+		;;
+	esac
+
+	echo "Executing test $name ... \c"
+	mkdir -p "$odir" >/dev/null || fatal "can't make output directory"
+	cd $(dirname $name) || fatal "failed to enter test directory"
+	$command > "$odir/stdout" 2>"$odir/stderr"
+	res=$?
+	cd - > /dev/null || fatal "failed to leave test directory"
+
+	if [[ -f "$name.out" ]] && \
+	    ! diff "$name.out" "$odir/stdout" >/dev/null; then
+		cp $name.out $odir/$base.out
+		reason="stdout mismatch"
+	elif [[ -n "$iserr" && $res -eq 0 ]]; then
+		reason="test exited $res, not non-zero"
+	elif [[ -z "$iserr" && $res -ne 0 ]]; then
+		reason="test exited $res, not zero"
+	fi
+
+	if [[ -n "$reason" ]]; then
+		echo "$reason"
+		((bt_tfail++))
+		mv "$odir" "$bt_outdir/failure.$bt_tfail" || fatal \
+		    "failed to move test output directory"
+		cp "$name" "$bt_outdir/failure.$bt_tfail/$(basename $name)" || \
+		    fatal "failed to copy test into output directory"
+	else
+		echo "passed"
+		((bt_tsuc++))
+		mv "$odir" "$bt_outdir/success.$bt_tsuc" || fatal \
+		    "failed to move test directory"
+	fi
+
+	((bt_tnum++))
+}
+
+function run_all
+{
+	typeset tests t dir
+
+	tests=$(ls -1 $bt_root/tst/*/*.@(ksh|exe))
+	for t in $tests; do
+		run_single $t
+	done
+}
+
+function welcome
+{
+	cat <<WELCOME
+Starting tests...
+output directory: $bt_outdir
+WELCOME
+}
+
+function cleanup
+{
+	[[ -n "$bt_keep" ]] && return
+	rm -rf "$bt_outdir"/success.* || fatal \
+	     "failed to remove successful test cases"
+	if [[ $bt_tfail -eq 0 ]]; then
+		rmdir "$bt_outdir" || fatal \
+		    "failed to remove test output directory"
+	fi
+}
+
+function goodbye
+{
+	cat <<EOF
+
+-------------
+Results
+-------------
+
+Tests passed: $bt_tsuc
+Tests failed: $bt_tfail
+Tests ran:    $bt_tnum
+
+EOF
+	if [[ $bt_tfail  -eq 0 ]]; then
+		echo "Congrats, some tiny parts of bhyve aren't completely" \
+		    "broken, the tests pass".
+	else
+		echo "Some tests failed, you have some work to do."
+	fi
+}
+
+while getopts ":ahko:m:" c $@; do
+	case "$c" in
+	a)
+		bt_all="y"
+		;;
+	k)
+		bt_keep="y"
+		;;
+	o)
+		bt_outdir="$OPTARG"
+		;;
+	h)
+		usage
+		;;
+	:)
+		usage "option requires an argument -- $OPTARG"
+		;;
+	*)
+		usage "invalid option -- $OPTARG"
+		;;
+	esac
+done
+
+shift $((OPTIND-1))
+
+[[ -z "$bt_all" && $# == 0 ]] && usage "no tests to run"
+
+[[ -z "$bt_outdir" ]] && bt_outdir="$PWD"
+
+setup_outdir
+welcome
+
+if [[ ! -z "$bt_all" ]]; then
+	run_all
+else
+	for t in $@; do
+		[[ -f $t ]] || fatal "cannot find test $t"
+		run_single $t
+	done
+fi
+
+goodbye
+cleanup
+
+#
+# Exit 1 if we have tests that return non-zero
+#
+[[ $bt_tfai -eq 0 ]]
diff --git a/usr/src/cmd/bhyve/test/tst/Makefile b/usr/src/cmd/bhyve/test/tst/Makefile
new file mode 100644
index 0000000000..f6a6ec96fc
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+SUBDIRS = mevent
+
+include ../Makefile.subdirs
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/Makefile b/usr/src/cmd/bhyve/test/tst/mevent/Makefile
new file mode 100644
index 0000000000..047886bc6a
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/Makefile
@@ -0,0 +1,30 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+TSTDIR =	mevent
+EXETESTS = \
+		lists.delete.exe \
+		read.disable.exe \
+		read.pause.exe \
+		read.requeue.exe \
+
+SHTESTS =
+SUPOBJS =	mevent.o testlib.o
+
+include ../../Makefile.com
+
+install: $(ROOTTESTS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c
new file mode 100644
index 0000000000..c5ed91a790
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c
@@ -0,0 +1,172 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ *        Test:	lists.delete
+ *   Assertion: mevent_delete() causes the total number of events to decrease
+ *
+ *    Strategy: 1. Create a pipe.
+ *		2. Call mevent_add() to be notified of writes to the pipe.  The
+ *		   callback will do nothing other than generate an error if it
+ *		   is called.
+ *		3. Create another pipe and add a read event watcher to it.  The
+ *		   callback will signal a cv when called.  A write to the pipe
+ *		   followed by a wait on the cv will ensure that async
+ *		   operations in mevent.c are complete.  See flush_and_wait().
+ *		4. Call flush_and_wait(), then get event count.
+ *		5. Delete the event created in step 2.
+ *		6. Call flush_and_wait(), then get event count.
+ *		7. Verify result in step 6 is one less than result in step 4.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static int
+get_count(void)
+{
+	int global = -1, change = -1, del_pending = -1;
+	int total;
+
+	test_mevent_count_lists(&global, &change, &del_pending);
+	ASSERT_INT_NEQ(("count not set"), global, -1);
+	ASSERT_INT_NEQ(("count not set"), change, -1);
+	ASSERT_INT_NEQ(("count not set"), change, -1);
+	ASSERT_INT_EQ(("pending delete not processed"), del_pending, 0);
+
+	total = global + change + del_pending;
+
+	VERBOSE(("count = %d (%d + %d + %d)", total, global, change,
+	    del_pending));
+
+	return (total);
+}
+
+static void
+not_called_cb(int fd, enum ev_type ev, void *arg)
+{
+	FAIL(("this callback should never be called"));
+}
+
+static void
+flush_cb(int fd, enum ev_type ev, void *arg)
+{
+	char buf[32];
+
+	/* Drain the pipe */
+	while (read(fd, buf, sizeof (buf)) > 0)
+		;
+
+	pthread_mutex_lock(&mtx);
+	pthread_cond_signal(&cv);
+	pthread_mutex_unlock(&mtx);
+}
+
+void
+flush_and_wait(int fd)
+{
+	uint8_t msg = 42;
+
+	/*
+	 * Lock taken ahead of waking flush_cb so this thread doesn't race
+	 * with the event thread.
+	 */
+	pthread_mutex_lock(&mtx);
+	if (write(fd, &msg, sizeof (msg)) != sizeof (msg)) {
+		FAIL(("bad write"));
+	}
+
+	/* Wait for it to be read */
+	pthread_cond_wait(&cv, &mtx);
+	pthread_mutex_unlock(&mtx);
+}
+
+int
+main(int argc, const char *argv[])
+{
+	int unused_pipe[2];
+	int flush_pipe[2];
+	struct mevent *unused_evp, *flush_evp;
+	int count1, count2;
+
+	start_test(argv[0], 5);
+	start_event_thread();
+
+	/*
+	 * Create first pipe and related event
+	 */
+	if (pipe(unused_pipe) != 0) {
+		FAIL_ERRNO("pipe");
+	}
+	VERBOSE(("unused_pipe[] = { %d, %d }", unused_pipe[0], unused_pipe[1]));
+	if (fcntl(unused_pipe[0], F_SETFL, O_NONBLOCK) != 0) {
+		FAIL_ERRNO("set pipe nonblocking");
+	}
+	unused_evp = mevent_add(unused_pipe[0], EVF_READ, not_called_cb, NULL);
+	ASSERT_PTR_NEQ(("mevent_add"), unused_evp, NULL);
+
+	/*
+	 * Create flush pipe and related event
+	 */
+	if (pipe(flush_pipe) != 0) {
+		FAIL_ERRNO("pipe");
+	}
+	VERBOSE(("flush_pipe[] = { %d, %d }", flush_pipe[0],
+	    flush_pipe[1]));
+	if (fcntl(flush_pipe[0], F_SETFL, O_NONBLOCK) != 0) {
+		FAIL_ERRNO("set pipe nonblocking");
+	}
+	flush_evp = mevent_add(flush_pipe[0], EVF_READ, flush_cb, NULL);
+	ASSERT_PTR_NEQ(("mevent_add"), flush_evp, NULL);
+
+	/* Get count before delete. */
+	flush_and_wait(flush_pipe[1]);
+	count1 = get_count();
+
+	/*
+	 * Delete the first event and flush a read after the delete is
+	 * complete.
+	 */
+	if (mevent_delete(unused_evp) != 0) {
+		FAIL_ERRNO("mevent_delete");
+	}
+
+	/*
+	 * Verify count decreased.
+	 */
+	flush_and_wait(flush_pipe[1]);
+	count2 = get_count();
+	if (count1 - 1 != count2) {
+		FAIL(("mevent_delete() did not decrease count by 1: "
+		    "was %d, now %d", count1, count2));
+	}
+
+	PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/mevent.c b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c
new file mode 100644
index 0000000000..17b6546847
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c
@@ -0,0 +1,57 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include "../../../mevent.c"
+#include "testlib.h"
+
+/*
+ * Returns by reference the number of events on the global and change lists.
+ *
+ * Used by tests that wish to ensure that the event count changes as suggested
+ * by mevent_add() and mevent_delete().  Note that a delete does not immediately
+ * delete an event.  Events that are pending delete are included in the change
+ * list until the next pass through the change list to process pending changes.
+ */
+void
+test_mevent_count_lists(int *ret_global, int *ret_change, int *ret_del_pending)
+{
+	struct mevent *mevp;
+	int global = 0;
+	int change = 0;
+	int del_pending = 0;
+
+	mevent_qlock();
+
+	LIST_FOREACH(mevp, &global_head, me_list) {
+		global++;
+		VERBOSE(("on global: type %d fd %d state %d", mevp->me_type,
+		    mevp->me_fd, mevp->me_state));
+	}
+
+	LIST_FOREACH(mevp, &change_head, me_list) {
+		change++;
+		if (mevp->me_state == MEV_DEL_PENDING) {
+			del_pending++;
+		}
+		VERBOSE(("on change: type %d fd %d state %d", mevp->me_type,
+		    mevp->me_fd, mevp->me_state));
+	}
+
+	mevent_qunlock();
+
+	*ret_global = global;
+	*ret_change = change;
+	*ret_del_pending = del_pending;
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c
new file mode 100644
index 0000000000..d23b1af96c
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c
@@ -0,0 +1,163 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ *        Test:	read.cancel
+ *   Assertion: A read is not requeued if mevent_disable() is called while it is
+ *		being handled.
+ *
+ *    Strategy: 1. Create a pipe
+ *		2. Call mevent_add() to be notified of writes to the pipe.  The
+ *		   callback will signal a cv.
+ *		3. Write to the pipe then wait for a wakeup.
+ *		4. From the read event callback, disable the event then awaken
+ *		   the main thread.
+ *		5. In the main thread, add a timer event that will awaken the
+ *		   main thread after a short delay.
+ *		5. Write to the pipe and wait to be awoken.  The wakeup should
+ *		   come from the timer event, not the read event.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+typedef enum {
+	CB_NONE,
+	CB_READ,
+	CB_TIMER,
+} lastwake_t;
+
+static lastwake_t lastwake = CB_NONE;
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static struct mevent *read_event;
+
+static void
+munch(int fd, enum ev_type ev, void *arg)
+{
+	ssize_t nbytes;
+	char buf[32] = { 0 };
+	int err;
+
+	if ((nbytes = read(fd, buf, sizeof (buf))) < 0) {
+		FAIL_ERRNO("bad read");
+	}
+	VERBOSE(("read %ld bytes '%s'", nbytes, buf));
+
+	err = mevent_disable(read_event);
+	ASSERT_INT_EQ(("mevent_disable: ", strerror(err)), err, 0);
+
+	pthread_mutex_lock(&mtx);
+
+	ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_NONE);
+	lastwake = CB_READ;
+
+	pthread_cond_signal(&cv);
+	VERBOSE(("wakeup"));
+
+	pthread_mutex_unlock(&mtx);
+}
+
+static void
+tick(int ms, enum ev_type ev, void *arg)
+{
+	pthread_mutex_lock(&mtx);
+
+	ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ);
+	lastwake = CB_TIMER;
+
+	pthread_cond_signal(&cv);
+	VERBOSE(("wakeup"));
+
+	pthread_mutex_unlock(&mtx);
+}
+
+int
+main(int argc, const char *argv[])
+{
+	int pipefds[2];
+	struct mevent *timer;
+	ssize_t written;
+	char *msgs[] = { "first", "second" };
+	char *msg;
+
+	start_test(argv[0], 5);
+	start_event_thread();
+
+	if (pipe(pipefds) != 0) {
+		FAIL_ERRNO("pipe");
+	}
+	if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) {
+		FAIL_ERRNO("set pipe nonblocking");
+	}
+
+	/*
+	 * First write
+	 */
+	msg = msgs[0];
+	read_event = mevent_add(pipefds[0], EVF_READ, munch, msg);
+	ASSERT_PTR_NEQ(("mevent_add pipefd"), read_event, NULL);
+
+	pthread_mutex_lock(&mtx);
+	written = write(pipefds[1], msg, strlen(msg));
+	if (written < 0) {
+		FAIL_ERRNO("bad write");
+	}
+	ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg));
+
+	/*
+	 * Wait for it to be read
+	 */
+	pthread_cond_wait(&cv, &mtx);
+	ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ);
+	pthread_mutex_unlock(&mtx);
+
+	/*
+	 * Add timer, second write.
+	 */
+	msg = msgs[1];
+	timer = mevent_add(50, EVF_TIMER, tick, msg);
+	ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL);
+
+	pthread_mutex_lock(&mtx);
+	written = write(pipefds[1], msg, strlen(msg));
+	if (written < 0) {
+		FAIL_ERRNO("bad write");
+	}
+	ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg));
+
+	/*
+	 * Wait for timer to expire
+	 */
+	pthread_cond_wait(&cv, &mtx);
+	ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_TIMER);
+	pthread_mutex_unlock(&mtx);
+
+	PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c
new file mode 100644
index 0000000000..c877f014f6
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c
@@ -0,0 +1,152 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ *        Test:	read.pause
+ *   Assertion: mevent_disable() can be used to pause reads.
+ *
+ *    Strategy: 1. Create a pipe
+ *		2. Call mevent_add() to be notified of writes to the pipe.  The
+ *		   callback will signal a cv.
+ *		3. In a loop, write to the pipe then wait on the cv.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static char cookie[] = "Chocolate chip with fudge stripes";
+
+/*
+ * After this many bytes are sent, writes will get batched up, progress will be
+ * made on the write side via an interval timer
+ */
+const int pauseat = 8;
+
+static void
+munch(int fd, enum ev_type ev, void *arg)
+{
+	static int i = 0;
+	char buf[sizeof (cookie)] = { 0 };
+	ssize_t nbytes;
+	ssize_t expected;
+
+	ASSERT_INT_EQ(("bad event"), ev, EVF_READ);
+	ASSERT_PTR_EQ(("bad cookie"), arg, cookie);
+
+	/*
+	 * For the first while, expect data to come a byte at a time.  After the
+	 * pause, we should get a burst with the rest of the data.
+	 */
+	if (i > pauseat) {
+		expected = strlen(cookie) - pauseat - 1;
+	} else {
+		expected = 1;
+	}
+
+	if ((nbytes = read(fd, buf, sizeof (buf))) < 0) {
+		FAIL_ERRNO("bad read");
+	}
+	VERBOSE(("read %ld bytes '%s'", nbytes, buf));
+
+	ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, expected);
+
+	if (expected == 1) {
+		ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]);
+	} else {
+		ASSERT_STR_EQ(("bad last half of cookie"), buf, &cookie[i]);
+	}
+
+	pthread_mutex_lock(&mtx);
+	pthread_cond_signal(&cv);
+	VERBOSE(("wakeup"));
+	pthread_mutex_unlock(&mtx);
+
+	i++;
+}
+
+static void
+tick(int ms, enum ev_type ev, void *arg)
+{
+	pthread_mutex_lock(&mtx);
+	pthread_cond_signal(&cv);
+	VERBOSE(("wakeup"));
+	pthread_mutex_unlock(&mtx);
+}
+
+int
+main(int argc, const char *argv[])
+{
+	int pipefds[2];
+	struct mevent *evp, *timer;
+	ssize_t written;
+
+	start_test(argv[0], 5);
+	start_event_thread();
+
+	if (pipe(pipefds) != 0) {
+		FAIL_ERRNO("pipe");
+	}
+	if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) {
+		FAIL_ERRNO("set pipe nonblocking");
+	}
+
+	evp = mevent_add(pipefds[0], EVF_READ, munch, cookie);
+	ASSERT_PTR_NEQ(("mevent_add pipefd"), evp, NULL);
+
+	for (int i = 0; cookie[i] != 0; i++) {
+		pthread_mutex_lock(&mtx);
+		written = write(pipefds[1], cookie + i, 1);
+		if (written < 0) {
+			FAIL_ERRNO("bad write");
+		}
+		ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1);
+
+		/* Wait for it to be read */
+		pthread_cond_wait(&cv, &mtx);
+		pthread_mutex_unlock(&mtx);
+
+		if (i == pauseat) {
+			timer = mevent_add(10, EVF_TIMER, tick,
+			    &cookie[pauseat]);
+			ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL);
+			VERBOSE(("disable munch"));
+			mevent_disable(evp);
+		}
+	}
+
+	pthread_mutex_lock(&mtx);
+
+	mevent_enable(evp);
+
+	pthread_cond_wait(&cv, &mtx);
+	pthread_mutex_unlock(&mtx);
+
+	PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c
new file mode 100644
index 0000000000..ddc3e27235
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c
@@ -0,0 +1,108 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ *        Test:	read.requeue
+ *   Assertion: A sequence of writes turns into a sequence of events.
+ *
+ *    Strategy: 1. Create a pipe
+ *		2. Call mevent_add() to be notified of writes to the pipe.  The
+ *		   callback will signal a cv.
+ *		3. In a loop, write to the pipe then wait on the cv.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+
+static char *cookie = "Chocolate chip with fudge stripes";
+
+static void
+munch(int fd, enum ev_type ev, void *arg)
+{
+	static int i = 0;
+	char buf[8] = { 0 };
+	ssize_t nbytes;
+
+	ASSERT_INT_EQ(("bad event"), ev, EVF_READ);
+	ASSERT_PTR_EQ(("bad cookie"), arg, cookie);
+
+	if ((nbytes = read(fd, buf, sizeof (buf))) < 0) {
+		ASSERT_INT64_EQ(("bad read: %s", strerror(errno)), nbytes, 1);
+	}
+	VERBOSE(("read %ld bytes '%s'", nbytes, buf));
+
+	ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, 1);
+
+	ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]);
+
+	pthread_mutex_lock(&mtx);
+	pthread_cond_signal(&cv);
+	VERBOSE(("wakeup"));
+	pthread_mutex_unlock(&mtx);
+
+	i++;
+}
+
+int
+main(int argc, const char *argv[])
+{
+	int pipefds[2];
+	struct mevent *evp;
+
+	start_test(argv[0], 5);
+	start_event_thread();
+
+	if (pipe(pipefds) != 0) {
+		FAIL_ERRNO("pipe");
+	}
+	if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) {
+		FAIL_ERRNO("set pipe nonblocking");
+	}
+
+	evp = mevent_add(pipefds[0], EVF_READ, munch, cookie);
+	ASSERT_PTR_NEQ(("mevent_add"), evp, NULL);
+
+	for (int i = 0; cookie[i] != '\0'; i++) {
+		ssize_t written;
+
+		pthread_mutex_lock(&mtx);
+		written = write(pipefds[1], cookie + i, 1);
+		if (written < 0) {
+			FAIL_ERRNO("bad write");
+		}
+		ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1);
+
+		/* Wait for it to be read */
+		pthread_cond_wait(&cv, &mtx);
+		pthread_mutex_unlock(&mtx);
+	}
+
+	PASS();
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.c b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c
new file mode 100644
index 0000000000..67261b9a31
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c
@@ -0,0 +1,70 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <pthread.h>
+#include <signal.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "testlib.h"
+#include "mevent.h"
+
+const char *testlib_prog;
+boolean_t testlib_verbose;
+
+static void
+timed_out(int signo)
+{
+	ASSERT_INT_EQ(("timeout signal"), signo, SIGALRM);
+
+	FAIL(("Timed out"));
+}
+
+void
+start_test(const char *argv0, uint32_t timeout)
+{
+	char *val;
+
+	testlib_prog = strrchr(argv0, '/');
+	if (testlib_prog == NULL) {
+		testlib_prog = argv0;
+	} else {
+		testlib_prog++;
+	}
+
+	testlib_verbose = ((val = getenv("TEST_VERBOSE")) != NULL) &&
+	    val[0] != '\0';
+
+	signal(SIGALRM, timed_out);
+	alarm(timeout);
+}
+
+/* ARGSUSED */
+static void *
+event_thread(void *arg)
+{
+	mevent_dispatch();
+	return (NULL);
+}
+
+void
+start_event_thread(void)
+{
+	pthread_t tid;
+
+	if (pthread_create(&tid, NULL, event_thread, NULL) != 0) {
+		FAIL_ERRNO("pthread_create");
+	}
+}
diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.h b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h
new file mode 100644
index 0000000000..7e5ca2e9c9
--- /dev/null
+++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h
@@ -0,0 +1,93 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _TESTLIB_H_
+#define	_TESTLIB_H_
+
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "mevent.h"
+
+#define	EXIT_PASS 0
+#define	EXIT_FAIL 1
+
+#define	VERBOSE(msg)							\
+	if (testlib_verbose) {						\
+		(void) printf("VERBOSE %s: %s:%d %s: ", testlib_prog,	\
+		    __FILE__, __LINE__, __func__);			\
+		(void) printf msg;					\
+		(void) printf("\n");					\
+	}
+
+#define	FAIL_PROLOGUE() \
+	(void) printf("FAIL %s: %s:%d: ", testlib_prog, __FILE__, __LINE__)
+
+#define	FAIL(msg)							\
+	{								\
+		FAIL_PROLOGUE();					\
+		(void) printf msg;					\
+		(void) printf("\n");					\
+		exit(EXIT_FAIL);					\
+	}
+
+#define	FAIL_ERRNO(msg) FAIL((msg ": %s", strerror(errno)))
+
+#define	PASS()								\
+	{								\
+		(void) printf("PASS %s\n", testlib_prog);		\
+		exit(EXIT_PASS);					\
+	}
+
+#define	ASSERT_CMP(msg, got, cmp, exp, nfmt)				\
+	if (!(got cmp exp)) {						\
+		FAIL_PROLOGUE();					\
+		(void) printf msg;					\
+		(void) printf(": %s=" nfmt " %s %s=" nfmt "\n",		\
+		    #got, got, #cmp, #exp, exp);			\
+		exit(EXIT_FAIL);					\
+	}
+
+#define	ASSERT_CHAR_EQ(msg, got, exp)	ASSERT_CMP(msg, got, ==, exp, "%c")
+#define	ASSERT_INT_EQ(msg, got, exp)	ASSERT_CMP(msg, got, ==, exp, "%d")
+#define	ASSERT_INT_NEQ(msg, got, exp)	ASSERT_CMP(msg, got, !=, exp, "%d")
+#define	ASSERT_INT64_EQ(msg, got, exp)	ASSERT_CMP(msg, got, ==, exp, "%ld")
+#define	ASSERT_PTR_EQ(msg, got, exp)	ASSERT_CMP(msg, got, ==, exp, "%p")
+#define	ASSERT_PTR_NEQ(msg, got, exp)	ASSERT_CMP(msg, got, !=, exp, "%p")
+
+#define	ASSERT_STR_EQ(msg, got, exp)					\
+	if (strcmp(got, exp) != 0) {					\
+		FAIL_PROLOGUE();					\
+		(void) printf msg;					\
+		(void) printf(": %s='%s' != %s='%s'\n",			\
+		    #got, got, #exp, exp);				\
+		exit(EXIT_FAIL);					\
+	}
+
+extern const char	*testlib_prog;
+extern boolean_t	testlib_verbose;
+
+extern void start_test(const char *, uint32_t);
+extern void start_event_thread(void);
+extern void test_mevent_count_lists(int *, int *, int *);
+
+#endif /* _TESTLIB_H_ */
diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c
index a8b5d40356..c0fff61d00 100644
--- a/usr/src/cmd/bhyve/uart_emul.c
+++ b/usr/src/cmd/bhyve/uart_emul.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
@@ -24,7 +26,8 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $
+ * $FreeBSD$
+ *
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -37,46 +40,42 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <dev/ic/ns16550.h>
-
-#ifndef	__FreeBSD__
-#include <sys/socket.h>
-#include <sys/stat.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#include <capsicum_helpers.h>
 #endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
 #include <termios.h>
 #include <unistd.h>
 #include <stdbool.h>
 #include <string.h>
 #include <pthread.h>
+#include <sysexits.h>
 #ifndef	__FreeBSD__
-#include <errno.h>
-#include <fcntl.h>
-#include <poll.h>
+#include <sys/socket.h>
 #endif
 
-#ifndef	__FreeBSD__
-#include <bhyve.h>
-
-#include "bhyverun.h"
-#endif
-#ifdef	__FreeBSD__
 #include "mevent.h"
-#endif
 #include "uart_emul.h"
 
 #define	COM1_BASE	0x3F8
 #define	COM1_IRQ	4
-#define	COM2_BASE      	0x2F8
-#define COM2_IRQ	3
+#define	COM2_BASE	0x2F8
+#define	COM2_IRQ	3
 
 #define	DEFAULT_RCLK	1843200
 #define	DEFAULT_BAUD	9600
@@ -89,15 +88,13 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z
 #define	MSR_DELTA_MASK	0x0f
 
 #ifndef REG_SCR
-#define REG_SCR		com_scr
+#define	REG_SCR		com_scr
 #endif
 
 #define	FIFOSZ	16
 
 static bool uart_stdio;		/* stdio in use for i/o */
-#ifndef	__FreeBSD__
-static bool uart_bcons;		/* bhyveconsole in use for i/o */
-#endif
+static struct termios tio_stdio_orig;
 
 static struct {
 	int	baseaddr;
@@ -118,9 +115,15 @@ struct fifo {
 	int	size;		/* size of the fifo */
 };
 
+struct ttyfd {
+	bool	opened;
+	int	rfd;		/* fd for reading */
+	int	wfd;		/* fd for writing, may be == rfd */
+};
+
 struct uart_softc {
 	pthread_mutex_t mtx;	/* protects all softc elements */
-	uint8_t data;		/* Data register (R/W) */
+	uint8_t	data;		/* Data register (R/W) */
 	uint8_t ier;		/* Interrupt enable register (R/W) */
 	uint8_t lcr;		/* Line control register (R/W) */
 	uint8_t mcr;		/* Modem control register (R/W) */
@@ -133,16 +136,16 @@ struct uart_softc {
 	uint8_t dlh;		/* Baudrate divisor latch MSB */
 
 	struct fifo rxfifo;
+	struct mevent *mev;
 
-	bool	opened;
-	bool	stdio;
+	struct ttyfd tty;
 #ifndef	__FreeBSD__
-	bool	bcons;
+	bool	sock;
 	struct {
-		pid_t	clipid;
 		int	clifd;		/* console client unix domain socket */
 		int	servfd;		/* console server unix domain socket */
-	} usc_bcons;
+		struct mevent *servmev;	/* mevent for server socket */
+	} usc_sock;
 #endif
 
 	bool	thre_int_pending;	/* THRE interrupt pending */
@@ -152,140 +155,222 @@ struct uart_softc {
 	uart_intr_func_t intr_deassert;
 };
 
-#ifdef	__FreeBSD__
 static void uart_drain(int fd, enum ev_type ev, void *arg);
-#else
-static void uart_tty_drain(struct uart_softc *sc);
-static int uart_bcons_drain(struct uart_softc *sc);
-#endif
-
-static struct termios tio_orig, tio_new;	/* I/O Terminals */
 
 static void
 ttyclose(void)
 {
 
-	tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig);
 }
 
 static void
-ttyopen(void)
+ttyopen(struct ttyfd *tf)
 {
-
-	tcgetattr(STDIN_FILENO, &tio_orig);
-
-	tio_new = tio_orig;
-	cfmakeraw(&tio_new);
-	tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
-
-	atexit(ttyclose);
-}
-
-static bool
-tty_char_available(void)
-{
-	fd_set rfds;
-	struct timeval tv;
-
-	FD_ZERO(&rfds);
-	FD_SET(STDIN_FILENO, &rfds);
-	tv.tv_sec = 0;
-	tv.tv_usec = 0;
-	if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) {
-		return (true);
-	} else {
-		return (false);
+	struct termios orig, new;
+
+	tcgetattr(tf->rfd, &orig);
+	new = orig;
+	cfmakeraw(&new);
+	new.c_cflag |= CLOCAL;
+	tcsetattr(tf->rfd, TCSANOW, &new);
+	if (uart_stdio) {
+		tio_stdio_orig = orig;
+		atexit(ttyclose);
 	}
 }
 
 static int
-ttyread(void)
+ttyread(struct ttyfd *tf)
 {
-	char rb;
+	unsigned char rb;
 
-	if (tty_char_available()) {
-		read(STDIN_FILENO, &rb, 1);
-		return (rb & 0xff);
-	} else {
+	if (read(tf->rfd, &rb, 1) == 1)
+		return (rb);
+	else
 		return (-1);
-	}
 }
 
 static void
-ttywrite(unsigned char wb)
+ttywrite(struct ttyfd *tf, unsigned char wb)
 {
 
-	(void)write(STDIN_FILENO, &wb, 1);
+	(void)write(tf->wfd, &wb, 1);
 }
 
 #ifndef	__FreeBSD__
 static void
-bconswrite(struct uart_softc *sc, unsigned char wb)
+sockwrite(struct uart_softc *sc, unsigned char wb)
 {
-	(void) write(sc->usc_bcons.clifd, &wb, 1);
+	(void) write(sc->usc_sock.clifd, &wb, 1);
 }
 #endif
 
 static void
-fifo_reset(struct fifo *fifo, int size)
+rxfifo_reset(struct uart_softc *sc, int size)
 {
+	char flushbuf[32];
+	struct fifo *fifo;
+	ssize_t nread;
+	int error;
+ 
+	fifo = &sc->rxfifo;
 	bzero(fifo, sizeof(struct fifo));
 	fifo->size = size;
+
+	if (sc->tty.opened) {
+		/*
+		 * Flush any unread input from the tty buffer.
+		 */
+		while (1) {
+			nread = read(sc->tty.rfd, flushbuf, sizeof(flushbuf));
+			if (nread != sizeof(flushbuf))
+				break;
+		}
+
+		/*
+		 * Enable mevent to trigger when new characters are available
+		 * on the tty fd.
+		 */
+		error = mevent_enable(sc->mev);
+		assert(error == 0);
+	}
+#ifndef	__FreeBSD__
+	if (sc->sock && sc->usc_sock.clifd != -1) {
+		/* Flush any unread input from the socket buffer. */
+		do {
+			nread = read(sc->usc_sock.clifd, flushbuf,
+			    sizeof (flushbuf));
+		} while (nread == sizeof (flushbuf));
+
+		/* Enable mevent to trigger when new data available on sock */
+		error = mevent_enable(sc->mev);
+		assert(error == 0);
+	}
+#endif /* __FreeBSD__ */
+}
+
+static int
+rxfifo_available(struct uart_softc *sc)
+{
+	struct fifo *fifo;
+
+	fifo = &sc->rxfifo;
+	return (fifo->num < fifo->size);
 }
 
 static int
-fifo_putchar(struct fifo *fifo, uint8_t ch)
+rxfifo_putchar(struct uart_softc *sc, uint8_t ch)
 {
+	struct fifo *fifo;
+	int error;
+
+	fifo = &sc->rxfifo;
 
 	if (fifo->num < fifo->size) {
 		fifo->buf[fifo->windex] = ch;
 		fifo->windex = (fifo->windex + 1) % fifo->size;
 		fifo->num++;
+		if (!rxfifo_available(sc)) {
+			if (sc->tty.opened) {
+				/*
+				 * Disable mevent callback if the FIFO is full.
+				 */
+				error = mevent_disable(sc->mev);
+				assert(error == 0);
+			}
+#ifndef	__FreeBSD__
+			if (sc->sock && sc->usc_sock.clifd != -1) {
+				/*
+				 * Disable mevent callback if the FIFO is full.
+				 */
+				error = mevent_disable(sc->mev);
+				assert(error == 0);
+			}
+#endif /* __FreeBSD__ */
+		}
 		return (0);
 	} else
 		return (-1);
 }
 
 static int
-fifo_getchar(struct fifo *fifo)
+rxfifo_getchar(struct uart_softc *sc)
 {
-	int c;
+	struct fifo *fifo;
+	int c, error, wasfull;
 
+	wasfull = 0;
+	fifo = &sc->rxfifo;
 	if (fifo->num > 0) {
+		if (!rxfifo_available(sc))
+			wasfull = 1;
 		c = fifo->buf[fifo->rindex];
 		fifo->rindex = (fifo->rindex + 1) % fifo->size;
 		fifo->num--;
+		if (wasfull) {
+			if (sc->tty.opened) {
+				error = mevent_enable(sc->mev);
+				assert(error == 0);
+			}
+#ifndef	__FreeBSD__
+			if (sc->sock && sc->usc_sock.clifd != -1) {
+				error = mevent_enable(sc->mev);
+				assert(error == 0);
+			}
+#endif /* __FreeBSD__ */
+		}
 		return (c);
 	} else
 		return (-1);
 }
 
 static int
-fifo_numchars(struct fifo *fifo)
+rxfifo_numchars(struct uart_softc *sc)
 {
+	struct fifo *fifo = &sc->rxfifo;
 
 	return (fifo->num);
 }
 
-static int
-fifo_available(struct fifo *fifo)
+static void
+uart_opentty(struct uart_softc *sc)
 {
 
-	return (fifo->num < fifo->size);
+	ttyopen(&sc->tty);
+	sc->mev = mevent_add(sc->tty.rfd, EVF_READ, uart_drain, sc);
+	assert(sc->mev != NULL);
 }
 
-static void
-uart_opentty(struct uart_softc *sc)
+static uint8_t
+modem_status(uint8_t mcr)
 {
-	struct mevent *mev;
+	uint8_t msr;
 
-	assert(!sc->opened && sc->stdio);
+	if (mcr & MCR_LOOPBACK) {
+		/*
+		 * In the loopback mode certain bits from the MCR are
+		 * reflected back into MSR.
+		 */
+		msr = 0;
+		if (mcr & MCR_RTS)
+			msr |= MSR_CTS;
+		if (mcr & MCR_DTR)
+			msr |= MSR_DSR;
+		if (mcr & MCR_OUT1)
+			msr |= MSR_RI;
+		if (mcr & MCR_OUT2)
+			msr |= MSR_DCD;
+	} else {
+		/*
+		 * Always assert DCD and DSR so tty open doesn't block
+		 * even if CLOCAL is turned off.
+		 */
+		msr = MSR_DCD | MSR_DSR;
+	}
+	assert((msr & MSR_DELTA_MASK) == 0);
 
-	ttyopen();
-#ifdef	__FreeBSD__
-	mev = mevent_add(STDIN_FILENO, EVF_READ, uart_drain, sc);
-#endif
-	assert(mev);
+	return (msr);
 }
 
 /*
@@ -302,7 +387,7 @@ uart_intr_reason(struct uart_softc *sc)
 
 	if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
 		return (IIR_RLS);
-	else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0)
+	else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0)
 		return (IIR_RXTOUT);
 	else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
 		return (IIR_TXRDY);
@@ -319,9 +404,14 @@ uart_reset(struct uart_softc *sc)
 
 	divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
 	sc->dll = divisor;
+#ifndef __FreeBSD__
+	sc->dlh = 0;
+#else
 	sc->dlh = divisor >> 16;
+#endif
+	sc->msr = modem_status(sc->mcr);
 
-	fifo_reset(&sc->rxfifo, 1);	/* no fifo until enabled by software */
+	rxfifo_reset(sc, 1);	/* no fifo until enabled by software */
 }
 
 /*
@@ -341,7 +431,6 @@ uart_toggle_intr(struct uart_softc *sc)
 		(*sc->intr_assert)(sc->arg);
 }
 
-#ifdef	__FreeBSD__
 static void
 uart_drain(int fd, enum ev_type ev, void *arg)
 {
@@ -350,7 +439,7 @@ uart_drain(int fd, enum ev_type ev, void *arg)
 
 	sc = arg;
 
-	assert(fd == STDIN_FILENO);
+	assert(fd == sc->tty.rfd);
 	assert(ev == EVF_READ);
 
 	/*
@@ -361,35 +450,11 @@ uart_drain(int fd, enum ev_type ev, void *arg)
 	pthread_mutex_lock(&sc->mtx);
 
 	if ((sc->mcr & MCR_LOOPBACK) != 0) {
-		(void) ttyread();
-	} else {
-		while (fifo_available(&sc->rxfifo) &&
-		       ((ch = ttyread()) != -1)) {
-			fifo_putchar(&sc->rxfifo, ch);
-		}
-		uart_toggle_intr(sc);
-	}
-
-	pthread_mutex_unlock(&sc->mtx);
-}
-#else
-static void
-uart_tty_drain(struct uart_softc *sc)
-{
-	int ch;
-
-	/*
-	 * Take the softc lock to protect against concurrent
-	 * access from a vCPU i/o exit
-	 */
-	pthread_mutex_lock(&sc->mtx);
-
-	if ((sc->mcr & MCR_LOOPBACK) != 0) {
-		(void) ttyread();
+		(void) ttyread(&sc->tty);
 	} else {
-		while (fifo_available(&sc->rxfifo) &&
-		       ((ch = ttyread()) != -1)) {
-			fifo_putchar(&sc->rxfifo, ch);
+		while (rxfifo_available(sc) &&
+		       ((ch = ttyread(&sc->tty)) != -1)) {
+			rxfifo_putchar(sc, ch);
 		}
 		uart_toggle_intr(sc);
 	}
@@ -397,50 +462,6 @@ uart_tty_drain(struct uart_softc *sc)
 	pthread_mutex_unlock(&sc->mtx);
 }
 
-static int
-uart_bcons_drain(struct uart_softc *sc)
-{
-	char ch;
-	int nbytes;
-	int ret = 0;
-
-	/*
-	 * Take the softc lock to protect against concurrent
-	 * access from a vCPU i/o exit
-	 */
-	pthread_mutex_lock(&sc->mtx);
-
-	if ((sc->mcr & MCR_LOOPBACK) != 0) {
-		(void) read(sc->usc_bcons.clifd, &ch, 1);
-	} else {
-		for (;;) {
-			nbytes = read(sc->usc_bcons.clifd, &ch, 1);
-			if (nbytes == 0) {
-				ret = 1;
-				break;
-			}
-			if (nbytes == -1 &&
-			    errno != EINTR && errno != EAGAIN) {
-				ret = -1;
-				break;
-			}
-			if (nbytes == -1) {
-				break;
-			}
-
-			if (fifo_available(&sc->rxfifo)) {
-				fifo_putchar(&sc->rxfifo, ch);
-			}
-		}
-		uart_toggle_intr(sc);
-	}
-
-	pthread_mutex_unlock(&sc->mtx);
-
-	return (ret);
-}
-#endif
-
 void
 uart_write(struct uart_softc *sc, int offset, uint8_t value)
 {
@@ -449,12 +470,6 @@ uart_write(struct uart_softc *sc, int offset, uint8_t value)
 
 	pthread_mutex_lock(&sc->mtx);
 
-	/* Open terminal */
-	if (!sc->opened && sc->stdio) {
-		uart_opentty(sc);
-		sc->opened = true;
-	}
-
 	/*
 	 * Take care of the special case DLAB accesses first
 	 */
@@ -473,108 +488,96 @@ uart_write(struct uart_softc *sc, int offset, uint8_t value)
         switch (offset) {
 	case REG_DATA:
 		if (sc->mcr & MCR_LOOPBACK) {
-			if (fifo_putchar(&sc->rxfifo, value) != 0)
+			if (rxfifo_putchar(sc, value) != 0)
 				sc->lsr |= LSR_OE;
-		} else if (sc->stdio) {
-			ttywrite(value);
+		} else if (sc->tty.opened) {
+			ttywrite(&sc->tty, value);
 #ifndef	__FreeBSD__
-		} else if (sc->bcons) {
-				bconswrite(sc, value);
+		} else if (sc->sock) {
+			sockwrite(sc, value);
 #endif
 		} /* else drop on floor */
 		sc->thre_int_pending = true;
 		break;
 	case REG_IER:
+		/* Set pending when IER_ETXRDY is raised (edge-triggered). */
+		if ((sc->ier & IER_ETXRDY) == 0 && (value & IER_ETXRDY) != 0)
+			sc->thre_int_pending = true;
 		/*
 		 * Apply mask so that bits 4-7 are 0
 		 * Also enables bits 0-3 only if they're 1
 		 */
 		sc->ier = value & 0x0F;
 		break;
-		case REG_FCR:
-			/*
-			 * When moving from FIFO and 16450 mode and vice versa,
-			 * the FIFO contents are reset.
-			 */
-			if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
-				fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
-				fifo_reset(&sc->rxfifo, fifosz);
-			}
+	case REG_FCR:
+		/*
+		 * When moving from FIFO and 16450 mode and vice versa,
+		 * the FIFO contents are reset.
+		 */
+		if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
+			fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
+			rxfifo_reset(sc, fifosz);
+		}
 
-			/*
-			 * The FCR_ENABLE bit must be '1' for the programming
-			 * of other FCR bits to be effective.
-			 */
-			if ((value & FCR_ENABLE) == 0) {
-				sc->fcr = 0;
-			} else {
-				if ((value & FCR_RCV_RST) != 0)
-					fifo_reset(&sc->rxfifo, FIFOSZ);
-
-				sc->fcr = value &
-					 (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
-			}
-			break;
-		case REG_LCR:
-			sc->lcr = value;
-			break;
-		case REG_MCR:
-			/* Apply mask so that bits 5-7 are 0 */
-			sc->mcr = value & 0x1F;
-
-			msr = 0;
-			if (sc->mcr & MCR_LOOPBACK) {
-				/*
-				 * In the loopback mode certain bits from the
-				 * MCR are reflected back into MSR
-				 */
-				if (sc->mcr & MCR_RTS)
-					msr |= MSR_CTS;
-				if (sc->mcr & MCR_DTR)
-					msr |= MSR_DSR;
-				if (sc->mcr & MCR_OUT1)
-					msr |= MSR_RI;
-				if (sc->mcr & MCR_OUT2)
-					msr |= MSR_DCD;
-			}
+		/*
+		 * The FCR_ENABLE bit must be '1' for the programming
+		 * of other FCR bits to be effective.
+		 */
+		if ((value & FCR_ENABLE) == 0) {
+			sc->fcr = 0;
+		} else {
+			if ((value & FCR_RCV_RST) != 0)
+				rxfifo_reset(sc, FIFOSZ);
+
+			sc->fcr = value &
+				 (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
+		}
+		break;
+	case REG_LCR:
+		sc->lcr = value;
+		break;
+	case REG_MCR:
+		/* Apply mask so that bits 5-7 are 0 */
+		sc->mcr = value & 0x1F;
+		msr = modem_status(sc->mcr);
 
-			/*
-			 * Detect if there has been any change between the
-			 * previous and the new value of MSR. If there is
-			 * then assert the appropriate MSR delta bit.
-			 */
-			if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
-				sc->msr |= MSR_DCTS;
-			if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
-				sc->msr |= MSR_DDSR;
-			if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
-				sc->msr |= MSR_DDCD;
-			if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
-				sc->msr |= MSR_TERI;
-
-			/*
-			 * Update the value of MSR while retaining the delta
-			 * bits.
-			 */
-			sc->msr &= MSR_DELTA_MASK;
-			sc->msr |= msr;
-			break;
-		case REG_LSR:
-			/*
-			 * Line status register is not meant to be written to
-			 * during normal operation.
-			 */
-			break;
-		case REG_MSR:
-			/*
-			 * As far as I can tell MSR is a read-only register.
-			 */
-			break;
-		case REG_SCR:
-			sc->scr = value;
-			break;
-		default:
-			break;
+		/*
+		 * Detect if there has been any change between the
+		 * previous and the new value of MSR. If there is
+		 * then assert the appropriate MSR delta bit.
+		 */
+		if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
+			sc->msr |= MSR_DCTS;
+		if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
+			sc->msr |= MSR_DDSR;
+		if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
+			sc->msr |= MSR_DDCD;
+		if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
+			sc->msr |= MSR_TERI;
+
+		/*
+		 * Update the value of MSR while retaining the delta
+		 * bits.
+		 */
+		sc->msr &= MSR_DELTA_MASK;
+		sc->msr |= msr;
+		break;
+	case REG_LSR:
+		/*
+		 * Line status register is not meant to be written to
+		 * during normal operation.
+		 */
+		break;
+	case REG_MSR:
+		/*
+		 * As far as I can tell MSR is a read-only register.
+		 */
+		break;
+	case REG_SCR:
+		sc->scr = value;
+		break;
+	default:
+		break;
 	}
 
 done:
@@ -589,12 +592,6 @@ uart_read(struct uart_softc *sc, int offset)
 
 	pthread_mutex_lock(&sc->mtx);
 
-	/* Open terminal */
-	if (!sc->opened && sc->stdio) {
-		uart_opentty(sc);
-		sc->opened = true;
-	}
-
 	/*
 	 * Take care of the special case DLAB accesses first
 	 */
@@ -612,7 +609,7 @@ uart_read(struct uart_softc *sc, int offset)
 
 	switch (offset) {
 	case REG_DATA:
-		reg = fifo_getchar(&sc->rxfifo);
+		reg = rxfifo_getchar(sc);
 		break;
 	case REG_IER:
 		reg = sc->ier;
@@ -643,7 +640,7 @@ uart_read(struct uart_softc *sc, int offset)
 		sc->lsr |= LSR_TEMT | LSR_THRE;
 
 		/* Check for new receive data */
-		if (fifo_numchars(&sc->rxfifo) > 0)
+		if (rxfifo_numchars(sc) > 0)
 			sc->lsr |= LSR_RXRDY;
 		else
 			sc->lsr &= ~LSR_RXRDY;
@@ -676,277 +673,123 @@ done:
 }
 
 #ifndef	__FreeBSD__
-static void *
-uart_tty_thread(void *param)
-{
-	struct uart_softc *sc = param;
-	pollfd_t pollset;
-
-	pollset.fd = STDIN_FILENO;
-	pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
-
-	for (;;) {
-		if (poll(&pollset, 1, -1) < 0) {
-			if (errno != EINTR) {
-				perror("poll failed");
-				break;
-			}
-			continue;
-		}
-		uart_tty_drain(sc);
-	}
-
-	return (NULL);
-}
-
-/*
- * Read the "ident" string from the client's descriptor; this routine also
- * tolerates being called with pid=NULL, for times when you want to "eat"
- * the ident string from a client without saving it.
- */
-static int
-get_client_ident(int clifd, pid_t *pid)
+static void
+uart_sock_drain(int fd, enum ev_type ev, void *arg)
 {
-	char buf[BUFSIZ], *bufp;
-	size_t buflen = sizeof (buf);
-	char c = '\0';
-	int i = 0, r;
-
-	/* "eat up the ident string" case, for simplicity */
-	if (pid == NULL) {
-		while (read(clifd, &c, 1) == 1) {
-			if (c == '\n')
-				return (0);
-		}
-	}
-
-	bzero(buf, sizeof (buf));
-	while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) {
-		buflen--;
-		if (c == '\n')
-			break;
-
-		buf[i] = c;
-		i++;
-	}
-	if (r == -1)
-		return (-1);
-
-	/*
-	 * We've filled the buffer, but still haven't seen \n.  Keep eating
-	 * until we find it; we don't expect this to happen, but this is
-	 * defensive.
-	 */
-	if (c != '\n') {
-		while ((r = read(clifd, &c, sizeof (c))) > 0)
-			if (c == '\n')
-				break;
-	}
+	struct uart_softc *sc = arg;
+	char ch;
 
 	/*
-	 * Parse buffer for message of the form: IDENT <pid>
+	 * Take the softc lock to protect against concurrent
+	 * access from a vCPU i/o exit
 	 */
-	bufp = buf;
-	if (strncmp(bufp, "IDENT ", 6) != 0)
-		return (-1);
-	bufp += 6;
-	errno = 0;
-	*pid = strtoll(bufp, &bufp, 10);
-	if (errno != 0)
-		return (-1);
+	pthread_mutex_lock(&sc->mtx);
 
-	return (0);
-}
+	if ((sc->mcr & MCR_LOOPBACK) != 0) {
+		(void) read(sc->usc_sock.clifd, &ch, 1);
+	} else {
+		bool err_close = false;
 
-static int
-uart_bcons_accept_client(struct uart_softc *sc)
-{
-	int connfd;
-	struct sockaddr_un cliaddr;
-	socklen_t clilen;
-	pid_t pid;
-
-	clilen = sizeof (cliaddr);
-	connfd = accept(sc->usc_bcons.servfd,
-			(struct sockaddr *)&cliaddr, &clilen);
-	if (connfd == -1)
-		return (-1);
-	if (get_client_ident(connfd, &pid) == -1) {
-		(void) shutdown(connfd, SHUT_RDWR);
-		(void) close(connfd);
-		return (-1);
-	}
+		while (rxfifo_available(sc)) {
+			int res;
 
-	if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) {
-		(void) shutdown(connfd, SHUT_RDWR);
-		(void) close(connfd);
-		return (-1);
-	}
-	(void) write(connfd, "OK\n", 3);
+			res = read(sc->usc_sock.clifd, &ch, 1);
+			if (res == 0) {
+				err_close = true;
+				break;
+			} else if (res == -1) {
+				if (errno != EAGAIN && errno != EINTR) {
+					err_close = true;
+				}
+				break;
+			}
 
-	sc->usc_bcons.clipid = pid;
-	sc->usc_bcons.clifd = connfd;
+			rxfifo_putchar(sc, ch);
+		}
+		uart_toggle_intr(sc);
 
-	printf("Connection from process ID %lu.\n", pid);
+		if (err_close) {
+			(void) fprintf(stderr, "uart: closing client conn\n");
+			(void) shutdown(sc->usc_sock.clifd, SHUT_RDWR);
+			mevent_delete_close(sc->mev);
+			sc->mev = NULL;
+			sc->usc_sock.clifd = -1;
+		}
+	}
 
-	return (0);
+	pthread_mutex_unlock(&sc->mtx);
 }
 
 static void
-uart_bcons_reject_client(struct uart_softc *sc)
+uart_sock_accept(int fd, enum ev_type ev, void *arg)
 {
+	struct uart_softc *sc = arg;
 	int connfd;
-	struct sockaddr_un cliaddr;
-	socklen_t clilen;
-	char nak[MAXPATHLEN];
 
-	clilen = sizeof (cliaddr);
-	connfd = accept(sc->usc_bcons.servfd,
-			(struct sockaddr *)&cliaddr, &clilen);
+	connfd = accept(sc->usc_sock.servfd, NULL, NULL);
+	if (connfd == -1) {
+		return;
+	}
 
 	/*
-	 * After hear its ident string, tell client to get lost.
+	 * Do client connection management under protection of the softc lock
+	 * to avoid racing with concurrent UART events.
 	 */
-	if (get_client_ident(connfd, NULL) == 0) {
-		(void) snprintf(nak, sizeof (nak), "%lu\n",
-		    sc->usc_bcons.clipid);
-		(void) write(connfd, nak, strlen(nak));
-	}
-	(void) shutdown(connfd, SHUT_RDWR);
-	(void) close(connfd);
-}
-
-static int
-uart_bcons_client_event(struct uart_softc *sc)
-{
-	int res;
-
-	res = uart_bcons_drain(sc);
-	if (res < 0)
-		return (-1);
-
-	if (res > 0) {
-		fprintf(stderr, "Closing connection with bhyve console\n");
-		(void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR);
-		(void) close(sc->usc_bcons.clifd);
-		sc->usc_bcons.clifd = -1;
-	}
-
-	return (0);
-}
-
-static void
-uart_bcons_server_event(struct uart_softc *sc)
-{
-	int clifd;
+	pthread_mutex_lock(&sc->mtx);
 
-	if (sc->usc_bcons.clifd != -1) {
+	if (sc->usc_sock.clifd != -1) {
 		/* we're already handling a client */
-		uart_bcons_reject_client(sc);
-		return;
-	}
-
-	if (uart_bcons_accept_client(sc) == 0) {
-		pthread_mutex_lock(&bcons_wait_lock);
-		bcons_connected = B_TRUE;
-		pthread_cond_signal(&bcons_wait_done);
-		pthread_mutex_unlock(&bcons_wait_lock);
-	}
-}
-
-static void *
-uart_bcons_thread(void *param)
-{
-	struct uart_softc *sc = param;
-	struct pollfd pollfds[2];
-	int res;
-
-	/* read from client and write to vm */
-	pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND |
-	    POLLPRI | POLLERR | POLLHUP;
-
-	/* the server socket; watch for events (new connections) */
-	pollfds[1].events = pollfds[0].events;
-
-	for (;;) {
-		pollfds[0].fd = sc->usc_bcons.clifd;
-		pollfds[1].fd = sc->usc_bcons.servfd;
-		pollfds[0].revents = pollfds[1].revents = 0;
-
-		res = poll(pollfds,
-		    sizeof (pollfds) / sizeof (struct pollfd), -1);
-
-		if (res == -1 && errno != EINTR) {
-			perror("poll failed");
-			/* we are hosed, close connection */
-			break;
-		}
-
-		/* event from client side */
-		if (pollfds[0].revents) {
-			if (pollfds[0].revents &
-			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
-				if (uart_bcons_client_event(sc) < 0)
-					break;
-			} else {
-				break;
-			}
-		}
-
-		/* event from server socket */
-		if (pollfds[1].revents) {
-			if (pollfds[1].revents & (POLLIN | POLLRDNORM)) {
-				uart_bcons_server_event(sc);
-			} else {
-				break;
-			}
+		(void) fprintf(stderr, "uart: unexpected client conn\n");
+		(void) shutdown(connfd, SHUT_RDWR);
+		(void) close(connfd);
+	} else {
+		if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) {
+			perror("uart: fcntl(O_NONBLOCK)");
+			(void) shutdown(connfd, SHUT_RDWR);
+			(void) close(connfd);
+		} else {
+			sc->usc_sock.clifd = connfd;
+			sc->mev = mevent_add(sc->usc_sock.clifd, EVF_READ,
+			    uart_sock_drain, sc);
 		}
 	}
 
-	if (sc->usc_bcons.clifd != -1) {
-		fprintf(stderr, "Closing connection with bhyve console\n");
-		(void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR);
-		(void) close(sc->usc_bcons.clifd);
-		sc->usc_bcons.clifd = -1;
-	}
-
-	return (NULL);
+	pthread_mutex_unlock(&sc->mtx);
 }
 
 static int
-init_bcons_sock(void)
+init_sock(const char *path)
 {
 	int servfd;
 	struct sockaddr_un servaddr;
 
-	if (mkdir(BHYVE_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) {
-		fprintf(stderr, "bhyve console setup: "
-		    "could not mkdir %s", BHYVE_TMPDIR, strerror(errno));
-		return (-1);
-	}
-
 	bzero(&servaddr, sizeof (servaddr));
 	servaddr.sun_family = AF_UNIX;
-	(void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
-	    BHYVE_CONS_SOCKPATH, vmname);
+
+	if (strlcpy(servaddr.sun_path, path, sizeof (servaddr.sun_path)) >=
+	    sizeof (servaddr.sun_path)) {
+		(void) fprintf(stderr, "uart: path '%s' too long\n",
+		    path);
+		return (-1);
+	}
 
 	if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
-		fprintf(stderr, "bhyve console setup: "
-		    "could not create socket\n");
+		(void) fprintf(stderr, "uart: socket() error - %s\n",
+		    strerror(errno));
 		return (-1);
 	}
 	(void) unlink(servaddr.sun_path);
 
 	if (bind(servfd, (struct sockaddr *)&servaddr,
 	    sizeof (servaddr)) == -1) {
-		fprintf(stderr, "bhyve console setup: "
-		    "could not bind to socket\n");
+		(void) fprintf(stderr, "uart: bind() error - %s\n",
+		    strerror(errno));
 		goto out;
         }
 
-        if (listen(servfd, 4) == -1) {
-		fprintf(stderr, "bhyve console setup: "
-		    "could not listen on socket");
+        if (listen(servfd, 1) == -1) {
+		(void) fprintf(stderr, "uart: listen() error - %s\n",
+		    strerror(errno));
 		goto out;
         }
         return (servfd);
@@ -956,7 +799,7 @@ out:
         (void) close(servfd);
         return (-1);
 }
-#endif
+#endif /* not __FreeBSD__ */
 
 int
 uart_legacy_alloc(int which, int *baseaddr, int *irq)
@@ -978,8 +821,7 @@ uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
 {
 	struct uart_softc *sc;
 
-	sc = malloc(sizeof(struct uart_softc));
-	bzero(sc, sizeof(struct uart_softc));
+	sc = calloc(1, sizeof(struct uart_softc));
 
 	sc->arg = arg;
 	sc->intr_assert = intr_assert;
@@ -992,51 +834,130 @@ uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
 	return (sc);
 }
 
-int
-uart_set_backend(struct uart_softc *sc, const char *opts)
+#ifndef __FreeBSD__
+static int
+uart_sock_backend(struct uart_softc *sc, const char *inopts)
 {
-#ifndef	__FreeBSD__
-	int error;
+	char *opts;
+	char *opt;
+	char *nextopt;
+	char *path = NULL;
+
+	if (strncmp(inopts, "socket,", 7) != 0) {
+		return (-1);
+	}
+	if ((opts = strdup(inopts + 7)) == NULL) {
+		return (-1);
+	}
+
+	nextopt = opts;
+	for (opt = strsep(&nextopt, ","); opt != NULL;
+	    opt = strsep(&nextopt, ",")) {
+		if (path == NULL && *opt == '/') {
+			path = opt;
+			continue;
+		}
+		/*
+		 * XXX check for server and client options here.  For now,
+		 * everything is a server
+		 */
+		free(opts);
+		return (-1);
+	}
+
+	sc->usc_sock.clifd = -1;
+	if ((sc->usc_sock.servfd = init_sock(path)) == -1) {
+		free(opts);
+		return (-1);
+	}
+	sc->sock = true;
+	sc->tty.rfd = sc->tty.wfd = -1;
+	sc->usc_sock.servmev = mevent_add(sc->usc_sock.servfd, EVF_READ,
+	    uart_sock_accept, sc);
+	assert(sc->usc_sock.servmev != NULL);
+
+	return (0);
+}
+#endif /* not __FreeBSD__ */
+
+static int
+uart_stdio_backend(struct uart_softc *sc)
+{
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+	cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ };
 #endif
-	/*
-	 * XXX one stdio backend supported at this time.
-	 */
-	if (opts == NULL)
-		return (0);
 
-#ifdef	__FreeBSD__
-	if (strcmp("stdio", opts) == 0 && !uart_stdio) {
-		sc->stdio = true;
-		uart_stdio = true;
-		return (0);
-#else
-	if (strcmp("stdio", opts) == 0 && !uart_stdio && !uart_bcons) {
-		sc->stdio = true;
-		uart_stdio = true;
+	if (uart_stdio)
+		return (-1);
 
-		error = pthread_create(NULL, NULL, uart_tty_thread, sc);
-		assert(error == 0);
+	sc->tty.rfd = STDIN_FILENO;
+	sc->tty.wfd = STDOUT_FILENO;
+	sc->tty.opened = true;
 
-		return (0);
-	} else if (strstr(opts, "bcons") != 0 && !uart_stdio && !uart_bcons) {
-		sc->bcons = true;
-		uart_bcons= true;
+	if (fcntl(sc->tty.rfd, F_SETFL, O_NONBLOCK) != 0)
+		return (-1);
+	if (fcntl(sc->tty.wfd, F_SETFL, O_NONBLOCK) != 0)
+		return (-1);
 
-		if (strstr(opts, "bcons,wait") != 0) {
-			bcons_wait = true;
-		}
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ);
+	if (caph_rights_limit(sc->tty.rfd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	if (caph_ioctls_limit(sc->tty.rfd, cmds, nitems(cmds)) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
 
-		sc->usc_bcons.clifd = -1;
-		if ((sc->usc_bcons.servfd = init_bcons_sock()) == -1) {
-			fprintf(stderr, "bhyve console setup: "
-			    "socket initialization failed\n");
-			return (-1);
-		}
-		error = pthread_create(NULL, NULL, uart_bcons_thread, sc);
-		assert(error == 0);
+	uart_stdio = true;
 
-		return (0);
+	return (0);
+}
+
+static int
+uart_tty_backend(struct uart_softc *sc, const char *opts)
+{
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+	cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ };
 #endif
-	} else
+	int fd;
+
+	fd = open(opts, O_RDWR | O_NONBLOCK);
+	if (fd < 0 || !isatty(fd))
 		return (-1);
+
+	sc->tty.rfd = sc->tty.wfd = fd;
+	sc->tty.opened = true;
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(fd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	return (0);
+}
+
+int
+uart_set_backend(struct uart_softc *sc, const char *opts)
+{
+	int retval;
+
+	if (opts == NULL)
+		return (0);
+
+#ifndef __FreeBSD__
+	if (strncmp("socket,", opts, 7) == 0)
+		return (uart_sock_backend(sc, opts));
+#endif
+	if (strcmp("stdio", opts) == 0)
+		retval = uart_stdio_backend(sc);
+	else
+		retval = uart_tty_backend(sc, opts);
+	if (retval == 0)
+		uart_opentty(sc);
+
+	return (retval);
 }
diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h
index ecff957991..a87202df1f 100644
--- a/usr/src/cmd/bhyve/uart_emul.h
+++ b/usr/src/cmd/bhyve/uart_emul.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/uart_emul.h 257293 2013-10-29 00:18:11Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _UART_EMUL_H_
diff --git a/usr/src/cmd/bhyve/usb_emul.c b/usr/src/cmd/bhyve/usb_emul.c
new file mode 100644
index 0000000000..6ecdd9530e
--- /dev/null
+++ b/usr/src/cmd/bhyve/usb_emul.c
@@ -0,0 +1,78 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "usb_emul.h"
+
+SET_DECLARE(usb_emu_set, struct usb_devemu);
+
+struct usb_devemu *
+usb_emu_finddev(char *name)
+{
+	struct usb_devemu **udpp, *udp;
+
+	SET_FOREACH(udpp, usb_emu_set) {
+		udp = *udpp;
+		if (!strcmp(udp->ue_emu, name))
+			return (udp);
+	}
+
+	return (NULL);
+}
+
+struct usb_data_xfer_block *
+usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen,
+                     void *hci_data, int ccs)
+{
+	struct usb_data_xfer_block *xb;
+
+	if (xfer->ndata >= USB_MAX_XFER_BLOCKS)
+		return (NULL);
+
+	xb = &xfer->data[xfer->tail];
+	xb->buf = buf;
+	xb->blen = blen;
+	xb->hci_data = hci_data;
+	xb->ccs = ccs;
+	xb->processed = 0;
+	xb->bdone = 0;
+	xfer->ndata++;
+	xfer->tail = (xfer->tail + 1) % USB_MAX_XFER_BLOCKS;
+	return (xb);
+}
diff --git a/usr/src/cmd/bhyve/usb_emul.h b/usr/src/cmd/bhyve/usb_emul.h
new file mode 100644
index 0000000000..e55a421b6f
--- /dev/null
+++ b/usr/src/cmd/bhyve/usb_emul.h
@@ -0,0 +1,164 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * Copyright 2018 Joyent, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _USB_EMUL_H_
+#define _USB_EMUL_H_
+
+#include <stdlib.h>
+#include <sys/linker_set.h>
+#include <pthread.h>
+#ifndef __FreeBSD__
+#include <synch.h>
+#endif
+
+#define	USB_MAX_XFER_BLOCKS	8
+
+#define	USB_XFER_OUT		0
+#define	USB_XFER_IN		1
+
+
+
+struct usb_hci;
+struct usb_device_request;
+struct usb_data_xfer;
+
+/* Device emulation handlers */
+struct usb_devemu {
+	char	*ue_emu;	/* name of device emulation */
+	int	ue_usbver;	/* usb version: 2 or 3 */
+	int	ue_usbspeed;	/* usb device speed */
+
+	/* instance creation */
+	void	*(*ue_init)(struct usb_hci *hci, char *opt);
+
+	/* handlers */
+	int	(*ue_request)(void *sc, struct usb_data_xfer *xfer);
+	int	(*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir,
+	                   int epctx);
+	int	(*ue_reset)(void *sc);
+	int	(*ue_remove)(void *sc);
+	int	(*ue_stop)(void *sc);
+};
+#define	USB_EMUL_SET(x)		DATA_SET(usb_emu_set, x);
+
+/*
+ * USB device events to notify HCI when state changes
+ */
+enum hci_usbev {
+	USBDEV_ATTACH,
+	USBDEV_RESET,
+	USBDEV_STOP,
+	USBDEV_REMOVE,
+};
+
+/* usb controller, ie xhci, ehci */
+struct usb_hci {
+	int	(*hci_intr)(struct usb_hci *hci, int epctx);
+	int	(*hci_event)(struct usb_hci *hci, enum hci_usbev evid,
+		             void *param);
+	void	*hci_sc;			/* private softc for hci */
+
+	/* controller managed fields */
+	int	hci_address;
+	int	hci_port;
+};
+
+/*
+ * Each xfer block is mapped to the hci transfer block.
+ * On input into the device handler, blen is set to the lenght of buf.
+ * The device handler is to update blen to reflect on the residual size
+ * of the buffer, i.e. len(buf) - len(consumed).
+ */
+struct usb_data_xfer_block {
+	void	*buf;			/* IN or OUT pointer */
+	int	blen;			/* in:len(buf), out:len(remaining) */
+	int	bdone;			/* bytes transferred */
+	uint32_t processed;		/* device processed this + errcode */
+	void	*hci_data;		/* HCI private reference */
+	int	ccs;
+	uint32_t streamid;
+	uint64_t trbnext;		/* next TRB guest address */
+};
+
+struct usb_data_xfer {
+	struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS];
+	struct usb_device_request *ureq; 	/* setup ctl request */
+	int	ndata;				/* # of data items */
+	int	head;
+	int	tail;
+	pthread_mutex_t mtx;
+};
+
+enum USB_ERRCODE {
+	USB_ACK,
+	USB_NAK,
+	USB_STALL,
+	USB_NYET,
+	USB_ERR,
+	USB_SHORT
+};
+
+#define	USB_DATA_GET_ERRCODE(x)		(x)->processed >> 8
+#define	USB_DATA_SET_ERRCODE(x,e)	do {				\
+			(x)->processed = ((x)->processed & 0xFF) | (e << 8); \
+		} while (0)
+
+#define	USB_DATA_OK(x,i)	((x)->data[(i)].buf != NULL)
+
+#define	USB_DATA_XFER_INIT(x)	do {					\
+			memset((x), 0, sizeof(*(x)));			\
+			pthread_mutex_init(&((x)->mtx), NULL);		\
+		} while (0)
+
+#define	USB_DATA_XFER_RESET(x)	do {					\
+			memset((x)->data, 0, sizeof((x)->data));	\
+			(x)->ndata = 0;					\
+			(x)->head = (x)->tail = 0;			\
+		} while (0)
+
+#define	USB_DATA_XFER_LOCK(x)	do {					\
+			pthread_mutex_lock(&((x)->mtx));		\
+		} while (0)
+
+#define	USB_DATA_XFER_UNLOCK(x)	do {					\
+			pthread_mutex_unlock(&((x)->mtx));		\
+		} while (0)
+#ifndef __FreeBSD__
+#define	USB_DATA_XFER_LOCK_HELD(x) MUTEX_HELD(&((x)->mtx))
+#endif
+
+struct usb_devemu *usb_emu_finddev(char *name);
+
+struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer,
+                          void *buf, int blen, void *hci_data, int ccs);
+
+
+#endif /* _USB_EMUL_H_ */
diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c
new file mode 100644
index 0000000000..921fce5db9
--- /dev/null
+++ b/usr/src/cmd/bhyve/usb_mouse.c
@@ -0,0 +1,809 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <dev/usb/usb.h>
+#include <dev/usb/usbdi.h>
+
+#include "usb_emul.h"
+#include "console.h"
+#include "bhyvegc.h"
+
+static int umouse_debug = 0;
+#define	DPRINTF(params) if (umouse_debug) printf params
+#define	WPRINTF(params) printf params
+
+/* USB endpoint context (1-15) for reporting mouse data events*/
+#define	UMOUSE_INTR_ENDPT	1
+
+#define UMOUSE_REPORT_DESC_TYPE	0x22
+
+#define	UMOUSE_GET_REPORT	0x01
+#define	UMOUSE_GET_IDLE		0x02
+#define	UMOUSE_GET_PROTOCOL	0x03
+#define	UMOUSE_SET_REPORT	0x09
+#define	UMOUSE_SET_IDLE		0x0A
+#define	UMOUSE_SET_PROTOCOL	0x0B
+
+#define HSETW(ptr, val)   ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
+
+enum {
+	UMSTR_LANG,
+	UMSTR_MANUFACTURER,
+	UMSTR_PRODUCT,
+	UMSTR_SERIAL,
+	UMSTR_CONFIG,
+	UMSTR_MAX
+};
+
+static const char *umouse_desc_strings[] = {
+	"\x04\x09",
+	"BHYVE",
+	"HID Tablet",
+	"01",
+	"HID Tablet Device",
+};
+
+struct umouse_hid_descriptor {
+	uint8_t	bLength;
+	uint8_t	bDescriptorType;
+	uint8_t	bcdHID[2];
+	uint8_t	bCountryCode;
+	uint8_t	bNumDescriptors;
+	uint8_t	bReportDescriptorType;
+	uint8_t	wItemLength[2];
+} __packed;
+
+struct umouse_config_desc {
+	struct usb_config_descriptor		confd;
+	struct usb_interface_descriptor		ifcd;
+	struct umouse_hid_descriptor		hidd;
+	struct usb_endpoint_descriptor		endpd;
+	struct usb_endpoint_ss_comp_descriptor	sscompd;
+} __packed;
+
+#define MOUSE_MAX_X	0x8000
+#define MOUSE_MAX_Y	0x8000
+
+static const uint8_t umouse_report_desc[] = {
+	0x05, 0x01,		/* USAGE_PAGE (Generic Desktop)		*/
+	0x09, 0x02,		/* USAGE (Mouse)			*/
+	0xa1, 0x01,		/* COLLECTION (Application) 		*/
+	0x09, 0x01,		/*   USAGE (Pointer)			*/
+	0xa1, 0x00,		/*   COLLECTION (Physical)		*/
+	0x05, 0x09,		/*     USAGE_PAGE (Button)		*/
+	0x19, 0x01,		/*     USAGE_MINIMUM (Button 1)		*/
+	0x29, 0x03,		/*     USAGE_MAXIMUM (Button 3)		*/
+	0x15, 0x00,		/*     LOGICAL_MINIMUM (0)		*/
+	0x25, 0x01,		/*     LOGICAL_MAXIMUM (1)		*/
+	0x75, 0x01,		/*     REPORT_SIZE (1)			*/
+	0x95, 0x03,		/*     REPORT_COUNT (3)			*/
+	0x81, 0x02,		/*     INPUT (Data,Var,Abs); 3 buttons	*/
+	0x75, 0x05,		/*     REPORT_SIZE (5)			*/
+	0x95, 0x01,		/*     REPORT_COUNT (1)			*/
+	0x81, 0x03,		/*     INPUT (Cnst,Var,Abs); padding	*/
+	0x05, 0x01,		/*     USAGE_PAGE (Generic Desktop)	*/
+	0x09, 0x30,		/*     USAGE (X)			*/
+	0x09, 0x31,		/*     USAGE (Y)			*/
+	0x35, 0x00,		/*     PHYSICAL_MINIMUM (0)		*/
+	0x46, 0xff, 0x7f,	/*     PHYSICAL_MAXIMUM (0x7fff)	*/
+	0x15, 0x00,		/*     LOGICAL_MINIMUM (0)		*/
+	0x26, 0xff, 0x7f,	/*     LOGICAL_MAXIMUM (0x7fff)		*/
+	0x75, 0x10,		/*     REPORT_SIZE (16)			*/
+	0x95, 0x02,		/*     REPORT_COUNT (2)			*/
+	0x81, 0x02,		/*     INPUT (Data,Var,Abs)		*/
+	0x05, 0x01,		/*     USAGE Page (Generic Desktop)	*/
+	0x09, 0x38,		/*     USAGE (Wheel)			*/
+	0x35, 0x00,		/*     PHYSICAL_MINIMUM (0)		*/
+	0x45, 0x00,		/*     PHYSICAL_MAXIMUM (0)		*/
+	0x15, 0x81,		/*     LOGICAL_MINIMUM (-127)		*/
+	0x25, 0x7f,		/*     LOGICAL_MAXIMUM (127)		*/
+	0x75, 0x08,		/*     REPORT_SIZE (8)			*/
+	0x95, 0x01,		/*     REPORT_COUNT (1)			*/
+	0x81, 0x06,		/*     INPUT (Data,Var,Rel)		*/
+	0xc0,			/*   END_COLLECTION			*/
+	0xc0			/* END_COLLECTION			*/
+};
+
+struct umouse_report {
+	uint8_t	buttons;	/* bits: 0 left, 1 right, 2 middle */
+	int16_t	x;		/* x position */
+	int16_t	y;		/* y position */
+	int8_t	z;		/* z wheel position */
+} __packed;
+
+
+#define	MSETW(ptr, val)	ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
+
+static struct usb_device_descriptor umouse_dev_desc = {
+	.bLength = sizeof(umouse_dev_desc),
+	.bDescriptorType = UDESC_DEVICE,
+	MSETW(.bcdUSB, UD_USB_3_0),
+	.bMaxPacketSize = 8,			/* max packet size */
+	MSETW(.idVendor, 0xFB5D),		/* vendor */
+	MSETW(.idProduct, 0x0001),		/* product */
+	MSETW(.bcdDevice, 0),			/* device version */
+	.iManufacturer = UMSTR_MANUFACTURER,
+	.iProduct = UMSTR_PRODUCT,
+	.iSerialNumber = UMSTR_SERIAL,
+	.bNumConfigurations = 1,
+};
+
+static struct umouse_config_desc umouse_confd = {
+	.confd = {
+		.bLength = sizeof(umouse_confd.confd),
+		.bDescriptorType = UDESC_CONFIG,
+		.wTotalLength[0] = sizeof(umouse_confd),
+		.bNumInterface = 1,
+		.bConfigurationValue = 1,
+		.iConfiguration = UMSTR_CONFIG,
+		.bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP,
+		.bMaxPower = 0,
+	},
+	.ifcd = {
+		.bLength = sizeof(umouse_confd.ifcd),
+		.bDescriptorType = UDESC_INTERFACE,
+		.bNumEndpoints = 1,
+		.bInterfaceClass = UICLASS_HID,
+		.bInterfaceSubClass = UISUBCLASS_BOOT,
+		.bInterfaceProtocol = UIPROTO_MOUSE,
+	},
+	.hidd = {
+		.bLength = sizeof(umouse_confd.hidd),
+		.bDescriptorType = 0x21,
+		.bcdHID = { 0x01, 0x10 },
+		.bCountryCode = 0,
+		.bNumDescriptors = 1,
+		.bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE,
+		.wItemLength = { sizeof(umouse_report_desc), 0 },
+	},
+	.endpd = {
+		.bLength = sizeof(umouse_confd.endpd),
+		.bDescriptorType = UDESC_ENDPOINT,
+		.bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT,
+		.bmAttributes = UE_INTERRUPT,
+		.wMaxPacketSize[0] = 8,
+		.bInterval = 0xA,
+	},
+	.sscompd = {
+		.bLength = sizeof(umouse_confd.sscompd),
+		.bDescriptorType = UDESC_ENDPOINT_SS_COMP,
+		.bMaxBurst = 0,
+		.bmAttributes = 0,
+		MSETW(.wBytesPerInterval, 0),
+	},
+};
+
+
+struct umouse_bos_desc {
+	struct usb_bos_descriptor		bosd;
+	struct usb_devcap_ss_descriptor		usbssd;
+} __packed;
+
+
+struct umouse_bos_desc umouse_bosd = {
+	.bosd = {
+		.bLength = sizeof(umouse_bosd.bosd),
+		.bDescriptorType = UDESC_BOS,
+		HSETW(.wTotalLength, sizeof(umouse_bosd)),
+		.bNumDeviceCaps = 1,
+	},
+	.usbssd = {
+		.bLength = sizeof(umouse_bosd.usbssd),
+		.bDescriptorType = UDESC_DEVICE_CAPABILITY,
+		.bDevCapabilityType = 3,
+		.bmAttributes = 0,
+		HSETW(.wSpeedsSupported, 0x08),
+		.bFunctionalitySupport = 3,
+		.bU1DevExitLat = 0xa,   /* dummy - not used */
+		.wU2DevExitLat = { 0x20, 0x00 },
+	}
+};
+
+
+struct umouse_softc {
+	struct usb_hci *hci;
+
+	char	*opt;
+
+	struct umouse_report um_report;
+	int	newdata;
+	struct {
+		uint8_t	idle;
+		uint8_t	protocol;
+		uint8_t	feature;
+	} hid;
+
+	pthread_mutex_t	mtx;
+	pthread_mutex_t	ev_mtx;
+	int		polling;
+	struct timeval	prev_evt;
+};
+
+static void
+umouse_event(uint8_t button, int x, int y, void *arg)
+{
+	struct umouse_softc *sc;
+	struct bhyvegc_image *gc;
+
+	gc = console_get_image();
+	if (gc == NULL) {
+		/* not ready */
+		return;
+	}
+
+	sc = arg;
+
+	pthread_mutex_lock(&sc->mtx);
+
+	sc->um_report.buttons = 0;
+	sc->um_report.z = 0;
+
+	if (button & 0x01)
+		sc->um_report.buttons |= 0x01;	/* left */
+	if (button & 0x02)
+		sc->um_report.buttons |= 0x04;	/* middle */
+	if (button & 0x04)
+		sc->um_report.buttons |= 0x02;	/* right */
+	if (button & 0x8)
+		sc->um_report.z = 1;
+	if (button & 0x10)
+		sc->um_report.z = -1;
+
+	/* scale coords to mouse resolution */
+	sc->um_report.x = MOUSE_MAX_X * x / gc->width;
+	sc->um_report.y = MOUSE_MAX_Y * y / gc->height;
+	sc->newdata = 1;
+	pthread_mutex_unlock(&sc->mtx);
+
+	pthread_mutex_lock(&sc->ev_mtx);
+	sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT);
+	pthread_mutex_unlock(&sc->ev_mtx);
+}
+
+static void *
+umouse_init(struct usb_hci *hci, char *opt)
+{
+	struct umouse_softc *sc;
+
+	sc = calloc(1, sizeof(struct umouse_softc));
+	sc->hci = hci;
+
+	sc->hid.protocol = 1;	/* REPORT protocol */
+	sc->opt = strdup(opt);
+	pthread_mutex_init(&sc->mtx, NULL);
+	pthread_mutex_init(&sc->ev_mtx, NULL);
+
+	console_ptr_register(umouse_event, sc, 10);
+
+	return (sc);
+}
+
+#define	UREQ(x,y)	((x) | ((y) << 8))
+
+static int
+umouse_request(void *scarg, struct usb_data_xfer *xfer)
+{
+	struct umouse_softc *sc;
+	struct usb_data_xfer_block *data;
+	const char *str;
+	uint16_t value;
+	uint16_t index;
+	uint16_t len;
+	uint16_t slen;
+	uint8_t *udata;
+	int	err;
+	int	i, idx;
+	int	eshort;
+
+	sc = scarg;
+
+	data = NULL;
+	udata = NULL;
+	idx = xfer->head;
+	for (i = 0; i < xfer->ndata; i++) {
+		xfer->data[idx].bdone = 0;
+		if (data == NULL && USB_DATA_OK(xfer,i)) {
+			data = &xfer->data[idx];
+			udata = data->buf;
+		}
+
+		xfer->data[idx].processed = 1;
+		idx = (idx + 1) % USB_MAX_XFER_BLOCKS;
+	}
+
+	err = USB_ERR_NORMAL_COMPLETION;
+	eshort = 0;
+
+	if (!xfer->ureq) {
+		DPRINTF(("umouse_request: port %d\r\n", sc->hci->hci_port));
+		goto done;
+	}
+
+	value = UGETW(xfer->ureq->wValue);
+	index = UGETW(xfer->ureq->wIndex);
+	len = UGETW(xfer->ureq->wLength);
+
+	DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, "
+	         "idx 0x%x, len %u\r\n",
+	         sc->hci->hci_port, xfer->ureq->bmRequestType,
+	         xfer->ureq->bRequest, value, index, len));
+
+	switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) {
+	case UREQ(UR_GET_CONFIG, UT_READ_DEVICE):
+		DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)\r\n"));
+		if (!data)
+			break;
+
+		*udata = umouse_confd.confd.bConfigurationValue;
+		data->blen = len > 0 ? len - 1 : 0;
+		eshort = data->blen > 0;
+		data->bdone += 1;
+		break;
+
+	case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
+		DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x\r\n",
+		        value >> 8));
+		if (!data)
+			break;
+
+		switch (value >> 8) {
+		case UDESC_DEVICE:
+			DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= "
+			         "sizeof(umouse_dev_desc) %lu\r\n",
+			         len, sizeof(umouse_dev_desc)));
+			if ((value & 0xFF) != 0) {
+				err = USB_ERR_IOERROR;
+				goto done;
+			}
+			if (len > sizeof(umouse_dev_desc)) {
+				data->blen = len - sizeof(umouse_dev_desc);
+				len = sizeof(umouse_dev_desc);
+			} else
+				data->blen = 0;
+			memcpy(data->buf, &umouse_dev_desc, len);
+			data->bdone += len;
+			break;
+
+		case UDESC_CONFIG:
+			DPRINTF(("umouse: (->UDESC_CONFIG)\r\n"));
+			if ((value & 0xFF) != 0) {
+				err = USB_ERR_IOERROR;
+				goto done;
+			}
+			if (len > sizeof(umouse_confd)) {
+				data->blen = len - sizeof(umouse_confd);
+				len = sizeof(umouse_confd);
+			} else
+				data->blen = 0;
+
+			memcpy(data->buf, &umouse_confd, len);
+			data->bdone += len;
+			break;
+
+		case UDESC_STRING:
+			DPRINTF(("umouse: (->UDESC_STRING)\r\n"));
+			str = NULL;
+			if ((value & 0xFF) < UMSTR_MAX)
+				str = umouse_desc_strings[value & 0xFF];
+			else
+				goto done;
+
+			if ((value & 0xFF) == UMSTR_LANG) {
+				udata[0] = 4;
+				udata[1] = UDESC_STRING;
+				data->blen = len - 2;
+				len -= 2;
+				data->bdone += 2;
+
+				if (len >= 2) {
+					udata[2] = str[0];
+					udata[3] = str[1];
+					data->blen -= 2;
+					data->bdone += 2;
+				} else
+					data->blen = 0;
+
+				goto done;
+			}
+
+			slen = 2 + strlen(str) * 2;
+			udata[0] = slen;
+			udata[1] = UDESC_STRING;
+
+			if (len > slen) {
+				data->blen = len - slen;
+				len = slen;
+			} else
+				data->blen = 0;
+			for (i = 2; i < len; i += 2) {
+				udata[i] = *str++;
+				udata[i+1] = '\0';
+			}
+			data->bdone += slen;
+
+			break;
+
+		case UDESC_BOS:
+			DPRINTF(("umouse: USB3 BOS\r\n"));
+			if (len > sizeof(umouse_bosd)) {
+				data->blen = len - sizeof(umouse_bosd);
+				len = sizeof(umouse_bosd);
+			} else
+				data->blen = 0;
+			memcpy(udata, &umouse_bosd, len);
+			data->bdone += len;
+			break;
+
+		default:
+			DPRINTF(("umouse: unknown(%d)->ERROR\r\n", value >> 8));
+			err = USB_ERR_IOERROR;
+			goto done;
+		}
+		eshort = data->blen > 0;
+		break;
+
+	case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE):
+		DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) "
+		         "0x%x\r\n", (value >> 8)));
+		if (!data)
+			break;
+
+		switch (value >> 8) {
+		case UMOUSE_REPORT_DESC_TYPE:
+			if (len > sizeof(umouse_report_desc)) {
+				data->blen = len - sizeof(umouse_report_desc);
+				len = sizeof(umouse_report_desc);
+			} else
+				data->blen = 0;
+			memcpy(data->buf, umouse_report_desc, len);
+			data->bdone += len;
+			break;
+		default:
+			DPRINTF(("umouse: IO ERROR\r\n"));
+			err = USB_ERR_IOERROR;
+			goto done;
+		}
+		eshort = data->blen > 0;
+		break;
+
+	case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE):
+		DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)\r\n"));
+		if (index != 0) {
+			DPRINTF(("umouse get_interface, invalid index %d\r\n",
+			        index));
+			err = USB_ERR_IOERROR;
+			goto done;
+		}
+
+		if (!data)
+			break;
+
+		if (len > 0) {
+			*udata = 0;
+			data->blen = len - 1;
+		}
+		eshort = data->blen > 0;
+		data->bdone += 1;
+		break;
+
+	case UREQ(UR_GET_STATUS, UT_READ_DEVICE):
+		DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)\r\n"));
+		if (data != NULL && len > 1) {
+			if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP)
+				USETW(udata, UDS_REMOTE_WAKEUP);
+			else
+				USETW(udata, 0);
+			data->blen = len - 2;
+			data->bdone += 2;
+		}
+
+		eshort = data->blen > 0;
+		break;
+
+	case UREQ(UR_GET_STATUS, UT_READ_INTERFACE): 
+	case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT): 
+		DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)\r\n"));
+		if (data != NULL && len > 1) {
+			USETW(udata, 0);
+			data->blen = len - 2;
+			data->bdone += 2;
+		}
+		eshort = data->blen > 0;
+		break;
+
+	case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE):
+		/* XXX Controller should've handled this */
+		DPRINTF(("umouse set address %u\r\n", value));
+		break;
+
+	case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE):
+		DPRINTF(("umouse set config %u\r\n", value));
+		break;
+
+	case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
+		DPRINTF(("umouse set descriptor %u\r\n", value));
+		break;
+
+
+	case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
+		DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value));
+		if (value == UF_DEVICE_REMOTE_WAKEUP)
+			sc->hid.feature = 0;
+		break;
+
+	case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE):
+		DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value));
+		if (value == UF_DEVICE_REMOTE_WAKEUP)
+			sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP;
+		break;
+
+	case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
+	case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
+	case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE):
+	case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
+		DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)\r\n"));
+		err = USB_ERR_IOERROR;
+		goto done;
+
+	case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
+		DPRINTF(("umouse set interface %u\r\n", value));
+		break;
+
+	case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE):
+		DPRINTF(("umouse set isoch delay %u\r\n", value));
+		break;
+
+	case UREQ(UR_SET_SEL, 0):
+		DPRINTF(("umouse set sel\r\n"));
+		break;
+
+	case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
+		DPRINTF(("umouse synch frame\r\n"));
+		break;
+
+	/* HID device requests */
+
+	case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE):
+		DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) "
+		         "0x%x\r\n", (value >> 8)));
+		if (!data)
+			break;
+
+		if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) {
+			/* TODO read from backend */
+
+			if (len > sizeof(sc->um_report)) {
+				data->blen = len - sizeof(sc->um_report);
+				len = sizeof(sc->um_report);
+			} else
+				data->blen = 0;
+
+			memcpy(data->buf, &sc->um_report, len);
+			data->bdone += len;
+		} else {
+			err = USB_ERR_IOERROR;
+			goto done;
+		}
+		eshort = data->blen > 0;
+		break;
+
+	case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE):
+		if (data != NULL && len > 0) {
+			*udata = sc->hid.idle;
+			data->blen = len - 1;
+			data->bdone += 1;
+		}
+		eshort = data->blen > 0;
+		break;
+
+	case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE):
+		if (data != NULL && len > 0) {
+			*udata = sc->hid.protocol;
+			data->blen = len - 1;
+			data->bdone += 1;
+		}
+		eshort = data->blen > 0;
+		break;
+
+	case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE):
+		DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored\r\n"));
+		break;
+
+	case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE):
+		sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8;
+		DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x\r\n",
+		        sc->hid.idle));
+		break;
+
+	case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE):
+		sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8;
+		DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x\r\n",
+		        sc->hid.protocol));
+		break;
+
+	default:
+		DPRINTF(("**** umouse request unhandled\r\n"));
+		err = USB_ERR_IOERROR;
+		break;
+	}
+
+done:
+/* UT_WRITE is 0, so this is condition is never true. */
+#ifdef __FreeBSD__
+	if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) &&
+	    (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL))
+		data->blen = 0;
+	else if (eshort)
+		err = USB_ERR_SHORT_XFER;
+#else
+	if (eshort)
+		err = USB_ERR_SHORT_XFER;
+#endif
+
+
+	DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u\r\n",
+	        err, (data ? data->blen : 0), (data ? data->bdone : 0)));
+
+	return (err);
+}
+
+static int
+umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir,
+     int epctx)
+{
+	struct umouse_softc *sc;
+	struct usb_data_xfer_block *data;
+	uint8_t *udata;
+	int len, i, idx;
+	int err;
+
+	DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d\r\n",
+	        dir ? "IN" : "OUT", epctx, xfer->data[0].blen));
+
+
+	/* find buffer to add data */
+	udata = NULL;
+	err = USB_ERR_NORMAL_COMPLETION;
+
+	/* handle xfer at first unprocessed item with buffer */
+	data = NULL;
+	idx = xfer->head;
+	for (i = 0; i < xfer->ndata; i++) {
+		data = &xfer->data[idx];
+		if (data->buf != NULL && data->blen != 0) {
+			break;
+		} else {
+			data->processed = 1;
+			data = NULL;
+		}
+		idx = (idx + 1) % USB_MAX_XFER_BLOCKS;
+	}
+	if (!data)
+		goto done;
+
+	udata = data->buf;
+	len = data->blen;
+
+	if (udata == NULL) {
+		DPRINTF(("umouse no buffer provided for input\r\n"));
+		err = USB_ERR_NOMEM;
+		goto done;
+	}
+
+	sc = scarg;
+
+	if (dir) {
+
+		pthread_mutex_lock(&sc->mtx);
+
+		if (!sc->newdata) {
+			err = USB_ERR_CANCELLED;
+			USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK);
+			pthread_mutex_unlock(&sc->mtx);
+			goto done;
+		}
+
+		if (sc->polling) {
+			err = USB_ERR_STALLED;
+			USB_DATA_SET_ERRCODE(data, USB_STALL);
+			pthread_mutex_unlock(&sc->mtx);
+			goto done;
+		}
+		sc->polling = 1;
+
+		if (len > 0) {
+			sc->newdata = 0;
+
+			data->processed = 1;
+			data->bdone += 6;
+			memcpy(udata, &sc->um_report, 6);
+			data->blen = len - 6;
+			if (data->blen > 0)
+				err = USB_ERR_SHORT_XFER;
+		}
+
+		sc->polling = 0;
+		pthread_mutex_unlock(&sc->mtx);
+	} else { 
+		USB_DATA_SET_ERRCODE(data, USB_STALL);
+		err = USB_ERR_STALLED;
+	}
+
+done:
+	return (err);
+}
+
+static int
+umouse_reset(void *scarg)
+{
+	struct umouse_softc *sc;
+
+	sc = scarg;
+
+	sc->newdata = 0;
+
+	return (0);
+}
+
+static int
+umouse_remove(void *scarg)
+{
+
+	return (0);
+}
+
+static int
+umouse_stop(void *scarg)
+{
+
+	return (0);
+}
+
+
+struct usb_devemu ue_mouse = {
+	.ue_emu =	"tablet",
+	.ue_usbver =	3,
+	.ue_usbspeed =	USB_SPEED_HIGH,
+	.ue_init =	umouse_init,
+	.ue_request =	umouse_request,
+	.ue_data =	umouse_data_handler,
+	.ue_reset =	umouse_reset,
+	.ue_remove =	umouse_remove,
+	.ue_stop =	umouse_stop
+};
+USB_EMUL_SET(ue_mouse);
diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c
index 4330741042..314ddeb1e8 100644
--- a/usr/src/cmd/bhyve/vga.c
+++ b/usr/src/cmd/bhyve/vga.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -24,6 +26,10 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
@@ -161,10 +167,10 @@ struct vga_softc {
 	 */
 	struct {
 		uint8_t		dac_state;
-		int		dac_rd_index;
-		int		dac_rd_subindex;
-		int		dac_wr_index;
-		int		dac_wr_subindex;
+		uint8_t		dac_rd_index;
+		uint8_t		dac_rd_subindex;
+		uint8_t		dac_wr_index;
+		uint8_t		dac_wr_subindex;
 		uint8_t		dac_palette[3 * 256];
 		uint32_t	dac_palette_rgb[256];
 	} vga_dac;
@@ -187,8 +193,10 @@ vga_check_size(struct bhyvegc *gc, struct vga_softc *sc)
 	if (vga_in_reset(sc))
 		return;
 
-	old_width = sc->gc_width;
-	old_height = sc->gc_height;
+	//old_width = sc->gc_width;
+	//old_height = sc->gc_height;
+	old_width = sc->gc_image->width;
+	old_height = sc->gc_image->height;
 
 	/*
 	 * Horizontal Display End: For text modes this is the number
@@ -263,7 +271,7 @@ vga_get_text_pixel(struct vga_softc *sc, int x, int y)
 	offset = 2 * sc->vga_crtc.crtc_start_addr;
 	offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2;
 
-	bit = 7 - (x % dots);
+	bit = 7 - (x % dots > 7 ? 7 : x % dots);
 
 	ch = sc->vga_ram[offset + 0 * 64*KB];
 	attr = sc->vga_ram[offset + 1 * 64*KB];
@@ -291,7 +299,7 @@ vga_get_text_pixel(struct vga_softc *sc, int x, int y)
 
 	font = sc->vga_ram[font_offset + 2 * 64*KB];
 
-	if ((bit > 0) && (font & (1 << bit)))
+	if (font & (1 << bit))
 		idx = sc->vga_atc.atc_palette[attr & 0xf];
 	else
 		idx = sc->vga_atc.atc_palette[attr >> 4];
@@ -314,7 +322,7 @@ vga_render_text(struct vga_softc *sc)
 	}
 }
 
-static void
+void
 vga_render(struct bhyvegc *gc, void *arg)
 {
 	struct vga_softc *sc = arg;
@@ -361,7 +369,11 @@ vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1)
 		/*
 		 * monochrome text mode: base 0xb0000 size 32kb
 		 */
+#ifdef __FreeBSD__
 		assert(0);
+#else
+		abort();
+#endif
 	case 0x3:
 		/*
 		 * color text mode and CGA: base 0xb8000 size 32kb
@@ -425,7 +437,11 @@ vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1)
 		/*
 		 * monochrome text mode: base 0xb0000 size 32kb
 		 */
+#ifdef __FreeBSD__
 		assert(0);
+#else
+		abort();
+#endif
 	case 0x3:
 		/*
 		 * color text mode and CGA: base 0xb8000 size 32kb
@@ -858,6 +874,7 @@ vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes,
 			assert(0);
 			break;
 		}
+		break;
 	case DAC_DATA_PORT:
 		*val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index +
 					       sc->vga_dac.dac_rd_subindex];
@@ -914,15 +931,33 @@ vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes,
 	case GEN_INPUT_STS1_MONO_PORT:
 	case GEN_INPUT_STS1_COLOR_PORT:
 		sc->vga_atc.atc_flipflop = 0;
+#ifdef __FreeBSD__
+		sc->vga_sts1 = GEN_IS1_VR | GEN_IS1_DE;
+		//sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE);
+#else
+		/*
+		 * During the bhyve bring-up process, a guest image was failing
+		 * to successfully boot.  It appeared to be spinning, waiting
+		 * for this value to be toggled.  Until it can be ruled out
+		 * that this is unnecessary (and documentation seems to
+		 * indicate that it should be present),  the toggle should
+		 * remain.
+		 */
 		sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE);
+#endif
 		*val = sc->vga_sts1;
 		break;
 	case GEN_FEATURE_CTRL_PORT:
-		assert(0);
+		// OpenBSD calls this with bytes = 1
+		//assert(0);
+		*val = 0;
+		break;
+	case 0x3c3:
+		*val = 0;
 		break;
 	default:
 		printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port);
-		assert(0);
+		//assert(0);
 		return (-1);
 	}
 
@@ -1060,7 +1095,7 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes,
 				sc->vga_atc.atc_color_select_45 =
 					(val & ATC_CS_C45) << 4;
 				sc->vga_atc.atc_color_select_67 =
-					(val & ATC_CS_C67) << 6;
+					((val & ATC_CS_C67) >> 2) << 6;
 				break;
 			default:
 				//printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index);
@@ -1095,7 +1130,8 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes,
 			break;
 		case SEQ_MEMORY_MODE:
 			sc->vga_seq.seq_mm = val;
-			assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0);
+			/* Windows queries Chain4 */
+			//assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0);
 			break;
 		default:
 			//printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index);
@@ -1161,6 +1197,9 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes,
 			sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0;
 			sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1;
 			sc->vga_gc.gc_mode_wm = val & 0x3;
+
+			if (sc->gc_image)
+				sc->gc_image->vgamode = 1;
 			break;
 		case GC_MISCELLANEOUS:
 			sc->vga_gc.gc_misc = val;
@@ -1188,8 +1227,10 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes,
 	case GEN_INPUT_STS1_COLOR_PORT:
 		/* write to Feature Control Register */
 		break;
+//	case 0x3c3:
+//		break;
 	default:
-		printf("XXX vga_port_out_handler() unhandled port 0x%x\n", port);
+		printf("XXX vga_port_out_handler() unhandled port 0x%x, val 0x%x\n", port, val);
 		//assert(0);
 		return (-1);
 	}
@@ -1248,8 +1289,8 @@ vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 	return (error);
 }
 
-int
-vga_init(void)
+void *
+vga_init(int io_only)
 {
 	struct inout_port iop;
 	struct vga_softc *sc;
@@ -1270,6 +1311,12 @@ vga_init(void)
 		assert(error == 0);
 	}
 
+	sc->gc_image = console_get_image();
+
+	/* only handle io ports; vga graphics is disabled */
+	if (io_only)
+		return(sc);
+
 	sc->mr.name = "VGA memory";
 	sc->mr.flags = MEM_F_RW;
 	sc->mr.base = 640 * KB;
@@ -1282,8 +1329,29 @@ vga_init(void)
 	sc->vga_ram = malloc(256 * KB);
 	memset(sc->vga_ram, 0, 256 * KB);
 
-	sc->gc_image = console_get_image();
-	console_fb_register(vga_render, sc);
+	{
+		static uint8_t palette[] = {
+			0x00,0x00,0x00, 0x00,0x00,0x2a, 0x00,0x2a,0x00, 0x00,0x2a,0x2a,
+			0x2a,0x00,0x00, 0x2a,0x00,0x2a, 0x2a,0x2a,0x00, 0x2a,0x2a,0x2a,
+			0x00,0x00,0x15, 0x00,0x00,0x3f, 0x00,0x2a,0x15, 0x00,0x2a,0x3f,
+			0x2a,0x00,0x15, 0x2a,0x00,0x3f, 0x2a,0x2a,0x15, 0x2a,0x2a,0x3f,
+		};
+		int i;
+
+		memcpy(sc->vga_dac.dac_palette, palette, 16 * 3 * sizeof (uint8_t));
+		for (i = 0; i < 16; i++) {
+			sc->vga_dac.dac_palette_rgb[i] =
+				((((sc->vga_dac.dac_palette[3*i + 0] << 2) |
+				   ((sc->vga_dac.dac_palette[3*i + 0] & 0x1) << 1) |
+				   (sc->vga_dac.dac_palette[3*i + 0] & 0x1)) << 16) |
+				 (((sc->vga_dac.dac_palette[3*i + 1] << 2) |
+				   ((sc->vga_dac.dac_palette[3*i + 1] & 0x1) << 1) |
+				   (sc->vga_dac.dac_palette[3*i + 1] & 0x1)) << 8) |
+				 (((sc->vga_dac.dac_palette[3*i + 2] << 2) |
+				   ((sc->vga_dac.dac_palette[3*i + 2] & 0x1) << 1) |
+				   (sc->vga_dac.dac_palette[3*i + 2] & 0x1)) << 0));
+		}
+	}
 
-	return (0);
+	return (sc);
 }
diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h
index 14637b12b3..36c6dc15fa 100644
--- a/usr/src/cmd/bhyve/vga.h
+++ b/usr/src/cmd/bhyve/vga.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -38,8 +40,8 @@
 #define	GEN_MISC_OUTPUT_PORT		0x3cc
 #define	GEN_INPUT_STS1_MONO_PORT	0x3ba
 #define	GEN_INPUT_STS1_COLOR_PORT	0x3da
-#define	 GEN_IS1_VR			0x08	/* Vertical retrace */
-#define	 GEN_IS1_DE			0x01	/* Display enable not */
+#define	GEN_IS1_VR			0x08	/* Vertical retrace */
+#define	GEN_IS1_DE			0x01	/* Display enable not */
 
 /* Attribute controller registers. */
 #define	ATC_IDX_PORT			0x3c0
@@ -49,14 +51,14 @@
 #define	ATC_PALETTE0			0
 #define	ATC_PALETTE15			15
 #define	ATC_MODE_CONTROL		16
-#define	 ATC_MC_IPS			0x80	/* Internal palette size */
-#define	 ATC_MC_GA			0x01	/* Graphics/alphanumeric */
+#define	ATC_MC_IPS			0x80	/* Internal palette size */
+#define	ATC_MC_GA			0x01	/* Graphics/alphanumeric */
 #define	ATC_OVERSCAN_COLOR		17
 #define	ATC_COLOR_PLANE_ENABLE		18
 #define	ATC_HORIZ_PIXEL_PANNING		19
 #define	ATC_COLOR_SELECT		20
-#define	 ATC_CS_C67			0x0c	/* Color select bits 6+7 */
-#define	 ATC_CS_C45			0x03	/* Color select bits 4+5 */
+#define	ATC_CS_C67			0x0c	/* Color select bits 6+7 */
+#define	ATC_CS_C45			0x03	/* Color select bits 4+5 */
 
 /* Sequencer registers. */
 #define	SEQ_IDX_PORT			0x3c4
@@ -66,22 +68,22 @@
 #define	SEQ_RESET_ASYNC			0x1
 #define	SEQ_RESET_SYNC			0x2
 #define	SEQ_CLOCKING_MODE		1
-#define	 SEQ_CM_SO			0x20	/* Screen off */
-#define	 SEQ_CM_89			0x01	/* 8/9 dot clock */
+#define	SEQ_CM_SO			0x20	/* Screen off */
+#define	SEQ_CM_89			0x01	/* 8/9 dot clock */
 #define	SEQ_MAP_MASK			2
 #define	SEQ_CHAR_MAP_SELECT		3
-#define	 SEQ_CMS_SAH			0x20	/* Char map A bit 2 */
-#define	 SEQ_CMS_SAH_SHIFT		5
-#define	 SEQ_CMS_SA			0x0c	/* Char map A bits 0+1 */
-#define	 SEQ_CMS_SA_SHIFT		2
-#define	 SEQ_CMS_SBH			0x10	/* Char map B bit 2 */
-#define	 SEQ_CMS_SBH_SHIFT		4
-#define	 SEQ_CMS_SB			0x03	/* Char map B bits 0+1 */
-#define	 SEQ_CMS_SB_SHIFT		0
+#define	SEQ_CMS_SAH			0x20	/* Char map A bit 2 */
+#define	SEQ_CMS_SAH_SHIFT		5
+#define	SEQ_CMS_SA			0x0c	/* Char map A bits 0+1 */
+#define	SEQ_CMS_SA_SHIFT		2
+#define	SEQ_CMS_SBH			0x10	/* Char map B bit 2 */
+#define	SEQ_CMS_SBH_SHIFT		4
+#define	SEQ_CMS_SB			0x03	/* Char map B bits 0+1 */
+#define	SEQ_CMS_SB_SHIFT		0
 #define	SEQ_MEMORY_MODE			4
-#define	 SEQ_MM_C4			0x08	/* Chain 4 */
-#define	 SEQ_MM_OE			0x04	/* Odd/even */
-#define	 SEQ_MM_EM			0x02	/* Extended memory */
+#define	SEQ_MM_C4			0x08	/* Chain 4 */
+#define	SEQ_MM_OE			0x04	/* Odd/even */
+#define	SEQ_MM_EM			0x02	/* Extended memory */
 
 /* Graphics controller registers. */
 #define	GC_IDX_PORT			0x3ce
@@ -93,13 +95,13 @@
 #define	GC_DATA_ROTATE			3
 #define	GC_READ_MAP_SELECT		4
 #define	GC_MODE				5
-#define	 GC_MODE_OE			0x10	/* Odd/even */
-#define	 GC_MODE_C4			0x04	/* Chain 4 */
+#define	GC_MODE_OE			0x10	/* Odd/even */
+#define	GC_MODE_C4			0x04	/* Chain 4 */
 
 #define	GC_MISCELLANEOUS		6
-#define	 GC_MISC_GM			0x01	/* Graphics/alphanumeric */
-#define	 GC_MISC_MM			0x0c	/* memory map */
-#define	 GC_MISC_MM_SHIFT	2
+#define	GC_MISC_GM			0x01	/* Graphics/alphanumeric */
+#define	GC_MISC_MM			0x0c	/* memory map */
+#define	GC_MISC_MM_SHIFT		2
 #define	GC_COLOR_DONT_CARE		7
 #define	GC_BIT_MASK			8
 
@@ -117,36 +119,36 @@
 #define	CRTC_END_HORIZ_RETRACE		5
 #define	CRTC_VERT_TOTAL			6
 #define	CRTC_OVERFLOW			7
-#define	 CRTC_OF_VRS9			0x80	/* VRS bit 9 */
-#define	 CRTC_OF_VRS9_SHIFT		7
-#define	 CRTC_OF_VDE9			0x40	/* VDE bit 9 */
-#define	 CRTC_OF_VDE9_SHIFT		6
-#define	 CRTC_OF_VRS8			0x04	/* VRS bit 8 */
-#define	 CRTC_OF_VRS8_SHIFT		2
-#define	 CRTC_OF_VDE8			0x02	/* VDE bit 8 */
-#define	 CRTC_OF_VDE8_SHIFT		1
+#define	CRTC_OF_VRS9			0x80	/* VRS bit 9 */
+#define	CRTC_OF_VRS9_SHIFT		7
+#define	CRTC_OF_VDE9			0x40	/* VDE bit 9 */
+#define	CRTC_OF_VDE9_SHIFT		6
+#define	CRTC_OF_VRS8			0x04	/* VRS bit 8 */
+#define	CRTC_OF_VRS8_SHIFT		2
+#define	CRTC_OF_VDE8			0x02	/* VDE bit 8 */
+#define	CRTC_OF_VDE8_SHIFT		1
 #define	CRTC_PRESET_ROW_SCAN		8
 #define	CRTC_MAX_SCAN_LINE		9
-#define	 CRTC_MSL_MSL			0x1f
+#define	CRTC_MSL_MSL			0x1f
 #define	CRTC_CURSOR_START		10
-#define	 CRTC_CS_CO			0x20	/* Cursor off */
-#define	 CRTC_CS_CS			0x1f	/* Cursor start */
+#define	CRTC_CS_CO			0x20	/* Cursor off */
+#define	CRTC_CS_CS			0x1f	/* Cursor start */
 #define	CRTC_CURSOR_END			11
-#define	 CRTC_CE_CE			0x1f	/* Cursor end */
+#define	CRTC_CE_CE			0x1f	/* Cursor end */
 #define	CRTC_START_ADDR_HIGH		12
 #define	CRTC_START_ADDR_LOW		13
 #define	CRTC_CURSOR_LOC_HIGH		14
 #define	CRTC_CURSOR_LOC_LOW		15
 #define	CRTC_VERT_RETRACE_START		16
 #define	CRTC_VERT_RETRACE_END		17
-#define	 CRTC_VRE_MASK			0xf
+#define	CRTC_VRE_MASK			0xf
 #define	CRTC_VERT_DISP_END		18
 #define	CRTC_OFFSET			19
 #define	CRTC_UNDERLINE_LOC		20
 #define	CRTC_START_VERT_BLANK		21
 #define	CRTC_END_VERT_BLANK		22
 #define	CRTC_MODE_CONTROL		23
-#define	 CRTC_MC_TE			0x80	/* Timing enable */
+#define	CRTC_MC_TE			0x80	/* Timing enable */
 #define	CRTC_LINE_COMPARE		24
 
 /* DAC registers */
@@ -155,6 +157,6 @@
 #define	DAC_IDX_WR_PORT			0x3c8
 #define	DAC_DATA_PORT			0x3c9
 
-int	vga_init(void);
+void	*vga_init(int io_only);
 
 #endif /* _VGA_H_ */
diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c
index c3b11dc439..47a3ed29ba 100644
--- a/usr/src/cmd/bhyve/virtio.c
+++ b/usr/src/cmd/bhyve/virtio.c
@@ -1,6 +1,9 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
+ * Copyright (c) 2019 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -25,11 +28,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/uio.h>
 
+#include <machine/atomic.h>
+
 #include <stdio.h>
 #include <stdint.h>
 #include <pthread.h>
@@ -49,7 +54,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tyc
  * front of virtio-based device softc" constraint, let's use
  * this to convert.
  */
-#define DEV_SOFTC(vs) ((void *)(vs))
+#define	DEV_SOFTC(vs) ((void *)(vs))
 
 /*
  * Link a virtio_softc to its constants, the device softc, and
@@ -97,6 +102,7 @@ vi_reset_dev(struct virtio_softc *vs)
 	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
 		vq->vq_flags = 0;
 		vq->vq_last_avail = 0;
+		vq->vq_save_used = 0;
 		vq->vq_pfn = 0;
 		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
 	}
@@ -147,8 +153,13 @@ vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
 			return (1);
 	} else
 		vs->vs_flags &= ~VIRTIO_USE_MSIX;
+
 	/* Only 1 MSI vector for bhyve */
 	pci_emul_add_msicap(vs->vs_pi, 1);
+
+	/* Legacy interrupts are mandatory for virtio devices */
+	pci_lintr_request(vs->vs_pi);
+
 	return (0);
 }
 
@@ -188,6 +199,7 @@ vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
 	/* Mark queue as allocated, and start at 0 when we use it. */
 	vq->vq_flags = VQ_ALLOC;
 	vq->vq_last_avail = 0;
+	vq->vq_save_used = 0;
 }
 
 /*
@@ -247,12 +259,12 @@ _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
  * that vq_has_descs() does one).
  */
 int
-vq_getchain(struct vqueue_info *vq,
+vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 	    struct iovec *iov, int n_iov, uint16_t *flags)
 {
 	int i;
 	u_int ndesc, n_indir;
-	u_int idx, head, next;
+	u_int idx, next;
 	volatile struct virtio_desc *vdir, *vindir, *vp;
 	struct vmctx *ctx;
 	struct virtio_softc *vs;
@@ -295,8 +307,8 @@ vq_getchain(struct vqueue_info *vq,
 	 * index, but we just abort if the count gets excessive.
 	 */
 	ctx = vs->vs_pi->pi_vmctx;
-	head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
-	next = head;
+	*pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
+	vq->vq_last_avail++;
 	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
 		if (next >= vq->vq_qsize) {
 			fprintf(stderr,
@@ -309,7 +321,7 @@ vq_getchain(struct vqueue_info *vq,
 		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
 			_vq_record(i, vdir, ctx, iov, n_iov, flags);
 			i++;
-		} else if ((vs->vs_negotiated_caps &
+		} else if ((vs->vs_vc->vc_hv_caps &
 		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
 			fprintf(stderr,
 			    "%s: descriptor has forbidden INDIRECT flag, "
@@ -370,16 +382,29 @@ loopy:
 }
 
 /*
- * Return the currently-first request chain to the guest, setting
- * its I/O length to the provided value.
+ * Return the currently-first request chain back to the available queue.
  *
  * (This chain is the one you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
-vq_relchain(struct vqueue_info *vq, uint32_t iolen)
+vq_retchain(struct vqueue_info *vq)
 {
-	uint16_t head, uidx, mask;
+
+	vq->vq_last_avail--;
+}
+
+/*
+ * Return specified request chain to the guest, setting its I/O length
+ * to the provided value.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
+{
+	uint16_t uidx, mask;
 	volatile struct vring_used *vuh;
 	volatile struct virtio_used *vue;
 
@@ -395,12 +420,17 @@ vq_relchain(struct vqueue_info *vq, uint32_t iolen)
 	 */
 	mask = vq->vq_qsize - 1;
 	vuh = vq->vq_used;
-	head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask];
 
 	uidx = vuh->vu_idx;
 	vue = &vuh->vu_ring[uidx++ & mask];
-	vue->vu_idx = head; /* ie, vue->id = head */
+	vue->vu_idx = idx;
 	vue->vu_tlen = iolen;
+
+	/*
+	 * Ensure the used descriptor is visible before updating the index.
+	 * This is necessary on ISAs with memory ordering less strict than x86.
+	 */
+	atomic_thread_fence_rel();
 	vuh->vu_idx = uidx;
 }
 
@@ -436,8 +466,15 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail)
 	 * entire avail was processed, we need to interrupt always.
 	 */
 	vs = vq->vq_vs;
-	new_idx = vq->vq_used->vu_idx;
 	old_idx = vq->vq_save_used;
+	vq->vq_save_used = new_idx = vq->vq_used->vu_idx;
+
+	/*
+	 * Use full memory barrier between vu_idx store from preceding
+	 * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
+	 * va_flags below.
+	 */
+	atomic_thread_fence_seq_cst();
 	if (used_all_avail &&
 	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
 		intr = 1;
@@ -698,6 +735,9 @@ bad:
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
+		if (vc->vc_apply_features)
+			(*vc->vc_apply_features)(DEV_SOFTC(vs),
+			    vs->vs_negotiated_caps);
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq >= vc->vc_nvq)
diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h
index 1a2ebe8118..a2c3362ec2 100644
--- a/usr/src/cmd/bhyve/virtio.h
+++ b/usr/src/cmd/bhyve/virtio.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  *
@@ -23,12 +25,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/virtio.h 268276 2014-07-05 02:38:53Z grehan $
+ * $FreeBSD$
  */
 
 #ifndef	_VIRTIO_H_
 #define	_VIRTIO_H_
 
+#include <pthread_np.h>
+
 /*
  * These are derived from several virtio specifications.
  *
@@ -184,7 +188,7 @@ struct vring_used {
 /*
  * PFN register shift amount
  */
-#define VRING_PFN               12
+#define	VRING_PFN		12
 
 /*
  * Virtio device types
@@ -209,7 +213,9 @@ struct vring_used {
 #define	VIRTIO_VENDOR		0x1AF4
 #define	VIRTIO_DEV_NET		0x1000
 #define	VIRTIO_DEV_BLOCK	0x1001
-#define	VIRTIO_DEV_RANDOM	0x1002
+#define	VIRTIO_DEV_CONSOLE	0x1003
+#define	VIRTIO_DEV_RANDOM	0x1005
+#define	VIRTIO_DEV_SCSI		0x1008
 
 /*
  * PCI config space constants.
@@ -220,19 +226,19 @@ struct vring_used {
  * If MSI-X is not enabled, those two registers disappear and
  * the remaining configuration registers start at offset 20.
  */
-#define VTCFG_R_HOSTCAP		0
-#define VTCFG_R_GUESTCAP	4
-#define VTCFG_R_PFN		8
-#define VTCFG_R_QNUM		12
-#define VTCFG_R_QSEL		14
-#define VTCFG_R_QNOTIFY		16
-#define VTCFG_R_STATUS		18
-#define VTCFG_R_ISR		19
-#define VTCFG_R_CFGVEC		20
-#define VTCFG_R_QVEC		22
-#define VTCFG_R_CFG0		20	/* No MSI-X */
-#define VTCFG_R_CFG1		24	/* With MSI-X */
-#define VTCFG_R_MSIX		20
+#define	VTCFG_R_HOSTCAP		0
+#define	VTCFG_R_GUESTCAP	4
+#define	VTCFG_R_PFN		8
+#define	VTCFG_R_QNUM		12
+#define	VTCFG_R_QSEL		14
+#define	VTCFG_R_QNOTIFY		16
+#define	VTCFG_R_STATUS		18
+#define	VTCFG_R_ISR		19
+#define	VTCFG_R_CFGVEC		20
+#define	VTCFG_R_QVEC		22
+#define	VTCFG_R_CFG0		20	/* No MSI-X */
+#define	VTCFG_R_CFG1		24	/* With MSI-X */
+#define	VTCFG_R_MSIX		20
 
 /*
  * Bits in VTCFG_R_STATUS.  Guests need not actually set any of these,
@@ -251,7 +257,7 @@ struct vring_used {
 #define	VTCFG_ISR_QUEUES	0x01	/* re-scan queues */
 #define	VTCFG_ISR_CONF_CHANGED	0x80	/* configuration changed */
 
-#define VIRTIO_MSI_NO_VECTOR	0xFFFF
+#define	VIRTIO_MSI_NO_VECTOR	0xFFFF
 
 /*
  * Feature flags.
@@ -352,6 +358,8 @@ struct virtio_consts {
 					/* called to read config regs */
 	int	(*vc_cfgwrite)(void *, int, int, uint32_t);
 					/* called to write config regs */
+	void    (*vc_apply_features)(void *, uint64_t);
+				/* called to apply negotiated features */
 	uint64_t vc_hv_caps;		/* hypervisor-provided capabilities */
 };
 
@@ -422,20 +430,6 @@ vq_has_descs(struct vqueue_info *vq)
 	    vq->vq_avail->va_idx);
 }
 
-/*
- * Called by virtio driver as it starts processing chains.  Each
- * completed chain (obtained from vq_getchain()) is released by
- * calling vq_relchain(), then when all are done, vq_endchains()
- * can tell if / how-many chains were processed and know whether
- * and how to generate an interrupt.
- */
-static inline void
-vq_startchains(struct vqueue_info *vq)
-{
-
-	vq->vq_save_used = vq->vq_used->vu_idx;
-}
-
 /*
  * Deliver an interrupt to guest on the given virtual queue
  * (if possible, or a generic MSI interrupt if not using MSI-X).
@@ -447,11 +441,25 @@ vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
 	if (pci_msix_enabled(vs->vs_pi))
 		pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
 	else {
+#ifndef __FreeBSD__
+		boolean_t unlock = B_FALSE;
+
+		if (vs->vs_mtx && !pthread_mutex_isowned_np(vs->vs_mtx)) {
+			unlock = B_TRUE;
+			pthread_mutex_lock(vs->vs_mtx);
+		}
+#else
 		VS_LOCK(vs);
+#endif
 		vs->vs_isr |= VTCFG_ISR_QUEUES;
 		pci_generate_msi(vs->vs_pi, 0);
 		pci_lintr_assert(vs->vs_pi);
+#ifndef __FreeBSD__
+		if (unlock)
+			pthread_mutex_unlock(vs->vs_mtx);
+#else
 		VS_UNLOCK(vs);
+#endif
 	}
 }
 
@@ -463,9 +471,10 @@ int	vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
 void	vi_reset_dev(struct virtio_softc *);
 void	vi_set_io_bar(struct virtio_softc *, int);
 
-int	vq_getchain(struct vqueue_info *vq,
+int	vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 		    struct iovec *iov, int n_iov, uint16_t *flags);
-void	vq_relchain(struct vqueue_info *vq, uint32_t iolen);
+void	vq_retchain(struct vqueue_info *vq);
+void	vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
 void	vq_endchains(struct vqueue_info *vq, int used_all_avail);
 
 uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c
index 0c097251e0..994445b3e3 100644
--- a/usr/src/cmd/bhyve/xmsr.c
+++ b/usr/src/cmd/bhyve/xmsr.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,11 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $
+ * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
@@ -77,6 +79,7 @@ emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val)
 			return (0);
 
 		case MSR_NB_CFG1:
+		case MSR_LS_CFG:
 		case MSR_IC_CFG:
 			return (0);	/* Ignore writes */
 
@@ -146,6 +149,7 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
 			break;
 
 		case MSR_NB_CFG1:
+		case MSR_LS_CFG:
 		case MSR_IC_CFG:
 			/*
 			 * The reset value is processor family dependent so
@@ -195,12 +199,23 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
 		/*
 		 * OpenBSD guests test bit 0 of this MSR to detect if the
 		 * workaround for erratum 721 is already applied.
-		 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
+		 * https://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
 		 */
 		case 0xC0011029:
 			*val = 1;
 			break;
 
+#ifndef	__FreeBSD__
+		case MSR_VM_CR:
+			/*
+			 * We currently don't support nested virt.
+			 * Windows seems to ignore the cpuid bits and reads this
+			 * MSR anyways.
+			 */
+			*val = VM_CR_SVMDIS;
+			break;
+#endif
+
 		default:
 			error = -1;
 			break;
diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h
index ac3c147442..1fb47c3ae2 100644
--- a/usr/src/cmd/bhyve/xmsr.h
+++ b/usr/src/cmd/bhyve/xmsr.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyve/xmsr.h 271888 2014-09-20 02:35:21Z neel $
+ * $FreeBSD$
  */
 
 #ifndef	_XMSR_H_
diff --git a/usr/src/cmd/bhyveconsole/Makefile b/usr/src/cmd/bhyveconsole/Makefile
deleted file mode 100644
index 11d34e6599..0000000000
--- a/usr/src/cmd/bhyveconsole/Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-include ../Makefile.cmd
-
-SUBDIRS= $(MACH)
-
-all	:=	TARGET = all
-install	:=	TARGET = install
-clean	:=	TARGET = clean
-clobber	:=	TARGET = clobber
-lint	:=	TARGET = lint
-
-.KEEP_STATE:
-
-all:	$(SUBDIRS)
-
-clean clobber lint:	$(SUBDIRS)
-
-install:	$(SUBDIRS)
-	-$(RM) $(ROOTUSRSBINPROG)
-	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
-
-$(SUBDIRS):	FRC
-	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
-
-FRC:
-
-include ../Makefile.targ
diff --git a/usr/src/cmd/bhyveconsole/bhyveconsole.c b/usr/src/cmd/bhyveconsole/bhyveconsole.c
deleted file mode 100644
index 7f237a72f6..0000000000
--- a/usr/src/cmd/bhyveconsole/bhyveconsole.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2013 Pluribus Networks Inc.
- */
-
-#include <sys/param.h>
-#include <sys/signal.h>
-#include <sys/socket.h>
-#include <sys/termios.h>
-#include <assert.h>
-#include <errno.h>
-#include <libgen.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <strings.h>
-#include <unistd.h>
-
-#include <bhyve.h>
-
-static int masterfd;
-static struct termios save_termios;
-static int save_fd;
-
-static int nocmdchar = 0;
-static char cmdchar = '~';
-
-static const char *pname;
-
-#define	BCONS_BUFSIZ		8192
-
-static void
-usage(void)
-{
-	(void) fprintf(stderr, "usage: %s vmname\n", pname);
-	exit(2);
-}
-
-static void
-bcons_error(const char *fmt, ...)
-{
-	va_list alist;
-
-	(void) fprintf(stderr, "%s: ", pname);
-	va_start(alist, fmt);
-	(void) vfprintf(stderr, fmt, alist);
-	va_end(alist);
-	(void) fprintf(stderr, "\n");
-}
-
-static void
-bcons_perror(const char *str)
-{
-	const char *estr;
-
-	if ((estr = strerror(errno)) != NULL)
-		(void) fprintf(stderr, "%s: %s: %s\n", pname, str, estr);
-	else
-		(void) fprintf(stderr, "%s: %s: errno %d\n", pname, str, errno);
-}
-
-/*
- * Create the unix domain socket and call bhyve; handshake
- * with it to determine whether it will allow us to connect.
- */
-static int
-get_console(const char *vmname)
-{
-	int sockfd = -1;
-	struct sockaddr_un servaddr;
-	char clientid[MAXPATHLEN];
-	char handshake[MAXPATHLEN], c;
-	int msglen;
-	int i = 0, err = 0;
-
-	if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
-		bcons_perror("could not create socket");
-		return (-1);
-	}
-
-	bzero(&servaddr, sizeof (servaddr));
-	servaddr.sun_family = AF_UNIX;
-	(void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
-	    BHYVE_CONS_SOCKPATH, vmname);
-
-	if (connect(sockfd, (struct sockaddr *)&servaddr,
-	    sizeof (servaddr)) == -1) {
-		bcons_perror("Could not connect to console server");
-		goto bad;
-	}
-	masterfd = sockfd;
-
-	msglen = snprintf(clientid, sizeof (clientid), "IDENT %lu\n",
-	    getpid());
-	assert(msglen > 0 && msglen < sizeof (clientid));
-
-	if (write(masterfd, clientid, msglen) != msglen) {
-		bcons_error("protocol error");
-		goto bad;
-	}
-
-	/*
-	 * Take care not to accumulate more than our fill, and leave room for
-	 * the NUL at the end.
-	 */
-	while ((err = read(masterfd, &c, 1)) == 1) {
-		if (i >= (sizeof (handshake) - 1))
-			break;
-		if (c == '\n')
-			break;
-		handshake[i] = c;
-		i++;
-	}
-	handshake[i] = '\0';
-
-	/*
-	 * If something went wrong during the handshake we bail; perhaps
-	 * the server died off.
-	 */
-	if (err == -1) {
-		bcons_perror("Could not connect to console server");
-		goto bad;
-	}
-
-	if (strncmp(handshake, "OK", sizeof (handshake)) == 0)
-		return (0);
-
-	bcons_error("Console is already in use by process ID %s.",
-	    handshake);
-bad:
-	(void) close(sockfd);
-	masterfd = -1;
-	return (-1);
-}
-
-/*
- * Place terminal into raw mode.
- */
-static int
-set_tty_rawmode(int fd)
-{
-	struct termios term;
-	if (tcgetattr(fd, &term) < 0) {
-		bcons_perror("failed to get user terminal settings");
-		return (-1);
-	}
-
-	/* Stash for later, so we can revert back to previous mode */
-	save_termios = term;
-	save_fd = fd;
-
-	/* disable 8->7 bit strip, start/stop, enable any char to restart */
-	term.c_iflag &= ~(ISTRIP|IXON|IXANY);
-	/* disable NL->CR, CR->NL, ignore CR, UPPER->lower */
-	term.c_iflag &= ~(INLCR|ICRNL|IGNCR|IUCLC);
-	/* disable output post-processing */
-	term.c_oflag &= ~OPOST;
-	/* disable canonical mode, signal chars, echo & extended functions */
-	term.c_lflag &= ~(ICANON|ISIG|ECHO|IEXTEN);
-
-	term.c_cc[VMIN] = 1;    /* byte-at-a-time */
-	term.c_cc[VTIME] = 0;
-
-	if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term)) {
-		bcons_perror("failed to set user terminal to raw mode");
-		return (-1);
-	}
-
-	return (0);
-}
-
-/*
- * reset terminal settings for global environment
- */
-static void
-reset_tty(void)
-{
-	(void) tcsetattr(save_fd, TCSADRAIN, &save_termios);
-}
-
-/*
- * process_user_input watches the input stream for the escape sequence for
- * 'quit' (by default, tilde-period).  Because we might be fed just one
- * keystroke at a time, state associated with the user input (are we at the
- * beginning of the line?  are we locally echoing the next character?) is
- * maintained by beginning_of_line and local_echo across calls to the routine.
- *
- * This routine returns -1 when the 'quit' escape sequence has been issued,
- * or an error is encountered and 0 otherwise.
- */
-static int
-process_user_input(int out_fd, int in_fd)
-{
-	static boolean_t beginning_of_line = B_TRUE;
-	static boolean_t local_echo = B_FALSE;
-	char ibuf[BCONS_BUFSIZ];
-	int nbytes;
-	char *buf = ibuf;
-	char c;
-
-	nbytes = read(in_fd, ibuf, sizeof (ibuf));
-	if (nbytes == -1 && errno != EINTR)
-		return (-1);
-
-	if (nbytes == -1)	/* The read was interrupted. */
-		return (0);
-
-	for (c = *buf; nbytes > 0; c = *buf, --nbytes) {
-		buf++;
-		if (beginning_of_line && !nocmdchar) {
-			beginning_of_line = B_FALSE;
-			if (c == cmdchar) {
-				local_echo = B_TRUE;
-				continue;
-			}
-		} else if (local_echo) {
-			local_echo = B_FALSE;
-			if (c == '.') {
-				(void) write(STDOUT_FILENO, &cmdchar, 1);
-				(void) write(STDOUT_FILENO, &c, 1);
-				return (-1);
-			}
-		}
-
-		(void) write(out_fd, &c, 1);
-
-		beginning_of_line = (c == '\r' || c == '\n');
-	}
-
-	return (0);
-}
-
-static int
-process_output(int in_fd, int out_fd)
-{
-	int wrote = 0;
-	int cc;
-	char ibuf[BCONS_BUFSIZ];
-
-	cc = read(in_fd, ibuf, sizeof (ibuf));
-	if (cc == -1 && errno != EINTR)
-		return (-1);
-	if (cc == 0)	/* EOF */
-		return (-1);
-	if (cc == -1)	/* The read was interrupted. */
-		return (0);
-
-	do {
-		int len;
-
-		len = write(out_fd, ibuf + wrote, cc - wrote);
-		if (len == -1 && errno != EINTR)
-			return (-1);
-		if (len != -1)
-			wrote += len;
-	} while (wrote < cc);
-
-	return (0);
-}
-
-/*
- * This is the main I/O loop.
- */
-static void
-doio(void)
-{
-	struct pollfd pollfds[2];
-	int res;
-
-	/* read from vm and write to stdout */
-	pollfds[0].fd = masterfd;
-	pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI;
-
-	/* read from stdin and write to vm */
-	pollfds[1].fd = STDIN_FILENO;
-	pollfds[1].events = pollfds[0].events;
-
-	for (;;) {
-		pollfds[0].revents = pollfds[1].revents = 0;
-
-		res = poll(pollfds,
-		    sizeof (pollfds) / sizeof (struct pollfd), -1);
-
-		if (res == -1 && errno != EINTR) {
-			bcons_perror("poll failed");
-			/* we are hosed, close connection */
-			break;
-		}
-
-		/* event from master side stdout */
-		if (pollfds[0].revents) {
-			if (pollfds[0].revents &
-			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
-				if (process_output(masterfd, STDOUT_FILENO)
-				    != 0)
-					break;
-			} else {
-				break;
-			}
-		}
-
-		/* event from user stdin side */
-		if (pollfds[1].revents) {
-			if (pollfds[1].revents &
-			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
-			  if (process_user_input(masterfd, STDIN_FILENO)
-				    != 0)
-					break;
-			} else {
-				break;
-			}
-		}
-	}
-}
-
-int
-main(int argc, char **argv)
-{
-	char *vmname;
-
-	pname = basename(argv[0]);
-
-	if (argc == 2) {
-		vmname = argv[1];
-	} else {
-		usage();
-	}
-
-	/*
-	 * Make contact with bhyve
-	 */
-	if (get_console(vmname) == -1)
-		return (1);
-
-	(void) printf("[Connected to vm '%s' console]\n", vmname);
-
-	if (set_tty_rawmode(STDIN_FILENO) == -1) {
-		reset_tty();
-		bcons_perror("failed to set stdin pty to raw mode");
-		return (1);
-	}
-
-	/*
-	 * Run the I/O loop until we get disconnected.
-	 */
-	doio();
-	reset_tty();
-	(void) printf("\n[Connection to vm '%s' console closed]\n", vmname);
-
-	return (0);
-}
diff --git a/usr/src/cmd/bhyveconsole/i386/Makefile b/usr/src/cmd/bhyveconsole/i386/Makefile
deleted file mode 100644
index c4f317a9fa..0000000000
--- a/usr/src/cmd/bhyveconsole/i386/Makefile
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-PROG=	bhyveconsole
-
-OBJS=	bhyveconsole.o
-
-SRCS=	$(OBJS:%.o=../%.c)
-
-include ../../Makefile.cmd
-
-CFLAGS	+= $(CCVERBOSE)
-LDLIBS += -lsocket
-
-.KEEP_STATE:
-
-%.o:	../%.c
-	$(COMPILE.c) $<
-
-all: $(PROG)
-
-$(PROG): $(OBJS)
-	$(LINK.c) $(OBJS) -o $@ $(LDLIBS)
-	$(POST_PROCESS)
-
-install: all $(ROOTUSRSBINPROG32)
-
-clean:
-	$(RM) $(OBJS)
-
-include ../../Makefile.targ
diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile
index fe98204056..0a8a96cfc9 100644
--- a/usr/src/cmd/bhyvectl/Makefile
+++ b/usr/src/cmd/bhyvectl/Makefile
@@ -11,31 +11,50 @@
 
 #
 # Copyright 2013 Pluribus Networks Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 PROG =		bhyvectl
 
 include ../Makefile.cmd
+include ../Makefile.cmd.64
 
-$(BUILD64)SUBDIRS += $(MACH64)
+SRCS =		bhyvectl.c
+OBJS =		$(SRCS:.c=.o) humanize_number.o
 
-all	:=	TARGET = all
-install	:=	TARGET = install
-clean	:=	TARGET = clean
-clobber	:=	TARGET = clobber
-lint	:=	TARGET = lint
+CLEANFILES =	$(PROG)
+CLOBBERFILES +=	$(ROOTUSRSBINPROG)
 
 .KEEP_STATE:
 
-all clean clobber lint:	$(SUBDIRS)
+CFLAGS +=	$(CCVERBOSE)
+CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+		-I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \
+		$(CPPFLAGS.master) \
+		-I$(SRC)/uts/i86pc/io/vmm \
+		-I$(SRC)/uts/i86pc
+LDLIBS +=	-lvmmapi
 
-install: $(SUBDIRS)
-	-$(RM) $(ROOTUSRSBINPROG)
-	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+CERRWARN +=	-_gcc=-Wno-uninitialized
 
-$(SUBDIRS):	FRC
-	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+# main() is too hairy for smatch
+bhyvectl.o := SMATCH=off
 
-FRC:
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+	$(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+	$(RM) $(OBJS) $(CLEANFILES)
+
+lint:	lint_SRCS
 
 include ../Makefile.targ
+
+%.o: $(CONTRIB)/freebsd/lib/libutil/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
diff --git a/usr/src/cmd/bhyvectl/Makefile.com b/usr/src/cmd/bhyvectl/Makefile.com
deleted file mode 100644
index 03ca34792c..0000000000
--- a/usr/src/cmd/bhyvectl/Makefile.com
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-PROG= bhyvectl
-
-SRCS = bhyvectl.c
-OBJS = $(SRCS:.c=.o)
-
-include ../../Makefile.cmd
-
-.KEEP_STATE:
-
-CFLAGS +=	$(CCVERBOSE)
-CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
-	-I$(ROOT)/usr/platform/i86pc/include \
-	-I$(SRC)/uts/i86pc/io/vmm
-LDLIBS +=	-lvmmapi
-
-all: $(PROG)
-
-$(PROG): $(OBJS)
-	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
-	$(POST_PROCESS)
-
-install: all $(ROOTUSRSBINPROG)
-
-clean:
-	$(RM) $(OBJS)
-
-lint:	lint_SRCS
-
-include ../../Makefile.targ
-
-%.o: ../%.c
-	$(COMPILE.c) -I$(SRC)/common $<
-	$(POST_PROCESS_O)
diff --git a/usr/src/cmd/bhyvectl/amd64/Makefile b/usr/src/cmd/bhyvectl/amd64/Makefile
deleted file mode 100644
index b602c50d05..0000000000
--- a/usr/src/cmd/bhyvectl/amd64/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-include ../Makefile.com
-include ../../Makefile.cmd.64
-
-CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
-
-install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c
index 07d0a83df5..b8bdf524a9 100644
--- a/usr/src/cmd/bhyvectl/bhyvectl.c
+++ b/usr/src/cmd/bhyvectl/bhyvectl.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -38,30 +40,39 @@
 
 /*
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/mman.h>
+#include <sys/cpuset.h>
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
 #include <unistd.h>
 #include <libgen.h>
 #include <libutil.h>
 #include <fcntl.h>
-#include <string.h>
 #include <getopt.h>
+#include <time.h>
 #include <assert.h>
+#include <libutil.h>
 
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
 #include <machine/vmm.h>
+#include <machine/vmm_dev.h>
 #include <vmmapi.h>
 
+#include "amd/vmcb.h"
 #include "intel/vmcs.h"
 
 #define	MB	(1UL << 20)
@@ -74,7 +85,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43
 static const char *progname;
 
 static void
-usage(void)
+usage(bool cpu_intel)
 {
 
 	(void)fprintf(stderr,
@@ -82,6 +93,9 @@ usage(void)
 	"       [--cpu=<vcpu_number>]\n"
 	"       [--create]\n"
 	"       [--destroy]\n"
+#ifndef __FreeBSD__
+	"       [--wrlock-cycle]\n"
+#endif
 	"       [--get-all]\n"
 	"       [--get-stats]\n"
 	"       [--set-desc-ds]\n"
@@ -113,10 +127,22 @@ usage(void)
 	"       [--desc-access=<ACCESS>]\n"
 	"       [--set-cr0=<CR0>]\n"
 	"       [--get-cr0]\n"
+	"       [--set-cr2=<CR2>]\n"
+	"       [--get-cr2]\n"
 	"       [--set-cr3=<CR3>]\n"
 	"       [--get-cr3]\n"
 	"       [--set-cr4=<CR4>]\n"
 	"       [--get-cr4]\n"
+	"       [--set-dr0=<DR0>]\n"
+	"       [--get-dr0]\n"
+	"       [--set-dr1=<DR1>]\n"
+	"       [--get-dr1]\n"
+	"       [--set-dr2=<DR2>]\n"
+	"       [--get-dr2]\n"
+	"       [--set-dr3=<DR3>]\n"
+	"       [--get-dr3]\n"
+	"       [--set-dr6=<DR6>]\n"
+	"       [--get-dr6]\n"
 	"       [--set-dr7=<DR7>]\n"
 	"       [--get-dr7]\n"
 	"       [--set-rsp=<RSP>]\n"
@@ -155,64 +181,108 @@ usage(void)
 	"       [--get-ss]\n"
 	"       [--get-tr]\n"
 	"       [--get-ldtr]\n"
-	"       [--get-vmcs-pinbased-ctls]\n"
-	"       [--get-vmcs-procbased-ctls]\n"
-	"       [--get-vmcs-procbased-ctls2]\n"
-	"       [--get-vmcs-entry-interruption-info]\n"
-	"       [--set-vmcs-entry-interruption-info=<info>]\n"
-	"       [--get-vmcs-eptp]\n"
-	"       [--get-vmcs-guest-physical-address\n"
-	"       [--get-vmcs-guest-linear-address\n"
-	"       [--set-vmcs-exception-bitmap]\n"
-	"       [--get-vmcs-exception-bitmap]\n"
-	"       [--get-vmcs-io-bitmap-address]\n"
-	"       [--get-vmcs-tsc-offset]\n"
-	"       [--get-vmcs-guest-pat]\n"
-	"       [--get-vmcs-host-pat]\n"
-	"       [--get-vmcs-host-cr0]\n"
-	"       [--get-vmcs-host-cr3]\n"
-	"       [--get-vmcs-host-cr4]\n"
-	"       [--get-vmcs-host-rip]\n"
-	"       [--get-vmcs-host-rsp]\n"
-	"       [--get-vmcs-cr0-mask]\n"
-	"       [--get-vmcs-cr0-shadow]\n"
-	"       [--get-vmcs-cr4-mask]\n"
-	"       [--get-vmcs-cr4-shadow]\n"
-	"       [--get-vmcs-cr3-targets]\n"
-	"       [--get-vmcs-apic-access-address]\n"
-	"       [--get-vmcs-virtual-apic-address]\n"
-	"       [--get-vmcs-tpr-threshold]\n"
-	"       [--get-vmcs-msr-bitmap]\n"
-	"       [--get-vmcs-msr-bitmap-address]\n"
-	"       [--get-vmcs-vpid]\n"
-	"       [--get-vmcs-ple-gap]\n"
-	"       [--get-vmcs-ple-window]\n"
-	"       [--get-vmcs-instruction-error]\n"
-	"       [--get-vmcs-exit-ctls]\n"
-	"       [--get-vmcs-entry-ctls]\n"
-	"       [--get-vmcs-guest-sysenter]\n"
-	"       [--get-vmcs-link]\n"
-	"       [--get-vmcs-exit-reason]\n"
-	"       [--get-vmcs-exit-qualification]\n"
-	"       [--get-vmcs-exit-interruption-info]\n"
-	"       [--get-vmcs-exit-interruption-error]\n"
-	"       [--get-vmcs-interruptibility]\n"
 	"       [--set-x2apic-state=<state>]\n"
 	"       [--get-x2apic-state]\n"
 	"       [--unassign-pptdev=<bus/slot/func>]\n"
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
-	"       [--get-highmem]\n",
+	"       [--get-highmem]\n"
+	"       [--get-gpa-pmap]\n"
+	"       [--assert-lapic-lvt=<pin>]\n"
+	"       [--inject-nmi]\n"
+	"       [--force-reset]\n"
+	"       [--force-poweroff]\n"
+	"       [--get-rtc-time]\n"
+	"       [--set-rtc-time=<secs>]\n"
+	"       [--get-rtc-nvram]\n"
+	"       [--set-rtc-nvram=<val>]\n"
+	"       [--rtc-nvram-offset=<offset>]\n"
+	"       [--get-active-cpus]\n"
+	"       [--get-suspended-cpus]\n"
+	"       [--get-intinfo]\n"
+	"       [--get-eptp]\n"
+	"       [--set-exception-bitmap]\n"
+	"       [--get-exception-bitmap]\n"
+	"       [--get-tsc-offset]\n"
+	"       [--get-guest-pat]\n"
+	"       [--get-io-bitmap-address]\n"
+	"       [--get-msr-bitmap]\n"
+	"       [--get-msr-bitmap-address]\n"
+	"       [--get-guest-sysenter]\n"
+	"       [--get-exit-reason]\n"
+	"       [--get-cpu-topology]\n",
 	progname);
+
+	if (cpu_intel) {
+		(void)fprintf(stderr,
+		"       [--get-vmcs-pinbased-ctls]\n"
+		"       [--get-vmcs-procbased-ctls]\n"
+		"       [--get-vmcs-procbased-ctls2]\n"
+		"       [--get-vmcs-entry-interruption-info]\n"
+		"       [--set-vmcs-entry-interruption-info=<info>]\n"
+		"       [--get-vmcs-guest-physical-address\n"
+		"       [--get-vmcs-guest-linear-address\n"
+		"       [--get-vmcs-host-pat]\n"
+		"       [--get-vmcs-host-cr0]\n"
+		"       [--get-vmcs-host-cr3]\n"
+		"       [--get-vmcs-host-cr4]\n"
+		"       [--get-vmcs-host-rip]\n"
+		"       [--get-vmcs-host-rsp]\n"
+		"       [--get-vmcs-cr0-mask]\n"
+		"       [--get-vmcs-cr0-shadow]\n"
+		"       [--get-vmcs-cr4-mask]\n"
+		"       [--get-vmcs-cr4-shadow]\n"
+		"       [--get-vmcs-cr3-targets]\n"
+		"       [--get-vmcs-apic-access-address]\n"
+		"       [--get-vmcs-virtual-apic-address]\n"
+		"       [--get-vmcs-tpr-threshold]\n"
+		"       [--get-vmcs-vpid]\n"
+		"       [--get-vmcs-instruction-error]\n"
+		"       [--get-vmcs-exit-ctls]\n"
+		"       [--get-vmcs-entry-ctls]\n"
+		"       [--get-vmcs-link]\n"
+		"       [--get-vmcs-exit-qualification]\n"
+		"       [--get-vmcs-exit-interruption-info]\n"
+		"       [--get-vmcs-exit-interruption-error]\n"
+		"       [--get-vmcs-interruptibility]\n"
+		);
+	} else {
+		(void)fprintf(stderr,
+		"       [--get-vmcb-intercepts]\n"
+		"       [--get-vmcb-asid]\n"
+		"       [--get-vmcb-exit-details]\n"
+		"       [--get-vmcb-tlb-ctrl]\n"
+		"       [--get-vmcb-virq]\n"
+		"       [--get-avic-apic-bar]\n"
+		"       [--get-avic-backing-page]\n"
+		"       [--get-avic-table]\n"
+		);
+	}
 	exit(1);
 }
 
-static int get_stats, getcap, setcap, capval;
+static int get_rtc_time, set_rtc_time;
+static int get_rtc_nvram, set_rtc_nvram;
+static int rtc_nvram_offset;
+static uint8_t rtc_nvram_value;
+static time_t rtc_secs;
+
+static int get_stats, getcap, setcap, capval, get_gpa_pmap;
+static int inject_nmi, assert_lapic_lvt;
+static int force_reset, force_poweroff;
 static const char *capname;
-static int create, destroy, get_lowmem, get_highmem;
+static int create, destroy, get_memmap, get_memseg;
+static int get_intinfo;
+static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
-static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_cr0, get_cr0, set_cr2, get_cr2, set_cr3, get_cr3;
+static int set_cr4, get_cr4;
 static int set_efer, get_efer;
+static int set_dr0, get_dr0;
+static int set_dr1, get_dr1;
+static int set_dr2, get_dr2;
+static int set_dr3, get_dr3;
+static int set_dr6, get_dr6;
 static int set_dr7, get_dr7;
 static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
 static int set_rax, get_rax;
@@ -234,6 +304,16 @@ static int set_x2apic_state, get_x2apic_state;
 enum x2apic_state x2apic_state;
 static int unassign_pptdev, bus, slot, func;
 static int run;
+static int get_cpu_topology;
+#ifndef __FreeBSD__
+static int wrlock_cycle;
+#endif
+
+/*
+ * VMCB specific.
+ */
+static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl;
+static int get_vmcb_virq, get_avic_table;
 
 /*
  * VMCS-specific fields
@@ -250,14 +330,15 @@ static int get_cr4_mask, get_cr4_shadow;
 static int get_cr3_targets;
 static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
 static int get_msr_bitmap, get_msr_bitmap_address;
-static int get_vpid, get_ple_gap, get_ple_window;
+static int get_vpid_asid;
 static int get_inst_err, get_exit_ctls, get_entry_ctls;
 static int get_host_cr0, get_host_cr3, get_host_cr4;
 static int get_host_rip, get_host_rsp;
 static int get_guest_pat, get_host_pat;
 static int get_guest_sysenter, get_vmcs_link;
-static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_exit_reason, get_vmcs_exit_qualification;
 static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+static int get_vmcs_exit_inst_length;
 
 static uint64_t desc_base;
 static uint32_t desc_limit, desc_access;
@@ -291,29 +372,115 @@ dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
 		printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 		printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 		break;
+	case VM_EXITCODE_SVM:
+		printf("\treason\t\tSVM\n");
+		printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode);
+		printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1);
+		printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2);
+		break;
 	default:
 		printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
 		break;
 	}
 }
 
-static int
-dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+/* AMD 6th generation and Intel compatible MSRs */
+#define MSR_AMD6TH_START	0xC0000000
+#define MSR_AMD6TH_END		0xC0001FFF
+/* AMD 7th and 8th generation compatible MSRs */
+#define MSR_AMD7TH_START	0xC0010000
+#define MSR_AMD7TH_END		0xC0011FFF
+
+static const char *
+msr_name(uint32_t msr)
 {
-	int error, fd, byte, bit, readable, writeable;
-	u_int msr;
-	const char *bitmap;
+	static char buf[32];
+
+	switch(msr) {
+	case MSR_TSC:
+		return ("MSR_TSC");
+	case MSR_EFER:
+		return ("MSR_EFER");
+	case MSR_STAR:
+		return ("MSR_STAR");
+	case MSR_LSTAR:	
+		return ("MSR_LSTAR");
+	case MSR_CSTAR:
+		return ("MSR_CSTAR");
+	case MSR_SF_MASK:
+		return ("MSR_SF_MASK");
+	case MSR_FSBASE:
+		return ("MSR_FSBASE");
+	case MSR_GSBASE:
+		return ("MSR_GSBASE");
+	case MSR_KGSBASE:
+		return ("MSR_KGSBASE");
+	case MSR_SYSENTER_CS_MSR:
+		return ("MSR_SYSENTER_CS_MSR");
+	case MSR_SYSENTER_ESP_MSR:
+		return ("MSR_SYSENTER_ESP_MSR");
+	case MSR_SYSENTER_EIP_MSR:
+		return ("MSR_SYSENTER_EIP_MSR");
+	case MSR_PAT:
+		return ("MSR_PAT");
+	}
+	snprintf(buf, sizeof(buf), "MSR       %#08x", msr);
+
+	return (buf);
+}
 
-	error = -1;
-	bitmap = MAP_FAILED;
+static inline void
+print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable)
+{
 
-	fd = open("/dev/mem", O_RDONLY, 0);
-	if (fd < 0)
-		goto done;
+	if (readable || writeable) {
+		printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu,
+			readable ? 'R' : '-', writeable ? 'W' : '-');
+	}
+}
 
-	bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
-	if (bitmap == MAP_FAILED)
-		goto done;
+/*
+ * Reference APM vol2, section 15.11 MSR Intercepts.
+ */
+static void
+dump_amd_msr_pm(const char *bitmap, int vcpu)
+{
+	int byte, bit, readable, writeable;
+	uint32_t msr;
+
+	for (msr = 0; msr < 0x2000; msr++) {
+		byte = msr / 4;
+		bit = (msr % 4) * 2;
+
+		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
+		print_msr_pm(msr, vcpu, readable, writeable);
+
+		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+		byte += 2048;
+		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
+		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
+				writeable);
+		
+		/* MSR 0xC0010000 to 0xC0011FF is only for AMD */
+		byte += 4096;
+		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
+		print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable,
+				writeable);
+	}
+}
+
+/*
+ * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address
+ */
+static void
+dump_intel_msr_pm(const char *bitmap, int vcpu)
+{
+	int byte, bit, readable, writeable;
+	uint32_t msr;
 
 	for (msr = 0; msr < 0x2000; msr++) {
 		byte = msr / 8;
@@ -321,31 +488,56 @@ dump_vmcs_msr_bitmap(int vcpu, u_long addr)
 
 		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
-		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
-		if (readable || writeable) {
-			printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
-				readable ? 'R' : '-',
-				writeable ? 'W' : '-');
-		}
+		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
+		print_msr_pm(msr, vcpu, readable, writeable);
 
 		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
 		byte += 1024;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
-		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
-		if (readable || writeable) {
-			printf("msr 0x%08x[%d]\t\t%c%c\n",
-				0xc0000000 + msr, vcpu,
-				readable ? 'R' : '-',
-				writeable ? 'W' : '-');
-		}
+		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
+		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
+				writeable);
+	}
+}
+
+static int
+dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel)
+{
+	int error, fd, map_size;
+	const char *bitmap;
+
+	error = -1;
+	bitmap = MAP_FAILED;
+
+	fd = open("/dev/mem", O_RDONLY, 0);
+	if (fd < 0) {
+		perror("Couldn't open /dev/mem");
+		goto done;
+	}
+
+	if (cpu_intel)
+		map_size = PAGE_SIZE;
+	else
+		map_size = 2 * PAGE_SIZE;
+
+	bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr);
+	if (bitmap == MAP_FAILED) {
+		perror("mmap failed");
+		goto done;
 	}
+	
+	if (cpu_intel)
+		dump_intel_msr_pm(bitmap, vcpu);
+	else	
+		dump_amd_msr_pm(bitmap, vcpu);
 
 	error = 0;
 done:
 	if (bitmap != MAP_FAILED)
-		munmap((void *)bitmap, PAGE_SIZE);
+		munmap((void *)bitmap, map_size);
 	if (fd >= 0)
 		close(fd);
+
 	return (error);
 }
 
@@ -363,14 +555,36 @@ vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
 	return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
 }
 
+static int
+vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
+	uint64_t *ret_val)
+{
+
+	return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val));
+}
+
+static int
+vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
+	uint64_t val)
+{
+	
+	return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val));
+}
+
 enum {
 	VMNAME = 1000,	/* avoid collision with return values from getopt */
 	VCPU,
 	SET_MEM,
 	SET_EFER,
 	SET_CR0,
+	SET_CR2,
 	SET_CR3,
 	SET_CR4,
+	SET_DR0,
+	SET_DR1,
+	SET_DR2,
+	SET_DR3,
+	SET_DR6,
 	SET_DR7,
 	SET_RSP,
 	SET_RIP,
@@ -388,492 +602,158 @@ enum {
 	SET_TR,
 	SET_LDTR,
 	SET_X2APIC_STATE,
-	SET_VMCS_EXCEPTION_BITMAP,
+	SET_EXCEPTION_BITMAP,
 	SET_VMCS_ENTRY_INTERRUPTION_INFO,
 	SET_CAP,
 	CAPNAME,
 	UNASSIGN_PPTDEV,
+	GET_GPA_PMAP,
+	ASSERT_LAPIC_LVT,
+	SET_RTC_TIME,
+	SET_RTC_NVRAM,
+	RTC_NVRAM_OFFSET,
 };
 
-int
-main(int argc, char *argv[])
+static void
+print_cpus(const char *banner, const cpuset_t *cpus)
 {
-	char *vmname;
-	int error, ch, vcpu;
-	vm_paddr_t gpa;
-	size_t len;
-	struct vm_exit vmexit;
-	uint64_t ctl, eptp, bm, addr, u64;
-	struct vmctx *ctx;
-	int wired;
+	int i;
+	int first;
+
+	first = 1;
+	printf("%s:\t", banner);
+	if (!CPU_EMPTY(cpus)) {
+		for (i = 0; i < CPU_SETSIZE; i++) {
+			if (CPU_ISSET(i, cpus)) {
+				printf("%s%d", first ? " " : ", ", i);
+				first = 0;
+			}
+		}
+	} else
+		printf(" (none)");
+	printf("\n");
+}
+
+static void
+print_intinfo(const char *banner, uint64_t info)
+{
+	int type;
+
+	printf("%s:\t", banner);
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		switch (type) {
+		case VM_INTINFO_HWINTR:
+			printf("extint");
+			break;
+		case VM_INTINFO_NMI:
+			printf("nmi");
+			break;
+		case VM_INTINFO_SWINTR:
+			printf("swint");
+			break;
+		default:
+			printf("exception");
+			break;
+		}
+		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
+		if (info & VM_INTINFO_DEL_ERRCODE)
+			printf(" errcode %#x", (u_int)(info >> 32));
+	} else {
+		printf("n/a");
+	}
+	printf("\n");
+}
 
-	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+static bool
+cpu_vendor_intel(void)
+{
+	u_int regs[4];
+	char cpu_vendor[13];
+
+	do_cpuid(0, regs);
+	((u_int *)&cpu_vendor)[0] = regs[1];
+	((u_int *)&cpu_vendor)[1] = regs[3];
+	((u_int *)&cpu_vendor)[2] = regs[2];
+	cpu_vendor[12] = '\0';
+
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+		return (false);
+	} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+		return (true);
+	} else {
+		fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
+		exit(1);
+	}
+}
+
+static int
+get_all_registers(struct vmctx *ctx, int vcpu)
+{
+	uint64_t cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7;
+	uint64_t rsp, rip, rflags, efer;
 	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
 	uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
-	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+	int error = 0;
 
-	struct option opts[] = {
-		{ "vm",		REQ_ARG,	0,	VMNAME },
-		{ "cpu",	REQ_ARG,	0,	VCPU },
-		{ "set-mem",	REQ_ARG,	0,	SET_MEM },
-		{ "set-efer",	REQ_ARG,	0,	SET_EFER },
-		{ "set-cr0",	REQ_ARG,	0,	SET_CR0 },
-		{ "set-cr3",	REQ_ARG,	0,	SET_CR3 },
-		{ "set-cr4",	REQ_ARG,	0,	SET_CR4 },
-		{ "set-dr7",	REQ_ARG,	0,	SET_DR7 },
-		{ "set-rsp",	REQ_ARG,	0,	SET_RSP },
-		{ "set-rip",	REQ_ARG,	0,	SET_RIP },
-		{ "set-rax",	REQ_ARG,	0,	SET_RAX },
-		{ "set-rflags",	REQ_ARG,	0,	SET_RFLAGS },
-		{ "desc-base",	REQ_ARG,	0,	DESC_BASE },
-		{ "desc-limit",	REQ_ARG,	0,	DESC_LIMIT },
-		{ "desc-access",REQ_ARG,	0,	DESC_ACCESS },
-		{ "set-cs",	REQ_ARG,	0,	SET_CS },
-		{ "set-ds",	REQ_ARG,	0,	SET_DS },
-		{ "set-es",	REQ_ARG,	0,	SET_ES },
-		{ "set-fs",	REQ_ARG,	0,	SET_FS },
-		{ "set-gs",	REQ_ARG,	0,	SET_GS },
-		{ "set-ss",	REQ_ARG,	0,	SET_SS },
-		{ "set-tr",	REQ_ARG,	0,	SET_TR },
-		{ "set-ldtr",	REQ_ARG,	0,	SET_LDTR },
-		{ "set-x2apic-state",REQ_ARG,	0,	SET_X2APIC_STATE },
-		{ "set-vmcs-exception-bitmap",
-				REQ_ARG,	0, SET_VMCS_EXCEPTION_BITMAP },
-		{ "set-vmcs-entry-interruption-info",
-				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
-		{ "capname",	REQ_ARG,	0,	CAPNAME },
-		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
-		{ "setcap",	REQ_ARG,	0,	SET_CAP },
-		{ "getcap",	NO_ARG,		&getcap,	1 },
-		{ "get-stats",	NO_ARG,		&get_stats,	1 },
-		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
-		{ "set-desc-ds",NO_ARG,		&set_desc_ds,	1 },
-		{ "get-desc-es",NO_ARG,		&get_desc_es,	1 },
-		{ "set-desc-es",NO_ARG,		&set_desc_es,	1 },
-		{ "get-desc-ss",NO_ARG,		&get_desc_ss,	1 },
-		{ "set-desc-ss",NO_ARG,		&set_desc_ss,	1 },
-		{ "get-desc-cs",NO_ARG,		&get_desc_cs,	1 },
-		{ "set-desc-cs",NO_ARG,		&set_desc_cs,	1 },
-		{ "get-desc-fs",NO_ARG,		&get_desc_fs,	1 },
-		{ "set-desc-fs",NO_ARG,		&set_desc_fs,	1 },
-		{ "get-desc-gs",NO_ARG,		&get_desc_gs,	1 },
-		{ "set-desc-gs",NO_ARG,		&set_desc_gs,	1 },
-		{ "get-desc-tr",NO_ARG,		&get_desc_tr,	1 },
-		{ "set-desc-tr",NO_ARG,		&set_desc_tr,	1 },
-		{ "set-desc-ldtr", NO_ARG,	&set_desc_ldtr,	1 },
-		{ "get-desc-ldtr", NO_ARG,	&get_desc_ldtr,	1 },
-		{ "set-desc-gdtr", NO_ARG,	&set_desc_gdtr, 1 },
-		{ "get-desc-gdtr", NO_ARG,	&get_desc_gdtr, 1 },
-		{ "set-desc-idtr", NO_ARG,	&set_desc_idtr, 1 },
-		{ "get-desc-idtr", NO_ARG,	&get_desc_idtr, 1 },
-		{ "get-lowmem", NO_ARG,		&get_lowmem,	1 },
-		{ "get-highmem",NO_ARG,		&get_highmem,	1 },
-		{ "get-efer",	NO_ARG,		&get_efer,	1 },
-		{ "get-cr0",	NO_ARG,		&get_cr0,	1 },
-		{ "get-cr3",	NO_ARG,		&get_cr3,	1 },
-		{ "get-cr4",	NO_ARG,		&get_cr4,	1 },
-		{ "get-dr7",	NO_ARG,		&get_dr7,	1 },
-		{ "get-rsp",	NO_ARG,		&get_rsp,	1 },
-		{ "get-rip",	NO_ARG,		&get_rip,	1 },
-		{ "get-rax",	NO_ARG,		&get_rax,	1 },
-		{ "get-rbx",	NO_ARG,		&get_rbx,	1 },
-		{ "get-rcx",	NO_ARG,		&get_rcx,	1 },
-		{ "get-rdx",	NO_ARG,		&get_rdx,	1 },
-		{ "get-rsi",	NO_ARG,		&get_rsi,	1 },
-		{ "get-rdi",	NO_ARG,		&get_rdi,	1 },
-		{ "get-rbp",	NO_ARG,		&get_rbp,	1 },
-		{ "get-r8",	NO_ARG,		&get_r8,	1 },
-		{ "get-r9",	NO_ARG,		&get_r9,	1 },
-		{ "get-r10",	NO_ARG,		&get_r10,	1 },
-		{ "get-r11",	NO_ARG,		&get_r11,	1 },
-		{ "get-r12",	NO_ARG,		&get_r12,	1 },
-		{ "get-r13",	NO_ARG,		&get_r13,	1 },
-		{ "get-r14",	NO_ARG,		&get_r14,	1 },
-		{ "get-r15",	NO_ARG,		&get_r15,	1 },
-		{ "get-rflags",	NO_ARG,		&get_rflags,	1 },
-		{ "get-cs",	NO_ARG,		&get_cs,	1 },
-		{ "get-ds",	NO_ARG,		&get_ds,	1 },
-		{ "get-es",	NO_ARG,		&get_es,	1 },
-		{ "get-fs",	NO_ARG,		&get_fs,	1 },
-		{ "get-gs",	NO_ARG,		&get_gs,	1 },
-		{ "get-ss",	NO_ARG,		&get_ss,	1 },
-		{ "get-tr",	NO_ARG,		&get_tr,	1 },
-		{ "get-ldtr",	NO_ARG,		&get_ldtr,	1 },
-		{ "get-vmcs-pinbased-ctls",
-				NO_ARG,		&get_pinbased_ctls, 1 },
-		{ "get-vmcs-procbased-ctls",
-				NO_ARG,		&get_procbased_ctls, 1 },
-		{ "get-vmcs-procbased-ctls2",
-				NO_ARG,		&get_procbased_ctls2, 1 },
-		{ "get-vmcs-guest-linear-address",
-				NO_ARG,		&get_vmcs_gla,	1 },
-		{ "get-vmcs-guest-physical-address",
-				NO_ARG,		&get_vmcs_gpa,	1 },
-		{ "get-vmcs-entry-interruption-info",
-				NO_ARG, &get_vmcs_entry_interruption_info, 1},
-		{ "get-vmcs-eptp", NO_ARG,	&get_eptp,	1 },
-		{ "get-vmcs-exception-bitmap",
-				NO_ARG,		&get_exception_bitmap, 1 },
-		{ "get-vmcs-io-bitmap-address",
-				NO_ARG,		&get_io_bitmap,	1 },
-		{ "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
-		{ "get-vmcs-cr0-mask", NO_ARG,	&get_cr0_mask,	1 },
-		{ "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
-		{ "get-vmcs-cr4-mask", NO_ARG,	&get_cr4_mask,	1 },
-		{ "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
-		{ "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
-		{ "get-vmcs-apic-access-address",
-				NO_ARG,		&get_apic_access_addr, 1},
-		{ "get-vmcs-virtual-apic-address",
-				NO_ARG,		&get_virtual_apic_addr, 1},
-		{ "get-vmcs-tpr-threshold",
-				NO_ARG,		&get_tpr_threshold, 1 },
-		{ "get-vmcs-msr-bitmap",
-				NO_ARG,		&get_msr_bitmap, 1 },
-		{ "get-vmcs-msr-bitmap-address",
-				NO_ARG,		&get_msr_bitmap_address, 1 },
-		{ "get-vmcs-vpid", NO_ARG,	&get_vpid,	1 },
-		{ "get-vmcs-ple-gap", NO_ARG,	&get_ple_gap,	1 },
-		{ "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
-		{ "get-vmcs-instruction-error",
-				NO_ARG,		&get_inst_err,	1 },
-		{ "get-vmcs-exit-ctls", NO_ARG,	&get_exit_ctls,	1 },
-		{ "get-vmcs-entry-ctls",
-					NO_ARG,	&get_entry_ctls, 1 },
-		{ "get-vmcs-guest-pat",	NO_ARG,	&get_guest_pat,	1 },
-		{ "get-vmcs-host-pat",	NO_ARG,	&get_host_pat,	1 },
-		{ "get-vmcs-host-cr0",
-				NO_ARG,		&get_host_cr0,	1 },
-		{ "get-vmcs-host-cr3",
-				NO_ARG,		&get_host_cr3,	1 },
-		{ "get-vmcs-host-cr4",
-				NO_ARG,		&get_host_cr4,	1 },
-		{ "get-vmcs-host-rip",
-				NO_ARG,		&get_host_rip,	1 },
-		{ "get-vmcs-host-rsp",
-				NO_ARG,		&get_host_rsp,	1 },
-		{ "get-vmcs-guest-sysenter",
-				NO_ARG,		&get_guest_sysenter, 1 },
-		{ "get-vmcs-link", NO_ARG,	&get_vmcs_link, 1 },
-		{ "get-vmcs-exit-reason",
-				NO_ARG,		&get_vmcs_exit_reason, 1 },
-		{ "get-vmcs-exit-qualification",
-			NO_ARG,		&get_vmcs_exit_qualification, 1 },
-		{ "get-vmcs-exit-interruption-info",
-				NO_ARG,	&get_vmcs_exit_interruption_info, 1},
-		{ "get-vmcs-exit-interruption-error",
-				NO_ARG,	&get_vmcs_exit_interruption_error, 1},
-		{ "get-vmcs-interruptibility",
-				NO_ARG, &get_vmcs_interruptibility, 1 },
-		{ "get-x2apic-state",NO_ARG,	&get_x2apic_state, 1 },
-		{ "get-all",	NO_ARG,		&get_all,	1 },
-		{ "run",	NO_ARG,		&run,		1 },
-		{ "create",	NO_ARG,		&create,	1 },
-		{ "destroy",	NO_ARG,		&destroy,	1 },
-		{ NULL,		0,		NULL,		0 }
-	};
+	if (!error && (get_efer || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+		if (error == 0)
+			printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+	}
 
-	vcpu = 0;
-	progname = basename(argv[0]);
+	if (!error && (get_cr0 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+		if (error == 0)
+			printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+	}
 
-	while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
-		switch (ch) {
-		case 0:
-			break;
-		case VMNAME:
-			vmname = optarg;
-			break;
-		case VCPU:
-			vcpu = atoi(optarg);
-			break;
-		case SET_MEM:
-			memsize = atoi(optarg) * MB;
-			memsize = roundup(memsize, 2 * MB);
-			break;
-		case SET_EFER:
-			efer = strtoul(optarg, NULL, 0);
-			set_efer = 1;
-			break;
-		case SET_CR0:
-			cr0 = strtoul(optarg, NULL, 0);
-			set_cr0 = 1;
-			break;
-		case SET_CR3:
-			cr3 = strtoul(optarg, NULL, 0);
-			set_cr3 = 1;
-			break;
-		case SET_CR4:
-			cr4 = strtoul(optarg, NULL, 0);
-			set_cr4 = 1;
-			break;
-		case SET_DR7:
-			dr7 = strtoul(optarg, NULL, 0);
-			set_dr7 = 1;
-			break;
-		case SET_RSP:
-			rsp = strtoul(optarg, NULL, 0);
-			set_rsp = 1;
-			break;
-		case SET_RIP:
-			rip = strtoul(optarg, NULL, 0);
-			set_rip = 1;
-			break;
-		case SET_RAX:
-			rax = strtoul(optarg, NULL, 0);
-			set_rax = 1;
-			break;
-		case SET_RFLAGS:
-			rflags = strtoul(optarg, NULL, 0);
-			set_rflags = 1;
-			break;
-		case DESC_BASE:
-			desc_base = strtoul(optarg, NULL, 0);
-			break;
-		case DESC_LIMIT:
-			desc_limit = strtoul(optarg, NULL, 0);
-			break;
-		case DESC_ACCESS:
-			desc_access = strtoul(optarg, NULL, 0);
-			break;
-		case SET_CS:
-			cs = strtoul(optarg, NULL, 0);
-			set_cs = 1;
-			break;
-		case SET_DS:
-			ds = strtoul(optarg, NULL, 0);
-			set_ds = 1;
-			break;
-		case SET_ES:
-			es = strtoul(optarg, NULL, 0);
-			set_es = 1;
-			break;
-		case SET_FS:
-			fs = strtoul(optarg, NULL, 0);
-			set_fs = 1;
-			break;
-		case SET_GS:
-			gs = strtoul(optarg, NULL, 0);
-			set_gs = 1;
-			break;
-		case SET_SS:
-			ss = strtoul(optarg, NULL, 0);
-			set_ss = 1;
-			break;
-		case SET_TR:
-			tr = strtoul(optarg, NULL, 0);
-			set_tr = 1;
-			break;
-		case SET_LDTR:
-			ldtr = strtoul(optarg, NULL, 0);
-			set_ldtr = 1;
-			break;
-		case SET_X2APIC_STATE:
-			x2apic_state = strtol(optarg, NULL, 0);
-			set_x2apic_state = 1;
-			break;
-		case SET_VMCS_EXCEPTION_BITMAP:
-			exception_bitmap = strtoul(optarg, NULL, 0);
-			set_exception_bitmap = 1;
-			break;
-		case SET_VMCS_ENTRY_INTERRUPTION_INFO:
-			vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
-			set_vmcs_entry_interruption_info = 1;
-			break;
-		case SET_CAP:
-			capval = strtoul(optarg, NULL, 0);
-			setcap = 1;
-			break;
-		case CAPNAME:
-			capname = optarg;
-			break;
-		case UNASSIGN_PPTDEV:
-			unassign_pptdev = 1;
-			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
-				usage();
-			break;
-		default:
-			usage();
-		}
-	}
-	argc -= optind;
-	argv += optind;
-
-	if (vmname == NULL)
-		usage();
-
-	error = 0;
-
-	if (!error && create)
-		error = vm_create(vmname);
-
-	if (!error) {
-		ctx = vm_open(vmname);
-		if (ctx == NULL)
-			error = -1;
-	}
-
-	if (!error && memsize)
-		error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE);
-
-	if (!error && set_efer)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
-
-	if (!error && set_cr0)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
-
-	if (!error && set_cr3)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
-
-	if (!error && set_cr4)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
-
-	if (!error && set_dr7)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
-
-	if (!error && set_rsp)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
-
-	if (!error && set_rip)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
-
-	if (!error && set_rax)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
-
-	if (!error && set_rflags) {
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
-					rflags);
-	}
-
-	if (!error && set_desc_ds) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_es) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_ss) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_cs) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_fs) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_gs) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_tr) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_ldtr) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
-				    desc_base, desc_limit, desc_access);
-	}
-
-	if (!error && set_desc_gdtr) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
-				    desc_base, desc_limit, 0);
-	}
-
-	if (!error && set_desc_idtr) {
-		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
-				    desc_base, desc_limit, 0);
-	}
-
-	if (!error && set_cs)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
-
-	if (!error && set_ds)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
-
-	if (!error && set_es)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
-
-	if (!error && set_fs)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
-
-	if (!error && set_gs)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
-
-	if (!error && set_ss)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
-
-	if (!error && set_tr)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
-
-	if (!error && set_ldtr)
-		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
-
-	if (!error && set_x2apic_state)
-		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
-
-#ifdef	__FreeBSD__
-	if (!error && unassign_pptdev)
-		error = vm_unassign_pptdev(ctx, bus, slot, func);
-#endif
-
-	if (!error && set_exception_bitmap) {
-		error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
-					  exception_bitmap);
+	if (!error && (get_cr2 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR2, &cr2);
+		if (error == 0)
+			printf("cr2[%d]\t\t0x%016lx\n", vcpu, cr2);
 	}
 
-	if (!error && set_vmcs_entry_interruption_info) {
-		error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
-					  vmcs_entry_interruption_info);
+	if (!error && (get_cr3 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+		if (error == 0)
+			printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
 	}
 
-	if (!error && (get_lowmem || get_all)) {
-		gpa = 0;
-		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
+	if (!error && (get_cr4 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
 		if (error == 0)
-			printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
-			    wired ? " wired" : "");
+			printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
 	}
 
-	if (!error && (get_highmem || get_all)) {
-		gpa = 4 * GB;
-		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
+	if (!error && (get_dr0 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR0, &dr0);
 		if (error == 0)
-			printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
-			    wired ? " wired" : "");
+			printf("dr0[%d]\t\t0x%016lx\n", vcpu, dr0);
 	}
 
-	if (!error && (get_efer || get_all)) {
-		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+	if (!error && (get_dr1 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR1, &dr1);
 		if (error == 0)
-			printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+			printf("dr1[%d]\t\t0x%016lx\n", vcpu, dr1);
 	}
 
-	if (!error && (get_cr0 || get_all)) {
-		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+	if (!error && (get_dr2 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR2, &dr2);
 		if (error == 0)
-			printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+			printf("dr2[%d]\t\t0x%016lx\n", vcpu, dr2);
 	}
 
-	if (!error && (get_cr3 || get_all)) {
-		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+	if (!error && (get_dr3 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR3, &dr3);
 		if (error == 0)
-			printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+			printf("dr3[%d]\t\t0x%016lx\n", vcpu, dr3);
 	}
 
-	if (!error && (get_cr4 || get_all)) {
-		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+	if (!error && (get_dr6 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR6, &dr6);
 		if (error == 0)
-			printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+			printf("dr6[%d]\t\t0x%016lx\n", vcpu, dr6);
 	}
 
 	if (!error && (get_dr7 || get_all)) {
@@ -991,30 +871,21 @@ main(int argc, char *argv[])
 			printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
 	}
 
-#ifdef	__FreeBSD__
-	if (!error && (get_stats || get_all)) {
-		int i, num_stats;
-		uint64_t *stats;
-		struct timeval tv;
-		const char *desc;
+	return (error);
+}
 
-		stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
-		if (stats != NULL) {
-			printf("vcpu%d\n", vcpu);
-			for (i = 0; i < num_stats; i++) {
-				desc = vm_get_stat_desc(ctx, i);
-				printf("%-40s\t%ld\n", desc, stats[i]);
-			}
-		}
-	}
-#endif
+static int
+get_all_segments(struct vmctx *ctx, int vcpu)
+{
+	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+	int error = 0;
 
 	if (!error && (get_desc_ds || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
-				    &desc_base, &desc_limit, &desc_access);
+				   &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			      vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1023,7 +894,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1032,7 +903,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1041,7 +912,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1050,7 +921,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1059,7 +930,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1068,7 +939,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1077,7 +948,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
-			       vcpu, desc_base, desc_limit, desc_access);	
+			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
@@ -1086,7 +957,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
-			       vcpu, desc_base, desc_limit);	
+			       vcpu, desc_base, desc_limit);
 		}
 	}
 
@@ -1095,7 +966,7 @@ main(int argc, char *argv[])
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
-			       vcpu, desc_base, desc_limit);	
+			       vcpu, desc_base, desc_limit);
 		}
 	}
 
@@ -1147,82 +1018,14 @@ main(int argc, char *argv[])
 			printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
 	}
 
-	if (!error && (get_x2apic_state || get_all)) {
-		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
-		if (error == 0)
-			printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
-	}
-
-	if (!error && (get_pinbased_ctls || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
-		if (error == 0)
-			printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
-	}
-
-	if (!error && (get_procbased_ctls || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu,
-					  VMCS_PRI_PROC_BASED_CTLS, &ctl);
-		if (error == 0)
-			printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
-	}
-
-	if (!error && (get_procbased_ctls2 || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu,
-					  VMCS_SEC_PROC_BASED_CTLS, &ctl);
-		if (error == 0)
-			printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl);
-	}
-
-	if (!error && (get_vmcs_gla || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu,
-					  VMCS_GUEST_LINEAR_ADDRESS, &u64);
-		if (error == 0)
-			printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
-	}
-
-	if (!error && (get_vmcs_gpa || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu,
-					  VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
-		if (error == 0)
-			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
-	}
-
-	if (!error && (get_vmcs_entry_interruption_info || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
-		if (error == 0) {
-			printf("entry_interruption_info[%d]\t0x%08lx\n",
-				vcpu, u64);
-		}
-	}
-
-	if (!error && (get_eptp || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
-		if (error == 0)
-			printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
-	}
-
-	if (!error && (get_exception_bitmap || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
-					  &bm);
-		if (error == 0)
-			printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm);
-	}
-
-	if (!error && (get_io_bitmap || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
-		if (error == 0)
-			printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm);
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
-		if (error == 0)
-			printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm);
-	}
+	return (error);
+}
 
-	if (!error && (get_tsc_offset || get_all)) {
-		uint64_t tscoff;
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
-		if (error == 0)
-			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
-	}
+static int
+get_misc_vmcs(struct vmctx *ctx, int vcpu)
+{
+	uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64;
+	int error = 0;
 
 	if (!error && (get_cr0_mask || get_all)) {
 		uint64_t cr0mask;
@@ -1259,7 +1062,7 @@ main(int argc, char *argv[])
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
 					  &target_count);
 		if (error == 0) {
-			printf("cr3_target_count[%d]\t0x%08lx\n",
+			printf("cr3_target_count[%d]\t0x%016lx\n",
 				vcpu, target_count);
 		}
 
@@ -1292,57 +1095,55 @@ main(int argc, char *argv[])
 		}
 	}
 
-	if (!error && (get_apic_access_addr || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+	if (!error && (get_pinbased_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
 		if (error == 0)
-			printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+			printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
-	if (!error && (get_virtual_apic_addr || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+	if (!error && (get_procbased_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_PRI_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
-			printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+			printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
-	if (!error && (get_tpr_threshold || get_all)) {
-		uint64_t threshold;
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
-					  &threshold);
+	if (!error && (get_procbased_ctls2 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_SEC_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
-			printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold);
+			printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
-	if (!error && (get_msr_bitmap_address || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+	if (!error && (get_vmcs_gla || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_LINEAR_ADDRESS, &u64);
 		if (error == 0)
-			printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+			printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
-	if (!error && (get_msr_bitmap || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+	if (!error && (get_vmcs_gpa || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
 		if (error == 0)
-			error = dump_vmcs_msr_bitmap(vcpu, addr);
+			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
-	if (!error && (get_vpid || get_all)) {
-		uint64_t vpid;
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
-		if (error == 0)
-			printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid);
-	}
-	
-	if (!error && (get_ple_window || get_all)) {
-		uint64_t window;
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
-		if (error == 0)
-			printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window);
+	if (!error && (get_vmcs_entry_interruption_info || 
+		get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+		if (error == 0) {
+			printf("entry_interruption_info[%d]\t0x%016lx\n",
+				vcpu, u64);
+		}
 	}
 
-	if (!error && (get_ple_gap || get_all)) {
-		uint64_t gap;
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+	if (!error && (get_tpr_threshold || get_all)) {
+		uint64_t threshold;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+					  &threshold);
 		if (error == 0)
-			printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap);
+			printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold);
 	}
 
 	if (!error && (get_inst_err || get_all)) {
@@ -1350,7 +1151,7 @@ main(int argc, char *argv[])
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
 					  &insterr);
 		if (error == 0) {
-			printf("instruction_error[%d]\t0x%08lx\n",
+			printf("instruction_error[%d]\t0x%016lx\n",
 				vcpu, insterr);
 		}
 	}
@@ -1358,13 +1159,13 @@ main(int argc, char *argv[])
 	if (!error && (get_exit_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
 		if (error == 0)
-			printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+			printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_entry_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
 		if (error == 0)
-			printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+			printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_host_pat || get_all)) {
@@ -1373,12 +1174,6 @@ main(int argc, char *argv[])
 			printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
 	}
 
-	if (!error && (get_guest_pat || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
-		if (error == 0)
-			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
-	}
-
 	if (!error && (get_host_cr0 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
 		if (error == 0)
@@ -1409,55 +1204,25 @@ main(int argc, char *argv[])
 			printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
 	}
 
-	if (!error && (get_guest_sysenter || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu,
-					  VMCS_GUEST_IA32_SYSENTER_CS, &cs);
-		if (error == 0)
-			printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs);
-
-		error = vm_get_vmcs_field(ctx, vcpu,
-					  VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
-		if (error == 0)
-			printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
-		error = vm_get_vmcs_field(ctx, vcpu,
-					  VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
-		if (error == 0)
-			printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
-	}
-
 	if (!error && (get_vmcs_link || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
 		if (error == 0)
 			printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
-	if (!error && (get_vmcs_exit_reason || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
-		if (error == 0)
-			printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
-	}
-
-	if (!error && (get_vmcs_exit_qualification || get_all)) {
-		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
-					  &u64);
-		if (error == 0)
-			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
-				vcpu, u64);
-	}
-
 	if (!error && (get_vmcs_exit_interruption_info || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64);
 		if (error == 0) {
-			printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n",
+			printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_exit_interruption_error || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE,
-		    &u64);
+		    			  &u64);
 		if (error == 0) {
-			printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n",
+			printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
@@ -1466,58 +1231,1150 @@ main(int argc, char *argv[])
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_INTERRUPTIBILITY, &u64);
 		if (error == 0) {
-			printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n",
+			printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
-	if (!error && setcap) {
-		int captype;
-		captype = vm_capability_name2type(capname);
-		error = vm_set_capability(ctx, vcpu, captype, capval);
-		if (error != 0 && errno == ENOENT)
-			printf("Capability \"%s\" is not available\n", capname);
+	if (!error && (get_vmcs_exit_inst_length || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+		    VMCS_EXIT_INSTRUCTION_LENGTH, &u64);
+		if (error == 0)
+			printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpu,
+			    (uint32_t)u64);
 	}
 
-	if (!error && (getcap || get_all)) {
-		int captype, val, getcaptype;
+	if (!error && (get_vmcs_exit_qualification || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+					  &u64);
+		if (error == 0)
+			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+				vcpu, u64);
+	}
+	
+	return (error);
+}
 
-		if (getcap && capname)
-			getcaptype = vm_capability_name2type(capname);
-		else
-			getcaptype = -1;
+static int
+get_misc_vmcb(struct vmctx *ctx, int vcpu)
+{
+	uint64_t ctl, addr;
+	int error = 0;
 
-		for (captype = 0; captype < VM_CAP_MAX; captype++) {
-			if (getcaptype >= 0 && captype != getcaptype)
-				continue;
-			error = vm_get_capability(ctx, vcpu, captype, &val);
-			if (error == 0) {
-				printf("Capability \"%s\" is %s on vcpu %d\n",
-					vm_capability_type2name(captype),
-					val ? "set" : "not set", vcpu);
-			} else if (errno == ENOENT) {
-				error = 0;
-				printf("Capability \"%s\" is not available\n",
-					vm_capability_type2name(captype));
-			} else {
-				break;
-			}
-		}
+	if (!error && (get_vmcb_intercept || get_all)) {
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4,
+		    &ctl);
+		if (error == 0)
+			printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
+
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4,
+		    &ctl);
+		if (error == 0)
+			printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
+
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4,
+		    &ctl);
+		if (error == 0)
+			printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
+
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT,
+		    4, &ctl);
+		if (error == 0)
+			printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
+
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT,
+		    4, &ctl);
+		if (error == 0)
+			printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 	}
 
-	if (!error && run) {
-		error = vm_run(ctx, vcpu, &vmexit);
+	if (!error && (get_vmcb_tlb_ctrl || get_all)) {
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL,
+					  4, &ctl);
 		if (error == 0)
-			dump_vm_run_exitcode(&vmexit, vcpu);
-		else
-			printf("vm_run error %d\n", error);
+			printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_vmcb_exit_details || get_all)) {
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1,
+					  8, &ctl);
+		if (error == 0)
+			printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl);
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2,
+					  8, &ctl);
+		if (error == 0)
+			printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl);
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO,
+					  8, &ctl);
+		if (error == 0)
+			printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
+	if (!error && (get_vmcb_virq || get_all)) {
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ,
+					  8, &ctl);
+		if (error == 0)
+			printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_apic_access_addr || get_all)) {
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8,
+					  &addr);
+		if (error == 0)
+			printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_virtual_apic_addr || get_all)) {
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8,
+					  &addr);
+		if (error == 0)
+			printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_avic_table || get_all)) {
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8,
+					  &addr);
+		if (error == 0)
+			printf("AVIC logical table[%d]\t0x%016lx\n",
+				vcpu, addr);
+		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8,
+					  &addr);
+		if (error == 0)
+			printf("AVIC physical table[%d]\t0x%016lx\n",
+				vcpu, addr);
+	}
+
+	return (error);
+}
+
+static struct option *
+setup_options(bool cpu_intel)
+{
+	const struct option common_opts[] = {
+		{ "vm",		REQ_ARG,	0,	VMNAME },
+		{ "cpu",	REQ_ARG,	0,	VCPU },
+		{ "set-mem",	REQ_ARG,	0,	SET_MEM },
+		{ "set-efer",	REQ_ARG,	0,	SET_EFER },
+		{ "set-cr0",	REQ_ARG,	0,	SET_CR0 },
+		{ "set-cr2",	REQ_ARG,	0,	SET_CR2 },
+		{ "set-cr3",	REQ_ARG,	0,	SET_CR3 },
+		{ "set-cr4",	REQ_ARG,	0,	SET_CR4 },
+		{ "set-dr0",	REQ_ARG,	0,	SET_DR0 },
+		{ "set-dr1",	REQ_ARG,	0,	SET_DR1 },
+		{ "set-dr2",	REQ_ARG,	0,	SET_DR2 },
+		{ "set-dr3",	REQ_ARG,	0,	SET_DR3 },
+		{ "set-dr6",	REQ_ARG,	0,	SET_DR6 },
+		{ "set-dr7",	REQ_ARG,	0,	SET_DR7 },
+		{ "set-rsp",	REQ_ARG,	0,	SET_RSP },
+		{ "set-rip",	REQ_ARG,	0,	SET_RIP },
+		{ "set-rax",	REQ_ARG,	0,	SET_RAX },
+		{ "set-rflags",	REQ_ARG,	0,	SET_RFLAGS },
+		{ "desc-base",	REQ_ARG,	0,	DESC_BASE },
+		{ "desc-limit",	REQ_ARG,	0,	DESC_LIMIT },
+		{ "desc-access",REQ_ARG,	0,	DESC_ACCESS },
+		{ "set-cs",	REQ_ARG,	0,	SET_CS },
+		{ "set-ds",	REQ_ARG,	0,	SET_DS },
+		{ "set-es",	REQ_ARG,	0,	SET_ES },
+		{ "set-fs",	REQ_ARG,	0,	SET_FS },
+		{ "set-gs",	REQ_ARG,	0,	SET_GS },
+		{ "set-ss",	REQ_ARG,	0,	SET_SS },
+		{ "set-tr",	REQ_ARG,	0,	SET_TR },
+		{ "set-ldtr",	REQ_ARG,	0,	SET_LDTR },
+		{ "set-x2apic-state",REQ_ARG,	0,	SET_X2APIC_STATE },
+		{ "set-exception-bitmap",
+				REQ_ARG,	0, SET_EXCEPTION_BITMAP },
+		{ "capname",	REQ_ARG,	0,	CAPNAME },
+		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
+		{ "setcap",	REQ_ARG,	0,	SET_CAP },
+		{ "get-gpa-pmap", REQ_ARG,	0,	GET_GPA_PMAP },
+		{ "assert-lapic-lvt", REQ_ARG,	0,	ASSERT_LAPIC_LVT },
+		{ "get-rtc-time", NO_ARG,	&get_rtc_time,	1 },
+		{ "set-rtc-time", REQ_ARG,	0,	SET_RTC_TIME },
+		{ "rtc-nvram-offset", REQ_ARG,	0,	RTC_NVRAM_OFFSET },
+		{ "get-rtc-nvram", NO_ARG,	&get_rtc_nvram,	1 },
+		{ "set-rtc-nvram", REQ_ARG,	0,	SET_RTC_NVRAM },
+		{ "getcap",	NO_ARG,		&getcap,	1 },
+		{ "get-stats",	NO_ARG,		&get_stats,	1 },
+		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
+		{ "set-desc-ds",NO_ARG,		&set_desc_ds,	1 },
+		{ "get-desc-es",NO_ARG,		&get_desc_es,	1 },
+		{ "set-desc-es",NO_ARG,		&set_desc_es,	1 },
+		{ "get-desc-ss",NO_ARG,		&get_desc_ss,	1 },
+		{ "set-desc-ss",NO_ARG,		&set_desc_ss,	1 },
+		{ "get-desc-cs",NO_ARG,		&get_desc_cs,	1 },
+		{ "set-desc-cs",NO_ARG,		&set_desc_cs,	1 },
+		{ "get-desc-fs",NO_ARG,		&get_desc_fs,	1 },
+		{ "set-desc-fs",NO_ARG,		&set_desc_fs,	1 },
+		{ "get-desc-gs",NO_ARG,		&get_desc_gs,	1 },
+		{ "set-desc-gs",NO_ARG,		&set_desc_gs,	1 },
+		{ "get-desc-tr",NO_ARG,		&get_desc_tr,	1 },
+		{ "set-desc-tr",NO_ARG,		&set_desc_tr,	1 },
+		{ "set-desc-ldtr", NO_ARG,	&set_desc_ldtr,	1 },
+		{ "get-desc-ldtr", NO_ARG,	&get_desc_ldtr,	1 },
+		{ "set-desc-gdtr", NO_ARG,	&set_desc_gdtr, 1 },
+		{ "get-desc-gdtr", NO_ARG,	&get_desc_gdtr, 1 },
+		{ "set-desc-idtr", NO_ARG,	&set_desc_idtr, 1 },
+		{ "get-desc-idtr", NO_ARG,	&get_desc_idtr, 1 },
+		{ "get-memmap",	NO_ARG,		&get_memmap,	1 },
+		{ "get-memseg", NO_ARG,		&get_memseg,	1 },
+		{ "get-efer",	NO_ARG,		&get_efer,	1 },
+		{ "get-cr0",	NO_ARG,		&get_cr0,	1 },
+		{ "get-cr2",	NO_ARG,		&get_cr2,	1 },
+		{ "get-cr3",	NO_ARG,		&get_cr3,	1 },
+		{ "get-cr4",	NO_ARG,		&get_cr4,	1 },
+		{ "get-dr0",	NO_ARG,		&get_dr0,	1 },
+		{ "get-dr1",	NO_ARG,		&get_dr1,	1 },
+		{ "get-dr2",	NO_ARG,		&get_dr2,	1 },
+		{ "get-dr3",	NO_ARG,		&get_dr3,	1 },
+		{ "get-dr6",	NO_ARG,		&get_dr6,	1 },
+		{ "get-dr7",	NO_ARG,		&get_dr7,	1 },
+		{ "get-rsp",	NO_ARG,		&get_rsp,	1 },
+		{ "get-rip",	NO_ARG,		&get_rip,	1 },
+		{ "get-rax",	NO_ARG,		&get_rax,	1 },
+		{ "get-rbx",	NO_ARG,		&get_rbx,	1 },
+		{ "get-rcx",	NO_ARG,		&get_rcx,	1 },
+		{ "get-rdx",	NO_ARG,		&get_rdx,	1 },
+		{ "get-rsi",	NO_ARG,		&get_rsi,	1 },
+		{ "get-rdi",	NO_ARG,		&get_rdi,	1 },
+		{ "get-rbp",	NO_ARG,		&get_rbp,	1 },
+		{ "get-r8",	NO_ARG,		&get_r8,	1 },
+		{ "get-r9",	NO_ARG,		&get_r9,	1 },
+		{ "get-r10",	NO_ARG,		&get_r10,	1 },
+		{ "get-r11",	NO_ARG,		&get_r11,	1 },
+		{ "get-r12",	NO_ARG,		&get_r12,	1 },
+		{ "get-r13",	NO_ARG,		&get_r13,	1 },
+		{ "get-r14",	NO_ARG,		&get_r14,	1 },
+		{ "get-r15",	NO_ARG,		&get_r15,	1 },
+		{ "get-rflags",	NO_ARG,		&get_rflags,	1 },
+		{ "get-cs",	NO_ARG,		&get_cs,	1 },
+		{ "get-ds",	NO_ARG,		&get_ds,	1 },
+		{ "get-es",	NO_ARG,		&get_es,	1 },
+		{ "get-fs",	NO_ARG,		&get_fs,	1 },
+		{ "get-gs",	NO_ARG,		&get_gs,	1 },
+		{ "get-ss",	NO_ARG,		&get_ss,	1 },
+		{ "get-tr",	NO_ARG,		&get_tr,	1 },
+		{ "get-ldtr",	NO_ARG,		&get_ldtr,	1 },
+		{ "get-eptp", 	NO_ARG,		&get_eptp,	1 },
+		{ "get-exception-bitmap",
+					NO_ARG,	&get_exception_bitmap,  1 },
+		{ "get-io-bitmap-address",
+					NO_ARG,	&get_io_bitmap,		1 },
+		{ "get-tsc-offset", 	NO_ARG, &get_tsc_offset, 	1 },
+		{ "get-msr-bitmap",
+					NO_ARG,	&get_msr_bitmap, 	1 },
+		{ "get-msr-bitmap-address",
+					NO_ARG,	&get_msr_bitmap_address, 1 },
+		{ "get-guest-pat",	NO_ARG,	&get_guest_pat,		1 },
+		{ "get-guest-sysenter",
+					NO_ARG,	&get_guest_sysenter, 	1 },
+		{ "get-exit-reason",
+					NO_ARG,	&get_exit_reason, 	1 },
+		{ "get-x2apic-state",	NO_ARG,	&get_x2apic_state, 	1 },
+		{ "get-all",		NO_ARG,	&get_all,		1 },
+		{ "run",		NO_ARG,	&run,			1 },
+		{ "create",		NO_ARG,	&create,		1 },
+		{ "destroy",		NO_ARG,	&destroy,		1 },
+		{ "inject-nmi",		NO_ARG,	&inject_nmi,		1 },
+		{ "force-reset",	NO_ARG,	&force_reset,		1 },
+		{ "force-poweroff", 	NO_ARG,	&force_poweroff, 	1 },
+		{ "get-active-cpus", 	NO_ARG,	&get_active_cpus, 	1 },
+		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 	1 },
+		{ "get-intinfo", 	NO_ARG,	&get_intinfo,		1 },
+		{ "get-cpu-topology",	NO_ARG, &get_cpu_topology,	1 },
+#ifndef __FreeBSD__
+		{ "wrlock-cycle",	NO_ARG,	&wrlock_cycle,	1 },
+#endif
+	};
+
+	const struct option intel_opts[] = {
+		{ "get-vmcs-pinbased-ctls",
+				NO_ARG,		&get_pinbased_ctls, 1 },
+		{ "get-vmcs-procbased-ctls",
+				NO_ARG,		&get_procbased_ctls, 1 },
+		{ "get-vmcs-procbased-ctls2",
+				NO_ARG,		&get_procbased_ctls2, 1 },
+		{ "get-vmcs-guest-linear-address",
+				NO_ARG,		&get_vmcs_gla,	1 },
+		{ "get-vmcs-guest-physical-address",
+				NO_ARG,		&get_vmcs_gpa,	1 },
+		{ "get-vmcs-entry-interruption-info",
+				NO_ARG, &get_vmcs_entry_interruption_info, 1},
+		{ "get-vmcs-cr0-mask", NO_ARG,	&get_cr0_mask,	1 },
+		{ "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+		{ "get-vmcs-cr4-mask", 		NO_ARG,	&get_cr4_mask,	  1 },
+		{ "get-vmcs-cr4-shadow", 	NO_ARG, &get_cr4_shadow,  1 },
+		{ "get-vmcs-cr3-targets", 	NO_ARG, &get_cr3_targets, 1 },
+		{ "get-vmcs-tpr-threshold",
+					NO_ARG,	&get_tpr_threshold, 1 },
+		{ "get-vmcs-vpid", 	NO_ARG,	&get_vpid_asid,	    1 },
+		{ "get-vmcs-exit-ctls", NO_ARG,	&get_exit_ctls,	    1 },
+		{ "get-vmcs-entry-ctls",
+					NO_ARG,	&get_entry_ctls, 1 },
+		{ "get-vmcs-instruction-error",
+					NO_ARG,	&get_inst_err,	1 },
+		{ "get-vmcs-host-pat",	NO_ARG,	&get_host_pat,	1 },
+		{ "get-vmcs-host-cr0",
+					NO_ARG,	&get_host_cr0,	1 },
+		{ "set-vmcs-entry-interruption-info",
+				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+		{ "get-vmcs-exit-qualification",
+				NO_ARG,	&get_vmcs_exit_qualification, 1 },
+		{ "get-vmcs-exit-inst-length",
+				NO_ARG,	&get_vmcs_exit_inst_length, 1 },
+		{ "get-vmcs-interruptibility",
+				NO_ARG, &get_vmcs_interruptibility, 1 },
+		{ "get-vmcs-exit-interruption-error",
+				NO_ARG,	&get_vmcs_exit_interruption_error, 1 },
+		{ "get-vmcs-exit-interruption-info",
+				NO_ARG,	&get_vmcs_exit_interruption_info, 1 },
+		{ "get-vmcs-link", 	NO_ARG,		&get_vmcs_link, 1 },
+		{ "get-vmcs-host-cr3",
+					NO_ARG,		&get_host_cr3,	1 },
+		{ "get-vmcs-host-cr4",
+				NO_ARG,		&get_host_cr4,	1 },
+		{ "get-vmcs-host-rip",
+				NO_ARG,		&get_host_rip,	1 },
+		{ "get-vmcs-host-rsp",
+				NO_ARG,		&get_host_rsp,	1 },
+		{ "get-apic-access-address",
+				NO_ARG,		&get_apic_access_addr, 1},
+		{ "get-virtual-apic-address",
+				NO_ARG,		&get_virtual_apic_addr, 1}
+	};
+
+	const struct option amd_opts[] = {
+		{ "get-vmcb-intercepts",
+				NO_ARG,	&get_vmcb_intercept, 	1 },
+		{ "get-vmcb-asid", 
+				NO_ARG,	&get_vpid_asid,	     	1 },
+		{ "get-vmcb-exit-details",
+				NO_ARG, &get_vmcb_exit_details,	1 },
+		{ "get-vmcb-tlb-ctrl",
+				NO_ARG, &get_vmcb_tlb_ctrl, 	1 },
+		{ "get-vmcb-virq",
+				NO_ARG, &get_vmcb_virq, 	1 },
+		{ "get-avic-apic-bar",
+				NO_ARG,	&get_apic_access_addr, 	1 },
+		{ "get-avic-backing-page",
+				NO_ARG,	&get_virtual_apic_addr, 1 },
+		{ "get-avic-table",
+				NO_ARG,	&get_avic_table, 	1 }
+	};
+
+	const struct option null_opt = {
+		NULL, 0, NULL, 0
+	};
+
+	struct option *all_opts;
+	char *cp;
+	int optlen;
+
+	optlen = sizeof(common_opts);
+
+	if (cpu_intel)
+		optlen += sizeof(intel_opts);
+	else
+		optlen += sizeof(amd_opts);
+
+	optlen += sizeof(null_opt);
+
+	all_opts = malloc(optlen);
+
+	cp = (char *)all_opts;
+	memcpy(cp, common_opts, sizeof(common_opts));
+	cp += sizeof(common_opts);
+
+	if (cpu_intel) {
+		memcpy(cp, intel_opts, sizeof(intel_opts));
+		cp += sizeof(intel_opts);
+	} else {
+		memcpy(cp, amd_opts, sizeof(amd_opts));
+		cp += sizeof(amd_opts);
+	}
+
+	memcpy(cp, &null_opt, sizeof(null_opt));
+	cp += sizeof(null_opt);
+
+	return (all_opts);
+}
+
+static const char *
+wday_str(int idx)
+{
+	static const char *weekdays[] = {
+		"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
+	};
+
+	if (idx >= 0 && idx < 7)
+		return (weekdays[idx]);
+	else
+		return ("UNK");
+}
+
+static const char *
+mon_str(int idx)
+{
+	static const char *months[] = {
+		"Jan", "Feb", "Mar", "Apr", "May", "Jun",
+		"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+	};
+
+	if (idx >= 0 && idx < 12)
+		return (months[idx]);
+	else
+		return ("UNK");
+}
+
+static int
+show_memmap(struct vmctx *ctx)
+{
+	char name[SPECNAMELEN + 1], numbuf[8];
+	vm_ooffset_t segoff;
+	vm_paddr_t gpa;
+	size_t maplen, seglen;
+	int error, flags, prot, segid, delim;
+
+	printf("Address     Length      Segment     Offset      ");
+	printf("Prot  Flags\n");
+
+	gpa = 0;
+	while (1) {
+		error = vm_mmap_getnext(ctx, &gpa, &segid, &segoff, &maplen,
+		    &prot, &flags);
+		if (error)
+			return (errno == ENOENT ? 0 : error);
+
+		error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name));
+		if (error)
+			return (error);
+
+		printf("%-12lX", gpa);
+		humanize_number(numbuf, sizeof(numbuf), maplen, "B",
+		    HN_AUTOSCALE, HN_NOSPACE);
+		printf("%-12s", numbuf);
+
+		printf("%-12s", name[0] ? name : "sysmem");
+		printf("%-12lX", segoff);
+		printf("%c%c%c   ", prot & PROT_READ ? 'R' : '-',
+		    prot & PROT_WRITE ? 'W' : '-',
+		    prot & PROT_EXEC ? 'X' : '-');
+
+		delim = '\0';
+		if (flags & VM_MEMMAP_F_WIRED) {
+			printf("%cwired", delim);
+			delim = '/';
+		}
+		if (flags & VM_MEMMAP_F_IOMMU) {
+			printf("%ciommu", delim);
+			delim = '/';
+		}
+		printf("\n");
+
+		gpa += maplen;
+	}
+}
+
+static int
+show_memseg(struct vmctx *ctx)
+{
+	char name[SPECNAMELEN + 1], numbuf[8];
+	size_t seglen;
+	int error, segid;
+
+	printf("ID  Length      Name\n");
+
+	segid = 0;
+	while (1) {
+		error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name));
+		if (error)
+			return (errno == EINVAL ? 0 : error);
+
+		if (seglen) {
+			printf("%-4d", segid);
+			humanize_number(numbuf, sizeof(numbuf), seglen, "B",
+			    HN_AUTOSCALE, HN_NOSPACE);
+			printf("%-12s", numbuf);
+			printf("%s", name[0] ? name : "sysmem");
+			printf("\n");
+		}
+		segid++;
+	}
+}
+
+int
+main(int argc, char *argv[])
+{
+	char *vmname;
+	int error, ch, vcpu, ptenum;
+	vm_paddr_t gpa_pmap;
+	struct vm_exit vmexit;
+	uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7;
+	uint64_t rsp, rip, rflags, efer, pat;
+	uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2];
+	struct vmctx *ctx;
+	cpuset_t cpus;
+	bool cpu_intel;
+	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+	struct tm tm;
+	struct option *opts;
+
+	cpu_intel = cpu_vendor_intel();
+	opts = setup_options(cpu_intel);
+
+	vcpu = 0;
+	vmname = NULL;
+	assert_lapic_lvt = -1;
+	progname = basename(argv[0]);
+
+	while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+		switch (ch) {
+		case 0:
+			break;
+		case VMNAME:
+			vmname = optarg;
+			break;
+		case VCPU:
+			vcpu = atoi(optarg);
+			break;
+		case SET_MEM:
+			memsize = atoi(optarg) * MB;
+			memsize = roundup(memsize, 2 * MB);
+			break;
+		case SET_EFER:
+			efer = strtoul(optarg, NULL, 0);
+			set_efer = 1;
+			break;
+		case SET_CR0:
+			cr0 = strtoul(optarg, NULL, 0);
+			set_cr0 = 1;
+			break;
+		case SET_CR2:
+			cr2 = strtoul(optarg, NULL, 0);
+			set_cr2 = 1;
+			break;
+		case SET_CR3:
+			cr3 = strtoul(optarg, NULL, 0);
+			set_cr3 = 1;
+			break;
+		case SET_CR4:
+			cr4 = strtoul(optarg, NULL, 0);
+			set_cr4 = 1;
+			break;
+		case SET_DR0:
+			dr0 = strtoul(optarg, NULL, 0);
+			set_dr0 = 1;
+			break;
+		case SET_DR1:
+			dr1 = strtoul(optarg, NULL, 0);
+			set_dr1 = 1;
+			break;
+		case SET_DR2:
+			dr2 = strtoul(optarg, NULL, 0);
+			set_dr2 = 1;
+			break;
+		case SET_DR3:
+			dr3 = strtoul(optarg, NULL, 0);
+			set_dr3 = 1;
+			break;
+		case SET_DR6:
+			dr6 = strtoul(optarg, NULL, 0);
+			set_dr6 = 1;
+			break;
+		case SET_DR7:
+			dr7 = strtoul(optarg, NULL, 0);
+			set_dr7 = 1;
+			break;
+		case SET_RSP:
+			rsp = strtoul(optarg, NULL, 0);
+			set_rsp = 1;
+			break;
+		case SET_RIP:
+			rip = strtoul(optarg, NULL, 0);
+			set_rip = 1;
+			break;
+		case SET_RAX:
+			rax = strtoul(optarg, NULL, 0);
+			set_rax = 1;
+			break;
+		case SET_RFLAGS:
+			rflags = strtoul(optarg, NULL, 0);
+			set_rflags = 1;
+			break;
+		case DESC_BASE:
+			desc_base = strtoul(optarg, NULL, 0);
+			break;
+		case DESC_LIMIT:
+			desc_limit = strtoul(optarg, NULL, 0);
+			break;
+		case DESC_ACCESS:
+			desc_access = strtoul(optarg, NULL, 0);
+			break;
+		case SET_CS:
+			cs = strtoul(optarg, NULL, 0);
+			set_cs = 1;
+			break;
+		case SET_DS:
+			ds = strtoul(optarg, NULL, 0);
+			set_ds = 1;
+			break;
+		case SET_ES:
+			es = strtoul(optarg, NULL, 0);
+			set_es = 1;
+			break;
+		case SET_FS:
+			fs = strtoul(optarg, NULL, 0);
+			set_fs = 1;
+			break;
+		case SET_GS:
+			gs = strtoul(optarg, NULL, 0);
+			set_gs = 1;
+			break;
+		case SET_SS:
+			ss = strtoul(optarg, NULL, 0);
+			set_ss = 1;
+			break;
+		case SET_TR:
+			tr = strtoul(optarg, NULL, 0);
+			set_tr = 1;
+			break;
+		case SET_LDTR:
+			ldtr = strtoul(optarg, NULL, 0);
+			set_ldtr = 1;
+			break;
+		case SET_X2APIC_STATE:
+			x2apic_state = strtol(optarg, NULL, 0);
+			set_x2apic_state = 1;
+			break;
+		case SET_EXCEPTION_BITMAP:
+			exception_bitmap = strtoul(optarg, NULL, 0);
+			set_exception_bitmap = 1;
+			break;
+		case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+			vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+			set_vmcs_entry_interruption_info = 1;
+			break;
+		case SET_CAP:
+			capval = strtoul(optarg, NULL, 0);
+			setcap = 1;
+			break;
+		case SET_RTC_TIME:
+			rtc_secs = strtoul(optarg, NULL, 0);
+			set_rtc_time = 1;
+			break;
+		case SET_RTC_NVRAM:
+			rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0);
+			set_rtc_nvram = 1;
+			break;
+		case RTC_NVRAM_OFFSET:
+			rtc_nvram_offset = strtoul(optarg, NULL, 0);
+			break;
+		case GET_GPA_PMAP:
+			gpa_pmap = strtoul(optarg, NULL, 0);
+			get_gpa_pmap = 1;
+			break;
+		case CAPNAME:
+			capname = optarg;
+			break;
+		case UNASSIGN_PPTDEV:
+			unassign_pptdev = 1;
+			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
+				usage(cpu_intel);
+			break;
+		case ASSERT_LAPIC_LVT:
+			assert_lapic_lvt = atoi(optarg);
+			break;
+		default:
+			usage(cpu_intel);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (vmname == NULL)
+		usage(cpu_intel);
+
+	error = 0;
+
+	if (!error && create)
+		error = vm_create(vmname);
+
+	if (!error) {
+		ctx = vm_open(vmname);
+		if (ctx == NULL) {
+			printf("VM:%s is not created.\n", vmname);
+			exit (1);
+		}
+	}
+
+#ifndef __FreeBSD__
+	if (!error && wrlock_cycle) {
+		error = vm_wrlock_cycle(ctx);
+		exit(error);
+	}
+#endif /* __FreeBSD__ */
+
+	if (!error && memsize)
+		error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+
+	if (!error && set_efer)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+	if (!error && set_cr0)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+	if (!error && set_cr2)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR2, cr2);
+
+	if (!error && set_cr3)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+	if (!error && set_cr4)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+	if (!error && set_dr0)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR0, dr0);
+
+	if (!error && set_dr1)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR1, dr1);
+
+	if (!error && set_dr2)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR2, dr2);
+
+	if (!error && set_dr3)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR3, dr3);
+
+	if (!error && set_dr6)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR6, dr6);
+
+	if (!error && set_dr7)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+	if (!error && set_rsp)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+	if (!error && set_rip)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+	if (!error && set_rax)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+	if (!error && set_rflags) {
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+					rflags);
+	}
+
+	if (!error && set_desc_ds) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_es) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_ss) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_cs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_fs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_gs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_tr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_ldtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_gdtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+				    desc_base, desc_limit, 0);
+	}
+
+	if (!error && set_desc_idtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+				    desc_base, desc_limit, 0);
+	}
+
+	if (!error && set_cs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+	if (!error && set_ds)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+	if (!error && set_es)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+	if (!error && set_fs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+	if (!error && set_gs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+	if (!error && set_ss)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+	if (!error && set_tr)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+	if (!error && set_ldtr)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+	if (!error && set_x2apic_state)
+		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
+
+	if (!error && unassign_pptdev)
+		error = vm_unassign_pptdev(ctx, bus, slot, func);
+
+	if (!error && set_exception_bitmap) {
+		if (cpu_intel)
+			error = vm_set_vmcs_field(ctx, vcpu,
+						  VMCS_EXCEPTION_BITMAP,
+						  exception_bitmap);
+		else
+			error = vm_set_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_EXC_INTERCEPT,
+						  4, exception_bitmap);
+	}
+
+	if (!error && cpu_intel && set_vmcs_entry_interruption_info) {
+		error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+					  vmcs_entry_interruption_info);
+	}
+
+	if (!error && inject_nmi) {
+		error = vm_inject_nmi(ctx, vcpu);
+	}
+
+	if (!error && assert_lapic_lvt != -1) {
+		error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt);
+	}
+
+	if (!error && (get_memseg || get_all))
+		error = show_memseg(ctx);
+
+	if (!error && (get_memmap || get_all))
+		error = show_memmap(ctx);
+
+	if (!error)
+		error = get_all_registers(ctx, vcpu);
+
+	if (!error)
+		error = get_all_segments(ctx, vcpu);
+
+	if (!error) {
+		if (cpu_intel)
+			error = get_misc_vmcs(ctx, vcpu);
+		else
+			error = get_misc_vmcb(ctx, vcpu);
+	}
+	
+	if (!error && (get_x2apic_state || get_all)) {
+		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
+		if (error == 0)
+			printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
+	}
+
+	if (!error && (get_eptp || get_all)) {
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE,
+						   8, &eptp);
+		if (error == 0)
+			printf("%s[%d]\t\t0x%016lx\n",
+				cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp);
+	}
+
+	if (!error && (get_exception_bitmap || get_all)) {
+		if(cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu,
+						VMCS_EXCEPTION_BITMAP, &bm);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_EXC_INTERCEPT,
+						  4, &bm);
+		if (error == 0)
+			printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm);
+	}
+
+	if (!error && (get_io_bitmap || get_all)) {
+		if (cpu_intel) {
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A,
+						  &bm);
+			if (error == 0)
+				printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm);
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B,
+						  &bm);
+			if (error == 0)
+				printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm);
+		} else {
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_IO_PERM, 8, &bm);
+			if (error == 0)
+				printf("io_bitmap[%d]\t%#lx\n", vcpu, bm);
+		}
+	}
+
+	if (!error && (get_tsc_offset || get_all)) {
+		uint64_t tscoff;
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET,
+						  &tscoff);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_TSC_OFFSET, 
+						  8, &tscoff);
+		if (error == 0)
+			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
+	}
+
+	if (!error && (get_msr_bitmap_address || get_all)) {
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, 
+						  &addr);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_MSR_PERM, 8, &addr);
+		if (error == 0)
+			printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_msr_bitmap || get_all)) {
+		if (cpu_intel) {
+			error = vm_get_vmcs_field(ctx, vcpu, 
+						  VMCS_MSR_BITMAP, &addr);
+		} else {
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_MSR_PERM, 8,
+						  &addr);
+		}
+
+		if (error == 0)
+			error = dump_msr_bitmap(vcpu, addr, cpu_intel);
+	}
+
+	if (!error && (get_vpid_asid || get_all)) {
+		uint64_t vpid;
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, 
+						  4, &vpid);
+		if (error == 0)
+			printf("%s[%d]\t\t0x%04lx\n", 
+				cpu_intel ? "vpid" : "asid", vcpu, vpid);
+	}
+
+	if (!error && (get_guest_pat || get_all)) {
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu,
+						  VMCS_GUEST_IA32_PAT, &pat);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_GUEST_PAT, 8, &pat);
+		if (error == 0)
+			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+	}
+
+	if (!error && (get_guest_sysenter || get_all)) {
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu,
+						  VMCS_GUEST_IA32_SYSENTER_CS,
+						  &cs);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_SYSENTER_CS, 8,
+						  &cs);
+
+		if (error == 0)
+			printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs);
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu,
+						  VMCS_GUEST_IA32_SYSENTER_ESP,
+						  &rsp);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_SYSENTER_ESP, 8,
+						  &rsp);
+
+		if (error == 0)
+			printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp);
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu,
+						  VMCS_GUEST_IA32_SYSENTER_EIP,
+						  &rip);
+		else
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_SYSENTER_EIP, 8, 
+						  &rip);
+		if (error == 0)
+			printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip);
+	}
+
+	if (!error && (get_exit_reason || get_all)) {
+		if (cpu_intel)
+			error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON,
+						  &u64);
+		else	
+			error = vm_get_vmcb_field(ctx, vcpu,
+						  VMCB_OFF_EXIT_REASON, 8,
+						  &u64);
+		if (error == 0)
+			printf("exit_reason[%d]\t%#lx\n", vcpu, u64);
+	}
+
+	if (!error && setcap) {
+		int captype;
+		captype = vm_capability_name2type(capname);
+		error = vm_set_capability(ctx, vcpu, captype, capval);
+		if (error != 0 && errno == ENOENT)
+			printf("Capability \"%s\" is not available\n", capname);
+	}
+
+	if (!error && get_gpa_pmap) {
+		error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
+		if (error == 0) {
+			printf("gpa %#lx:", gpa_pmap);
+			pte = &pteval[0];
+			while (ptenum-- > 0)
+				printf(" %#lx", *pte++);
+			printf("\n");
+		}
+	}
+
+	if (!error && set_rtc_nvram)
+		error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value);
+
+	if (!error && (get_rtc_nvram || get_all)) {
+		error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value);
+		if (error == 0) {
+			printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset,
+			    rtc_nvram_value);
+		}
+	}
+
+	if (!error && set_rtc_time)
+		error = vm_rtc_settime(ctx, rtc_secs);
+
+	if (!error && (get_rtc_time || get_all)) {
+		error = vm_rtc_gettime(ctx, &rtc_secs);
+		if (error == 0) {
+			gmtime_r(&rtc_secs, &tm);
+			printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n",
+			    rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon),
+			    tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec,
+			    1900 + tm.tm_year);
+		}
+	}
+
+	if (!error && (getcap || get_all)) {
+		int captype, val, getcaptype;
+
+		if (getcap && capname)
+			getcaptype = vm_capability_name2type(capname);
+		else
+			getcaptype = -1;
+
+		for (captype = 0; captype < VM_CAP_MAX; captype++) {
+			if (getcaptype >= 0 && captype != getcaptype)
+				continue;
+			error = vm_get_capability(ctx, vcpu, captype, &val);
+			if (error == 0) {
+				printf("Capability \"%s\" is %s on vcpu %d\n",
+					vm_capability_type2name(captype),
+					val ? "set" : "not set", vcpu);
+			} else if (errno == ENOENT) {
+				error = 0;
+				printf("Capability \"%s\" is not available\n",
+					vm_capability_type2name(captype));
+			} else {
+				break;
+			}
+		}
+	}
+
+	if (!error && (get_active_cpus || get_all)) {
+		error = vm_active_cpus(ctx, &cpus);
+		if (!error)
+			print_cpus("active cpus", &cpus);
+	}
+
+	if (!error && (get_suspended_cpus || get_all)) {
+		error = vm_suspended_cpus(ctx, &cpus);
+		if (!error)
+			print_cpus("suspended cpus", &cpus);
+	}
+
+	if (!error && (get_intinfo || get_all)) {
+		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
+		if (!error) {
+			print_intinfo("pending", info[0]);
+			print_intinfo("current", info[1]);
+		}
+	}
+
+	if (!error && (get_stats || get_all)) {
+		int i, num_stats;
+		uint64_t *stats;
+		struct timeval tv;
+		const char *desc;
+
+		stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+		if (stats != NULL) {
+			printf("vcpu%d stats:\n", vcpu);
+			for (i = 0; i < num_stats; i++) {
+				desc = vm_get_stat_desc(ctx, i);
+				printf("%-40s\t%ld\n", desc, stats[i]);
+			}
+		}
+	}
+
+	if (!error && (get_cpu_topology || get_all)) {
+		uint16_t sockets, cores, threads, maxcpus;
+
+		vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus);
+		printf("cpu_topology:\tsockets=%hu, cores=%hu, threads=%hu, "
+		    "maxcpus=%hu\n", sockets, cores, threads, maxcpus);
+	}
+
+	if (!error && run) {
+		error = vm_run(ctx, vcpu, &vmexit);
+		if (error == 0)
+			dump_vm_run_exitcode(&vmexit, vcpu);
+		else
+			printf("vm_run error %d\n", error);
+	}
+
+	if (!error && force_reset)
+		error = vm_suspend(ctx, VM_SUSPEND_RESET);
+
+	if (!error && force_poweroff)
+		error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
+
 	if (error)
 		printf("errno = %d\n", errno);
 
 	if (!error && destroy)
-		error = vm_destroy(ctx);
+		vm_destroy(ctx);
 
+	free (opts);
 	exit(error);
 }
diff --git a/usr/src/cmd/bhyveload-uefi/Makefile b/usr/src/cmd/bhyveload-uefi/Makefile
deleted file mode 100644
index bbcbacf32f..0000000000
--- a/usr/src/cmd/bhyveload-uefi/Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-PROG =		bhyveload-uefi
-
-include ../Makefile.cmd
-
-$(BUILD64)SUBDIRS += $(MACH64)
-
-all	:=	TARGET = all
-install	:=	TARGET = install
-clean	:=	TARGET = clean
-clobber	:=	TARGET = clobber
-lint	:=	TARGET = lint
-
-.KEEP_STATE:
-
-all clean clobber lint:	$(SUBDIRS)
-
-install: $(SUBDIRS)
-	-$(RM) $(ROOTUSRSBINPROG)
-	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
-
-$(SUBDIRS):	FRC
-	@cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
-
-FRC:
-
-include ../Makefile.targ
diff --git a/usr/src/cmd/bhyveload-uefi/Makefile.com b/usr/src/cmd/bhyveload-uefi/Makefile.com
deleted file mode 100644
index 7865cca8d8..0000000000
--- a/usr/src/cmd/bhyveload-uefi/Makefile.com
+++ /dev/null
@@ -1,52 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-PROG= bhyveload-uefi
-
-SRCS = ../bhyveload-uefi.c expand_number.c
-OBJS = bhyveload-uefi.o expand_number.o
-
-include ../../Makefile.cmd
-
-.KEEP_STATE:
-
-CFLAGS +=	$(CCVERBOSE)
-CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
-	        -I$(ROOT)/usr/platform/i86pc/include
-LDLIBS +=	-lvmmapi
-
-all: $(PROG)
-
-$(PROG): $(OBJS)
-	$(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
-	$(POST_PROCESS)
-
-install: all $(ROOTUSRSBINPROG)
-
-clean:
-	$(RM) $(OBJS)
-
-lint:	lint_SRCS
-
-include ../../Makefile.targ
-
-%.o: ../%.c
-	$(COMPILE.c) $<
-	$(POST_PROCESS_O)
-
-%.o: $(CONTRIB)/freebsd/lib/libutil/%.c
-	$(COMPILE.c) $<
-	$(POST_PROCESS_O)
-
diff --git a/usr/src/cmd/bhyveload-uefi/amd64/Makefile b/usr/src/cmd/bhyveload-uefi/amd64/Makefile
deleted file mode 100644
index b602c50d05..0000000000
--- a/usr/src/cmd/bhyveload-uefi/amd64/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-include ../Makefile.com
-include ../../Makefile.cmd.64
-
-CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
-
-install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
deleted file mode 100644
index 62a7ca5d0f..0000000000
--- a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2013 Pluribus Networks Inc.
- */
-
-#include <sys/types.h>
-
-#include <machine/vmm.h>
-
-#include <errno.h>
-#include <err.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sysexits.h>
-#include <unistd.h>
-
-#include <vmmapi.h>
-
-#define	KB	(1024UL)
-#define	MB	(1024 * 1024UL)
-#define	GB	(1024 * 1024 * 1024UL)
-
-#define	UEFI_ROM_ADDR	0xFFE00000
-#define	UEFI_ROM_SIZE	(2 * MB)
-/*
- * N.B. the UEFI code zeros the first page in memory so use the second.
- */
-#define	BHYVE_HOB_ADDR		0x00002000
-#define	BHYVE_BO_HOB_ADDR	0x00002080
-
-#define	UEFI_ROM_PATH	"/usr/share/bhyve/uefi-rom.bin"
-
-struct platform_info {
-	uint32_t	ncpus;
-};
-
-/*
- * Boot order code:
- * 0 - EFI_CD_HD
- * 1 - EFI_CD
- * 2 - EFI_HD_CD
- * 3 - EFI_HD
- * 4 - EFI_NET
- * 5 - EFI_NET_CD_HD
- * 6 - EFI_HD_HD_CD
- * 7 - LEGACY_CD_HD
- * 8 - LEGACY_CD
- * 9 - LEGACY_HD_CD
- * 10 - LEGACY_HD
- * 11 - EFI_SHELL
- */
-
-struct bootorder_info {
-	uint32_t	guestbootorder;
-};
-
-static char *vmname, *progname;
-static struct vmctx *ctx;
-
-static void
-usage(void)
-{
-	printf("usage: %s "
-	       "[-c vcpus] [-m mem-size] [-b bootorder]"
-	       "<vmname>\n", progname);
-	exit(1);
-}
-
-int
-main(int argc, char** argv)
-{
-	int opt, error, fd;
-	int guest_ncpus;
-	int guest_bootorder = 0;
-	uint64_t mem_size;
-	char *membase, *rombase;
-	struct platform_info *pi;
-	struct bootorder_info *bi;
-
-	progname = argv[0];
-
-	guest_ncpus = 1;
-	mem_size = 256 * MB;
-
-	while ((opt = getopt(argc, argv, "c:m:b:")) != -1) {
-		switch (opt) {
-		case 'c':
-			guest_ncpus = atoi(optarg);
-			break;
-		case 'm':
-			error = vm_parse_memsize(optarg, &mem_size);
-			if (error != 0 || mem_size == 0)
-				errx(EX_USAGE, "Invalid memsize '%s'", optarg);
-			break;
-		case 'b':
-			guest_bootorder = atoi(optarg);
-			if (guest_bootorder < 0 || guest_bootorder > 11) {
-				errx(EX_USAGE, "Invalid bootoption: %d\n"
-		 		    "\tBoot order code:\n"
- 				    "\t0 - EFI_CD_HD\n"
- 				    "\t1 - EFI_CD\n"
- 				    "\t2 - EFI_HD_CD\n"
-				    "\t3 - EFI_HD\n"
-				    "\t4 - EFI_NET\n"
-				    "\t5 - EFI_NET_CD_HD\n"
-				    "\t6 - EFI_HD_HD_CD\n"
-				    "\t7 - LEGACY_CD_HD\n"
-				    "\t8 - LEGACY_CD\n"
-				    "\t9 - LEGACY_HD_CD\n"
-				    "\t10 - LEGACY_HD\n"
- 				    "\t11 - EFI_SHELL\n", guest_bootorder);
-				exit(1);
-			}
-			break;
-		case '?':
-			usage();
-		}
-	}
-
-	argc -= optind;
-	argv += optind;
-
-	if (argc != 1)
-		usage();
-
-	vmname = argv[0];
-	error = vm_create(vmname);
-	if (error != 0 && errno != EEXIST) {
-		perror("vm_create");
-		exit(1);
-
-	}
-
-	ctx = vm_open(vmname);
-	if (ctx == NULL) {
-		perror("vm_open");
-		exit(1);
-	}
-
-	error = vm_set_capability(ctx, 0, VM_CAP_UNRESTRICTED_GUEST, 1);
-	if (error) {
-		perror("vm_set_capability(VM_CAP_UNRESTRICTED_GUEST)");
-	}
-
-	error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL);
-	if (error) {
-		perror("vm_setup_memory");
-		exit(1);
-	}
-	membase = vm_map_gpa(ctx, 0, 8 * KB);
-
-	error = vm_setup_rom(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE);
-	if (error) {
-		perror("vm_setup_rom");
-		exit(1);
-	}
-	rombase = vm_map_gpa(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE);
-
-	fd = open(UEFI_ROM_PATH, O_RDONLY);
-	if (fd == -1) {
-		perror("open");
-		exit(1);
-	}
-	read(fd, rombase, UEFI_ROM_SIZE);
-	close(fd);
-
-	pi = (struct platform_info *)(membase + BHYVE_HOB_ADDR);
-	pi->ncpus = guest_ncpus;
-	bi = (struct bootorder_info *)(membase + BHYVE_BO_HOB_ADDR);
-	bi->guestbootorder = guest_bootorder;
-
-	error = vcpu_reset(ctx, 0);
-	if (error) {
-		perror("vcpu_reset");
-		exit(1);
-	}
-
-	return (0);
-}
diff --git a/usr/src/cmd/bhyveload-uefi/i386/Makefile b/usr/src/cmd/bhyveload-uefi/i386/Makefile
deleted file mode 100644
index f5b7bb6915..0000000000
--- a/usr/src/cmd/bhyveload-uefi/i386/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-include ../Makefile.com
-
-install: all $(ROOTUSRSBINPROG32)
diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
index 2b678df527..4aeea7d294 100644
--- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c
+++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
@@ -45,6 +45,7 @@ static int vt00(di_minor_t minor, di_node_t node);
 static int kdmouse(di_minor_t minor, di_node_t node);
 static int ipmi(di_minor_t minor, di_node_t node);
 static int mc_node(di_minor_t minor, di_node_t node);
+static int vmmctl(di_minor_t minor, di_node_t node);
 
 static devfsadm_create_t misc_cbt[] = {
 	{ "vt00", "ddi_display", NULL,
@@ -84,6 +85,9 @@ static devfsadm_create_t misc_cbt[] = {
 	{ "pseudo", "ddi_pseudo", "ucode",
 	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name,
 	},
+	{ "pseudo", "ddi_pseudo", "vmm",
+	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl,
+	}
 };
 
 DEVFSADM_CREATE_INIT_V0(misc_cbt);
@@ -109,6 +113,9 @@ static devfsadm_remove_t misc_remove_cbt[] = {
 	},
 	{ "serial", "^tty[a-z]$", RM_ALWAYS | RM_PRE,
 		ILEVEL_1, devfsadm_rm_all
+	},
+	{ "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT,
+		ILEVEL_0, devfsadm_rm_all
 	}
 };
 
@@ -345,3 +352,14 @@ mc_node(di_minor_t minor, di_node_t node)
 	(void) devfsadm_mklink(linkpath, node, minor, 0);
 	return (DEVFSADM_CONTINUE);
 }
+
+/*
+ *	/dev/vmmctl	->	/devices/pseudo/vmm@0:ctl
+ */
+static int
+vmmctl(di_minor_t minor, di_node_t node)
+{
+	if (strcmp(di_minor_name(minor), "ctl") == 0)
+		(void) devfsadm_mklink("vmmctl", node, minor, 0);
+	return (DEVFSADM_CONTINUE);
+}
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile
deleted file mode 100644
index bf9219b435..0000000000
--- a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2014 Pluribus Networks Inc.
-#
-
-MAKEVARS = CW_NO_SHADOW=true __GNUC=
-
-include $(SRC)/Makefile.master
-$(BUILD64)SUBDIRS += 	$(MACH64)
-include ../../../Makefile.subdirs
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
deleted file mode 100644
index 49ca0c5eb3..0000000000
--- a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-
-#
-# Copyright 2013 Pluribus Networks Inc.
-#
-
-MODULE = vmm.so
-MDBTGT = kvm
-
-MODSRCS = vmm.c
-
-include ../../../../../Makefile.cmd
-include ../../../../../Makefile.cmd.64
-include ../../../Makefile.amd64
-include ../../../../Makefile.module
-
-CPPFLAGS = -D_KERNEL -D_MACHDEP
-CPPFLAGS += -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64
-CPPFLAGS += -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64
-CPPFLAGS += -I$(SRC)/uts/common -I$(SRC)/uts/i86pc
-CPPFLAGS += -I$(SRC)/cmd/mdb/common
-
-CPPFLAGS += -_cc=-xdryrun
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
deleted file mode 100644
index 9e29d8662a..0000000000
--- a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2014 Pluribus Networks Inc.
- */
-
-#include <sys/param.h>
-
-#include <mdb/mdb_modapi.h>
-#include <sys/cpuvar.h>
-#include <sys/varargs.h>
-#include <sys/vmm.h>
-#include <sys/vmm_impl.h>
-
-/*
- * VMM trace debug walker/dcmd code
- */
-
-/*
- * Initialize the vmm_trace_dmsg_t walker by either using the given starting
- * address, or reading the value of the kernel's vmm_debug_rbuf pointer.
- * We also allocate a vmm_trace_dmsg_t for storage, and save this using the
- * walk_data pointer.
- */
-static int
-vmm_dmsg_walk_i(mdb_walk_state_t *wsp)
-{
-	uintptr_t rbuf_addr;
-	vmm_trace_rbuf_t rbuf;
-
-	if (wsp->walk_addr == NULL) {
-		if (mdb_readvar(&rbuf_addr, "vmm_debug_rbuf") == -1) {
-			mdb_warn("failed to read 'vmm_debug_rbuf'");
-			return (WALK_ERR);
-		}
-
-		if (mdb_vread(&rbuf, sizeof (vmm_trace_rbuf_t), rbuf_addr)
-		    == -1) {
-			mdb_warn("failed to read vmm_trace_rbuf_t at %p",
-			    rbuf_addr);
-			return (WALK_ERR);
-		}
-
-		wsp->walk_addr = (uintptr_t)(vmm_trace_dmsg_t *)rbuf.dmsgh;
-	}
-
-	/*
-	 * Save ptr to head of ring buffer to prevent looping.
-	 */
-	wsp->walk_arg = (void *)wsp->walk_addr;
-	wsp->walk_data = mdb_alloc(sizeof (vmm_trace_dmsg_t), UM_SLEEP);
-	return (WALK_NEXT);
-}
-
-/*
- * At each step, read a vmm_trace_dmsg_t into our private storage, and then
- * invoke the callback function.  We terminate when we reach a NULL next
- * pointer.
- */
-static int
-vmm_dmsg_walk_s(mdb_walk_state_t *wsp)
-{
-	int status;
-
-	if (wsp->walk_addr == NULL)
-		return (WALK_DONE);
-
-	if (mdb_vread(wsp->walk_data, sizeof (vmm_trace_dmsg_t),
-	    wsp->walk_addr) == -1) {
-		mdb_warn("failed to read vmm_trace_dmsg_t at %p",
-		    wsp->walk_addr);
-		return (WALK_ERR);
-	}
-
-	status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data,
-	    wsp->walk_cbdata);
-
-	wsp->walk_addr =
-	    (uintptr_t)(((vmm_trace_dmsg_t *)wsp->walk_data)->next);
-
-	/*
-	 * If we've looped then we're done.
-	 */
-	if (wsp->walk_addr == (uintptr_t)wsp->walk_arg)
-		wsp->walk_addr = NULL;
-
-	return (status);
-}
-
-/*
- * The walker's fini function is invoked at the end of each walk.  Since we
- * dynamically allocated a vmm_trace_dmsg_t in vmm_dmsg_walk_i, we must
- * free it now.
- */
-static void
-vmm_dmsg_walk_f(mdb_walk_state_t *wsp)
-{
-	mdb_free(wsp->walk_data, sizeof (vmm_trace_dmsg_t));
-}
-
-/*
- * This routine is used by the vmm_dmsg_dump dcmd to dump content of
- * VMM trace ring buffer.
- */
-int
-vmm_dmsg_dump(vmm_trace_dmsg_t *addr, int print_pathname, uint_t *printed)
-{
-	vmm_trace_dmsg_t	dmsg, *dmsgh = addr;
-	char			pathname[MAXPATHLEN];
-	char			merge[1024];
-
-	while (addr != NULL) {
-		if (mdb_vread(&dmsg, sizeof (dmsg), (uintptr_t)addr) !=
-		    sizeof (dmsg)) {
-			mdb_warn("failed to read message pointer in kernel");
-			return (DCMD_ERR);
-		}
-
-		(void) mdb_snprintf(merge, sizeof (merge),
-		    "[%Y:%03d:%03d:%03d] : %s",
-		    dmsg.timestamp.tv_sec,
-		    (int)dmsg.timestamp.tv_nsec/1000000,
-		    (int)(dmsg.timestamp.tv_nsec/1000)%1000,
-		    (int)dmsg.timestamp.tv_nsec%1000,
-		    dmsg.buf);
-
-		mdb_printf("%s", merge);
-
-		if (printed != NULL) {
-			(*printed)++;
-		}
-
-		if (((addr = dmsg.next) == NULL) || (dmsg.next == dmsgh)) {
-			break;
-		}
-	}
-
-	return (DCMD_OK);
-}
-
-/*
- * 1. Process flag passed to vmm_dmsg_dump dcmd.
- * 2. Obtain VMM trace ring buffer pointer.
- * 3. Pass VMM trace ring buffer pointer to vmm_dmsg_dump()
- *    to dump content of VMM trace ring buffer.
- */
-int
-vmm_rbuf_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	vmm_trace_rbuf_t	rbuf;
-	uint_t		printed = 0; /* have we printed anything? */
-	int		print_pathname = FALSE;
-	int		rval = DCMD_OK;
-
-	if (argc > 1) {
-		return (DCMD_USAGE);
-	}
-
-	if (mdb_getopts(argc, argv,
-	    'a', MDB_OPT_SETBITS, TRUE, &print_pathname) != argc) {
-		return (DCMD_USAGE);
-	}
-
-	/*
-	 * If ring buffer address not provided try to obtain
-	 * it using vmm_debug_rbuf global.
-	 */
-	if ((addr == NULL) || !(flags & DCMD_ADDRSPEC)) {
-		if (mdb_readvar(&addr, "vmm_debug_rbuf") == -1) {
-			mdb_warn("Failed to read 'vmm_debug_rbuf'.");
-			return (DCMD_ERR);
-		}
-	}
-
-	if (mdb_vread(&rbuf, sizeof (rbuf), addr) != sizeof (rbuf)) {
-		mdb_warn("Failed to read ring buffer in kernel.");
-		return (DCMD_ERR);
-	}
-
-	if (rbuf.dmsgh == NULL) {
-		mdb_printf("The vmm trace ring buffer is empty.\n");
-		return (DCMD_OK);
-	}
-
-	rval = vmm_dmsg_dump((vmm_trace_dmsg_t *)rbuf.dmsgh,
-	    print_pathname, &printed);
-
-	if (rval != DCMD_OK) {
-		return (rval);
-	}
-
-	if (printed == 0) {
-		mdb_warn("Failed to read vmm trace ring buffer.");
-		return (DCMD_ERR);
-	}
-
-	return (rval);
-}
-
-/*
- * MDB module linkage information:
- *
- * We declare a list of structures describing our dcmds, a list of structures
- * describing our walkers, and a function named _mdb_init to return a pointer
- * to our module information.
- */
-
-static const mdb_dcmd_t dcmds[] = {
-	{ "vmm_dmsg_dump", "[-a]", "Dump vmm trace debug messages",
-	    vmm_rbuf_dump },
-	{ NULL }
-};
-
-static const mdb_walker_t walkers[] = {
-	{ "vmm_dmsg",
-	    "walk ring buffer containing vmm trace debug messages",
-	    vmm_dmsg_walk_i, vmm_dmsg_walk_s, vmm_dmsg_walk_f },
-	{ NULL }
-};
-
-static const mdb_modinfo_t modinfo = {
-	MDB_API_VERSION, dcmds, walkers
-};
-
-const mdb_modinfo_t *
-_mdb_init(void)
-{
-	return (&modinfo);
-}
diff --git a/usr/src/compat/freebsd/amd64/machine/asmacros.h b/usr/src/compat/freebsd/amd64/machine/asmacros.h
index fcf35a7b78..1f6955130b 100644
--- a/usr/src/compat/freebsd/amd64/machine/asmacros.h
+++ b/usr/src/compat/freebsd/amd64/machine/asmacros.h
@@ -25,4 +25,7 @@ x:
 #define	END(x) \
 	.size x, [.-x]
 
+#define	ALIGN_TEXT \
+	.p2align 4,0x90; /* 16-byte alignment, nop filled */
+
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/atomic.h b/usr/src/compat/freebsd/amd64/machine/atomic.h
index 5b78143d21..1da9724b7d 100644
--- a/usr/src/compat/freebsd/amd64/machine/atomic.h
+++ b/usr/src/compat/freebsd/amd64/machine/atomic.h
@@ -11,31 +11,20 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_
 #define	_COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_
 
-static __inline u_char
-atomic_load_acq_char(volatile u_char *p)
-{
-	u_char res;
-
-	__asm volatile("lock ; " "cmpxchgb %b0,%1"
-		       : "=a" (res), "=m" (*p)
-		       : "m" (*p) : "memory", "cc");
-	return (res);
-}
-
-static __inline u_short
+static __inline u_int
 atomic_load_acq_short(volatile u_short *p)
 {
 	u_short res;
 
-	__asm volatile("lock ; " "cmpxchgw %w0,%1"
-		       : "=a" (res), "=m" (*p)
-		       : "m" (*p)
-		       : "memory", "cc");
+	res = *p;
+	__asm volatile("" : : : "memory");
+
 	return (res);
 }
 
@@ -44,10 +33,9 @@ atomic_load_acq_int(volatile u_int *p)
 {
 	u_int res;
 
-	__asm volatile("lock ; " "cmpxchgl %0,%1"
-		       : "=a" (res), "=m" (*p)
-		       : "m" (*p)
-		       : "memory", "cc");
+	res = *p;
+	__asm volatile("" : : : "memory");
+
 	return (res);
 }
 
@@ -56,25 +44,10 @@ atomic_load_acq_long(volatile u_long *p)
 {
 	u_long res;
 
-	__asm volatile("lock ; " "cmpxchgq %0,%1"
-		       : "=a" (res), "=m" (*p)
-		       : "m" (*p)
-		       : "memory", "cc");
-	return (res);
-}
-
-static __inline void
-atomic_store_rel_char(volatile u_char *p, u_char v)
-{
+	res = *p;
 	__asm volatile("" : : : "memory");
-	*p = v;
-}
 
-static __inline void
-atomic_store_rel_short(volatile u_short *p, u_short v)
-{
-	__asm volatile("" : : : "memory");
-	*p = v;
+	return (res);
 }
 
 static __inline void
@@ -134,6 +107,23 @@ atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src)
 	return (res);
 }
 
+static __inline int
+atomic_testandset_int(volatile u_int *p, u_int v)
+{
+	u_char res;
+
+	__asm __volatile(
+	"	lock ;			"
+	"	btsl	%2,%1 ;		"
+	"	setc	%0 ;		"
+	"# atomic_testandset_int"
+	: "=q" (res),		/* 0 */
+	"+m" (*p)		/* 1 */
+	: "Ir" (v & 0x1f)	/* 2 */
+	: "cc");
+	return (res);
+}
+
 /*
  * Atomically add the value of v to the integer pointed to by p and return
  * the previous value of *p.
@@ -226,6 +216,13 @@ atomic_swap_long(volatile u_long *p, u_long v)
 	return (v);
 }
 
+
+#define	atomic_store_short(p, v)	\
+	    (*(volatile u_short *)(p) = (u_short)(v))
+#define	atomic_store_int(p, v)		\
+	    (*(volatile u_int *)(p) = (u_int)(v))
+
+
 #define	atomic_readandclear_int(p)	atomic_swap_int(p, 0)
 #define	atomic_readandclear_long(p)	atomic_swap_long(p, 0)
 
@@ -241,4 +238,25 @@ atomic_swap_long(volatile u_long *p, u_long v)
 /* Operations on pointers. */
 #define	atomic_cmpset_ptr	atomic_cmpset_long
 
+/* Needed for the membar functions */
+#include_next <sys/atomic.h>
+
+static __inline void
+atomic_thread_fence_rel(void)
+{
+	/* Equivalent to their __compiler_membar() */
+	__asm __volatile(" " : : : "memory");
+}
+
+static __inline void
+atomic_thread_fence_seq_cst(void)
+{
+	/* Equivalent to their !KERNEL storeload_barrer() */
+	__asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc");
+}
+
+#define	mb()			membar_enter()
+#define	rmb()			membar_consumer()
+#define	wmb()			membar_producer()
+
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/cpufunc.h b/usr/src/compat/freebsd/amd64/machine/cpufunc.h
index cf485e947c..0b7bcdaa59 100644
--- a/usr/src/compat/freebsd/amd64/machine/cpufunc.h
+++ b/usr/src/compat/freebsd/amd64/machine/cpufunc.h
@@ -16,6 +16,8 @@
 #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_
 #define	_COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_
 
+#include <sys/types.h>
+
 static __inline u_long
 bsfq(u_long mask)
 {
@@ -65,6 +67,12 @@ cpuid_count(u_int ax, u_int cx, u_int *p)
 			 :  "0" (ax), "c" (cx));
 }
 
+static __inline void
+disable_intr(void)
+{
+	__asm __volatile("cli");
+}
+
 static __inline void
 enable_intr(void)
 {
@@ -95,6 +103,15 @@ flsll(long long mask)
 	return (flsl((long)mask));
 }
 
+static __inline u_long
+read_rflags(void)
+{
+	u_long  rf;
+
+	__asm __volatile("pushfq; popq %0" : "=r" (rf));
+	return (rf);
+}
+
 static __inline uint64_t
 rdmsr(u_int msr)
 {
@@ -107,10 +124,10 @@ rdmsr(u_int msr)
 static __inline uint64_t
 rdtsc(void)
 {
-	uint32_t low, high;
- 
-	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
-	return (low | ((uint64_t)high << 32));
+	extern hrtime_t tsc_gethrtimeunscaled_delta(void);
+
+	/* Get the TSC reading with any needed synch offset applied */
+	return ((uint64_t)tsc_gethrtimeunscaled_delta());
 }
 
 static __inline void
@@ -162,4 +179,133 @@ rcr4(void)
 	return (data);
 }
 
+static __inline u_long
+rxcr(u_int reg)
+{
+	u_int low, high;
+
+	__asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg));
+	return (low | ((uint64_t)high << 32));
+}
+
+static __inline void
+load_xcr(u_int reg, u_long val)
+{
+	u_int low, high;
+
+	low = val;
+	high = val >> 32;
+	__asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high));
+}
+
+static __inline void
+write_rflags(u_long rf)
+{
+	__asm __volatile("pushq %0;  popfq" : : "r" (rf));
+}
+
+static __inline uint64_t
+rdr0(void)
+{
+	uint64_t data;
+	__asm __volatile("movq %%dr0,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_dr0(uint64_t dr0)
+{
+	__asm __volatile("movq %0,%%dr0" : : "r" (dr0));
+}
+
+static __inline uint64_t
+rdr1(void)
+{
+	uint64_t data;
+	__asm __volatile("movq %%dr1,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_dr1(uint64_t dr1)
+{
+	__asm __volatile("movq %0,%%dr1" : : "r" (dr1));
+}
+
+static __inline uint64_t
+rdr2(void)
+{
+	uint64_t data;
+	__asm __volatile("movq %%dr2,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_dr2(uint64_t dr2)
+{
+	__asm __volatile("movq %0,%%dr2" : : "r" (dr2));
+}
+
+static __inline uint64_t
+rdr3(void)
+{
+	uint64_t data;
+	__asm __volatile("movq %%dr3,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_dr3(uint64_t dr3)
+{
+	__asm __volatile("movq %0,%%dr3" : : "r" (dr3));
+}
+
+static __inline uint64_t
+rdr6(void)
+{
+	uint64_t data;
+	__asm __volatile("movq %%dr6,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_dr6(uint64_t dr6)
+{
+	__asm __volatile("movq %0,%%dr6" : : "r" (dr6));
+}
+
+static __inline uint64_t
+rdr7(void)
+{
+	uint64_t data;
+	__asm __volatile("movq %%dr7,%0" : "=r" (data));
+	return (data);
+}
+
+static __inline void
+load_dr7(uint64_t dr7)
+{
+	__asm __volatile("movq %0,%%dr7" : : "r" (dr7));
+}
+
+#ifdef _KERNEL
+/*
+ * Including the native sys/segments.h in userspace seriously conflicts with
+ * the FreeBSD compat/contrib headers.
+ */
+#include <sys/segments.h>
+
+static __inline void
+lldt(u_short sel)
+{
+	wr_ldtr(sel);
+}
+
+static __inline u_short
+sldt()
+{
+	return (rd_ldtr());
+}
+#endif /* _KERNEL */
+
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/fpu.h b/usr/src/compat/freebsd/amd64/machine/fpu.h
index 48e686780c..6bc651d996 100644
--- a/usr/src/compat/freebsd/amd64/machine/fpu.h
+++ b/usr/src/compat/freebsd/amd64/machine/fpu.h
@@ -11,13 +11,12 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright (c) 2018, Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_
 #define	_COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_
 
-#define	XSAVE_AREA_ALIGN	64
-
 void	fpuexit(kthread_t *td);
 void	fpurestore(void *);
 void	fpusave(void *);
diff --git a/usr/src/compat/freebsd/amd64/machine/iodev.h b/usr/src/compat/freebsd/amd64/machine/iodev.h
new file mode 100644
index 0000000000..c7cdddc817
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/iodev.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H */
diff --git a/usr/src/compat/freebsd/amd64/machine/md_var.h b/usr/src/compat/freebsd/amd64/machine/md_var.h
index 60fdd566e5..ed57a8bebc 100644
--- a/usr/src/compat/freebsd/amd64/machine/md_var.h
+++ b/usr/src/compat/freebsd/amd64/machine/md_var.h
@@ -21,4 +21,8 @@ extern	u_int	cpu_exthigh;		/* Highest arg to extended CPUID */
 extern	u_int	cpu_id;			/* Stepping ID */
 extern	char	cpu_vendor[];		/* CPU Origin code */
 
+#include <sys/systm.h>
+
+#define	Maxmem	(physmax + 1)
+
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/param.h b/usr/src/compat/freebsd/amd64/machine/param.h
index eaca5ab8d7..b152f4d526 100644
--- a/usr/src/compat/freebsd/amd64/machine/param.h
+++ b/usr/src/compat/freebsd/amd64/machine/param.h
@@ -36,4 +36,6 @@
 /* Size of the level 4 page-map level-4 table units */
 #define	NPML4EPG	(PAGE_SIZE/(sizeof (pml4_entry_t)))
 
+#define	CACHE_LINE_SIZE	64
+
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/pmap.h b/usr/src/compat/freebsd/amd64/machine/pmap.h
index d0303bdd56..ce3185629b 100644
--- a/usr/src/compat/freebsd/amd64/machine/pmap.h
+++ b/usr/src/compat/freebsd/amd64/machine/pmap.h
@@ -1,3 +1,54 @@
+/*
+ * All rights reserved. This copyright notice is Copyright Management
+ * Information under 17 USC 1202 and is included to protect this work and
+ * deter copyright infringement.  Removal or alteration of this Copyright
+ * Management Information without the express written permission from
+ * Pluribus Networks Inc is prohibited, and any such unauthorized removal
+ * or alteration will be a violation of federal law.
+ *
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department and William Jolitz of UUNET Technologies Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Derived from hp300 version by Mike Hibler, this version by William
+ * Jolitz uses a recursive map [a pde points to the page directory] to
+ * map the page tables using the pagetables themselves. This is done to
+ * reduce the impact on kernel virtual memory for lots of sparse address
+ * space, and to reduce the cost of memory to each process.
+ *
+ *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
+ *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
+ * $FreeBSD$
+ */
+
 /*
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
@@ -13,32 +64,426 @@
  * Copyright 2014 Pluribus Networks Inc.
  */
 
+
 #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
 #define	_COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
 
+/*
+ * Page-directory and page-table entries follow this format, with a few
+ * of the fields not present here and there, depending on a lot of things.
+ */
 				/* ---- Intel Nomenclature ---- */
-#define	PG_V		0x001	/* P	Valid			*/
-#define	PG_RW		0x002	/* R/W	Read/Write		*/
-#define	PG_U		0x004	/* U/S	User/Supervisor 	*/
-#define	PG_A		0x020	/* A	Accessed		*/
-#define	PG_M		0x040	/* D	Dirty			*/
-#define	PG_PS		0x080	/* PS	Page size (0=4k,1=2M)	*/
+#define	X86_PG_V	0x001	/* P	Valid			*/
+#define	X86_PG_RW	0x002	/* R/W	Read/Write		*/
+#define	X86_PG_U	0x004	/* U/S  User/Supervisor		*/
+#define	X86_PG_NC_PWT	0x008	/* PWT	Write through		*/
+#define	X86_PG_NC_PCD	0x010	/* PCD	Cache disable		*/
+#define	X86_PG_A	0x020	/* A	Accessed		*/
+#define	X86_PG_M	0x040	/* D	Dirty			*/
+#define	X86_PG_PS	0x080	/* PS	Page size (0=4k,1=2M)	*/
+#define	X86_PG_PTE_PAT	0x080	/* PAT	PAT index		*/
+#define	X86_PG_G	0x100	/* G	Global			*/
+#define	X86_PG_AVAIL1	0x200	/*    /	Available for system	*/
+#define	X86_PG_AVAIL2	0x400	/*   <	programmers use		*/
+#define	X86_PG_AVAIL3	0x800	/*    \				*/
+#define	X86_PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
+#define	X86_PG_NX	(1ul<<63) /* No-execute */
+#define	X86_PG_AVAIL(x)	(1ul << (x))
+
+/* Page level cache control fields used to determine the PAT type */
+#define	X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+#define	X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+
+/*
+ * Intel extended page table (EPT) bit definitions.
+ */
+#define	EPT_PG_READ		0x001	/* R	Read		*/
+#define	EPT_PG_WRITE		0x002	/* W	Write		*/
+#define	EPT_PG_EXECUTE		0x004	/* X	Execute		*/
+#define	EPT_PG_IGNORE_PAT	0x040	/* IPAT	Ignore PAT	*/
+#define	EPT_PG_PS		0x080	/* PS	Page size	*/
+#define	EPT_PG_A		0x100	/* A	Accessed	*/
+#define	EPT_PG_M		0x200	/* D	Dirty		*/
+#define	EPT_PG_MEMORY_TYPE(x)	((x) << 3) /* MT Memory Type	*/
+
+/*
+ * Define the PG_xx macros in terms of the bits on x86 PTEs.
+ */
+#define	PG_V		X86_PG_V
+#define	PG_RW		X86_PG_RW
+#define	PG_U		X86_PG_U
+#define	PG_NC_PWT	X86_PG_NC_PWT
+#define	PG_NC_PCD	X86_PG_NC_PCD
+#define	PG_A		X86_PG_A
+#define	PG_M		X86_PG_M
+#define	PG_PS		X86_PG_PS
+#define	PG_PTE_PAT	X86_PG_PTE_PAT
+#define	PG_G		X86_PG_G
+#define	PG_AVAIL1	X86_PG_AVAIL1
+#define	PG_AVAIL2	X86_PG_AVAIL2
+#define	PG_AVAIL3	X86_PG_AVAIL3
+#define	PG_PDE_PAT	X86_PG_PDE_PAT
+#define	PG_NX		X86_PG_NX
+#define	PG_PDE_CACHE	X86_PG_PDE_CACHE
+#define	PG_PTE_CACHE	X86_PG_PTE_CACHE
+
+/* Our various interpretations of the above */
+#define	PG_W		X86_PG_AVAIL3	/* "Wired" pseudoflag */
+#define	PG_MANAGED	X86_PG_AVAIL2
+#define	EPT_PG_EMUL_V	X86_PG_AVAIL(52)
+#define	EPT_PG_EMUL_RW	X86_PG_AVAIL(53)
+#define	PG_PROMOTED	X86_PG_AVAIL(54)	/* PDE only */
+#define	PG_FRAME	(0x000ffffffffff000ul)
+#define	PG_PS_FRAME	(0x000fffffffe00000ul)
+
+/*
+ * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
+ * (PTE) page mappings have identical settings for the following fields:
+ */
+#define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
+	    PG_M | PG_A | PG_U | PG_RW | PG_V)
 
 /*
  * Page Protection Exception bits
  */
+
 #define PGEX_P		0x01	/* Protection violation vs. not present */
 #define PGEX_W		0x02	/* during a Write cycle */
 #define PGEX_U		0x04	/* access from User mode (UPL) */
 #define PGEX_RSV	0x08	/* reserved PTE field is non-zero */
 #define PGEX_I		0x10	/* during an instruction fetch */
 
+/* 
+ * undef the PG_xx macros that define bits in the regular x86 PTEs that
+ * have a different position in nested PTEs. This is done when compiling
+ * code that needs to be aware of the differences between regular x86 and
+ * nested PTEs.
+ *
+ * The appropriate bitmask will be calculated at runtime based on the pmap
+ * type.
+ */
+#ifdef AMD64_NPT_AWARE
+#undef PG_AVAIL1		/* X86_PG_AVAIL1 aliases with EPT_PG_M */
+#undef PG_G
+#undef PG_A
+#undef PG_M
+#undef PG_PDE_PAT
+#undef PG_PDE_CACHE
+#undef PG_PTE_PAT
+#undef PG_PTE_CACHE
+#undef PG_RW
+#undef PG_V
+#endif
+
+/*
+ * Pte related macros.  This is complicated by having to deal with
+ * the sign extension of the 48th bit.
+ */
+#define KVADDR(l4, l3, l2, l1) ( \
+	((unsigned long)-1 << 47) | \
+	((unsigned long)(l4) << PML4SHIFT) | \
+	((unsigned long)(l3) << PDPSHIFT) | \
+	((unsigned long)(l2) << PDRSHIFT) | \
+	((unsigned long)(l1) << PAGE_SHIFT))
+
+#define UVADDR(l4, l3, l2, l1) ( \
+	((unsigned long)(l4) << PML4SHIFT) | \
+	((unsigned long)(l3) << PDPSHIFT) | \
+	((unsigned long)(l2) << PDRSHIFT) | \
+	((unsigned long)(l1) << PAGE_SHIFT))
+
+/*
+ * Number of kernel PML4 slots.  Can be anywhere from 1 to 64 or so,
+ * but setting it larger than NDMPML4E makes no sense.
+ *
+ * Each slot provides .5 TB of kernel virtual space.
+ */
+#define NKPML4E		4
+
+#define	NUPML4E		(NPML4EPG/2)	/* number of userland PML4 pages */
+#define	NUPDPE		(NUPML4E*NPDPEPG)/* number of userland PDP pages */
+#define	NUPDE		(NUPDPE*NPDEPG)	/* number of userland PD entries */
+
+/*
+ * NDMPML4E is the maximum number of PML4 entries that will be
+ * used to implement the direct map.  It must be a power of two,
+ * and should generally exceed NKPML4E.  The maximum possible
+ * value is 64; using 128 will make the direct map intrude into
+ * the recursive page table map.
+ */
+#define	NDMPML4E	8
+
+/*
+ * These values control the layout of virtual memory.  The starting address
+ * of the direct map, which is controlled by DMPML4I, must be a multiple of
+ * its size.  (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ *
+ * Note: KPML4I is the index of the (single) level 4 page that maps
+ * the KVA that holds KERNBASE, while KPML4BASE is the index of the
+ * first level 4 page that maps VM_MIN_KERNEL_ADDRESS.  If NKPML4E
+ * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
+ * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
+ * KERNBASE.
+ *
+ * (KPML4I combines with KPDPI to choose where KERNBASE starts.
+ * Or, in other words, KPML4I provides bits 39..47 of KERNBASE,
+ * and KPDPI provides bits 30..38.)
+ */
+#define	PML4PML4I	(NPML4EPG/2)	/* Index of recursive pml4 mapping */
+
+#define	KPML4BASE	(NPML4EPG-NKPML4E) /* KVM at highest addresses */
+#define	DMPML4I		rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
+
+#define	KPML4I		(NPML4EPG-1)
+#define	KPDPI		(NPDPEPG-2)	/* kernbase at -2GB */
+
+/*
+ * XXX doesn't really belong here I guess...
+ */
+#define ISA_HOLE_START    0xa0000
+#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START)
+
+#define	PMAP_PCID_NONE		0xffffffff
+#define	PMAP_PCID_KERN		0
+#define	PMAP_PCID_OVERMAX	0x1000
+
+#ifndef LOCORE
+
+#ifdef __FreeBSD__
+#include <sys/queue.h>
+#include <sys/_cpuset.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+
+#include <vm/_vm_radix.h>
+#endif /* __FreeBSD__ */
+
 typedef u_int64_t pd_entry_t;
 typedef u_int64_t pt_entry_t;
 typedef u_int64_t pdp_entry_t;
 typedef u_int64_t pml4_entry_t;
 
+/*
+ * Address of current address space page table maps and directories.
+ */
+#ifdef _KERNEL
+#define	addr_PTmap	(KVADDR(PML4PML4I, 0, 0, 0))
+#define	addr_PDmap	(KVADDR(PML4PML4I, PML4PML4I, 0, 0))
+#define	addr_PDPmap	(KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
+#define	addr_PML4map	(KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
+#define	addr_PML4pml4e	(addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
+#define	PTmap		((pt_entry_t *)(addr_PTmap))
+#define	PDmap		((pd_entry_t *)(addr_PDmap))
+#define	PDPmap		((pd_entry_t *)(addr_PDPmap))
+#define	PML4map		((pd_entry_t *)(addr_PML4map))
+#define	PML4pml4e	((pd_entry_t *)(addr_PML4pml4e))
+
+extern int nkpt;		/* Initial number of kernel page tables */
+extern u_int64_t KPDPphys;	/* physical address of kernel level 3 */
+extern u_int64_t KPML4phys;	/* physical address of kernel level 4 */
+
+/*
+ * virtual address to page table entry and
+ * to physical address.
+ * Note: these work recursively, thus vtopte of a pte will give
+ * the corresponding pde that in turn maps it.
+ */
+pt_entry_t *vtopte(vm_offset_t);
 #define	vtophys(va)	pmap_kextract(((vm_offset_t) (va)))
-vm_paddr_t pmap_kextract(vm_offset_t va);
+#ifndef __FreeBSD__
+extern vm_paddr_t pmap_kextract(vm_offset_t);
+#endif
+
+#define	pte_load_store(ptep, pte)	atomic_swap_long(ptep, pte)
+#define	pte_load_clear(ptep)		atomic_swap_long(ptep, 0)
+#define	pte_store(ptep, pte) do { \
+	*(u_long *)(ptep) = (u_long)(pte); \
+} while (0)
+#define	pte_clear(ptep)			pte_store(ptep, 0)
+
+#define	pde_store(pdep, pde)		pte_store(pdep, pde)
+
+extern pt_entry_t pg_nx;
+
+#endif /* _KERNEL */
+
+#ifdef __FreeBSD__
+/*
+ * Pmap stuff
+ */
+struct	pv_entry;
+struct	pv_chunk;
+
+/*
+ * Locks
+ * (p) PV list lock
+ */
+struct md_page {
+	TAILQ_HEAD(, pv_entry)	pv_list;  /* (p) */
+	int			pv_gen;   /* (p) */
+	int			pat_mode;
+};
+#endif /* __FreeBSD__ */
+
+enum pmap_type {
+	PT_X86,			/* regular x86 page tables */
+	PT_EPT,			/* Intel's nested page tables */
+	PT_RVI,			/* AMD's nested page tables */
+};
+
+#ifdef __FreeBSD__
+struct pmap_pcids {
+	uint32_t	pm_pcid;
+	uint32_t	pm_gen;
+};
+
+/*
+ * The kernel virtual address (KVA) of the level 4 page table page is always
+ * within the direct map (DMAP) region.
+ */
+struct pmap {
+	struct mtx		pm_mtx;
+	pml4_entry_t		*pm_pml4;	/* KVA of level 4 page table */
+	uint64_t		pm_cr3;
+	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
+	cpuset_t		pm_active;	/* active on cpus */
+	enum pmap_type		pm_type;	/* regular or nested tables */
+	struct pmap_statistics	pm_stats;	/* pmap statistics */
+	struct vm_radix		pm_root;	/* spare page table pages */
+	long			pm_eptgen;	/* EPT pmap generation id */
+	int			pm_flags;
+	struct pmap_pcids	pm_pcids[MAXCPU];
+};
+#endif /* __FreeBSD__ */
+
+/* flags */
+#define	PMAP_NESTED_IPIMASK	0xff
+#define	PMAP_PDE_SUPERPAGE	(1 << 8)	/* supports 2MB superpages */
+#define	PMAP_EMULATE_AD_BITS	(1 << 9)	/* needs A/D bits emulation */
+#define	PMAP_SUPPORTS_EXEC_ONLY	(1 << 10)	/* execute only mappings ok */
+
+typedef struct pmap	*pmap_t;
+
+#ifdef _KERNEL
+extern struct pmap	kernel_pmap_store;
+#define kernel_pmap	(&kernel_pmap_store)
+
+#define	PMAP_LOCK(pmap)		mtx_lock(&(pmap)->pm_mtx)
+#define	PMAP_LOCK_ASSERT(pmap, type) \
+				mtx_assert(&(pmap)->pm_mtx, (type))
+#define	PMAP_LOCK_DESTROY(pmap)	mtx_destroy(&(pmap)->pm_mtx)
+#define	PMAP_LOCK_INIT(pmap)	mtx_init(&(pmap)->pm_mtx, "pmap", \
+				    NULL, MTX_DEF | MTX_DUPOK)
+#define	PMAP_LOCKED(pmap)	mtx_owned(&(pmap)->pm_mtx)
+#define	PMAP_MTX(pmap)		(&(pmap)->pm_mtx)
+#define	PMAP_TRYLOCK(pmap)	mtx_trylock(&(pmap)->pm_mtx)
+#define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
+
+int	pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
+int	pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
+#endif
+
+#ifdef	__FreeBSD__
+/*
+ * For each vm_page_t, there is a list of all currently valid virtual
+ * mappings of that page.  An entry is a pv_entry_t, the list is pv_list.
+ */
+typedef struct pv_entry {
+	vm_offset_t	pv_va;		/* virtual address for mapping */
+	TAILQ_ENTRY(pv_entry)	pv_next;
+} *pv_entry_t;
+
+/*
+ * pv_entries are allocated in chunks per-process.  This avoids the
+ * need to track per-pmap assignments.
+ */
+#define	_NPCM	3
+#define	_NPCPV	168
+struct pv_chunk {
+	pmap_t			pc_pmap;
+	TAILQ_ENTRY(pv_chunk)	pc_list;
+	uint64_t		pc_map[_NPCM];	/* bitmap; 1 = free */
+	TAILQ_ENTRY(pv_chunk)	pc_lru;
+	struct pv_entry		pc_pventry[_NPCPV];
+};
+
+#ifdef	_KERNEL
+
+extern caddr_t	CADDR1;
+extern pt_entry_t *CMAP1;
+extern vm_paddr_t phys_avail[];
+extern vm_paddr_t dump_avail[];
+extern vm_offset_t virtual_avail;
+extern vm_offset_t virtual_end;
+extern vm_paddr_t dmaplimit;
+extern int pmap_pcid_enabled;
+extern int invpcid_works;
+
+#define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
+#define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
+
+struct thread;
+
+void	pmap_activate_sw(struct thread *);
+void	pmap_bootstrap(vm_paddr_t *);
+int	pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde);
+int	pmap_change_attr(vm_offset_t, vm_size_t, int);
+void	pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate);
+void	pmap_init_pat(void);
+void	pmap_kenter(vm_offset_t va, vm_paddr_t pa);
+void	*pmap_kenter_temporary(vm_paddr_t pa, int i);
+vm_paddr_t pmap_kextract(vm_offset_t);
+void	pmap_kremove(vm_offset_t);
+void	*pmap_mapbios(vm_paddr_t, vm_size_t);
+void	*pmap_mapdev(vm_paddr_t, vm_size_t);
+void	*pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
+boolean_t pmap_page_is_mapped(vm_page_t m);
+void	pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
+void	pmap_pinit_pml4(vm_page_t);
+void	pmap_unmapdev(vm_offset_t, vm_size_t);
+void	pmap_invalidate_page(pmap_t, vm_offset_t);
+void	pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
+void	pmap_invalidate_all(pmap_t);
+void	pmap_invalidate_cache(void);
+void	pmap_invalidate_cache_pages(vm_page_t *pages, int count);
+void	pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva,
+	    boolean_t force);
+void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
+boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+void	pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+#endif /* _KERNEL */
+
+/* Return various clipped indexes for a given VA */
+static __inline vm_pindex_t
+pmap_pte_index(vm_offset_t va)
+{
+
+	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pde_index(vm_offset_t va)
+{
+
+	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pdpe_index(vm_offset_t va)
+{
+
+	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+}
+
+static __inline vm_pindex_t
+pmap_pml4e_index(vm_offset_t va)
+{
+
+	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
+}
+
+#endif /* __FreeBSD__ */
+#endif /* !LOCORE */
 
-#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */
+#endif /* !_COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/reg.h b/usr/src/compat/freebsd/amd64/machine/reg.h
new file mode 100644
index 0000000000..4a73463603
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/reg.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_REG_H_
+
+#define	DBREG_DR6_RESERVED1	0xffff0ff0
+#define	DBREG_DR7_RESERVED1	0x0400
+
+
+#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/smp.h b/usr/src/compat/freebsd/amd64/machine/smp.h
index ef719b9684..9c4f2d111b 100644
--- a/usr/src/compat/freebsd/amd64/machine/smp.h
+++ b/usr/src/compat/freebsd/amd64/machine/smp.h
@@ -11,9 +11,20 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
 #define	_COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
 
+#ifdef _KERNEL
+
+/*
+ * APIC-related functions are replaced with native calls rather than shims
+ * which attempt to replicate the FreeBSD interfaces.  This is empty, but will
+ * remain present to appease sources which wish to include the path.
+ */
+
+#endif /* _KERNEL */
+
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/specialreg.h b/usr/src/compat/freebsd/amd64/machine/specialreg.h
new file mode 100644
index 0000000000..871573ea6b
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/specialreg.h
@@ -0,0 +1,61 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_
+
+#ifdef _SYS_X86_ARCHEXT_H
+/* Our x86_archext conflicts with BSD header for the XFEATURE_ defines */
+#undef	XFEATURE_AVX
+#undef	XFEATURE_MPX
+#undef	XFEATURE_AVX512
+#endif
+
+#ifdef _SYS_CONTROLREGS_H
+/* Our CR4 defines conflict with BSD header */
+#undef	CR4_VME
+#undef	CR4_PVI
+#undef	CR4_TSD
+#undef	CR4_DE
+#undef	CR4_PSE
+#undef	CR4_PAE
+#undef	CR4_MCE
+#undef	CR4_PGE
+#undef	CR4_PCE
+#undef	CR4_VMXE
+#undef	CR4_SMEP
+#undef	CR4_SMAP
+#undef	CR4_PKE
+#undef	CR4_PCIDE
+#endif /* _SYS_CONTROLREGS_H */
+
+#ifdef _SYS_X86_ARCHEXT_H
+/* Our IA32 speculation-related defines conflict with BSD header */
+#undef	IA32_ARCH_CAP_RDCL_NO
+#undef	IA32_ARCH_CAP_IBRS_ALL
+#undef	IA32_ARCH_CAP_RSBA
+#undef	IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY
+#undef	IA32_ARCH_CAP_SSB_NO
+#undef	IA32_ARCH_CAP_MDS_NO
+#undef	IA32_SPEC_CTRL_IBRS
+#undef	IA32_SPEC_CTRL_STIBP
+#undef	IA32_SPEC_CTRL_SSBD
+#undef	IA32_FLUSH_CMD_L1D
+#undef	MSR_IA32_SPEC_CTRL
+#undef	MSR_IA32_PRED_CMD
+#endif /* _SYS_X86_ARCHEXT_H */
+
+#include <x86/specialreg.h>
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmm.h b/usr/src/compat/freebsd/amd64/machine/vmm.h
index 79c3ec959e..1c54c0830d 100644
--- a/usr/src/compat/freebsd/amd64/machine/vmm.h
+++ b/usr/src/compat/freebsd/amd64/machine/vmm.h
@@ -11,11 +11,14 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_
 #define	_COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_
 
+#include <sys/_cpuset.h>
+
 #include <sys/vmm.h>
 
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmparam.h b/usr/src/compat/freebsd/amd64/machine/vmparam.h
index c80c2af545..c76a3259f3 100644
--- a/usr/src/compat/freebsd/amd64/machine/vmparam.h
+++ b/usr/src/compat/freebsd/amd64/machine/vmparam.h
@@ -11,9 +11,35 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_
 #define	_COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_
 
+extern caddr_t kpm_vbase;
+extern size_t kpm_size;
+
+static inline uintptr_t
+phys_to_dmap(uintptr_t pa)
+{
+	ASSERT3U(pa, <, kpm_size);
+	return ((uintptr_t)kpm_vbase + pa);
+}
+
+static inline uintptr_t
+dmap_to_phys(uintptr_t kva)
+{
+	const uintptr_t base = (uintptr_t)kpm_vbase;
+
+	ASSERT3U(kva, >=, base);
+	ASSERT3U(kva, <, base + kpm_size);
+
+	return (kva - base);
+}
+
+#define	PHYS_TO_DMAP(x)	phys_to_dmap(x)
+#define	DMAP_TO_PHYS(x)	dmap_to_phys(x)
+
+
 #endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ */
diff --git a/usr/src/compat/freebsd/err.h b/usr/src/compat/freebsd/err.h
new file mode 100644
index 0000000000..40d144e025
--- /dev/null
+++ b/usr/src/compat/freebsd/err.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_ERR_H_
+#define	_COMPAT_FREEBSD_ERR_H_
+
+#define	errc(code, num, ...)	err(code, __VA_ARGS__)
+
+#include_next <err.h>
+
+#endif	/* _COMPAT_FREEBSD_ERR_H_ */
diff --git a/usr/src/compat/freebsd/libutil.h b/usr/src/compat/freebsd/libutil.h
index e22ffc0551..f899d4425e 100644
--- a/usr/src/compat/freebsd/libutil.h
+++ b/usr/src/compat/freebsd/libutil.h
@@ -17,5 +17,19 @@
 #define	_COMPAT_FREEBSD_LIBUTIL_H_
 
 int	expand_number(const char *_buf, uint64_t *_num);
+int	humanize_number(char *_buf, size_t _len, int64_t _number,
+    const char *_suffix, int _scale, int _flags);
+
+/* Values for humanize_number(3)'s flags parameter. */
+#define HN_DECIMAL      0x01
+#define HN_NOSPACE      0x02
+#define HN_B            0x04
+#define HN_DIVISOR_1000     0x08
+#define HN_IEC_PREFIXES     0x10
+
+/* Values for humanize_number(3)'s scale parameter. */
+#define HN_GETSCALE     0x10
+#define HN_AUTOSCALE        0x20
+
 
 #endif	/* _COMPAT_FREEBSD_LIBUTIL_H_ */
diff --git a/usr/src/compat/freebsd/net/ethernet.h b/usr/src/compat/freebsd/net/ethernet.h
index a0d5a828c6..dcd3a58925 100644
--- a/usr/src/compat/freebsd/net/ethernet.h
+++ b/usr/src/compat/freebsd/net/ethernet.h
@@ -11,11 +11,25 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_
 #define	_COMPAT_FREEBSD_SYS_NET_ETHERNET_H_
 
+#define	ether_addr_octet	octet
+
 #include <sys/ethernet.h>
 
+/*
+ * Some basic Ethernet constants.
+ */
+#define	ETHER_ADDR_LEN		6	/* length of an Ethernet address */
+#define	ETHER_CRC_LEN		4	/* length of the Ethernet CRC */
+#define	ETHER_MIN_LEN		64	/* minimum frame len, including CRC */
+
+#define	ETHER_VLAN_ENCAP_LEN	4	/* len of 802.1Q VLAN encapsulation */
+
+#define	ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
+
 #endif	/* _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ */
diff --git a/usr/src/compat/freebsd/pthread_np.h b/usr/src/compat/freebsd/pthread_np.h
index 641c58f406..c4f76b259c 100644
--- a/usr/src/compat/freebsd/pthread_np.h
+++ b/usr/src/compat/freebsd/pthread_np.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_PTHREAD_NP_H_
@@ -20,8 +21,9 @@
 #include <sys/cpuset.h>
 
 #include <synch.h>
+#include <pthread.h>
 
-#define	pthread_set_name_np(thread, name)
+#define	pthread_set_name_np pthread_setname_np
 
 #define	pthread_mutex_isowned_np(x)	_mutex_held(x)
 
diff --git a/usr/src/compat/freebsd/sys/_cpuset.h b/usr/src/compat/freebsd/sys/_cpuset.h
new file mode 100644
index 0000000000..286d26fc00
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/_cpuset.h
@@ -0,0 +1,33 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS__CPUSET_H_
+#define	_COMPAT_FREEBSD_SYS__CPUSET_H_
+
+#ifdef _KERNEL
+/*
+ * The sys/_cpuset.h header is used to communicate the layout of cpuset_t while
+ * sys/cpuset.h contains the manipulation routines.
+ *
+ * The explicit guard definition below is necessary as other contrib headers
+ * change their behavior based on its presence.
+ */
+#define	_SYS__CPUSET_H_
+
+#include <sys/cpuvar.h>
+
+#endif /* _KERNEL */
+
+#endif	/* _COMPAT_FREEBSD_SYS__CPUSET_H_ */
diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h
index 17b6e31507..6087a09f54 100644
--- a/usr/src/compat/freebsd/sys/callout.h
+++ b/usr/src/compat/freebsd/sys/callout.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_
@@ -41,6 +42,9 @@ int	vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt,
 int	vmm_glue_callout_stop(struct callout *c);
 int	vmm_glue_callout_drain(struct callout *c);
 
+/* illumos-custom function for resource locality optimization */
+void	vmm_glue_callout_localize(struct callout *c);
+
 static __inline void
 callout_init(struct callout *c, int mpsafe)
 {
diff --git a/usr/src/compat/freebsd/sys/cdefs.h b/usr/src/compat/freebsd/sys/cdefs.h
index 974e323dbe..0b857437e3 100644
--- a/usr/src/compat/freebsd/sys/cdefs.h
+++ b/usr/src/compat/freebsd/sys/cdefs.h
@@ -11,48 +11,67 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_
 #define	_COMPAT_FREEBSD_SYS_CDEFS_H_
 
+/*
+ * Testing against Clang-specific extensions.
+ */
+#ifndef __has_extension
+#define	__has_extension		__has_feature
+#endif
+#ifndef __has_feature
+#define	__has_feature(x)	0
+#endif
+
+/*
+ * Macro to test if we're using a specific version of gcc or later.
+ */
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#define __GNUC_PREREQ__(ma, mi) \
+	(__GNUC__ > (ma) || __GNUC__ == (ma) && __GNUC_MINOR__ >= (mi))
+#else
+#define __GNUC_PREREQ__(ma, mi) 0
+#endif
+
 #define	__FBSDID(s)
 
 #ifdef	__GNUC__
+#define	asm		__asm
 #define	inline		__inline
 
 #define	__GNUCLIKE___SECTION		1
 
 #define	__dead2		__attribute__((__noreturn__))
-#define	__unused	__attribute__((__unused__))
 #define	__used		__attribute__((__used__))
 #define	__packed	__attribute__((__packed__))
 #define	__aligned(x)	__attribute__((__aligned__(x)))
 #define	__section(x)	__attribute__((__section__(x)))
+#define	__weak_symbol   __attribute__((__weak__))
 #endif
 
-/*
- * The __CONCAT macro is used to concatenate parts of symbol names, e.g.
- * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo.
- * The __CONCAT macro is a bit tricky to use if it must work in non-ANSI
- * mode -- there must be no spaces between its arguments, and for nested
- * __CONCAT's, all the __CONCAT's must be at the left.  __CONCAT can also
- * concatenate double-quoted strings produced by the __STRING macro, but
- * this only works with ANSI C.
- *
- * __XSTRING is like __STRING, but it expands any macros in its argument
- * first.  It is only available with ANSI C.
- */
-#if defined(__STDC__) || defined(__cplusplus)
-#define	__P(protos)	protos		/* full-blown ANSI C */
-#define	__CONCAT1(x,y)	x ## y
-#define	__CONCAT(x,y)	__CONCAT1(x,y)
-#define	__STRING(x)	#x		/* stringify without expanding x */
-#define	__XSTRING(x)	__STRING(x)	/* expand x, then stringify */
-#else	/* !(__STDC__ || __cplusplus) */
-#define	__P(protos)	()		/* traditional C preprocessor */
-#define	__CONCAT(x,y)	x/**/y
-#define	__STRING(x)	"x"
-#endif	/* !(__STDC__ || __cplusplus) */
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L || defined(lint)
+
+#if !__has_extension(c_static_assert)
+#if (defined(__cplusplus) && __cplusplus >= 201103L) || \
+    __has_extension(cxx_static_assert)
+#define _Static_assert(x, y)    static_assert(x, y)
+#elif __GNUC_PREREQ__(4,6)
+/* Nothing, gcc 4.6 and higher has _Static_assert built-in */
+#elif defined(__COUNTER__)
+#define _Static_assert(x, y)    __Static_assert(x, __COUNTER__)
+#define __Static_assert(x, y)   ___Static_assert(x, y)
+#define ___Static_assert(x, y)  typedef char __assert_ ## y[(x) ? 1 : -1] \
+                                __unused
+#else
+#define _Static_assert(x, y)    struct __hack
+#endif
+#endif
+#define	static_assert(x, y)	_Static_assert(x, y)
+
+#endif /* __STDC_VERSION__ || __STDC_VERSION__ < 201112L */
 
 #endif	/* _COMPAT_FREEBSD_SYS_CDEFS_H_ */
diff --git a/usr/src/compat/freebsd/sys/clock.h b/usr/src/compat/freebsd/sys/clock.h
new file mode 100644
index 0000000000..ebf7f171a3
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/clock.h
@@ -0,0 +1,110 @@
+/*-
+ * Copyright (c) 1996 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Gordon W. Ross
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	$NetBSD: clock_subr.h,v 1.7 2000/10/03 13:41:07 tsutsui Exp $
+ *
+ *
+ * This file is the central clearing-house for calendrical issues.
+ *
+ * In general the kernel does not know about minutes, hours, days, timezones,
+ * daylight savings time, leap-years and such.  All that is theoretically a
+ * matter for userland only.
+ *
+ * Parts of kernel code does however care: badly designed filesystems store
+ * timestamps in local time and RTC chips sometimes track time in a local
+ * timezone instead of UTC and so on.
+ *
+ * All that code should go here for service.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_CLOCK_H_
+#define	_COMPAT_FREEBSD_SYS_CLOCK_H_
+
+#include_next <sys/clock.h>
+
+#ifdef _KERNEL		/* No user serviceable parts */
+
+#ifdef __FreeBSD__
+/*
+ * Timezone info from settimeofday(2), usually not used
+ */
+extern int tz_minuteswest;
+extern int tz_dsttime;
+extern struct mtx resettodr_lock;
+
+int utc_offset(void);
+#endif /* __FreeBSD__ */
+
+/*
+ * Structure to hold the values typically reported by time-of-day clocks.
+ * This can be passed to the generic conversion functions to be converted
+ * to a struct timespec.
+ */
+struct clocktime {
+	int	year;			/* year (4 digit year) */
+	int	mon;			/* month (1 - 12) */
+	int	day;			/* day (1 - 31) */
+	int	hour;			/* hour (0 - 23) */
+	int	min;			/* minute (0 - 59) */
+	int	sec;			/* second (0 - 59) */
+	int	dow;			/* day of week (0 - 6; 0 = Sunday) */
+	long	nsec;			/* nano seconds */
+};
+
+int clock_ct_to_ts(struct clocktime *, struct timespec *);
+void clock_ts_to_ct(struct timespec *, struct clocktime *);
+#ifdef __FreeBSD__
+void clock_register(device_t, long);
+#endif
+
+#ifndef __FreeBSD__
+extern u_char const bin2bcd_data[];
+#define	bin2bcd(x)	(bin2bcd_data[bin])
+#endif
+
+/*
+ * BCD to decimal and decimal to BCD.
+ */
+#define	FROMBCD(x)	bcd2bin(x)
+#define	TOBCD(x)	bin2bcd(x)
+
+/* Some handy constants. */
+#define SECDAY		(24 * 60 * 60)
+#define SECYR		(SECDAY * 365)
+
+/* Traditional POSIX base year */
+#define	POSIX_BASE_YEAR	1970
+
+void timespec2fattime(struct timespec *tsp, int utc, u_int16_t *ddp, u_int16_t *dtp, u_int8_t *dhp);
+void fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp);
+
+#endif /* _KERNEL */
+
+#endif	/* _COMPAT_FREEBSD_SYS_CLOCK_H_ */
diff --git a/usr/src/compat/freebsd/sys/cpuset.h b/usr/src/compat/freebsd/sys/cpuset.h
index 8527624b5e..626b323d7d 100644
--- a/usr/src/compat/freebsd/sys/cpuset.h
+++ b/usr/src/compat/freebsd/sys/cpuset.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_CPUSET_H_
@@ -19,26 +20,115 @@
 #define	NOCPU			-1
 
 #ifdef	_KERNEL
-#define	CPU_SET(cpu, set)		CPUSET_ADD(*(set), cpu)
-#define	CPU_SETOF(cpu, set)		CPUSET_ONLY(*(set), cpu)
-#define	CPU_ZERO(set)			CPUSET_ZERO(*(set))
-#define	CPU_CLR(cpu, set)		CPUSET_DEL(*(set), cpu)
+
+#include <sys/_cpuset.h>
+
+#define	CPU_SET(cpu, set)		cpuset_add((set), (cpu))
+#define	CPU_SETOF(cpu, set)		cpuset_only((set), (cpu))
+#define	CPU_ZERO(set)			cpuset_zero((cpuset_t *)(set))
+#define	CPU_CLR(cpu, set)		cpuset_del((set), (cpu))
+#define	CPU_EMPTY(set)			cpuset_isnull((set))
 #define	CPU_FFS(set)			cpusetobj_ffs(set)
-#define	CPU_ISSET(cpu, set)		CPU_IN_SET(*(set), cpu)
-#define	CPU_CMP(set1, set2)		CPUSET_ISEQUAL(*(set1), *(set2))
-#define	CPU_SET_ATOMIC(cpu, set)	CPUSET_ATOMIC_ADD(*(set), cpu)
+#define	CPU_ISSET(cpu, set)		cpu_in_set((cpuset_t *)(set), (cpu))
+#define	CPU_AND(dst, src)		cpuset_and(			\
+						(cpuset_t *)(dst),	\
+						(cpuset_t *)(src))
+#define	CPU_OR(dst, src)		cpuset_or(			\
+						(cpuset_t *)(dst),	\
+						(cpuset_t *)(src))
+#define	CPU_CMP(set1, set2)		(cpuset_isequal(		\
+						(cpuset_t *)(set1),	\
+						(cpuset_t *)(set2)) == 0)
+#define	CPU_SET_ATOMIC(cpu, set)	cpuset_atomic_add(		\
+						(cpuset_t *)(set),	\
+						(cpu))
+#define	CPU_CLR_ATOMIC(cpu, set)	cpuset_atomic_del(		\
+						(cpuset_t *)(set),	\
+						(cpu))
+
+#define	CPU_SET_ATOMIC_ACQ(cpu, set)	cpuset_atomic_add((set), (cpu))
 
-#include <sys/cpuvar.h>
 
 int	cpusetobj_ffs(const cpuset_t *set);
+
 #else
+
+#include <sys/bitmap.h>
 #include <machine/atomic.h>
+#include <machine/cpufunc.h>
+
+/* For now, assume NCPU of 256 */
+#define	CPU_SETSIZE			(256)
+
+typedef struct {
+	ulong_t _bits[BT_BITOUL(CPU_SETSIZE)];
+} cpuset_t;
+
+static __inline int
+cpuset_isempty(const cpuset_t *set)
+{
+	uint_t i;
 
-typedef int cpuset_t;
+	for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) {
+		if (set->_bits[i] != 0)
+			return (0);
+	}
+	return (1);
+}
 
-#define	CPUSET(cpu)			(1UL << (cpu))
+static __inline void
+cpuset_zero(cpuset_t *dst)
+{
+	uint_t i;
+
+	for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) {
+		dst->_bits[i] = 0;
+	}
+}
+
+static __inline int
+cpuset_isequal(cpuset_t *s1, cpuset_t *s2)
+{
+	uint_t i;
+
+	for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) {
+		if (s1->_bits[i] != s2->_bits[i])
+			return (0);
+	}
+	return (1);
+}
+
+static __inline uint_t
+cpusetobj_ffs(const cpuset_t *set)
+{
+	uint_t i, cbit;
+
+	cbit = 0;
+	for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) {
+		if (set->_bits[i] != 0) {
+			cbit = ffsl(set->_bits[i]);
+			cbit += i * sizeof (set->_bits[0]);
+			break;
+		}
+	}
+	return (cbit);
+}
+
+
+#define	CPU_SET(cpu, setp)		BT_SET((setp)->_bits, cpu)
+#define	CPU_CLR(cpu, setp)		BT_CLEAR((setp)->_bits, cpu)
+#define	CPU_ZERO(setp)			cpuset_zero((setp))
+#define	CPU_CMP(set1, set2)		(cpuset_isequal(		\
+						(cpuset_t *)(set1),	\
+						(cpuset_t *)(set2)) == 0)
+#define	CPU_FFS(set)			cpusetobj_ffs(set)
+#define	CPU_ISSET(cpu, setp)		BT_TEST((setp)->_bits, cpu)
+#define	CPU_EMPTY(setp)			cpuset_isempty((setp))
+#define	CPU_SET_ATOMIC(cpu, setp)	\
+	atomic_set_long(&(BT_WIM((setp)->_bits, cpu)), BT_BIW(cpu))
+#define	CPU_CLR_ATOMIC(cpu, setp)	\
+	atomic_clear_long(&(BT_WIM((setp)->_bits, cpu)), BT_BIW(cpu))
 
-#define	CPU_SET_ATOMIC(cpu, set)	atomic_set_int((set), CPUSET(cpu))
 #endif
 
 #endif	/* _COMPAT_FREEBSD_SYS_CPUSET_H_ */
diff --git a/usr/src/compat/freebsd/sys/endian.h b/usr/src/compat/freebsd/sys/endian.h
index a31bff55d6..24ea02d251 100644
--- a/usr/src/compat/freebsd/sys/endian.h
+++ b/usr/src/compat/freebsd/sys/endian.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_ENDIAN_H_
@@ -122,4 +123,14 @@ le64enc(void *pp, uint64_t u)
 	le32enc(p + 4, (uint32_t)(u >> 32));
 }
 
+#ifdef _LITTLE_ENDIAN
+#define	htole16(x)	((uint16_t)(x))
+#define	htole32(x)	((uint32_t)(x))
+#define	htole64(x)	((uint64_t)(x))
+
+#define	le16toh(x)	((uint16_t)(x))
+#define	le32toh(x)	((uint32_t)(x))
+#define	le64toh(x)	((uint64_t)(x))
+#endif
+
 #endif	/* _COMPAT_FREEBSD_SYS_ENDIAN_H_ */
diff --git a/usr/src/compat/freebsd/sys/eventhandler.h b/usr/src/compat/freebsd/sys/eventhandler.h
new file mode 100644
index 0000000000..133aa664f0
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/eventhandler.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_
+#define	_COMPAT_FREEBSD_SYS_EVENTHANDLER_H_
+
+#endif	/* _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_ */
diff --git a/usr/src/compat/freebsd/sys/ioctl.h b/usr/src/compat/freebsd/sys/ioctl.h
index e223e1e4c7..72a46b8085 100644
--- a/usr/src/compat/freebsd/sys/ioctl.h
+++ b/usr/src/compat/freebsd/sys/ioctl.h
@@ -17,6 +17,8 @@
 #define	_COMPAT_FREEBSD_SYS_IOCTL_H_
 
 #include <sys/ioccom.h>
+/* Get BSD compatibility from the ioctl header */
+#define	BSD_COMP
 #include_next <sys/ioctl.h>
 
 #endif	/* _COMPAT_FREEBSD_SYS_IOCTL_H_ */
diff --git a/usr/src/compat/freebsd/sys/kernel.h b/usr/src/compat/freebsd/sys/kernel.h
index b1c07674e4..adf96f40fc 100644
--- a/usr/src/compat/freebsd/sys/kernel.h
+++ b/usr/src/compat/freebsd/sys/kernel.h
@@ -11,15 +11,32 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_KERNEL_H_
 #define	_COMPAT_FREEBSD_SYS_KERNEL_H_
 
-#define	SYSINIT(uniquifier, subsystem, order, func, ident)
+#define	TUNABLE_INT_FETCH(path, var)
 
 #include <sys/linker_set.h>
 
+typedef void (*sysinit_func_t)(const void *);
+
+struct sysinit {
+	const sysinit_func_t func;
+	const void *data;
+};
+
+#define	SYSINIT(uniquifier, subsystem, order, func, ident) \
+	static struct sysinit uniquifier ## _sys_init = {  \
+		(const sysinit_func_t)func,		   \
+		(const void *)&(ident)			   \
+	};						   \
+	DATA_SET(sysinit_set, uniquifier ## _sys_init);
+
+extern void sysinit(void);
+
 #define	ticks	ddi_get_lbolt()
 
 #endif	/* _COMPAT_FREEBSD_SYS_KERNEL_H_ */
diff --git a/usr/src/compat/freebsd/sys/limits.h b/usr/src/compat/freebsd/sys/limits.h
index 99ae0f4d64..0e66319791 100644
--- a/usr/src/compat/freebsd/sys/limits.h
+++ b/usr/src/compat/freebsd/sys/limits.h
@@ -11,9 +11,14 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_LIMITS_H_
 #define	_COMPAT_FREEBSD_SYS_LIMITS_H_
 
+#include_next <limits.h>
+
+#define	OFF_MAX		((off_t)-1)
+
 #endif	/* _COMPAT_FREEBSD_SYS_LIMITS_H_ */
diff --git a/usr/src/compat/freebsd/sys/lock.h b/usr/src/compat/freebsd/sys/lock.h
new file mode 100644
index 0000000000..fd6021a87e
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/lock.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_LOCK_H_
+#define	_COMPAT_FREEBSD_SYS_LOCK_H_
+
+#include_next <sys/lock.h>
+
+#define	WITNESS_WARN(...)
+
+#endif	/* _COMPAT_FREEBSD_SYS_LOCK_H_ */
diff --git a/usr/src/compat/freebsd/sys/malloc.h b/usr/src/compat/freebsd/sys/malloc.h
index 579df44533..341d57b807 100644
--- a/usr/src/compat/freebsd/sys/malloc.h
+++ b/usr/src/compat/freebsd/sys/malloc.h
@@ -39,6 +39,11 @@ struct malloc_type {
 void	free(void *addr, struct malloc_type *type);
 void	*malloc(unsigned long size, struct malloc_type *type, int flags);
 void	*old_malloc(unsigned long size, struct malloc_type *type , int flags);
+void	*contigmalloc(unsigned long, struct malloc_type *, int, vm_paddr_t,
+    vm_paddr_t, unsigned long, vm_paddr_t);
+void	contigfree(void *, unsigned long, struct malloc_type *);
+
+
 #endif	/* _KERNEL */
 
 #endif	/* _COMPAT_FREEBSD_SYS_MALLOC_H_ */
diff --git a/usr/src/compat/freebsd/sys/mutex.h b/usr/src/compat/freebsd/sys/mutex.h
index b99884b652..9e588cb98a 100644
--- a/usr/src/compat/freebsd/sys/mutex.h
+++ b/usr/src/compat/freebsd/sys/mutex.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_MUTEX_H_
@@ -28,15 +29,11 @@ struct mtx;
 void mtx_init(struct mtx *, char *name, const char *type_name, int opts);
 void mtx_destroy(struct mtx *);
 
-int mtx_sleep(void *chan, struct mtx *mtx, int priority, const char *wmesg,
-    int timo);
-
 #endif	/* KERNEL */
 #include_next <sys/mutex.h>
 #ifdef	_KERNEL
 
 struct mtx {
-	kmutex_type_t	t;
 	kmutex_t	m;
 };
 
diff --git a/usr/src/compat/freebsd/sys/param.h b/usr/src/compat/freebsd/sys/param.h
index f09e9183f6..b125f9014f 100644
--- a/usr/src/compat/freebsd/sys/param.h
+++ b/usr/src/compat/freebsd/sys/param.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_PARAM_H_
@@ -18,8 +19,11 @@
 
 #ifndef	_KERNEL
 #define	MAXCOMLEN	16
+/* default value of the kernel tunable 'maxphys' in i86pc */
+#define	MAXPHYS		(56 * 1024)
 #endif
 #define	MAXHOSTNAMELEN	256
+#define	SPECNAMELEN	63
 
 #ifdef	_KERNEL
 #include <sys/time.h>
@@ -36,13 +40,18 @@
 
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x,y)	(((x)/(y))*(y))
+#define	rounddown2(x, y) ((x)&(~((y)-1)))   /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x,y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+#define	powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
+#define	trunc_page(x)	((unsigned long)(x) & ~(PAGE_MASK))
+#define	ptoa(x)		((unsigned long)(x) << PAGE_SHIFT)
+
 #include_next <sys/param.h>
 
 #endif	/* _COMPAT_FREEBSD_SYS_PARAM_H_ */
diff --git a/usr/src/compat/freebsd/sys/sdt.h b/usr/src/compat/freebsd/sys/sdt.h
new file mode 100644
index 0000000000..32d887c0d8
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/sdt.h
@@ -0,0 +1,37 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SDT_H_
+#define	_COMPAT_FREEBSD_SYS_SDT_H_
+
+/* Empty macros to cover FreeBSD's SDT linker tricks */
+
+#define	SDT_PROVIDER_DECLARE(mod)
+#define	SDT_PROVIDER_DEFINE(mod)
+
+#define	SDT_PROBE_DEFINE1(...)
+#define	SDT_PROBE_DEFINE2(...)
+#define	SDT_PROBE_DEFINE3(...)
+#define	SDT_PROBE_DEFINE4(...)
+#define	SDT_PROBE_DEFINE5(...)
+#define	SDT_PROBE1(...)
+#define	SDT_PROBE2(...)
+#define	SDT_PROBE3(...)
+#define	SDT_PROBE4(...)
+#define	SDT_PROBE5(...)
+
+#include_next <sys/sdt.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_SDT_H_ */
diff --git a/usr/src/compat/freebsd/sys/sglist.h b/usr/src/compat/freebsd/sys/sglist.h
new file mode 100644
index 0000000000..519c67915f
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/sglist.h
@@ -0,0 +1,29 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SGLIST_H_
+#define	_COMPAT_FREEBSD_SYS_SGLIST_H_
+
+#ifdef _KERNEL
+
+struct sglist;
+
+struct sglist *sglist_alloc(int, int);
+void sglist_free(struct sglist *);
+int sglist_append_phys(struct sglist *, vm_paddr_t, size_t);
+
+#endif /* _KERNEL */
+
+#endif	/* _COMPAT_FREEBSD_SYS_SGLIST_H_ */
diff --git a/usr/src/compat/freebsd/sys/smp.h b/usr/src/compat/freebsd/sys/smp.h
index 46183e8677..3d6413ce16 100644
--- a/usr/src/compat/freebsd/sys/smp.h
+++ b/usr/src/compat/freebsd/sys/smp.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_SMP_H_
@@ -18,10 +19,7 @@
 
 #include <sys/cpuset.h>
 
-void	smp_rendezvous(void (*)(void *),
-		       void (*)(void *),
-		       void (*)(void *),
-		       void *arg);
+#define	IPI_AST	0
 
 void	ipi_cpu(int cpu, u_int ipi);
 
diff --git a/usr/src/compat/freebsd/sys/socket.h b/usr/src/compat/freebsd/sys/socket.h
new file mode 100644
index 0000000000..3bf7a8f440
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/socket.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SOCKET_H
+#define	_COMPAT_FREEBSD_SYS_SOCKET_H
+
+#include_next <sys/socket.h>
+
+#define	SO_NOSIGPIPE	0
+
+#endif /* _COMPAT_FREEBSD_SYS_SOCKET_H */
diff --git a/usr/src/compat/freebsd/sys/systm.h b/usr/src/compat/freebsd/sys/systm.h
index e25acc0e4a..43fa16d450 100644
--- a/usr/src/compat/freebsd/sys/systm.h
+++ b/usr/src/compat/freebsd/sys/systm.h
@@ -28,18 +28,9 @@ struct mtx;
 		panic msg;						\
 } while (0)
 
-#define	CTASSERT(x)	_CTASSERT(x, __LINE__)
-#define	_CTASSERT(x,y)	__CTASSERT(x,y)
-#define	__CTASSERT(x,y)	typedef char __assert ## y[(x) ? 1 : -1]
-
 void	critical_enter(void);
 void	critical_exit(void);
 
-int	msleep_spin(void *chan, struct mtx *mutex, const char *wmesg,
-    int ticks);
-void	wakeup(void *chan);
-void	wakeup_one(void *chan);
-
 struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex);
 void delete_unrhdr(struct unrhdr *uh);
 int alloc_unr(struct unrhdr *uh);
diff --git a/usr/src/compat/freebsd/sys/time.h b/usr/src/compat/freebsd/sys/time.h
index f8f9da5cdf..4e0fbfc02c 100644
--- a/usr/src/compat/freebsd/sys/time.h
+++ b/usr/src/compat/freebsd/sys/time.h
@@ -50,7 +50,13 @@ binuptime(struct bintime *bt)
 	    ((a)->frac cmp (b)->frac) :					\
 	    ((a)->sec cmp (b)->sec))
 
-#define	SBT_1US	(1000)
+#define SBT_1S  ((sbintime_t)1 << 32)
+#define SBT_1M  (SBT_1S * 60)
+#define SBT_1MS (SBT_1S / 1000)
+#define SBT_1US (SBT_1S / 1000000)
+#define SBT_1NS (SBT_1S / 1000000000)
+#define SBT_MAX 0x7fffffffffffffffLL
+
 
 static __inline void
 bintime_add(struct bintime *bt, const struct bintime *bt2)
@@ -91,14 +97,28 @@ bintime_mul(struct bintime *bt, u_int x)
 static __inline sbintime_t
 bttosbt(const struct bintime bt)
 {
-	return ((bt.sec * 1000000000) +
-	    (((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32));
+	return (((sbintime_t)bt.sec << 32) + (bt.frac >> 32));
+}
+
+static __inline struct bintime
+sbttobt(sbintime_t _sbt)
+{
+	struct bintime _bt;
+
+	_bt.sec = _sbt >> 32;
+	_bt.frac = _sbt << 32;
+	return (_bt);
 }
 
 static __inline sbintime_t
 sbinuptime(void)
 {
-	return (gethrtime());
+	hrtime_t hrt = gethrtime();
+	uint64_t sec = hrt / NANOSEC;
+	uint64_t nsec = hrt % NANOSEC;
+
+	return (((sbintime_t)sec << 32) +
+	    (nsec * (((uint64_t)1 << 63) / 500000000) >> 32));
 }
 
 #endif	/* _COMPAT_FREEBSD_SYS_TIME_H_ */
diff --git a/usr/src/compat/freebsd/sys/types.h b/usr/src/compat/freebsd/sys/types.h
index 6fc8179f2e..63731da42e 100644
--- a/usr/src/compat/freebsd/sys/types.h
+++ b/usr/src/compat/freebsd/sys/types.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_TYPES_H_
@@ -53,6 +54,16 @@ typedef __vm_ooffset_t	vm_ooffset_t;
 typedef __vm_paddr_t	vm_paddr_t;
 #endif
 
+#ifndef	__VM_PINDEX_T_DEFINED
+#define	__VM_PINDEX_T_DEFINED
+typedef __uint64_t	vm_pindex_t;
+#endif
+
+#ifndef	__VM_SIZE_T_DEFINED
+#define	__VM_SIZE_T_DEFINED
+typedef __vm_size_t	vm_size_t;
+#endif
+
 #ifndef	__VM_MEMATTR_T_DEFINED
 #define	__VM_MEMATTR_T_DEFINED
 typedef char		vm_memattr_t;
@@ -65,8 +76,8 @@ typedef char		vm_memattr_t;
 typedef _Bool bool;
 #endif
 
-#if defined(_KERNEL) && !defined(offsetof)
-#define	offsetof(s, m)	((size_t)(&(((s *)0)->m)))
+#if defined(_KERNEL)
+typedef struct __dev_info **device_t;
 #endif
 
 #include_next <sys/types.h>
diff --git a/usr/src/compat/freebsd/unistd.h b/usr/src/compat/freebsd/unistd.h
new file mode 100644
index 0000000000..b4357e1da5
--- /dev/null
+++ b/usr/src/compat/freebsd/unistd.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_UNISTD_H
+#define	_COMPAT_FREEBSD_UNISTD_H
+
+#define	setproctitle(fmt, ...)
+
+#include_next <unistd.h>
+
+#endif /* _COMPAT_FREEBSD_UNISTD_H */
diff --git a/usr/src/compat/freebsd/vm/pmap.h b/usr/src/compat/freebsd/vm/pmap.h
deleted file mode 100644
index 5958c4b101..0000000000
--- a/usr/src/compat/freebsd/vm/pmap.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2014 Pluribus Networks Inc.
- */
-
-#ifndef _COMPAT_FREEBSD_VM_PMAP_H_
-#define	_COMPAT_FREEBSD_VM_PMAP_H_
-
-#include <machine/pmap.h>
-
-#endif	/* _COMPAT_FREEBSD_VM_PMAP_H_ */
diff --git a/usr/src/compat/freebsd/vm/vm.h b/usr/src/compat/freebsd/vm/vm.h
index 7da22099b6..f5bb7b6eb8 100644
--- a/usr/src/compat/freebsd/vm/vm.h
+++ b/usr/src/compat/freebsd/vm/vm.h
@@ -11,23 +11,48 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _FREEBSD_VM_VM_H_
 #define	_FREEBSD_VM_VM_H_
 
 #include <machine/vm.h>
+#include <sys/mman.h>
 
 typedef u_char vm_prot_t;
 
+/*
+ * Even though the FreeBSD VM_PROT defines happen to match illumos, this
+ * references the native values directly so there's no risk of breakage.
+ */
 #define	VM_PROT_NONE		((vm_prot_t) 0x00)
-#define	VM_PROT_READ		((vm_prot_t) 0x01)
-#define	VM_PROT_WRITE		((vm_prot_t) 0x02)
-#define	VM_PROT_EXECUTE		((vm_prot_t) 0x04)
+#define	VM_PROT_READ		((vm_prot_t) PROT_READ)
+#define	VM_PROT_WRITE		((vm_prot_t) PROT_WRITE)
+#define	VM_PROT_EXECUTE		((vm_prot_t) PROT_EXEC)
 
 #define	VM_PROT_ALL		(VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
 #define	VM_PROT_RW		(VM_PROT_READ|VM_PROT_WRITE)
 
+struct vm_page;
+typedef struct vm_page *vm_page_t;
+
+enum obj_type { OBJT_DEFAULT, OBJT_SWAP, OBJT_VNODE, OBJT_DEVICE, OBJT_PHYS,
+    OBJT_DEAD, OBJT_SG, OBJT_MGTDEVICE };
+typedef u_char objtype_t;
+
+union vm_map_object;
+typedef union vm_map_object vm_map_object_t;
+
+struct vm_map_entry;
+typedef struct vm_map_entry *vm_map_entry_t;
+
+struct vm_map;
+typedef struct vm_map *vm_map_t;
+
+struct vm_object;
+typedef struct vm_object *vm_object_t;
+
 /*
  * <sys/promif.h> contains a troublesome preprocessor define for BYTE.
  * Do this ugly workaround to avoid it.
diff --git a/usr/src/compat/freebsd/vm/vm_param.h b/usr/src/compat/freebsd/vm/vm_param.h
new file mode 100644
index 0000000000..fd76b62a37
--- /dev/null
+++ b/usr/src/compat/freebsd/vm/vm_param.h
@@ -0,0 +1,21 @@
+#ifndef _COMPAT_FREEBSD_VM_VM_PARAM_H_
+#define	_COMPAT_FREEBSD_VM_VM_PARAM_H_
+
+#include <machine/vmparam.h>
+
+#define	KERN_SUCCESS		0
+
+/* Not a direct correlation, but the primary necessity is being non-zero */
+#define	KERN_RESOURCE_SHORTAGE	ENOMEM
+
+/*
+ * The VM_MAXUSER_ADDRESS is used to determine the upper limit size limit of a
+ * vmspace, their 'struct as' equivalent.  The compat value is sized well below
+ * our native userlimit, even halving the available space below the VA hole.
+ * This is to avoid Intel EPT limits and leave room available in the usabe VA
+ * range for other mmap tricks.
+ */
+#define	VM_MAXUSER_ADDRESS	0x00003ffffffffffful
+
+
+#endif	/* _COMPAT_FREEBSD_VM_VM_PARAM_H_ */
diff --git a/usr/src/compat/freebsd/x86/_types.h b/usr/src/compat/freebsd/x86/_types.h
index a07fc017ad..8bbae549d8 100644
--- a/usr/src/compat/freebsd/x86/_types.h
+++ b/usr/src/compat/freebsd/x86/_types.h
@@ -41,9 +41,11 @@ typedef __int64_t	__register_t;
 typedef __uint64_t	__vm_offset_t;
 typedef __uint64_t	__vm_paddr_t;
 typedef __int64_t	__vm_ooffset_t;
+typedef __uint64_t	__vm_size_t;
 #else
 typedef __int32_t	__register_t;
 typedef __uint32_t	__vm_paddr_t;
+typedef __uint32_t	__vm_size_t;
 #endif
 
 #endif	/* _FREEBSD_X86__TYPES_H_ */
diff --git a/usr/src/compat/freebsd/x86/segments.h b/usr/src/compat/freebsd/x86/segments.h
index bc6ba976b8..11edc582b5 100644
--- a/usr/src/compat/freebsd/x86/segments.h
+++ b/usr/src/compat/freebsd/x86/segments.h
@@ -11,18 +11,19 @@
 
 /*
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
-#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H_
-#define	_COMPAT_FREEBSD_X86_SEGMENTS_H_
+#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H
+#define	_COMPAT_FREEBSD_X86_SEGMENTS_H
 
-/*
- * Entries in the Interrupt Descriptor Table (IDT)
- */
-#define	IDT_BP		3	/* #BP: Breakpoint */
+#if defined(_COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_) || defined(_KERNEL)
 #define	IDT_UD		6	/* #UD: Undefined/Invalid Opcode */
 #define	IDT_SS		12	/* #SS: Stack Segment Fault */
 #define	IDT_GP		13	/* #GP: General Protection Fault */
 #define	IDT_AC		17	/* #AC: Alignment Check */
+#else
+#include_next <x86/segments.h>
+#endif
 
-#endif	/* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */
+#endif /* _COMPAT_FREEBSD_X86_SEGMENTS_H */
diff --git a/usr/src/head/bhyve.h b/usr/src/head/bhyve.h
deleted file mode 100644
index 8c79ca1ccc..0000000000
--- a/usr/src/head/bhyve.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * COPYRIGHT 2013 Pluribus Networks Inc.
- *
- * All rights reserved. This copyright notice is Copyright Management
- * Information under 17 USC 1202 and is included to protect this work and
- * deter copyright infringement.  Removal or alteration of this Copyright
- * Management Information without the express written permission from
- * Pluribus Networks Inc is prohibited, and any such unauthorized removal
- * or alteration will be a violation of federal law.
- */
-#ifndef	_BHYVE_H
-#define	_BHYVE_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	BHYVE_TMPDIR			"/var/run/bhyve"
-#define	BHYVE_CONS_SOCKPATH		BHYVE_TMPDIR "/%s.console_sock"
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _BHYVE_H */
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index e2bbd9a8c0..b64d4c2bc1 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -280,7 +280,8 @@ SUBDIRS +=				\
 
 i386_SUBDIRS=		\
 	libfdisk	\
-	libsaveargs
+	libsaveargs	\
+	libvmmapi
 
 sparc_SUBDIRS=		\
 	efcode		\
@@ -504,7 +505,8 @@ HDRSUBDIRS=				\
 
 i386_HDRSUBDIRS=	\
 	libfdisk	\
-	libsaveargs
+	libsaveargs	\
+	libvmmapi
 
 sparc_HDRSUBDIRS=	\
 	libds		\
diff --git a/usr/src/lib/libvmmapi/Makefile b/usr/src/lib/libvmmapi/Makefile
index 60621fcb75..233fcd5edb 100644
--- a/usr/src/lib/libvmmapi/Makefile
+++ b/usr/src/lib/libvmmapi/Makefile
@@ -19,11 +19,13 @@ HDRS =		vmmapi.h
 
 HDRDIR =	common
 
+CHECKHDRS =
+
 $(BUILD64)SUBDIRS += $(MACH64)
 
 all:=		TARGET= all
 install:=	TARGET= install
-clean:=		TARGET= clean
+clean:=	TARGET= clean
 clobber:=	TARGET= clobber
 lint:=		TARGET= lint
 _msg:=		TARGET= _msg
diff --git a/usr/src/lib/libvmmapi/Makefile.com b/usr/src/lib/libvmmapi/Makefile.com
index e41a82f9a2..34240f4331 100644
--- a/usr/src/lib/libvmmapi/Makefile.com
+++ b/usr/src/lib/libvmmapi/Makefile.com
@@ -12,11 +12,12 @@
 #
 # Copyright 2013 Pluribus Networks Inc.
 #
+# Copyright 2019 Joyent, Inc.
 
-LIBRARY		= libvmmapi.a
+LIBRARY	= libvmmapi.a
 VERS		= .1
 
-OBJECTS		= vmmapi.o expand_number.o
+OBJECTS	= vmmapi.o expand_number.o
 
 # include library definitions
 include ../../Makefile.lib
@@ -24,16 +25,19 @@ include ../../Makefile.lib
 # install this library in the root filesystem
 include ../../Makefile.rootfs
 
-SRCDIR		=	../common
+SRCDIR		= ../common
 
-LIBS		=	$(DYNLIB) $(LINTLIB)
+LIBS		= $(DYNLIB) $(LINTLIB)
 
-CPPFLAGS	=	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+CPPFLAGS	= -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
 	$(CPPFLAGS.master) -I$(SRC)/uts/i86pc
 
+# not linted
+SMATCH=off
+
 $(LINTLIB) :=	SRCS = $(SRCDIR)/$(LINTSRC)
 
-LDLIBS		+=	-lc
+LDLIBS		+= -lc
 
 .KEEP_STATE:
 
diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers
index 7a8443a2b8..a64231ad1c 100644
--- a/usr/src/lib/libvmmapi/common/mapfile-vers
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers
@@ -11,6 +11,7 @@
 
 #
 # Copyright 2013 Pluribus Networks Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 #
@@ -27,51 +28,96 @@
 # MAPFILE HEADER END
 #
 
-SUNWprivate_1.0 {
-    global:
-	vcpu_reset;
-	vm_activate_cpu;
-	vm_apicid2vcpu;
-	vm_capability_name2type;
-	vm_capability_type2name;
-	vm_copy_setup;
-	vm_copy_teardown;
-	vm_copyin;
-	vm_copyout;
-	vm_create;
-	vm_destroy;
-	vm_get_capability;
-	vm_get_desc;
-	vm_get_highmem_size;
-	vm_get_lowmem_limit;
-	vm_get_lowmem_size;
-	vm_get_memory_seg;
-	vm_get_register;
-	vm_get_seg_desc;
-	vm_get_x2apic_state;
-	vm_gla2gpa;
-	vm_inject_exception;
-	vm_isa_assert_irq;
-	vm_isa_deassert_irq;
-	vm_isa_pulse_irq;
-	vm_isa_set_irq_trigger;
-	vm_ioapic_assert_irq;
-	vm_ioapic_deassert_irq;
-	vm_ioapic_pincount;
-	vm_ioapic_pulse_irq;
-	vm_lapic_irq;
-	vm_lapic_msi;
-	vm_map_gpa;
-	vm_open;
-	vm_parse_memsize;
-	vm_restart_instruction;
-	vm_run;
-	vm_set_capability;
-	vm_set_desc;
-	vm_set_register;
-	vm_set_x2apic_state;
-	vm_setup_memory;
-	vm_setup_rom;
-    local:
-        *;
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+	global:
+		vcpu_reset;
+		vm_active_cpus;
+		vm_activate_cpu;
+		vm_active_cpus;
+		vm_apicid2vcpu;
+		vm_assign_pptdev;
+		vm_capability_name2type;
+		vm_capability_type2name;
+		vm_copy_setup;
+		vm_copy_teardown;
+		vm_copyin;
+		vm_copyout;
+		vm_create_devmem;
+		vm_create;
+		vm_create_devmem;
+		vm_debug_cpus;
+		vm_destroy;
+		vm_destroy;
+		vm_get_capability;
+		vm_get_desc;
+		vm_get_device_fd;
+		vm_get_gpa_pmap;
+		vm_get_hpet_capabilities;
+		vm_get_highmem_size;
+		vm_get_intinfo;
+		vm_get_lowmem_limit;
+		vm_get_lowmem_size;
+		vm_get_memflags;
+		vm_get_memseg;
+		vm_get_register;
+		vm_get_register_set;
+		vm_get_seg_desc;
+		vm_get_stat_desc;
+		vm_get_stats;
+		vm_get_topology;
+		vm_get_x2apic_state;
+		vm_gla2gpa;
+		vm_gla2gpa_nofault;
+		vm_inject_exception;
+		vm_inject_nmi;
+		vm_isa_assert_irq;
+		vm_isa_deassert_irq;
+		vm_isa_pulse_irq;
+		vm_isa_set_irq_trigger;
+		vm_ioapic_assert_irq;
+		vm_ioapic_deassert_irq;
+		vm_ioapic_pincount;
+		vm_ioapic_pulse_irq;
+		vm_isa_assert_irq;
+		vm_isa_deassert_irq;
+		vm_isa_pulse_irq;
+		vm_isa_set_irq_trigger;
+		vm_lapic_irq;
+		vm_lapic_local_irq;
+		vm_lapic_msi;
+		vm_map_gpa;
+		vm_map_pptdev_mmio;
+		vm_mmap_getnext;
+		vm_mmap_memseg;
+		vm_open;
+		vm_parse_memsize;
+		vm_reinit;
+		vm_restart_instruction;
+		vm_rtc_gettime;
+		vm_rtc_read;
+		vm_rtc_settime;
+		vm_rtc_write;
+		vm_run;
+		vm_set_capability;
+		vm_set_desc;
+		vm_set_intinfo;
+		vm_set_memflags;
+		vm_set_register;
+		vm_set_register_set;
+		vm_set_topology;
+		vm_set_x2apic_state;
+		vm_setup_memory;
+		vm_setup_pptdev_msi;
+		vm_setup_pptdev_msix;
+		vm_suspend;
+		vm_suspend_cpu;
+		vm_suspended_cpus;
+		vm_resume_cpu;
+		vm_unassign_pptdev;
+		vm_wrlock_cycle;
+
+	local:
+		*;
 };
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
index bbab3961a9..0b9b871081 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.c
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,10 +38,11 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/sysctl.h>
@@ -48,11 +51,10 @@ __FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tych
 #include <sys/_iovec.h>
 #include <sys/cpuset.h>
 
+#include <x86/segments.h>
 #include <machine/specialreg.h>
 
-#ifndef	__FreeBSD__
 #include <errno.h>
-#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
@@ -70,23 +72,35 @@ __FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tych
 
 #include "vmmapi.h"
 
-#define	KB	(1024UL)
 #define	MB	(1024 * 1024UL)
 #define	GB	(1024 * 1024 * 1024UL)
 
+#ifndef __FreeBSD__
+/* shim to no-op for now */
+#define	MAP_NOCORE		0
+#define	MAP_ALIGNED_SUPER	0
+
+/* Rely on PROT_NONE for guard purposes */
+#define	MAP_GUARD		(MAP_PRIVATE | MAP_ANON | MAP_NORESERVE)
+#endif
+
+/*
+ * Size of the guard region before and after the virtual address space
+ * mapping the guest physical memory. This must be a multiple of the
+ * superpage size for performance reasons.
+ */
+#define	VM_MMAP_GUARD_SIZE	(4 * MB)
+
+#define	PROT_RW		(PROT_READ | PROT_WRITE)
+#define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
+
 struct vmctx {
 	int	fd;
 	uint32_t lowmem_limit;
-	enum vm_mmap_style vms;
-	char	*lowermem_addr;
-	char	*biosmem_addr;
+	int	memflags;
 	size_t	lowmem;
-	char	*lowmem_addr;
 	size_t	highmem;
-	char	*highmem_addr;
-	uint64_t rombase;
-	uint64_t romlimit;
-	char	*rom_addr;
+	char	*baseaddr;
 	char	*name;
 };
 
@@ -94,68 +108,50 @@ struct vmctx {
 #define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
 #define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
 #else
-#define	CREATE(x)	vmm_vm_create(x)
-#define	DESTROY(x)	vmm_vm_destroy(x)
-#endif
+#define	CREATE(x)	vm_do_ctl(VMM_CREATE_VM, (x))
+#define	DESTROY(x)	vm_do_ctl(VMM_DESTROY_VM, (x))
 
 static int
-vm_device_open(const char *name)
+vm_do_ctl(int cmd, const char *name)
 {
-        int fd, len;
-        char *vmfile;
+	int ctl_fd;
 
-#ifdef	__FreeBSD__
-	len = strlen("/dev/vmm/") + strlen(name) + 1;
-#else
-	len = strlen("/devices/pseudo/vmm@0:") + strlen(name) + 1;
-#endif
-	vmfile = malloc(len);
-	assert(vmfile != NULL);
-#ifdef	__FreeBSD__
-	snprintf(vmfile, len, "/dev/vmm/%s", name);
-#else
-	snprintf(vmfile, len, "/devices/pseudo/vmm@0:%s", name);
-#endif
+	ctl_fd = open(VMM_CTL_DEV, O_EXCL | O_RDWR);
+	if (ctl_fd < 0) {
+		return (-1);
+	}
 
-        /* Open the device file */
-        fd = open(vmfile, O_RDWR, 0);
+	if (ioctl(ctl_fd, cmd, name) == -1) {
+		int err = errno;
 
-	free(vmfile);
-        return (fd);
+		/* Do not lose ioctl errno through the close(2) */
+		(void) close(ctl_fd);
+		errno = err;
+		return (-1);
+	}
+	(void) close(ctl_fd);
+
+	return (0);
 }
+#endif
 
-#ifndef	__FreeBSD__
 static int
-vmm_vm_create(const char *name)
+vm_device_open(const char *name)
 {
-	const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl";
-	struct vmm_ioctl vi;
-	int err = 0;
-	int ctl_fd;
+	int fd, len;
+	char *vmfile;
 
-	(void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1);
+	len = strlen("/dev/vmm/") + strlen(name) + 1;
+	vmfile = malloc(len);
+	assert(vmfile != NULL);
+	snprintf(vmfile, len, "/dev/vmm/%s", name);
 
-	ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR);
-	if (ctl_fd == -1) {
-		err = errno;
-		if ((errno == EPERM) || (errno == EACCES)) {
-			fprintf(stderr, "you do not have permission to "
-				"perform that operation.\n");
-		} else {
-			fprintf(stderr, "open: %s: %s\n", vmm_ctl,
-				strerror(errno));
-		}
-		return (err);
-	}
-	if (ioctl(ctl_fd, VMM_CREATE_VM, &vi) == -1) {
-		err = errno;
-		fprintf(stderr, "couldn't create vm \"%s\"", name);
-	}
-	close (ctl_fd);
+	/* Open the device file */
+	fd = open(vmfile, O_RDWR, 0);
 
-	return (err);
+	free(vmfile);
+	return (fd);
 }
-#endif
 
 int
 vm_create(const char *name)
@@ -173,6 +169,7 @@ vm_open(const char *name)
 	assert(vm != NULL);
 
 	vm->fd = -1;
+	vm->memflags = 0;
 	vm->lowmem_limit = 3 * GB;
 	vm->name = (char *)(vm + 1);
 	strcpy(vm->name, name);
@@ -182,54 +179,20 @@ vm_open(const char *name)
 
 	return (vm);
 err:
-	(void) vm_destroy(vm);
+	vm_destroy(vm);
 	return (NULL);
 }
 
-#ifndef	__FreeBSD__
-static int
-vmm_vm_destroy(const char *name)
-{
-	const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl";
-	struct vmm_ioctl vi;	
-	int ctl_fd;
-	int err = 0;
-
-	(void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1);
-
-	ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR);
-	if (ctl_fd == -1) {
-		err = errno;
-		if ((errno == EPERM) || (errno == EACCES)) {
-			fprintf(stderr, "you do not have permission to "
-				"perform that operation.\n");
-		} else {
-			fprintf(stderr, "open: %s: %s\n", vmm_ctl,
-				strerror(errno));
-		}
-		return (err);
-	}
-	if (ioctl(ctl_fd, VMM_DESTROY_VM, &vi) == -1) {
-		err = errno;
-		fprintf(stderr, "couldn't destroy vm \"%s\"", name);
-	}
-	close (ctl_fd);
-	return (err);
-}
-#endif
-
-int
+void
 vm_destroy(struct vmctx *vm)
 {
-	int err;
 	assert(vm != NULL);
 
 	if (vm->fd >= 0)
 		close(vm->fd);
-	err = DESTROY(vm->name);
+	DESTROY(vm->name);
 
 	free(vm);
-	return (err);
 }
 
 int
@@ -256,92 +219,218 @@ vm_parse_memsize(const char *optarg, size_t *ret_memsize)
 	return (error);
 }
 
-#ifdef	__FreeBSD__
-size_t
-vmm_get_mem_total(void)
+uint32_t
+vm_get_lowmem_limit(struct vmctx *ctx)
 {
-	size_t mem_total = 0;
-	size_t oldlen = sizeof(mem_total);
-	int error;
-	error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0);
-	if (error)
-		return -1;
-	return mem_total;
+
+	return (ctx->lowmem_limit);
 }
 
-size_t
-vmm_get_mem_free(void)
+void
+vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
 {
-	size_t mem_free = 0;
-	size_t oldlen = sizeof(mem_free);
-	int error;
-	error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0);
-	if (error)
-		return -1;
-	return mem_free;
+
+	ctx->lowmem_limit = limit;
+}
+
+void
+vm_set_memflags(struct vmctx *ctx, int flags)
+{
+
+	ctx->memflags = flags;
 }
-#endif
 
 int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
-		  int *wired)
+vm_get_memflags(struct vmctx *ctx)
 {
-	int error;
-	struct vm_memory_segment seg;
-
-	bzero(&seg, sizeof(seg));
-	seg.gpa = gpa;
-	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
-	*ret_len = seg.len;
-	if (wired != NULL)
-		*wired = seg.wired;
+
+	return (ctx->memflags);
+}
+
+/*
+ * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
+ */
+int
+vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+    size_t len, int prot)
+{
+	struct vm_memmap memmap;
+	int error, flags;
+
+	memmap.gpa = gpa;
+	memmap.segid = segid;
+	memmap.segoff = off;
+	memmap.len = len;
+	memmap.prot = prot;
+	memmap.flags = 0;
+
+	if (ctx->memflags & VM_MEM_F_WIRED)
+		memmap.flags |= VM_MEMMAP_F_WIRED;
+
+	/*
+	 * If this mapping already exists then don't create it again. This
+	 * is the common case for SYSMEM mappings created by bhyveload(8).
+	 */
+	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
+	if (error == 0 && gpa == memmap.gpa) {
+		if (segid != memmap.segid || off != memmap.segoff ||
+		    prot != memmap.prot || flags != memmap.flags) {
+			errno = EEXIST;
+			return (-1);
+		} else {
+			return (0);
+		}
+	}
+
+	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
 	return (error);
 }
 
-uint32_t
-vm_get_lowmem_limit(struct vmctx *ctx)
+int
+vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 {
+	struct vm_memmap memmap;
+	int error;
 
-	return (ctx->lowmem_limit);
+	bzero(&memmap, sizeof(struct vm_memmap));
+	memmap.gpa = *gpa;
+	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
+	if (error == 0) {
+		*gpa = memmap.gpa;
+		*segid = memmap.segid;
+		*segoff = memmap.segoff;
+		*len = memmap.len;
+		*prot = memmap.prot;
+		*flags = memmap.flags;
+	}
+	return (error);
 }
 
-void
-vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
+/*
+ * Return 0 if the segments are identical and non-zero otherwise.
+ *
+ * This is slightly complicated by the fact that only device memory segments
+ * are named.
+ */
+static int
+cmpseg(size_t len, const char *str, size_t len2, const char *str2)
 {
 
-	ctx->lowmem_limit = limit;
+	if (len == len2) {
+		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
+			return (0);
+	}
+	return (-1);
 }
 
 static int
-setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
+vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
 {
+	struct vm_memseg memseg;
+	size_t n;
 	int error;
-	struct vm_memory_segment seg;
 
 	/*
-	 * Create and optionally map 'len' bytes of memory at guest
-	 * physical address 'gpa'
+	 * If the memory segment has already been created then just return.
+	 * This is the usual case for the SYSMEM segment created by userspace
+	 * loaders like bhyveload(8).
 	 */
-	bzero(&seg, sizeof(seg));
-	seg.gpa = gpa;
-	seg.len = len;
-	error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
-	if (error == 0 && addr != NULL) {
-		*addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
-				ctx->fd, gpa);
+	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
+	    sizeof(memseg.name));
+	if (error)
+		return (error);
+
+	if (memseg.len != 0) {
+		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
+			errno = EINVAL;
+			return (-1);
+		} else {
+			return (0);
+		}
+	}
+
+	bzero(&memseg, sizeof(struct vm_memseg));
+	memseg.segid = segid;
+	memseg.len = len;
+	if (name != NULL) {
+		n = strlcpy(memseg.name, name, sizeof(memseg.name));
+		if (n >= sizeof(memseg.name)) {
+			errno = ENAMETOOLONG;
+			return (-1);
+		}
+	}
+
+	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
+	return (error);
+}
+
+int
+vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
+    size_t bufsize)
+{
+	struct vm_memseg memseg;
+	size_t n;
+	int error;
+
+	memseg.segid = segid;
+	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
+	if (error == 0) {
+		*lenp = memseg.len;
+		n = strlcpy(namebuf, memseg.name, bufsize);
+		if (n >= bufsize) {
+			errno = ENAMETOOLONG;
+			error = -1;
+		}
 	}
 	return (error);
 }
 
+static int
+#ifdef __FreeBSD__
+setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
+#else
+setup_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len,
+    char *base)
+#endif
+{
+	char *ptr;
+	int error, flags;
+
+	/* Map 'len' bytes starting at 'gpa' in the guest address space */
+#ifdef __FreeBSD__
+	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
+#else
+	/*
+	 * As we use two segments for lowmem/highmem the offset within the
+	 * segment is 0 on illumos.
+	 */
+	error = vm_mmap_memseg(ctx, gpa, segid, 0, len, PROT_ALL);
+#endif
+	if (error)
+		return (error);
+
+	flags = MAP_SHARED | MAP_FIXED;
+	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+		flags |= MAP_NOCORE;
+
+	/* mmap into the process address space on the host */
+	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
+	if (ptr == MAP_FAILED)
+		return (-1);
+
+	return (0);
+}
+
 int
 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
 {
-	char **addr;
+	size_t objsize, len;
+	vm_paddr_t gpa;
+	char *baseaddr, *ptr;
 	int error;
 
-	/* XXX VM_MMAP_SPARSE not implemented yet */
-	assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
-	ctx->vms = vms;
+	assert(vms == VM_MMAP_ALL);
 
 	/*
 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
@@ -349,81 +438,100 @@ vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
 	 */
 	if (memsize > ctx->lowmem_limit) {
 		ctx->lowmem = ctx->lowmem_limit;
-		ctx->highmem = memsize - ctx->lowmem;
+		ctx->highmem = memsize - ctx->lowmem_limit;
+		objsize = 4*GB + ctx->highmem;
 	} else {
 		ctx->lowmem = memsize;
 		ctx->highmem = 0;
+		objsize = ctx->lowmem;
 	}
 
-	if (ctx->lowmem > 0) {
-		addr = (vms == VM_MMAP_ALL) ? &ctx->lowermem_addr : NULL;
-		error = setup_memory_segment(ctx, 0, 640*KB, addr);
-		if (error)
-			return (error);
+#ifdef __FreeBSD__
+	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
+	if (error)
+		return (error);
+#endif
+
+	/*
+	 * Stake out a contiguous region covering the guest physical memory
+	 * and the adjoining guard regions.
+	 */
+	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
+	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
+	if (ptr == MAP_FAILED)
+		return (-1);
+
+	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
 
-		addr = (vms == VM_MMAP_ALL) ? &ctx->biosmem_addr : NULL;
-		error = setup_memory_segment(ctx, 768*KB, 256*KB, addr);
+#ifdef __FreeBSD__
+	if (ctx->highmem > 0) {
+		gpa = 4*GB;
+		len = ctx->highmem;
+		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
+	}
 
-		addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
-		error = setup_memory_segment(ctx, 1*MB, ctx->lowmem - 1*MB, addr);
+	if (ctx->lowmem > 0) {
+		gpa = 0;
+		len = ctx->lowmem;
+		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
-
+#else
 	if (ctx->highmem > 0) {
-		addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
-		error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
+		error = vm_alloc_memseg(ctx, VM_HIGHMEM, ctx->highmem, NULL);
+		if (error)
+			return (error);
+		gpa = 4*GB;
+		len = ctx->highmem;
+		error = setup_memory_segment(ctx, VM_HIGHMEM, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
 
-	return (0);
-}
+	if (ctx->lowmem > 0) {
+		error = vm_alloc_memseg(ctx, VM_LOWMEM, ctx->lowmem, NULL);
+		if (error)
+			return (error);
+		gpa = 0;
+		len = ctx->lowmem;
+		error = setup_memory_segment(ctx, VM_LOWMEM, gpa, len, baseaddr);
+		if (error)
+			return (error);
+	}
+#endif
 
-int
-vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
-{
-	ctx->rombase = gpa;
-	ctx->romlimit = gpa + len;
+	ctx->baseaddr = baseaddr;
 
-	return (setup_memory_segment(ctx, gpa, len, &ctx->rom_addr));
+	return (0);
 }
 
+/*
+ * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
+ * the lowmem or highmem regions.
+ *
+ * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
+ * The instruction emulation code depends on this behavior.
+ */
 void *
 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
 {
 
-	/* XXX VM_MMAP_SPARSE not implemented yet */
-	assert(ctx->vms == VM_MMAP_ALL);
-
-	if (gaddr + len <= 1*MB) {
-		if (gaddr + len <= 640*KB)
-			return ((void *)(ctx->lowermem_addr + gaddr));
-
-		if (768*KB <= gaddr && gaddr + len <= 1*MB) {
-			gaddr -= 768*KB;
-			return ((void *)(ctx->biosmem_addr + gaddr));
-		}
-
-		return (NULL);
-	}
-
-	if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem) {
-		gaddr -= 1*MB;
-		return ((void *)(ctx->lowmem_addr + gaddr));
-	}
-
-	if (ctx->rombase <= gaddr && gaddr + len <= ctx->romlimit) {
-		gaddr -= ctx->rombase;
-		return ((void *)(ctx->rom_addr + gaddr));
+	if (ctx->lowmem > 0) {
+		if (gaddr < ctx->lowmem && len <= ctx->lowmem &&
+		    gaddr + len <= ctx->lowmem)
+			return (ctx->baseaddr + gaddr);
 	}
 
-	if (gaddr >= 4*GB) {
-		gaddr -= 4*GB;
-		if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
-			return ((void *)(ctx->highmem_addr + gaddr));
+	if (ctx->highmem > 0) {
+                if (gaddr >= 4*GB) {
+			if (gaddr < 4*GB + ctx->highmem &&
+			    len <= ctx->highmem &&
+			    gaddr + len <= 4*GB + ctx->highmem)
+				return (ctx->baseaddr + gaddr);
+		}
 	}
 
 	return (NULL);
@@ -443,6 +551,79 @@ vm_get_highmem_size(struct vmctx *ctx)
 	return (ctx->highmem);
 }
 
+void *
+vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
+{
+#ifdef	__FreeBSD__
+	char pathname[MAXPATHLEN];
+#endif
+	size_t len2;
+	char *base, *ptr;
+	int fd, error, flags;
+	off_t mapoff;
+
+	fd = -1;
+	ptr = MAP_FAILED;
+	if (name == NULL || strlen(name) == 0) {
+		errno = EINVAL;
+		goto done;
+	}
+
+	error = vm_alloc_memseg(ctx, segid, len, name);
+	if (error)
+		goto done;
+
+#ifdef	__FreeBSD__
+	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
+	strlcat(pathname, ctx->name, sizeof(pathname));
+	strlcat(pathname, ".", sizeof(pathname));
+	strlcat(pathname, name, sizeof(pathname));
+
+	fd = open(pathname, O_RDWR);
+	if (fd < 0)
+		goto done;
+#else
+	{
+		struct vm_devmem_offset vdo;
+
+		vdo.segid = segid;
+		error = ioctl(ctx->fd, VM_DEVMEM_GETOFFSET, &vdo);
+		if (error == 0) {
+			mapoff = vdo.offset;
+		} else {
+			goto done;
+		}
+	}
+#endif
+
+	/*
+	 * Stake out a contiguous region covering the device memory and the
+	 * adjoining guard regions.
+	 */
+	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
+	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
+	    0);
+	if (base == MAP_FAILED)
+		goto done;
+
+	flags = MAP_SHARED | MAP_FIXED;
+	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+		flags |= MAP_NOCORE;
+
+#ifdef	__FreeBSD__
+	/* mmap the devmem region in the host address space */
+	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
+#else
+	/* mmap the devmem region in the host address space */
+	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, ctx->fd,
+	    mapoff);
+#endif
+done:
+	if (fd >= 0)
+		close(fd);
+	return (ptr);
+}
+
 int
 vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t base, uint32_t limit, uint32_t access)
@@ -521,6 +702,40 @@ vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
 	return (error);
 }
 
+int
+vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
+    const int *regnums, uint64_t *regvals)
+{
+	int error;
+	struct vm_register_set vmregset;
+
+	bzero(&vmregset, sizeof(vmregset));
+	vmregset.cpuid = vcpu;
+	vmregset.count = count;
+	vmregset.regnums = regnums;
+	vmregset.regvals = regvals;
+
+	error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset);
+	return (error);
+}
+
+int
+vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
+    const int *regnums, uint64_t *regvals)
+{
+	int error;
+	struct vm_register_set vmregset;
+
+	bzero(&vmregset, sizeof(vmregset));
+	vmregset.cpuid = vcpu;
+	vmregset.count = count;
+	vmregset.regnums = regnums;
+	vmregset.regvals = regvals;
+
+	error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset);
+	return (error);
+}
+
 int
 vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
 {
@@ -535,19 +750,21 @@ vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
 	return (error);
 }
 
-static int
-vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector,
-    int error_code, int error_code_valid)
+int
+vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
 {
-	struct vm_exception exc;
+	struct vm_suspend vmsuspend;
 
-	bzero(&exc, sizeof(exc));
-	exc.cpuid = vcpu;
-	exc.vector = vector;
-	exc.error_code = error_code;
-	exc.error_code_valid = error_code_valid;
+	bzero(&vmsuspend, sizeof(vmsuspend));
+	vmsuspend.how = how;
+	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
+}
 
-	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
+int
+vm_reinit(struct vmctx *ctx)
+{
+
+	return (ioctl(ctx->fd, VM_REINIT, 0));
 }
 
 int
@@ -774,7 +991,7 @@ vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
 	vmcap.cpuid = vcpu;
 	vmcap.captype = cap;
 	vmcap.capval = val;
-	
+
 	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
 }
 
@@ -858,7 +1075,6 @@ vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
 	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
 }
 
-#ifdef	__FreeBSD__
 uint64_t *
 vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 	     int *ret_entries)
@@ -869,7 +1085,7 @@ vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 
 	vmstats.cpuid = vcpu;
 
-	error = ioctl(ctx->fd, VM_STATS, &vmstats);
+	error = ioctl(ctx->fd, VM_STATS_IOC, &vmstats);
 	if (error == 0) {
 		if (ret_entries)
 			*ret_entries = vmstats.num_entries;
@@ -891,7 +1107,6 @@ vm_get_stat_desc(struct vmctx *ctx, int index)
 	else
 		return (NULL);
 }
-#endif
 
 int
 vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
@@ -1112,9 +1327,9 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
 	return (error);
 }
 
-static int
-gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, int *fault, uint64_t *gpa)
+int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa, int *fault)
 {
 	struct vm_gla2gpa gg;
 	int error;
@@ -1134,14 +1349,23 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 }
 
 int
-vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa)
+vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa, int *fault)
 {
-	int error, fault;
+	struct vm_gla2gpa gg;
+	int error;
+
+	bzero(&gg, sizeof(struct vm_gla2gpa));
+	gg.vcpuid = vcpu;
+	gg.prot = prot;
+	gg.gla = gla;
+	gg.paging = *paging;
 
-	error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa);
-	if (fault)
-		error = fault;
+	error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg);
+	if (error == 0) {
+		*fault = gg.fault;
+		*gpa = gg.gpa;
+	}
 	return (error);
 }
 
@@ -1151,11 +1375,12 @@ vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 
 int
 vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
+    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
+    int *fault)
 {
 	void *va;
 	uint64_t gpa;
-	int error, fault, i, n, off;
+	int error, i, n, off;
 
 	for (i = 0; i < iovcnt; i++) {
 		iov[i].iov_base = 0;
@@ -1164,18 +1389,16 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 
 	while (len) {
 		assert(iovcnt > 0);
-		error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa);
-		if (error)
-			return (-1);
-		if (fault)
-			return (1);
+		error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault);
+		if (error || *fault)
+			return (error);
 
 		off = gpa & PAGE_MASK;
 		n = min(len, PAGE_SIZE - off);
 
 		va = vm_map_gpa(ctx, gpa, n);
 		if (va == NULL)
-			return (-1);
+			return (EFAULT);
 
 		iov->iov_base = va;
 		iov->iov_len = n;
@@ -1236,6 +1459,42 @@ vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
 	}
 }
 
+static int
+vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
+{
+	struct vm_cpuset vm_cpuset;
+	int error;
+
+	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
+	vm_cpuset.which = which;
+	vm_cpuset.cpusetsize = sizeof(cpuset_t);
+	vm_cpuset.cpus = cpus;
+
+	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
+	return (error);
+}
+
+int
+vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
+{
+
+	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
+}
+
+int
+vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
+{
+
+	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
+}
+
+int
+vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
+{
+
+	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
+}
+
 int
 vm_activate_cpu(struct vmctx *ctx, int vcpu)
 {
@@ -1248,6 +1507,111 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu)
 	return (error);
 }
 
+int
+vm_suspend_cpu(struct vmctx *ctx, int vcpu)
+{
+	struct vm_activate_cpu ac;
+	int error;
+
+	bzero(&ac, sizeof(struct vm_activate_cpu));
+	ac.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
+	return (error);
+}
+
+int
+vm_resume_cpu(struct vmctx *ctx, int vcpu)
+{
+	struct vm_activate_cpu ac;
+	int error;
+
+	bzero(&ac, sizeof(struct vm_activate_cpu));
+	ac.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
+	return (error);
+}
+
+int
+vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
+	if (error == 0) {
+		*info1 = vmii.info1;
+		*info2 = vmii.info2;
+	}
+	return (error);
+}
+
+int
+vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	vmii.info1 = info1;
+	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
+	return (error);
+}
+
+int
+vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value)
+{
+	struct vm_rtc_data rtcdata;
+	int error;
+
+	bzero(&rtcdata, sizeof(struct vm_rtc_data));
+	rtcdata.offset = offset;
+	rtcdata.value = value;
+	error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata);
+	return (error);
+}
+
+int
+vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval)
+{
+	struct vm_rtc_data rtcdata;
+	int error;
+
+	bzero(&rtcdata, sizeof(struct vm_rtc_data));
+	rtcdata.offset = offset;
+	error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata);
+	if (error == 0)
+		*retval = rtcdata.value;
+	return (error);
+}
+
+int
+vm_rtc_settime(struct vmctx *ctx, time_t secs)
+{
+	struct vm_rtc_time rtctime;
+	int error;
+
+	bzero(&rtctime, sizeof(struct vm_rtc_time));
+	rtctime.secs = secs;
+	error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime);
+	return (error);
+}
+
+int
+vm_rtc_gettime(struct vmctx *ctx, time_t *secs)
+{
+	struct vm_rtc_time rtctime;
+	int error;
+
+	bzero(&rtctime, sizeof(struct vm_rtc_time));
+	error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime);
+	if (error == 0)
+		*secs = rtctime.secs;
+	return (error);
+}
+
 int
 vm_restart_instruction(void *arg, int vcpu)
 {
@@ -1255,3 +1619,92 @@ vm_restart_instruction(void *arg, int vcpu)
 
 	return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
 }
+
+int
+vm_set_topology(struct vmctx *ctx,
+    uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
+{
+	struct vm_cpu_topology topology;
+
+	bzero(&topology, sizeof (struct vm_cpu_topology));
+	topology.sockets = sockets;
+	topology.cores = cores;
+	topology.threads = threads;
+	topology.maxcpus = maxcpus;
+	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
+}
+
+int
+vm_get_topology(struct vmctx *ctx,
+    uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
+{
+	struct vm_cpu_topology topology;
+	int error;
+
+	bzero(&topology, sizeof (struct vm_cpu_topology));
+	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
+	if (error == 0) {
+		*sockets = topology.sockets;
+		*cores = topology.cores;
+		*threads = topology.threads;
+		*maxcpus = topology.maxcpus;
+	}
+	return (error);
+}
+
+int
+vm_get_device_fd(struct vmctx *ctx)
+{
+
+	return (ctx->fd);
+}
+
+#ifndef __FreeBSD__
+int
+vm_wrlock_cycle(struct vmctx *ctx)
+{
+	if (ioctl(ctx->fd, VM_WRLOCK_CYCLE, 0) != 0) {
+		return (errno);
+	}
+	return (0);
+}
+#endif /* __FreeBSD__ */
+
+#ifdef __FreeBSD__
+const cap_ioctl_t *
+vm_get_ioctls(size_t *len)
+{
+	cap_ioctl_t *cmds;
+	/* keep in sync with machine/vmm_dev.h */
+	static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT,
+	    VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG,
+	    VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER,
+	    VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR,
+	    VM_SET_REGISTER_SET, VM_GET_REGISTER_SET,
+	    VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ,
+	    VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ,
+	    VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ,
+	    VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER,
+	    VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV,
+	    VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI,
+	    VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC,
+	    VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE,
+	    VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA,
+	    VM_GLA2GPA_NOFAULT,
+	    VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU,
+	    VM_SET_INTINFO, VM_GET_INTINFO,
+	    VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME,
+	    VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY };
+
+	if (len == NULL) {
+		cmds = malloc(sizeof(vm_ioctl_cmds));
+		if (cmds == NULL)
+			return (NULL);
+		bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds));
+		return (cmds);
+	}
+
+	*len = nitems(vm_ioctl_cmds);
+	return (NULL);
+}
+#endif /* __FreeBSD__ */
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
index d7eb67aa58..a1507255cb 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.h
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/lib/libvmmapi/vmmapi.h 280929 2015-04-01 00:15:31Z tychon $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,12 +38,20 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _VMMAPI_H_
 #define	_VMMAPI_H_
 
 #include <sys/param.h>
+#include <sys/cpuset.h>
+
+/*
+ * API version for out-of-tree consumers like grub-bhyve for making compile
+ * time decisions.
+ */
+#define	VMMAPI_VERSION	0103	/* 2 digit major followed by 2 digit minor */
 
 struct iovec;
 struct vmctx;
@@ -57,19 +67,77 @@ enum vm_mmap_style {
 	VM_MMAP_SPARSE,		/* mappings created on-demand */
 };
 
+/*
+ * 'flags' value passed to 'vm_set_memflags()'.
+ */
+#define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
+#define	VM_MEM_F_WIRED	0x02	/* guest memory is wired */
+
+/*
+ * Identifiers for memory segments:
+ * - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
+ * - the remaining identifiers can be used to create devmem segments.
+ */
+enum {
+#ifdef __FreeBSD__
+	VM_SYSMEM,
+#else
+	VM_LOWMEM,
+	VM_HIGHMEM,
+#endif
+	VM_BOOTROM,
+	VM_FRAMEBUFFER,
+};
+
+/*
+ * Get the length and name of the memory segment identified by 'segid'.
+ * Note that system memory segments are identified with a nul name.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+int	vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
+	    size_t namesiz);
+
+/*
+ * Iterate over the guest address space. This function finds an address range
+ * that starts at an address >= *gpa.
+ *
+ * Returns 0 if the next address range was found and non-zero otherwise.
+ */
+int	vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+	    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+/*
+ * Create a device memory segment identified by 'segid'.
+ *
+ * Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
+ */
+void	*vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
+	    size_t len);
+
+/*
+ * Map the memory segment identified by 'segid' into the guest address space
+ * at [gpa,gpa+len) with protection 'prot'.
+ */
+int	vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
+	    vm_ooffset_t segoff, size_t len, int prot);
+
 int	vm_create(const char *name);
+int	vm_get_device_fd(struct vmctx *ctx);
 struct vmctx *vm_open(const char *name);
-int	vm_destroy(struct vmctx *ctx);
+void	vm_destroy(struct vmctx *ctx);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
-int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
-			  int *wired);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
-int	vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 int	vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
-		   uint64_t gla, int prot, uint64_t *gpa);
+		   uint64_t gla, int prot, uint64_t *gpa, int *fault);
+int	vm_gla2gpa_nofault(struct vmctx *, int vcpuid,
+		   struct vm_guest_paging *paging, uint64_t gla, int prot,
+		   uint64_t *gpa, int *fault);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
+void	vm_set_memflags(struct vmctx *ctx, int flags);
+int	vm_get_memflags(struct vmctx *ctx);
 size_t	vm_get_lowmem_size(struct vmctx *ctx);
 size_t	vm_get_highmem_size(struct vmctx *ctx);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
@@ -80,7 +148,13 @@ int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
 			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
+int	vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
+    const int *regnums, uint64_t *regvals);
+int	vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
+    const int *regnums, uint64_t *regvals);
 int	vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
+int	vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
+int	vm_reinit(struct vmctx *ctx);
 int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
 int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
     int errcode_valid, uint32_t errcode, int restart_instruction);
@@ -113,6 +187,13 @@ int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
 
+int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
+int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
+
+#ifdef __FreeBSD__
+const cap_ioctl_t *vm_get_ioctls(size_t *len);
+#endif
+
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.
  */
@@ -127,11 +208,16 @@ int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
 
 /*
  * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
- * The 'iovcnt' should be big enough to accomodate all GPA segments.
- * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
+ * The 'iovcnt' should be big enough to accommodate all GPA segments.
+ *
+ * retval	fault		Interpretation
+ *   0		  0		Success
+ *   0		  1		An exception was injected into the guest
+ * EFAULT	 N/A		Error
  */
 int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
-	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
+	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
+	    int *fault);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
@@ -139,10 +225,32 @@ void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 void	vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
 	    int iovcnt);
 
+/* RTC */
+int	vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
+int	vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval);
+int	vm_rtc_settime(struct vmctx *ctx, time_t secs);
+int	vm_rtc_gettime(struct vmctx *ctx, time_t *secs);
+
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
+int	vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus);
+int	vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus);
+int	vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_activate_cpu(struct vmctx *ctx, int vcpu);
+int	vm_suspend_cpu(struct vmctx *ctx, int vcpu);
+int	vm_resume_cpu(struct vmctx *ctx, int vcpu);
+
+/* CPU topology */
+int	vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores,
+	    uint16_t threads, uint16_t maxcpus);
+int	vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores,
+	    uint16_t *threads, uint16_t *maxcpus);
+
+#ifndef	__FreeBSD__
+/* illumos-specific APIs */
+int	vm_wrlock_cycle(struct vmctx *ctx);
+#endif	/* __FreeBSD__ */
 
 #ifdef	__FreeBSD__
 /*
diff --git a/usr/src/pkg/manifests/system-bhyve-tests.mf b/usr/src/pkg/manifests/system-bhyve-tests.mf
new file mode 100644
index 0000000000..14586b5177
--- /dev/null
+++ b/usr/src/pkg/manifests/system-bhyve-tests.mf
@@ -0,0 +1,35 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
+#
+
+set name=pkg.fmri value=pkg:/system/bhyve/tests@$(PKGVERS)
+set name=pkg.description value="BSD hypervisor tests"
+set name=pkg.summary value="BSD hypervisor tests"
+set name=info.classification \
+    value=org.opensolaris.category.2008:System/Virtualization
+set name=variant.arch value=i386
+dir path=opt/bhyvetest
+dir path=opt/bhyvetest/bin
+dir path=opt/bhyvetest/tst
+dir path=opt/bhyvetest/tst/mevent
+file path=opt/bhyvetest/bin/bhyvetest mode=0555
+file path=opt/bhyvetest/tst/mevent/lists.delete.exe mode=0555
+file path=opt/bhyvetest/tst/mevent/read.disable.exe mode=0555
+file path=opt/bhyvetest/tst/mevent/read.pause.exe mode=0555
+file path=opt/bhyvetest/tst/mevent/read.requeue.exe mode=0555
+license lic_CDDL license=lic_CDDL
diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf
new file mode 100644
index 0000000000..2a51d4fc22
--- /dev/null
+++ b/usr/src/pkg/manifests/system-bhyve.mf
@@ -0,0 +1,46 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+# Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+#
+
+#
+# The default for payload-bearing actions in this package is to appear in the
+# global zone only.  See the include file for greater detail, as well as
+# information about overriding the defaults.
+#
+<include global_zone_only_component>
+set name=pkg.fmri value=pkg:/system/bhyve@$(PKGVERS)
+set name=pkg.description value="BSD hypervisor"
+set name=pkg.summary value="BSD hypervisor"
+set name=info.classification \
+    value=org.opensolaris.category.2008:System/Virtualization
+set name=variant.arch value=i386
+dir path=kernel group=sys
+dir path=usr group=sys
+dir path=usr/kernel/drv group=sys
+dir path=usr/kernel/drv/$(ARCH64) group=sys
+dir path=usr/sbin
+driver name=vmm
+file path=usr/kernel/drv/$(ARCH64)/vmm
+file path=usr/kernel/drv/vmm.conf
+file path=usr/sbin/bhyve mode=0555
+file path=usr/sbin/bhyvectl mode=0555
+license lic_CDDL license=lic_CDDL
+depend fmri=developer/acpi type=require
+depend fmri=system/bhyve/firmware type=require
+depend fmri=system/library/bhyve type=require
diff --git a/usr/src/pkg/manifests/system-library-bhyve.mf b/usr/src/pkg/manifests/system-library-bhyve.mf
new file mode 100644
index 0000000000..d9a15e1b37
--- /dev/null
+++ b/usr/src/pkg/manifests/system-library-bhyve.mf
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+#
+
+set name=pkg.fmri value=pkg:/system/library/bhyve@$(PKGVERS)
+set name=pkg.description value="BSD hypervisor (libraries)"
+set name=pkg.summary value="BSD hypervisor (libraries)"
+set name=info.classification \
+    value=org.opensolaris.category.2008:System/Virtualization
+set name=variant.arch value=i386
+dir path=lib group=bin
+dir path=lib/$(ARCH64) group=bin
+dir path=usr group=sys
+dir path=usr/lib group=bin
+file path=lib/$(ARCH64)/libvmmapi.so.1
+license lic_CDDL license=lic_CDDL
diff --git a/usr/src/req.flg b/usr/src/req.flg
index 9c992b1120..26415fa51f 100644
--- a/usr/src/req.flg
+++ b/usr/src/req.flg
@@ -33,3 +33,5 @@ echo_file usr/src/Makefile.master.64
 echo_file usr/src/Makefile.msg.targ
 echo_file usr/src/Makefile.psm
 echo_file usr/src/Makefile.psm.targ
+
+find_files "s.*" usr/contrib/freebsd
diff --git a/usr/src/tools/scripts/build_cscope.conf b/usr/src/tools/scripts/build_cscope.conf
index 859b5137d6..298db1281b 100644
--- a/usr/src/tools/scripts/build_cscope.conf
+++ b/usr/src/tools/scripts/build_cscope.conf
@@ -22,8 +22,8 @@
 #
 # Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
+# Copyright 2018 Joyent, Inc.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 # This file configures the set of cross-references built by build_cscope.
 # The format is:
@@ -35,6 +35,6 @@
 # directories.
 #
 
-complete  -f   .
+complete  ""   .
 uts	  ""   uts uts/sun4u uts/sun4v uts/i86pc
 psm	  ""   psm/stand psm/stand/boot psm/stand/boot/sparcv9/sun4u psm/stand/boot/sparcv9/sun4v
diff --git a/usr/src/tools/scripts/gensetdefs.pl b/usr/src/tools/scripts/gensetdefs.pl
deleted file mode 100644
index 8ca5782feb..0000000000
--- a/usr/src/tools/scripts/gensetdefs.pl
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/perl -w
-#
-# COPYRIGHT 2013 Pluribus Networks Inc.
-#
-# All rights reserved. This copyright notice is Copyright Management
-# Information under 17 USC 1202 and is included to protect this work and
-# deter copyright infringement.  Removal or alteration of this Copyright
-# Management Information without the express written permission from
-# Pluribus Networks Inc is prohibited, and any such unauthorized removal
-# or alteration will be a violation of federal law.
-
-use strict;
-
-my @Sections = split(/\n/, `elfedit -r -e \'shdr:sh_name -osimple\' $ARGV[0] 2>&1`);
-
-foreach my $Section (@Sections) {
-	if ($Section =~ "^set_") {
-		print "\tfixing $Section\n";
-
-		chomp(my $SectionAddr = `elfedit -r -e \'shdr:sh_addr -onum $Section\' $ARGV[0] 2>&1`);
-		chomp(my $SectionSize = `elfedit -r -e \'shdr:sh_size -onum $Section\' $ARGV[0] 2>&1`);
-		my $SectionEnd = hex($SectionAddr) + hex($SectionSize);
-
-		`elfedit -e \'sym:st_bind __start_$Section global\' $ARGV[0] 2>&1`;
-		`elfedit -e \'sym:st_value __start_$Section $SectionAddr\' $ARGV[0] 2>&1`;
-		`elfedit -e \'sym:st_shndx __start_$Section $Section\' $ARGV[0] 2>&1`;
-		`elfedit -e \'sym:st_bind __stop_$Section global\' $ARGV[0] 2>&1`;
-		`elfedit -e \'sym:st_value __stop_$Section $SectionEnd\' $ARGV[0] 2>&1`;
-		`elfedit -e \'sym:st_shndx __stop_$Section $Section\' $ARGV[0] 2>&1`;
-	}
-}
diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ
index c5c32caa19..c42f458948 100644
--- a/usr/src/uts/Makefile.targ
+++ b/usr/src/uts/Makefile.targ
@@ -23,6 +23,7 @@
 # Copyright 2014 Garrett D'Amore <garrett@damore.org>
 # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
 # Copyright (c) 2017 by Delphix. All rights reserved.
+# Copyright 2019 Joyent, Inc.
 #
 #	This Makefiles contains the common targets and definitions for
 #	all kernels. It is to be included in the Makefiles for specific
@@ -51,7 +52,7 @@ $(OBJECTS): $(INLINES)
 #	Partially link .o files to generate the kmod. The fake dependency
 #	on modstubs simplifies things...
 #
-$(BINARY):		$(OBJECTS) $(DTRACE_MAPFILE)
+$(BINARY):		$(OBJECTS) $(DTRACE_MAPFILE) $(MAPFILE)
 	$(LD) -r $(LDFLAGS) -o $@ $(OBJECTS)
 	$(CTFMERGE_UNIQUIFY_AGAINST_GENUNIX)
 	$(POST_PROCESS)
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index dbed5ea9cc..63f314ca93 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1924,9 +1924,9 @@ LINT_DEFS	+= -Dunix
 #	It is a bug in the current compilation system that the assember
 #	can't process the -Y I, flag.
 #
-NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
-AS_INC_PATH	+= $(INC_PATH) -I$(UTSBASE)/common
-INCLUDE_PATH    += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
+NATIVE_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
+AS_INC_PATH	+= $(PRE_INC_PATH) $(INC_PATH) -I$(UTSBASE)/common
+INCLUDE_PATH    += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
 
 PCIEB_OBJS += pcieb.o
 
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 879b8d86cb..ca4ae0cd65 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -23,8 +23,8 @@
 # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
 #
 # Copyright (c) 2010, Intel Corporation.
-# Copyright 2018 Joyent, Inc.
 # Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+# Copyright 2019 Joyent, Inc.
 #
 #	This Makefile defines file modules in the directory uts/i86pc
 #	and its children. These are the source files which are i86pc
@@ -237,6 +237,46 @@ UPPC_OBJS += uppc.o psm_common.o
 XSVC_OBJS += xsvc.o
 AMD_IOMMU_OBJS +=	amd_iommu.o amd_iommu_impl.o amd_iommu_acpi.o \
 			amd_iommu_cmd.o amd_iommu_log.o amd_iommu_page_tables.o
+VMM_OBJS += vmm.o \
+	vmm_sol_dev.o \
+	vmm_host.o \
+	vmm_instruction_emul.o \
+	vmm_ioport.o \
+	vmm_lapic.o \
+	vmm_mem.o \
+	vmm_stat.o \
+	vmm_util.o \
+	x86.o \
+	vdev.o \
+	vatpic.o \
+	vatpit.o \
+	vhpet.o \
+	vioapic.o \
+	vlapic.o \
+	vrtc.o \
+	vpmtmr.o \
+	ept.o \
+	vmcs.o \
+	vmx_msr.o \
+	vmx.o \
+	vmx_support.o \
+	svm.o \
+	svm_msr.o \
+	npt.o \
+	vmcb.o \
+	svm_support.o \
+	amdv.o \
+	sol_iommu.o \
+	sol_ppt.o \
+	gipt.o \
+	vmm_sol_vm.o \
+	vmm_sol_glue.o \
+	vmm_sol_ept.o \
+	vmm_sol_rvi.o \
+	vmm_support.o \
+	vmm_zsd.o
+
+VIONA_OBJS += viona.o
 
 #
 #	Build up defines and paths.
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc
index f5021ec738..b66b0ca2da 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc
+++ b/usr/src/uts/i86pc/Makefile.i86pc
@@ -246,6 +246,7 @@ DRV_KMODS	+= dr
 DRV_KMODS	+= ioat
 DRV_KMODS	+= fipe
 DRV_KMODS	+= imc imcstub
+DRV_KMODS	+= vmm
 
 DRV_KMODS	+= cpudrv
 
diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules
index e4f2fee0a0..3d3c8131c1 100644
--- a/usr/src/uts/i86pc/Makefile.rules
+++ b/usr/src/uts/i86pc/Makefile.rules
@@ -225,6 +225,35 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/dboot/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/amd/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/intel/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/io/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/%.s
+	$(COMPILE.s) -o $@ $<
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/intel/%.s
+	$(COMPILE.s) -o $@ $<
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/vmm/amd/%.s
+	$(COMPILE.s) -o $@ $<
+
+$(OBJS_DIR)/%.o:		$(UTSBASE)/i86pc/io/viona/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 #
 # dboot stuff is always 32 bit, linked to run with phys_addr == virt_addr
 #
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
index 40bdd80a6e..2371a2f3ae 100644
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ b/usr/src/uts/i86pc/io/viona/viona.c
@@ -34,6 +34,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/conf.h>
@@ -194,8 +195,8 @@ static void			*viona_state;
 static dev_info_t		*viona_dip;
 static id_space_t		*viona_minor_ids;
 /*
- * copy tx mbufs from virtio ring to avoid necessitating a wait 
- * for packet transmission to free resources.
+ * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
+ * transmission to free resources.
  */
 static boolean_t		copy_tx_mblks = B_TRUE;
 
@@ -914,7 +915,7 @@ viona_ioc_tx_intr_clear(viona_link_t *link)
 
 static int
 vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov,
-int n_iov, uint16_t *cookie)
+    int n_iov, uint16_t *cookie)
 {
 	int			i;
 	int			ndesc, nindir;
@@ -1139,10 +1140,12 @@ viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 		size_t			mblklen;
 		int			n, i = 0;
 		uint16_t		cookie;
-		struct virtio_net_hdr	*vrx;
-		struct virtio_net_mrgrxhdr *vmrgrx;
+		struct virtio_net_hdr	*vrx = NULL;
+		struct virtio_net_mrgrxhdr *vmrgrx = NULL;
+#if notyet
 		mblk_t			*ml;
-		caddr_t			buf;
+#endif
+		caddr_t			buf = NULL;
 		int			total_len = 0;
 		int			copied_buf = 0;
 		int			num_bufs = 0;
@@ -1312,8 +1315,10 @@ viona_desb_free(viona_desb_t *dp)
 {
 	viona_link_t		*link;
 	viona_vring_hqueue_t	*hq;
+#if notyet
 	struct virtio_used	*vu;
 	int			uidx;
+#endif
 	uint_t			ref;
 
 	ref = atomic_dec_uint_nv(&dp->d_ref);
diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync
new file mode 100644
index 0000000000..1cddfd829e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/README.sync
@@ -0,0 +1,18 @@
+The bhyve kernel module and its associated userland consumers have been updated
+to the latest upstream FreeBSD sources as of:
+
+
+commit 3b9cb80b242682690203709aaff4eafae41c138f
+Author: jhb <jhb@FreeBSD.org>
+Date:   Mon Jun 3 23:17:35 2019 +0000
+
+    Emulate the AMD MSR_LS_CFG MSR used for various Ryzen errata.
+
+    Writes are ignored and reads always return zero.
+
+    Submitted by:   José Albornoz <jojo@eljojo.net> (write-only version)
+    Reviewed by:    Patrick Mooney, cem
+    MFC after:      2 weeks
+    Differential Revision:  https://reviews.freebsd.org/D19506
+
+Which corresponds to SVN revision: 348592
diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdv.c b/usr/src/uts/i86pc/io/vmm/amd/amdv.c
index 6b62daae6c..c34a1e897b 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/amdv.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/amdv.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,141 +38,18 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
-#include <sys/smp.h>
 
 #include <machine/vmm.h>
-#ifdef	__FreeBSD__
 #include "io/iommu.h"
-#endif
-
-static int
-amdv_init(void)
-{
-
-	printf("amdv_init: not implemented\n");
-	return (ENXIO);
-}
-
-static int
-amdv_cleanup(void)
-{
-
-	printf("amdv_cleanup: not implemented\n");
-	return (ENXIO);
-}
-
-static void *
-amdv_vminit(struct vm *vm)
-{
-
-	printf("amdv_vminit: not implemented\n");
-	return (NULL);
-}
-
-static int
-amdv_vmrun(void *arg, int vcpu, register_t rip)
-{
-
-	printf("amdv_vmrun: not implemented\n");
-	return (ENXIO);
-}
-
-static void
-amdv_vmcleanup(void *arg)
-{
-
-	printf("amdv_vmcleanup: not implemented\n");
-	return;
-}
-
-static int
-amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-	    vm_memattr_t attr, int prot, boolean_t spok)
-{
-
-	printf("amdv_vmmmap_set: not implemented\n");
-	return (EINVAL);
-}
-
-static vm_paddr_t
-amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
-{
-
-	printf("amdv_vmmmap_get: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
-{
-	
-	printf("amdv_getreg: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
-{
-	
-	printf("amdv_setreg: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
-{
-
-	printf("amdv_get_desc: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
-{
-
-	printf("amdv_get_desc: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_getcap(void *arg, int vcpu, int type, int *retval)
-{
-
-	printf("amdv_getcap: not implemented\n");
-	return (EINVAL);
-}
-
-static int
-amdv_setcap(void *arg, int vcpu, int type, int val)
-{
-
-	printf("amdv_setcap: not implemented\n");
-	return (EINVAL);
-}
-
-struct vmm_ops vmm_ops_amd = {
-	amdv_init,
-	amdv_cleanup,
-	amdv_vminit,
-	amdv_vmrun,
-	amdv_vmcleanup,
-	amdv_vmmmap_set,
-	amdv_vmmmap_get,
-	amdv_getreg,
-	amdv_setreg,
-	amdv_getdesc,
-	amdv_setdesc,
-	amdv_getcap,
-	amdv_setcap
-};
 
 static int
 amd_iommu_init(void)
@@ -234,14 +113,14 @@ amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
 }
 
 static void
-amd_iommu_add_device(void *domain, int bus, int slot, int func)
+amd_iommu_add_device(void *domain, uint16_t rid)
 {
 
 	printf("amd_iommu_add_device: not implemented\n");
 }
 
 static void
-amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+amd_iommu_remove_device(void *domain, uint16_t rid)
 {
 
 	printf("amd_iommu_remove_device: not implemented\n");
@@ -254,7 +133,6 @@ amd_iommu_invalidate_tlb(void *domain)
 	printf("amd_iommu_invalidate_tlb: not implemented\n");
 }
 
-#ifdef	__FreeBSD__
 struct iommu_ops iommu_ops_amd = {
 	amd_iommu_init,
 	amd_iommu_cleanup,
@@ -268,4 +146,3 @@ struct iommu_ops iommu_ops_amd = {
 	amd_iommu_remove_device,
 	amd_iommu_invalidate_tlb,
 };
-#endif
diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
new file mode 100644
index 0000000000..f6b6e60363
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
@@ -0,0 +1,1461 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016, Anish Gupta (anish@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+#include <machine/vmm.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "pcib_if.h"
+
+#include "io/iommu.h"
+#include "amdvi_priv.h"
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL);
+
+#define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s)))
+#define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s)))
+
+/* Print RID or device ID in PCI string format. */
+#define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d)
+
+static void amdvi_dump_cmds(struct amdvi_softc *softc);
+static void amdvi_print_dev_cap(struct amdvi_softc *softc);
+
+MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi");
+
+extern device_t *ivhd_devs;
+
+extern int ivhd_count;
+SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count,
+    0, NULL);
+
+static int amdvi_enable_user = 0;
+SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN,
+    &amdvi_enable_user, 0, NULL);
+TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user);
+
+#ifdef AMDVI_ATS_ENABLE
+/* XXX: ATS is not tested. */
+static int amdvi_enable_iotlb = 1;
+SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN,
+    &amdvi_enable_iotlb, 0, NULL);
+TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb);
+#endif
+
+static int amdvi_host_ptp = 1;	/* Use page tables for host. */
+SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN,
+    &amdvi_host_ptp, 0, NULL);
+TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp);
+
+/* Page table level used <= supported by h/w[v1=7]. */
+static int amdvi_ptp_level = 4;
+SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN,
+    &amdvi_ptp_level, 0, NULL);
+TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level);
+
+/* Disable fault event reporting. */
+static int amdvi_disable_io_fault = 0;
+SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN,
+    &amdvi_disable_io_fault, 0, NULL);
+TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault);
+
+static uint32_t amdvi_dom_id = 0;	/* 0 is reserved for host. */
+SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD,
+    &amdvi_dom_id, 0, NULL);
+/*
+ * Device table entry.
+ * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes).
+ *	= 256 * 2 * PAGE_SIZE.
+ */
+static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE);
+CTASSERT(PCI_NUM_DEV_MAX == 0x10000);
+CTASSERT(sizeof(amdvi_dte) == 0x200000);
+
+static SLIST_HEAD (, amdvi_domain) dom_head;
+
+static inline uint32_t
+amdvi_pci_read(struct amdvi_softc *softc, int off)
+{
+
+	return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid),
+	    PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid),
+	    off, 4));
+}
+
+#ifdef AMDVI_ATS_ENABLE
+/* XXX: Should be in pci.c */
+/*
+ * Check if device has ATS capability and its enabled.
+ * If ATS is absent or disabled, return (-1), otherwise ATS
+ * queue length.
+ */
+static int
+amdvi_find_ats_qlen(uint16_t devid)
+{
+	device_t dev;
+	uint32_t off, cap;
+	int qlen = -1;
+
+	dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid),
+			   PCI_RID2FUNC(devid));
+
+	if (!dev) {
+		return (-1);
+	}
+#define PCIM_ATS_EN	BIT(31)
+
+	if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) {
+		cap = pci_read_config(dev, off + 4, 4);
+		qlen = (cap & 0x1F);
+		qlen = qlen ? qlen : 32;
+		printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n",
+		       RID2PCI_STR(devid),
+		       (cap & PCIM_ATS_EN) ? "enabled" : "Disabled",
+		       qlen);
+		qlen = (cap & PCIM_ATS_EN) ? qlen : -1;
+	}
+
+	return (qlen);
+}
+
+/*
+ * Check if an endpoint device support device IOTLB or ATS.
+ */
+static inline bool
+amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid)
+{
+	struct ivhd_dev_cfg *cfg;
+	int qlen, i;
+	bool pci_ats, ivhd_ats;
+
+	qlen = amdvi_find_ats_qlen(devid);
+	if (qlen < 0)
+		return (false);
+
+	KASSERT(softc, ("softc is NULL"));
+	cfg = softc->dev_cfg;
+
+	ivhd_ats = false;
+	for (i = 0; i < softc->dev_cfg_cnt; i++) {
+		if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) {
+			ivhd_ats = cfg->enable_ats;
+			break;
+		}
+		cfg++;
+	}
+
+	pci_ats = (qlen < 0) ? false : true;
+	if (pci_ats != ivhd_ats)
+		device_printf(softc->dev,
+		    "BIOS bug: mismatch in ATS setting for %d.%d.%d,"
+		    "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen);
+
+	/* Ignore IVRS setting and respect PCI setting. */
+	return (pci_ats);
+}
+#endif
+
+/* Enable IOTLB support for IOMMU if its supported. */
+static inline void
+amdvi_hw_enable_iotlb(struct amdvi_softc *softc)
+{
+#ifndef AMDVI_ATS_ENABLE
+	softc->iotlb = false;
+#else
+	bool supported;
+
+	supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false;
+
+	if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) {
+		if (!supported)
+			device_printf(softc->dev, "IOTLB disabled by BIOS.\n");
+
+		if (supported && !amdvi_enable_iotlb) {
+			device_printf(softc->dev, "IOTLB disabled by user.\n");
+			supported = false;
+		}
+	} else
+		supported = false;
+
+	softc->iotlb = supported;
+
+#endif
+}
+
+static int
+amdvi_init_cmd(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl = softc->ctrl;
+
+	ctrl->cmd.len = 8;	/* Use 256 command buffer entries. */
+	softc->cmd_max = 1 << ctrl->cmd.len;
+
+	softc->cmd = malloc(sizeof(struct amdvi_cmd) *
+	    softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO);
+
+	if ((uintptr_t)softc->cmd & PAGE_MASK)
+		panic("AMDVi: Command buffer not aligned on page boundary.");
+
+	ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE;
+	/*
+	 * XXX: Reset the h/w pointers in case IOMMU is restarting,
+	 * h/w doesn't clear these pointers based on empirical data.
+	 */
+	ctrl->cmd_tail = 0;
+	ctrl->cmd_head = 0;
+
+	return (0);
+}
+
+/*
+ * Note: Update tail pointer after we have written the command since tail
+ * pointer update cause h/w to execute new commands, see section 3.3
+ * of AMD IOMMU spec ver 2.0.
+ */
+/* Get the command tail pointer w/o updating it. */
+static struct amdvi_cmd *
+amdvi_get_cmd_tail(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+	struct amdvi_cmd *tail;
+
+	KASSERT(softc, ("softc is NULL"));
+	KASSERT(softc->cmd != NULL, ("cmd is NULL"));
+
+	ctrl = softc->ctrl;
+	KASSERT(ctrl != NULL, ("ctrl is NULL"));
+
+	tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd +
+	    ctrl->cmd_tail);
+
+	return (tail);
+}
+
+/*
+ * Update the command tail pointer which will start command execution.
+ */
+static void
+amdvi_update_cmd_tail(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+	int size;
+
+	size = sizeof(struct amdvi_cmd);
+	KASSERT(softc->cmd != NULL, ("cmd is NULL"));
+
+	ctrl = softc->ctrl;
+	KASSERT(ctrl != NULL, ("ctrl is NULL"));
+
+	ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max);
+	softc->total_cmd++;
+
+#ifdef AMDVI_DEBUG_CMD
+	device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n",
+	    ctrl->cmd_tail,
+	    ctrl->cmd_head);
+#endif
+
+}
+
+/*
+ * Various commands supported by IOMMU.
+ */
+
+/* Completion wait command. */
+static void
+amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data)
+{
+	struct amdvi_cmd *cmd;
+	uint64_t pa;
+
+	cmd = amdvi_get_cmd_tail(softc);
+	KASSERT(cmd != NULL, ("Cmd is NULL"));
+
+	pa = vtophys(&softc->cmp_data);
+	cmd->opcode = AMDVI_CMP_WAIT_OPCODE;
+	cmd->word0 = (pa & 0xFFFFFFF8) |
+	    (AMDVI_CMP_WAIT_STORE);
+	//(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE);
+	cmd->word1 = (pa >> 32) & 0xFFFFF;
+	cmd->addr = data;
+
+	amdvi_update_cmd_tail(softc);
+}
+
+/* Invalidate device table entry. */
+static void
+amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid)
+{
+	struct amdvi_cmd *cmd;
+
+	cmd = amdvi_get_cmd_tail(softc);
+	KASSERT(cmd != NULL, ("Cmd is NULL"));
+	cmd->opcode = AMDVI_INVD_DTE_OPCODE;
+	cmd->word0 = devid;
+	amdvi_update_cmd_tail(softc);
+#ifdef AMDVI_DEBUG_CMD
+	device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid);
+#endif
+}
+
+/* Invalidate IOMMU page, use for invalidation of domain. */
+static void
+amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id,
+			  uint64_t addr, bool guest_nested,
+			  bool pde, bool page)
+{
+	struct amdvi_cmd *cmd;
+
+	cmd = amdvi_get_cmd_tail(softc);
+	KASSERT(cmd != NULL, ("Cmd is NULL"));
+
+
+	cmd->opcode = AMDVI_INVD_PAGE_OPCODE;
+	cmd->word1 = domain_id;
+	/*
+	 * Invalidate all addresses for this domain.
+	 */
+	cmd->addr = addr;
+	cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0;
+	cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0;
+
+	amdvi_update_cmd_tail(softc);
+}
+
+#ifdef AMDVI_ATS_ENABLE
+/* Invalidate device IOTLB. */
+static void
+amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid)
+{
+	struct amdvi_cmd *cmd;
+	int qlen;
+
+	if (!softc->iotlb)
+		return;
+
+	qlen = amdvi_find_ats_qlen(devid);
+	if (qlen < 0) {
+		panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n",
+		      qlen, RID2PCI_STR(devid));
+	}
+	cmd = amdvi_get_cmd_tail(softc);
+	KASSERT(cmd != NULL, ("Cmd is NULL"));
+
+#ifdef AMDVI_DEBUG_CMD
+	device_printf(softc->dev, "Invalidate IOTLB devID 0x%x"
+		      " Qlen:%d\n", devid, qlen);
+#endif
+	cmd->opcode = AMDVI_INVD_IOTLB_OPCODE;
+	cmd->word0 = devid;
+	cmd->word1 = qlen;
+	cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR |
+		AMDVI_INVD_IOTLB_S;
+	amdvi_update_cmd_tail(softc);
+}
+#endif
+
+#ifdef notyet				/* For Interrupt Remap. */
+static void
+amdvi_cmd_inv_intr_map(struct amdvi_softc *softc,
+		       uint16_t devid)
+{
+	struct amdvi_cmd *cmd;
+
+	cmd = amdvi_get_cmd_tail(softc);
+	KASSERT(cmd != NULL, ("Cmd is NULL"));
+	cmd->opcode = AMDVI_INVD_INTR_OPCODE;
+	cmd->word0 = devid;
+	amdvi_update_cmd_tail(softc);
+#ifdef AMDVI_DEBUG_CMD
+	device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid);
+#endif
+}
+#endif
+
+/* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */
+static void
+amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id)
+{
+	struct amdvi_cmd *cmd;
+
+	cmd = amdvi_get_cmd_tail(softc);
+	KASSERT(cmd != NULL, ("Cmd is NULL"));
+
+	/*
+	 * See section 3.3.3 of IOMMU spec rev 2.0, software note
+	 * for invalidating domain.
+	 */
+	amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR,
+				false, true, true);
+
+#ifdef AMDVI_DEBUG_CMD
+	device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id);
+
+#endif
+}
+
+static	bool
+amdvi_cmp_wait(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+	const uint64_t VERIFY = 0xA5A5;
+	volatile uint64_t *read;
+	int i;
+	bool status;
+
+	ctrl = softc->ctrl;
+	read = &softc->cmp_data;
+	*read = 0;
+	amdvi_cmd_cmp(softc, VERIFY);
+	/* Wait for h/w to update completion data. */
+	for (i = 0; i < 100 && (*read != VERIFY); i++) {
+		DELAY(1000);		/* 1 ms */
+	}
+	status = (VERIFY == softc->cmp_data) ? true : false;
+
+#ifdef AMDVI_DEBUG_CMD
+	if (status)
+		device_printf(softc->dev, "CMD completion DONE Tail:0x%x, "
+			      "Head:0x%x, loop:%d.\n", ctrl->cmd_tail,
+			      ctrl->cmd_head, loop);
+#endif
+	return (status);
+}
+
+static void
+amdvi_wait(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+	int i;
+
+	KASSERT(softc, ("softc is NULL"));
+
+	ctrl = softc->ctrl;
+	KASSERT(ctrl != NULL, ("ctrl is NULL"));
+	/* Don't wait if h/w is not enabled. */
+	if ((ctrl->control & AMDVI_CTRL_EN) == 0)
+		return;
+
+	for (i = 0; i < 10; i++) {
+		if (amdvi_cmp_wait(softc))
+			return;
+	}
+
+	device_printf(softc->dev, "Error: completion failed"
+		      " tail:0x%x, head:0x%x.\n",
+		      ctrl->cmd_tail, ctrl->cmd_head);
+	amdvi_dump_cmds(softc);
+}
+
+static void
+amdvi_dump_cmds(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+	struct amdvi_cmd *cmd;
+	int off, i;
+
+	ctrl = softc->ctrl;
+	device_printf(softc->dev, "Dump all the commands:\n");
+	/*
+	 * If h/w is stuck in completion, it is the previous command,
+	 * start dumping from previous command onward.
+	 */
+	off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd),
+	    softc->cmd_max);
+	for (i = 0; off != ctrl->cmd_tail &&
+	    i < softc->cmd_max; i++) {
+		cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off);
+		printf("  [CMD%d, off:0x%x] opcode= 0x%x 0x%x"
+		    " 0x%x 0x%lx\n", i, off, cmd->opcode,
+		    cmd->word0, cmd->word1, cmd->addr);
+		off = (off + sizeof(struct amdvi_cmd)) %
+		    (softc->cmd_max * sizeof(struct amdvi_cmd));
+	}
+}
+
+static int
+amdvi_init_event(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+
+	ctrl = softc->ctrl;
+	ctrl->event.len = 8;
+	softc->event_max = 1 << ctrl->event.len;
+	softc->event = malloc(sizeof(struct amdvi_event) *
+	    softc->event_max, M_AMDVI, M_WAITOK | M_ZERO);
+	if ((uintptr_t)softc->event & PAGE_MASK) {
+		device_printf(softc->dev, "Event buffer not aligned on page.");
+		return (false);
+	}
+	ctrl->event.base = vtophys(softc->event) / PAGE_SIZE;
+
+	/* Reset the pointers. */
+	ctrl->evt_head = 0;
+	ctrl->evt_tail = 0;
+
+	return (0);
+}
+
+static inline void
+amdvi_decode_evt_flag(uint16_t flag)
+{
+
+	flag &= AMDVI_EVENT_FLAG_MASK;
+	printf(" 0x%b]\n", flag,
+		"\020"
+		"\001GN"
+		"\002NX"
+		"\003US"
+		"\004I"
+		"\005PR"
+		"\006RW"
+		"\007PE"
+		"\010RZ"
+		"\011TR"
+		);
+}
+
+/* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/
+static inline void
+amdvi_decode_evt_flag_type(uint8_t type)
+{
+
+	switch (AMDVI_EVENT_FLAG_TYPE(type)) {
+	case 0:
+		printf("RSVD\n");
+		break;
+	case 1:
+		printf("Master Abort\n");
+		break;
+	case 2:
+		printf("Target Abort\n");
+		break;
+	case 3:
+		printf("Data Err\n");
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr,
+    uint16_t flag)
+{
+
+	printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x"
+	    " Addr:0x%lx",
+	    devid, domid, addr);
+	amdvi_decode_evt_flag(flag);
+}
+
+static void
+amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr,
+    uint16_t flag)
+{
+
+	printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x"
+	    " Addr:0x%lx",
+	    devid, domid, addr);
+	amdvi_decode_evt_flag(flag);
+}
+
+static void
+amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid,
+    uint64_t addr, uint16_t flag)
+{
+
+	printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x"
+	    " Addr:0x%lx", devid, domid, addr);
+	amdvi_decode_evt_flag(flag);
+	amdvi_decode_evt_flag_type(flag);
+}
+
+static void
+amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr,
+    uint16_t flag)
+{
+
+	printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x"
+	    " Addr:0x%lx", devid, domid, addr);
+	amdvi_decode_evt_flag(flag);
+	amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag));
+}
+
+static void
+amdvi_decode_evt(struct amdvi_event *evt)
+{
+	struct amdvi_cmd *cmd;
+
+	switch (evt->opcode) {
+	case AMDVI_EVENT_INVALID_DTE:
+		amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid,
+		    evt->addr, evt->flag);
+		break;
+
+	case AMDVI_EVENT_PFAULT:
+		amdvi_decode_pf_evt(evt->devid, evt->pasid_domid,
+		    evt->addr, evt->flag);
+		break;
+
+	case AMDVI_EVENT_DTE_HW_ERROR:
+		amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid,
+		    evt->addr, evt->flag);
+		break;
+
+	case AMDVI_EVENT_PAGE_HW_ERROR:
+		amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid,
+		    evt->addr, evt->flag);
+		break;
+
+	case AMDVI_EVENT_ILLEGAL_CMD:
+		/* FALL THROUGH */
+	case AMDVI_EVENT_CMD_HW_ERROR:
+		printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ?
+		    "ILLEGAL CMD" : "CMD HW ERR");
+		cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr);
+		printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n",
+		    cmd->opcode, cmd->word0, cmd->word1, cmd->addr);
+		break;
+
+	case AMDVI_EVENT_IOTLB_TIMEOUT:
+		printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n",
+		    evt->devid, evt->addr);
+		break;
+
+	case AMDVI_EVENT_INVALID_DTE_REQ:
+		printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n",
+		    evt->devid, evt->addr, evt->flag >> 9,
+		    (evt->flag >> 8) & 1);
+		break;
+
+	case AMDVI_EVENT_INVALID_PPR_REQ:
+	case AMDVI_EVENT_COUNTER_ZERO:
+		printf("AMD-Vi: v2 events.\n");
+		break;
+
+	default:
+		printf("Unsupported AMD-Vi event:%d\n", evt->opcode);
+	}
+}
+
+static void
+amdvi_print_events(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+	struct amdvi_event *event;
+	int i, size;
+
+	ctrl = softc->ctrl;
+	size = sizeof(struct amdvi_event);
+	for (i = 0; i < softc->event_max; i++) {
+		event = &softc->event[ctrl->evt_head / size];
+		if (!event->opcode)
+			break;
+		device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n",
+		    i, ctrl->evt_head, ctrl->evt_tail);
+		amdvi_decode_evt(event);
+		ctrl->evt_head = MOD_INC(ctrl->evt_head, size,
+		    softc->event_max);
+	}
+}
+
+static int
+amdvi_init_dte(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+
+	ctrl = softc->ctrl;
+	ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE;
+	ctrl->dte.size = 0x1FF;		/* 2MB device table. */
+
+	return (0);
+}
+
+/*
+ * Not all capabilities of IOMMU are available in ACPI IVHD flag
+ * or EFR entry, read directly from device.
+ */
+static int
+amdvi_print_pci_cap(device_t dev)
+{
+	struct amdvi_softc *softc;
+	uint32_t off, cap;
+
+
+	softc = device_get_softc(dev);
+	off = softc->cap_off;
+
+	/*
+	 * Section 3.7.1 of IOMMU sepc rev 2.0.
+	 * Read capability from device.
+	 */
+	cap = amdvi_pci_read(softc, off);
+
+	/* Make sure capability type[18:16] is 3. */
+	KASSERT((((cap >> 16) & 0x7) == 0x3),
+	    ("Not a IOMMU capability 0x%x@0x%x", cap, off));
+
+	softc->pci_cap = cap >> 24;
+	device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n",
+	    cap, off, softc->pci_cap,
+	    "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt");
+
+	return (0);
+}
+
+static void
+amdvi_event_intr(void *arg)
+{
+	struct amdvi_softc *softc;
+	struct amdvi_ctrl *ctrl;
+
+	softc = (struct amdvi_softc *)arg;
+	ctrl = softc->ctrl;
+	device_printf(softc->dev, "EVT INTR %ld Status:0x%x"
+	    " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++,
+	    ctrl->status, ctrl->evt_head, ctrl->evt_tail);
+	printf("  [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n",
+	    softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head);
+
+	amdvi_print_events(softc);
+	ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR;
+}
+
+static void
+amdvi_free_evt_intr_res(device_t dev)
+{
+
+	struct amdvi_softc *softc;
+
+	softc = device_get_softc(dev);
+	if (softc->event_tag != NULL) {
+		bus_teardown_intr(dev, softc->event_res, softc->event_tag);
+	}
+	if (softc->event_res != NULL) {
+		bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid,
+		    softc->event_res);
+	}
+	bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid);
+	PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)),
+	    dev, 1, &softc->event_irq);
+}
+
+static bool
+amdvi_alloc_intr_resources(struct amdvi_softc *softc)
+{
+	struct amdvi_ctrl *ctrl;
+	device_t dev, pcib;
+	device_t mmio_dev;
+	uint64_t msi_addr;
+	uint32_t msi_data;
+	int err;
+
+	dev = softc->dev;
+	pcib = device_get_parent(device_get_parent(dev));
+	mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid),
+            PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid));
+	if (device_is_attached(mmio_dev)) {
+		device_printf(dev,
+		    "warning: IOMMU device is claimed by another driver %s\n",
+		    device_get_driver(mmio_dev)->name);
+	}
+
+	softc->event_irq = -1;
+	softc->event_rid = 0;
+
+	/*
+	 * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one
+	 * interrupt. XXX: Enable MSI/X support.
+	 */
+	err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq);
+	if (err) {
+		device_printf(dev,
+		    "Couldn't find event MSI IRQ resource.\n");
+		return (ENOENT);
+	}
+
+	err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid,
+	    softc->event_irq, 1);
+	if (err) {
+		device_printf(dev, "Couldn't set event MSI resource.\n");
+		return (ENXIO);
+	}
+
+	softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ,
+	    &softc->event_rid, RF_ACTIVE);
+	if (!softc->event_res) {
+		device_printf(dev,
+		    "Unable to allocate event INTR resource.\n");
+		return (ENOMEM);
+	}
+
+	if (bus_setup_intr(dev, softc->event_res,
+	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr,
+	    softc, &softc->event_tag)) {
+		device_printf(dev, "Fail to setup event intr\n");
+		bus_release_resource(softc->dev, SYS_RES_IRQ,
+		    softc->event_rid, softc->event_res);
+		softc->event_res = NULL;
+		return (ENXIO);
+	}
+
+	bus_describe_intr(dev, softc->event_res, softc->event_tag,
+	    "fault");
+
+	err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr,
+	    &msi_data);
+	if (err) {
+		device_printf(dev,
+		    "Event interrupt config failed, err=%d.\n",
+		    err);
+		amdvi_free_evt_intr_res(softc->dev);
+		return (err);
+	}
+
+	/* Clear interrupt status bits. */
+	ctrl = softc->ctrl;
+	ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR;
+
+	/* Now enable MSI interrupt. */
+	pci_enable_msi(mmio_dev, msi_addr, msi_data);
+	return (0);
+}
+
+
+static void
+amdvi_print_dev_cap(struct amdvi_softc *softc)
+{
+	struct ivhd_dev_cfg *cfg;
+	int i;
+
+	cfg = softc->dev_cfg;
+	for (i = 0; i < softc->dev_cfg_cnt; i++) {
+		device_printf(softc->dev, "device [0x%x - 0x%x]"
+		    "config:%b%s\n", cfg->start_id, cfg->end_id,
+		    cfg->data,
+		    "\020\001INIT\002ExtInt\003NMI"
+		    "\007LINT0\008LINT1",
+		    cfg->enable_ats ? "ATS enabled" : "");
+		cfg++;
+	}
+}
+
+static int
+amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct amdvi_softc *softc;
+	int result, type, error = 0;
+
+	softc = (struct amdvi_softc *)arg1;
+	type = arg2;
+
+	switch (type) {
+	case 0:
+		result = softc->ctrl->cmd_head;
+		error = sysctl_handle_int(oidp, &result, 0,
+		    req);
+		break;
+	case 1:
+		result = softc->ctrl->cmd_tail;
+		error = sysctl_handle_int(oidp, &result, 0,
+		    req);
+		break;
+	case 2:
+		result = softc->ctrl->evt_head;
+		error = sysctl_handle_int(oidp, &result, 0,
+		    req);
+		break;
+	case 3:
+		result = softc->ctrl->evt_tail;
+		error = sysctl_handle_int(oidp, &result, 0,
+		    req);
+		break;
+
+	default:
+		device_printf(softc->dev, "Unknown sysctl:%d\n", type);
+	}
+
+	return (error);
+}
+
+static void
+amdvi_add_sysctl(struct amdvi_softc *softc)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	device_t dev;
+
+	dev = softc->dev;
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD,
+	    &softc->event_intr_cnt, "Event interrupt count");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD,
+	    &softc->total_cmd, "Command submitted count");
+	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD,
+	    &softc->pci_rid, 0, "IOMMU RID");
+	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD,
+	    &softc->start_dev_rid, 0, "Start of device under this IOMMU");
+	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD,
+	    &softc->end_dev_rid, 0, "End of device under this IOMMU");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head",
+	    CTLTYPE_UINT | CTLFLAG_RD, softc, 0,
+	    amdvi_handle_sysctl, "IU", "Command head");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail",
+	    CTLTYPE_UINT | CTLFLAG_RD, softc, 1,
+	    amdvi_handle_sysctl, "IU", "Command tail");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head",
+	    CTLTYPE_UINT | CTLFLAG_RD, softc, 2,
+	    amdvi_handle_sysctl, "IU", "Command head");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail",
+	    CTLTYPE_UINT | CTLFLAG_RD, softc, 3,
+	    amdvi_handle_sysctl, "IU", "Command tail");
+}
+
+int
+amdvi_setup_hw(struct amdvi_softc *softc)
+{
+	device_t dev;
+	int status;
+
+	dev = softc->dev;
+
+	amdvi_hw_enable_iotlb(softc);
+
+	amdvi_print_dev_cap(softc);
+
+	if ((status = amdvi_print_pci_cap(dev)) != 0) {
+		device_printf(dev, "PCI capability.\n");
+		return (status);
+	}
+	if ((status = amdvi_init_cmd(softc)) != 0) {
+		device_printf(dev, "Couldn't configure command buffer.\n");
+		return (status);
+	}
+	if ((status = amdvi_init_event(softc)) != 0) {
+		device_printf(dev, "Couldn't configure event buffer.\n");
+		return (status);
+	}
+	if ((status = amdvi_init_dte(softc)) != 0) {
+		device_printf(dev, "Couldn't configure device table.\n");
+		return (status);
+	}
+	if ((status = amdvi_alloc_intr_resources(softc)) != 0) {
+		return (status);
+	}
+	amdvi_add_sysctl(softc);
+	return (0);
+}
+
+int
+amdvi_teardown_hw(struct amdvi_softc *softc)
+{
+	device_t dev;
+
+	dev = softc->dev;
+
+	/* 
+	 * Called after disable, h/w is stopped by now, free all the resources. 
+	 */
+	amdvi_free_evt_intr_res(dev);
+
+	if (softc->cmd)
+		free(softc->cmd, M_AMDVI);
+
+	if (softc->event)
+		free(softc->event, M_AMDVI);
+
+	return (0);
+}
+
+/*********** bhyve interfaces *********************/
+static int
+amdvi_init(void)
+{
+	if (!ivhd_count) {
+		return (EIO);
+	}
+	if (!amdvi_enable_user && ivhd_count) {
+		printf("bhyve: Found %d AMD-Vi/IOMMU device(s), "
+		    	"use hw.vmm.amdvi.enable=1 to enable pass-through.\n",
+		    ivhd_count);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static void
+amdvi_cleanup(void)
+{
+	/* Nothing. */
+}
+
+static uint16_t
+amdvi_domainId(void)
+{
+
+	/*
+	 * If we hit maximum domain limit, rollover leaving host
+	 * domain(0).
+	 * XXX: make sure that this domain is not used.
+	 */
+	if (amdvi_dom_id == AMDVI_MAX_DOMAIN)
+		amdvi_dom_id = 1;
+
+	return ((uint16_t)amdvi_dom_id++);
+}
+
+static void
+amdvi_do_inv_domain(uint16_t domain_id, bool create)
+{
+	struct amdvi_softc *softc;
+	int i;
+
+	for (i = 0; i < ivhd_count; i++) {
+		softc = device_get_softc(ivhd_devs[i]);
+		KASSERT(softc, ("softc is NULL"));
+		/*
+		 * If not present pages are cached, invalidate page after
+		 * creating domain.
+		 */
+#if 0
+		if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0))
+			continue;
+#endif
+		amdvi_inv_domain(softc, domain_id);
+		amdvi_wait(softc);
+	}
+}
+
+static void *
+amdvi_create_domain(vm_paddr_t maxaddr)
+{
+	struct amdvi_domain *dom;
+
+	dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK);
+	dom->id = amdvi_domainId();
+	//dom->maxaddr = maxaddr;
+#ifdef AMDVI_DEBUG_CMD
+	printf("Created domain #%d\n", dom->id);
+#endif
+	/*
+	 * Host domain(#0) don't create translation table.
+	 */
+	if (dom->id || amdvi_host_ptp)
+		dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO);
+
+	dom->ptp_level = amdvi_ptp_level;
+
+	amdvi_do_inv_domain(dom->id, true);
+	SLIST_INSERT_HEAD(&dom_head, dom, next);
+
+	return (dom);
+}
+
+static void
+amdvi_free_ptp(uint64_t *ptp, int level)
+{
+	int i;
+
+	if (level < 1)
+		return;
+
+	for (i = 0; i < NPTEPG ; i++) {
+		if ((ptp[i] & AMDVI_PT_PRESENT) == 0)
+			continue;
+		/* XXX: Add super-page or PTE mapping > 4KB. */
+#ifdef notyet
+		/* Super-page mapping. */
+		if (AMDVI_PD_SUPER(ptp[i]))
+			continue;
+#endif
+
+		amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i]
+		    & AMDVI_PT_MASK), level - 1);
+
+	}
+
+	free(ptp, M_AMDVI);
+}
+
+static void
+amdvi_destroy_domain(void *arg)
+{
+	struct amdvi_domain *domain;
+
+	domain = (struct amdvi_domain *)arg;
+	KASSERT(domain, ("domain is NULL"));
+#ifdef AMDVI_DEBUG_CMD
+	printf("Destroying domain %d\n", domain->id);
+#endif
+	if (domain->ptp)
+		amdvi_free_ptp(domain->ptp, domain->ptp_level);
+
+	amdvi_do_inv_domain(domain->id, false);
+	SLIST_REMOVE(&dom_head, domain, amdvi_domain, next);
+	free(domain, M_AMDVI);
+}
+
+static uint64_t
+amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa,
+    vm_paddr_t hpa, uint64_t pg_size, bool create)
+{
+	uint64_t *page, pa;
+	int shift, index;
+	const int PT_SHIFT = 9;
+	const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1;	/* Based on PT_SHIFT */
+
+	if (!pg_size)
+		return (0);
+
+	if (hpa & (pg_size - 1)) {
+		printf("HPA is not size aligned.\n");
+		return (0);
+	}
+	if (gpa & (pg_size - 1)) {
+		printf("HPA is not size aligned.\n");
+		return (0);
+	}
+	shift = PML4SHIFT;
+	while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) {
+		index = (gpa >> shift) & PT_INDEX_MASK;
+
+		if ((pt[index] == 0) && create) {
+			page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO);
+			pa = vtophys(page);
+			pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW |
+			    ((level - 1) << AMDVI_PD_LEVEL_SHIFT);
+		}
+#ifdef AMDVI_DEBUG_PTE
+		if ((gpa % 0x1000000) == 0)
+			printf("[level%d, shift = %d]PTE:0x%lx\n",
+			    level, shift, pt[index]);
+#endif
+#define PTE2PA(x)	((uint64_t)(x) & AMDVI_PT_MASK)
+		pa = PTE2PA(pt[index]);
+		pt = (uint64_t *)PHYS_TO_DMAP(pa);
+		shift -= PT_SHIFT;
+		level--;
+	}
+
+	/* Leaf entry. */
+	index = (gpa >> shift) & PT_INDEX_MASK;
+
+	if (create) {
+		pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT;
+	} else
+		pt[index] = 0;
+
+#ifdef AMDVI_DEBUG_PTE
+	if ((gpa % 0x1000000) == 0)
+		printf("[Last level%d, shift = %d]PTE:0x%lx\n",
+		    level, shift, pt[index]);
+#endif
+	return (1ULL << shift);
+}
+
+static uint64_t
+amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa,
+    vm_paddr_t hpa, uint64_t size, bool create)
+{
+	uint64_t mapped, *ptp, len;
+	int level;
+
+	KASSERT(domain, ("domain is NULL"));
+	level = domain->ptp_level;
+	KASSERT(level, ("Page table level is 0"));
+
+	ptp = domain->ptp;
+	KASSERT(ptp, ("PTP is NULL"));
+	mapped = 0;
+	while (mapped < size) {
+		len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped,
+		    PAGE_SIZE, create);
+		if (!len) {
+			printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n",
+			    hpa, gpa);
+			return (0);
+		}
+		mapped += len;
+	}
+
+	return (mapped);
+}
+
+static uint64_t
+amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa,
+    uint64_t len)
+{
+	struct amdvi_domain *domain;
+
+	domain = (struct amdvi_domain *)arg;
+
+	if (domain->id && !domain->ptp) {
+		printf("ptp is NULL");
+		return (-1);
+	}
+
+	/*
+	 * If host domain is created w/o page table, skip IOMMU page
+	 * table set-up.
+	 */
+	if (domain->ptp)
+		return (amdvi_update_mapping(domain, gpa, hpa, len, true));
+	else
+		return (len);
+}
+
+static uint64_t
+amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+	struct amdvi_domain *domain;
+
+	domain = (struct amdvi_domain *)arg;
+	/*
+	 * If host domain is created w/o page table, skip IOMMU page
+	 * table set-up.
+	 */
+	if (domain->ptp)
+		return (amdvi_update_mapping(domain, gpa, 0, len, false));
+	return
+	    (len);
+}
+
+static struct amdvi_softc *
+amdvi_find_iommu(uint16_t devid)
+{
+	struct amdvi_softc *softc;
+	int i;
+
+	for (i = 0; i < ivhd_count; i++) {
+		softc = device_get_softc(ivhd_devs[i]);
+		if ((devid >= softc->start_dev_rid) &&
+		    (devid <= softc->end_dev_rid))
+			return (softc);
+	}
+
+	/*
+	 * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU.
+	 */
+	printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n",
+	    RID2PCI_STR(devid));
+
+	return (device_get_softc(ivhd_devs[0]));
+}
+
+/*
+ * Set-up device table entry.
+ * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must
+ * be set concurrently, e.g. read and write bits.
+ */
+static void
+amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable)
+{
+	struct amdvi_softc *softc;
+	struct amdvi_dte* temp;
+
+	KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid));
+	
+	softc = amdvi_find_iommu(devid);
+	KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid));
+
+	temp = &amdvi_dte[devid];
+
+#ifdef AMDVI_ATS_ENABLE
+	/* If IOMMU and device support IOTLB, enable it. */
+	if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb)
+		temp->iotlb_enable = 1;
+#endif
+
+	/* Avoid duplicate I/O faults. */
+	temp->sup_second_io_fault = 1;
+	temp->sup_all_io_fault = amdvi_disable_io_fault;
+
+	temp->dt_valid = 1;
+	temp->domain_id = domain->id;
+
+	if (enable) {
+		if (domain->ptp) {
+			temp->pt_base = vtophys(domain->ptp) >> 12;
+			temp->pt_level = amdvi_ptp_level;
+		}
+		/*
+		 * XXX: Page table valid[TV] bit must be set even if host domain
+		 * page tables are not enabled.
+		 */
+		temp->pt_valid = 1;
+		temp->read_allow = 1;
+		temp->write_allow = 1;
+	}
+}
+
+static void
+amdvi_inv_device(uint16_t devid)
+{
+	struct amdvi_softc *softc;
+
+	softc = amdvi_find_iommu(devid);
+	KASSERT(softc, ("softc is NULL"));
+
+	amdvi_cmd_inv_dte(softc, devid);
+#ifdef AMDVI_ATS_ENABLE
+	if (amdvi_dev_support_iotlb(softc, devid))
+		amdvi_cmd_inv_iotlb(softc, devid);
+#endif
+	amdvi_wait(softc);
+}
+
+static void
+amdvi_add_device(void *arg, uint16_t devid)
+{
+	struct amdvi_domain *domain;
+
+	domain = (struct amdvi_domain *)arg;
+	KASSERT(domain != NULL, ("domain is NULL"));
+#ifdef AMDVI_DEBUG_CMD
+	printf("Assigning device(%d.%d.%d) to domain:%d\n",
+	    RID2PCI_STR(devid), domain->id);
+#endif
+	amdvi_set_dte(domain, devid, true);
+	amdvi_inv_device(devid);
+}
+
+static void
+amdvi_remove_device(void *arg, uint16_t devid)
+{
+	struct amdvi_domain *domain;
+
+	domain = (struct amdvi_domain *)arg;
+#ifdef AMDVI_DEBUG_CMD
+	printf("Remove device(0x%x) from domain:%d\n",
+	       devid, domain->id);
+#endif
+	amdvi_set_dte(domain, devid, false);
+	amdvi_inv_device(devid);
+}
+
+static void
+amdvi_enable(void)
+{
+	struct amdvi_ctrl *ctrl;
+	struct amdvi_softc *softc;
+	uint64_t val;
+	int i;
+
+	for (i = 0; i < ivhd_count; i++) {
+		softc = device_get_softc(ivhd_devs[i]);
+		KASSERT(softc, ("softc is NULL\n"));
+		ctrl = softc->ctrl;
+		KASSERT(ctrl, ("ctrl is NULL\n"));
+
+		val = (	AMDVI_CTRL_EN 		|
+			AMDVI_CTRL_CMD 		|
+		    	AMDVI_CTRL_ELOG 	|
+		    	AMDVI_CTRL_ELOGINT 	|
+		    	AMDVI_CTRL_INV_TO_1S);
+
+		if (softc->ivhd_flag & IVHD_FLAG_COH)
+			val |= AMDVI_CTRL_COH;
+		if (softc->ivhd_flag & IVHD_FLAG_HTT)
+			val |= AMDVI_CTRL_HTT;
+		if (softc->ivhd_flag & IVHD_FLAG_RPPW)
+			val |= AMDVI_CTRL_RPPW;
+		if (softc->ivhd_flag & IVHD_FLAG_PPW)
+			val |= AMDVI_CTRL_PPW;
+		if (softc->ivhd_flag & IVHD_FLAG_ISOC)
+			val |= AMDVI_CTRL_ISOC;
+
+		ctrl->control = val;
+	}
+}
+
+static void
+amdvi_disable(void)
+{
+	struct amdvi_ctrl *ctrl;
+	struct amdvi_softc *softc;
+	int i;
+
+	for (i = 0; i < ivhd_count; i++) {
+		softc = device_get_softc(ivhd_devs[i]);
+		KASSERT(softc, ("softc is NULL\n"));
+		ctrl = softc->ctrl;
+		KASSERT(ctrl, ("ctrl is NULL\n"));
+
+		ctrl->control = 0;
+	}
+}
+
+static void
+amdvi_inv_tlb(void *arg)
+{
+	struct amdvi_domain *domain;
+
+	domain = (struct amdvi_domain *)arg;
+	KASSERT(domain, ("domain is NULL"));
+	amdvi_do_inv_domain(domain->id, false);
+}
+
+struct iommu_ops iommu_ops_amd = {
+	amdvi_init,
+	amdvi_cleanup,
+	amdvi_enable,
+	amdvi_disable,
+	amdvi_create_domain,
+	amdvi_destroy_domain,
+	amdvi_create_mapping,
+	amdvi_destroy_mapping,
+	amdvi_add_device,
+	amdvi_remove_device,
+	amdvi_inv_tlb
+};
diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h
new file mode 100644
index 0000000000..6ee6c36632
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h
@@ -0,0 +1,431 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Anish Gupta (anish@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _AMDVI_PRIV_H_
+#define _AMDVI_PRIV_H_
+
+#include <contrib/dev/acpica/include/acpi.h>
+
+#define	BIT(n)			(1ULL << (n))
+/* Return value of bits[n:m] where n and (n >= ) m are bit positions. */
+#define REG_BITS(x, n, m)	(((x) >> (m)) & 		\
+				((1 << (((n) - (m)) + 1)) - 1))
+
+/*
+ * IOMMU PCI capability.
+ */
+#define AMDVI_PCI_CAP_IOTLB	BIT(0)	/* IOTLB is supported. */
+#define AMDVI_PCI_CAP_HT	BIT(1)	/* HyperTransport tunnel support. */
+#define AMDVI_PCI_CAP_NPCACHE	BIT(2)	/* Not present page cached. */
+#define AMDVI_PCI_CAP_EFR	BIT(3)	/* Extended features. */
+#define AMDVI_PCI_CAP_EXT	BIT(4)	/* Miscellaneous information reg. */
+
+/*
+ * IOMMU extended features.
+ */
+#define AMDVI_EX_FEA_PREFSUP	BIT(0)	/* Prefetch command support. */
+#define AMDVI_EX_FEA_PPRSUP	BIT(1)	/* PPR support */
+#define AMDVI_EX_FEA_XTSUP	BIT(2)	/* Reserved */
+#define AMDVI_EX_FEA_NXSUP	BIT(3)	/* No-execute. */
+#define AMDVI_EX_FEA_GTSUP	BIT(4)	/* Guest translation support. */
+#define AMDVI_EX_FEA_EFRW	BIT(5)	/* Reserved */
+#define AMDVI_EX_FEA_IASUP	BIT(6)	/* Invalidate all command supp. */
+#define AMDVI_EX_FEA_GASUP	BIT(7)	/* Guest APIC or AVIC support. */
+#define AMDVI_EX_FEA_HESUP	BIT(8)	/* Hardware Error. */
+#define AMDVI_EX_FEA_PCSUP	BIT(9)	/* Performance counters support. */
+/* XXX: add more EFER bits. */
+
+/*
+ * Device table entry or DTE
+ * NOTE: Must be 256-bits/32 bytes aligned.
+ */
+struct amdvi_dte {
+	uint32_t dt_valid:1;		/* Device Table valid. */
+	uint32_t pt_valid:1;		/* Page translation valid. */
+	uint16_t :7;			/* Reserved[8:2] */
+	uint8_t	 pt_level:3;		/* Paging level, 0 to disable. */
+	uint64_t pt_base:40;		/* Page table root pointer. */
+	uint8_t  :3;			/* Reserved[54:52] */
+	uint8_t	 gv_valid:1;		/* Revision 2, GVA to SPA. */
+	uint8_t	 gv_level:2;		/* Revision 2, GLX level. */
+	uint8_t	 gv_cr3_lsb:3;		/* Revision 2, GCR3[14:12] */
+	uint8_t	 read_allow:1;		/* I/O read enabled. */
+	uint8_t	 write_allow:1;		/* I/O write enabled. */
+	uint8_t  :1;			/* Reserved[63] */
+	uint16_t domain_id:16;		/* Domain ID */
+	uint16_t gv_cr3_lsb2:16;	/* Revision 2, GCR3[30:15] */
+	uint8_t	 iotlb_enable:1;	/* Device support IOTLB */
+	uint8_t	 sup_second_io_fault:1;	/* Suppress subsequent I/O faults. */
+	uint8_t	 sup_all_io_fault:1;	/* Suppress all I/O page faults. */
+	uint8_t	 IOctl:2;		/* Port I/O control. */
+	uint8_t	 iotlb_cache_disable:1;	/* IOTLB cache hints. */
+	uint8_t	 snoop_disable:1;	/* Snoop disable. */
+	uint8_t	 allow_ex:1;		/* Allow exclusion. */
+	uint8_t	 sysmgmt:2;		/* System management message.*/
+	uint8_t  :1;			/* Reserved[106] */
+	uint32_t gv_cr3_msb:21;		/* Revision 2, GCR3[51:31] */
+	uint8_t	 intmap_valid:1;	/* Interrupt map valid. */
+	uint8_t	 intmap_len:4;		/* Interrupt map table length. */
+	uint8_t	 intmap_ign:1;		/* Ignore unmapped interrupts. */
+	uint64_t intmap_base:46;	/* IntMap base. */
+	uint8_t  :4;			/* Reserved[183:180] */
+	uint8_t	 init_pass:1;		/* INIT pass through or PT */
+	uint8_t	 extintr_pass:1;	/* External Interrupt PT */
+	uint8_t	 nmi_pass:1;		/* NMI PT */
+	uint8_t  :1;			/* Reserved[187] */
+	uint8_t	 intr_ctrl:2;		/* Interrupt control */
+	uint8_t	 lint0_pass:1;		/* LINT0 PT */
+	uint8_t	 lint1_pass:1;		/* LINT1 PT */
+	uint64_t :64;			/* Reserved[255:192] */
+} __attribute__((__packed__));
+CTASSERT(sizeof(struct amdvi_dte) == 32);
+
+/*
+ * IOMMU command entry.
+ */
+struct amdvi_cmd {
+	uint32_t 	word0;
+	uint32_t 	word1:28;
+	uint8_t		opcode:4;
+	uint64_t 	addr;
+} __attribute__((__packed__));
+
+/* Command opcodes. */
+#define AMDVI_CMP_WAIT_OPCODE	0x1	/* Completion wait. */
+#define AMDVI_INVD_DTE_OPCODE	0x2	/* Invalidate device table entry. */
+#define AMDVI_INVD_PAGE_OPCODE	0x3	/* Invalidate pages. */
+#define AMDVI_INVD_IOTLB_OPCODE	0x4	/* Invalidate IOTLB pages. */
+#define AMDVI_INVD_INTR_OPCODE	0x5	/* Invalidate Interrupt table. */
+#define AMDVI_PREFETCH_PAGES_OPCODE	0x6	/* Prefetch IOMMU pages. */
+#define AMDVI_COMP_PPR_OPCODE	0x7	/* Complete PPR request. */
+#define AMDVI_INV_ALL_OPCODE	0x8	/* Invalidate all. */
+
+/* Completion wait attributes. */
+#define AMDVI_CMP_WAIT_STORE	BIT(0)	/* Write back data. */
+#define AMDVI_CMP_WAIT_INTR	BIT(1)	/* Completion wait interrupt. */
+#define AMDVI_CMP_WAIT_FLUSH	BIT(2)	/* Flush queue. */
+
+/* Invalidate page. */
+#define AMDVI_INVD_PAGE_S	BIT(0)	/* Invalidation size. */
+#define AMDVI_INVD_PAGE_PDE	BIT(1)	/* Invalidate PDE. */
+#define AMDVI_INVD_PAGE_GN_GVA	BIT(2)	/* GPA or GVA. */
+
+#define AMDVI_INVD_PAGE_ALL_ADDR	(0x7FFFFFFFFFFFFULL << 12)
+
+/* Invalidate IOTLB. */
+#define AMDVI_INVD_IOTLB_S	BIT(0)	/* Invalidation size 4k or addr */
+#define AMDVI_INVD_IOTLB_GN_GVA	BIT(2)	/* GPA or GVA. */
+
+#define AMDVI_INVD_IOTLB_ALL_ADDR	(0x7FFFFFFFFFFFFULL << 12)
+/* XXX: add more command entries. */
+
+/*
+ * IOMMU event entry.
+ */
+struct amdvi_event {
+	uint16_t 	devid;
+	uint16_t 	pasid_hi;
+	uint16_t 	pasid_domid;	/* PASID low or DomainID */
+	uint16_t 	flag:12;
+	uint8_t		opcode:4;
+	uint64_t 	addr;
+} __attribute__((__packed__));
+CTASSERT(sizeof(struct amdvi_event) == 16);
+
+/* Various event types. */
+#define AMDVI_EVENT_INVALID_DTE		0x1
+#define AMDVI_EVENT_PFAULT		0x2
+#define AMDVI_EVENT_DTE_HW_ERROR	0x3
+#define AMDVI_EVENT_PAGE_HW_ERROR	0x4
+#define AMDVI_EVENT_ILLEGAL_CMD		0x5
+#define AMDVI_EVENT_CMD_HW_ERROR	0x6
+#define AMDVI_EVENT_IOTLB_TIMEOUT	0x7
+#define AMDVI_EVENT_INVALID_DTE_REQ	0x8
+#define AMDVI_EVENT_INVALID_PPR_REQ	0x9
+#define AMDVI_EVENT_COUNTER_ZERO	0xA
+
+#define AMDVI_EVENT_FLAG_MASK           0x1FF	/* Mask for event flags. */
+#define AMDVI_EVENT_FLAG_TYPE(x)        (((x) >> 9) & 0x3)
+
+/*
+ * IOMMU control block.
+ */
+struct amdvi_ctrl {
+	struct {
+		uint16_t size:9;
+		uint16_t :3;
+		uint64_t base:40;	/* Devtable register base. */
+		uint16_t :12;
+	} dte;
+	struct {
+		uint16_t :12;
+		uint64_t base:40;
+		uint8_t  :4;
+		uint8_t	 len:4;
+		uint8_t  :4;
+	} cmd;
+	struct {
+		uint16_t :12;
+		uint64_t base:40;
+		uint8_t  :4;
+		uint8_t	 len:4;
+		uint8_t  :4;
+	} event;
+	uint16_t control :13;
+	uint64_t	 :51;
+	struct {
+		uint8_t	 enable:1;
+		uint8_t	 allow:1;
+		uint16_t :10;
+		uint64_t base:40;
+		uint16_t :12;
+		uint16_t :12;
+		uint64_t limit:40;
+		uint16_t :12;
+	} excl;
+	/* 
+	 * Revision 2 only. 
+	 */
+	uint64_t ex_feature;
+	struct {
+		uint16_t :12;
+		uint64_t base:40;
+		uint8_t  :4;
+		uint8_t	 len:4;
+		uint8_t  :4;
+	} ppr;
+	uint64_t first_event;
+	uint64_t second_event;
+	uint64_t event_status;
+	/* Revision 2 only, end. */
+	uint8_t	 pad1[0x1FA8];		/* Padding. */
+	uint32_t cmd_head:19;
+	uint64_t :45;
+	uint32_t cmd_tail:19;
+	uint64_t :45;
+	uint32_t evt_head:19;
+	uint64_t :45;
+	uint32_t evt_tail:19;
+	uint64_t :45;
+	uint32_t status:19;
+	uint64_t :45;
+	uint64_t pad2;
+	uint8_t  :4;
+	uint16_t ppr_head:15;
+	uint64_t :45;
+	uint8_t  :4;
+	uint16_t ppr_tail:15;
+	uint64_t :45;
+	uint8_t	 pad3[0x1FC0];		/* Padding. */
+
+	/* XXX: More for rev2. */
+} __attribute__((__packed__));
+CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58);
+CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028);
+CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040);
+
+#define AMDVI_MMIO_V1_SIZE	(4 * PAGE_SIZE)	/* v1 size */
+/* 
+ * AMF IOMMU v2 size including event counters 
+ */
+#define AMDVI_MMIO_V2_SIZE	(8 * PAGE_SIZE)
+
+CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000);
+CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE);
+
+/* IVHD flag */
+#define IVHD_FLAG_HTT		BIT(0)	/* Hypertransport Tunnel. */
+#define IVHD_FLAG_PPW		BIT(1)	/* Pass posted write. */
+#define IVHD_FLAG_RPPW		BIT(2)	/* Response pass posted write. */
+#define IVHD_FLAG_ISOC		BIT(3)	/* Isoc support. */
+#define IVHD_FLAG_IOTLB		BIT(4)	/* IOTLB support. */
+#define IVHD_FLAG_COH		BIT(5)	/* Coherent control, default 1 */
+#define IVHD_FLAG_PFS		BIT(6)	/* Prefetch IOMMU pages. */
+#define IVHD_FLAG_PPRS		BIT(7)	/* Peripheral page support. */
+
+/* IVHD device entry data setting. */
+#define IVHD_DEV_LINT0_PASS	BIT(6)	/* LINT0 interrupts. */
+#define IVHD_DEV_LINT1_PASS	BIT(7)	/* LINT1 interrupts. */
+
+/* Bit[5:4] for System Mgmt. Bit3 is reserved. */
+#define IVHD_DEV_INIT_PASS	BIT(0)	/* INIT */
+#define IVHD_DEV_EXTINTR_PASS	BIT(1)	/* ExtInt */
+#define IVHD_DEV_NMI_PASS	BIT(2)	/* NMI */
+
+/* IVHD 8-byte extended data settings. */
+#define IVHD_DEV_EXT_ATS_DISABLE	BIT(31)	/* Disable ATS */
+
+/* IOMMU control register. */
+#define AMDVI_CTRL_EN		BIT(0)	/* IOMMU enable. */
+#define AMDVI_CTRL_HTT		BIT(1)	/* Hypertransport tunnel enable. */
+#define AMDVI_CTRL_ELOG		BIT(2)	/* Event log enable. */
+#define AMDVI_CTRL_ELOGINT	BIT(3)	/* Event log interrupt. */
+#define AMDVI_CTRL_COMINT	BIT(4)	/* Completion wait interrupt. */
+#define AMDVI_CTRL_PPW		BIT(8)
+#define AMDVI_CTRL_RPPW		BIT(9)
+#define AMDVI_CTRL_COH		BIT(10)
+#define AMDVI_CTRL_ISOC		BIT(11)
+#define AMDVI_CTRL_CMD		BIT(12)	/* Command buffer enable. */
+#define AMDVI_CTRL_PPRLOG	BIT(13)
+#define AMDVI_CTRL_PPRINT	BIT(14)
+#define AMDVI_CTRL_PPREN	BIT(15)
+#define AMDVI_CTRL_GTE		BIT(16)	/* Guest translation enable. */
+#define AMDVI_CTRL_GAE		BIT(17)	/* Guest APIC enable. */
+
+/* Invalidation timeout. */
+#define AMDVI_CTRL_INV_NO_TO	0	/* No timeout. */
+#define AMDVI_CTRL_INV_TO_1ms	1	/* 1 ms */
+#define AMDVI_CTRL_INV_TO_10ms	2	/* 10 ms */
+#define AMDVI_CTRL_INV_TO_100ms	3	/* 100 ms */
+#define AMDVI_CTRL_INV_TO_1S	4	/* 1 second */
+#define AMDVI_CTRL_INV_TO_10S	5	/* 10 second */
+#define AMDVI_CTRL_INV_TO_100S	6	/* 100 second */
+
+/*
+ * Max number of PCI devices.
+ * 256 bus x 32 slot/devices x 8 functions.
+ */
+#define PCI_NUM_DEV_MAX		0x10000
+
+/* Maximum number of domains supported by IOMMU. */
+#define AMDVI_MAX_DOMAIN	(BIT(16) - 1)
+
+/*
+ * IOMMU Page Table attributes.
+ */
+#define AMDVI_PT_PRESENT	BIT(0)
+#define AMDVI_PT_COHERENT	BIT(60)
+#define AMDVI_PT_READ		BIT(61)
+#define AMDVI_PT_WRITE		BIT(62)
+
+#define AMDVI_PT_RW		(AMDVI_PT_READ | AMDVI_PT_WRITE)
+#define AMDVI_PT_MASK		0xFFFFFFFFFF000UL /* Only [51:12] for PA */
+
+#define AMDVI_PD_LEVEL_SHIFT	9
+#define AMDVI_PD_SUPER(x)	(((x) >> AMDVI_PD_LEVEL_SHIFT) == 7)
+/*
+ * IOMMU Status, offset 0x2020
+ */
+#define AMDVI_STATUS_EV_OF		BIT(0)	/* Event overflow. */
+#define AMDVI_STATUS_EV_INTR		BIT(1)	/* Event interrupt. */
+/* Completion wait command completed. */
+#define AMDVI_STATUS_CMP		BIT(2)
+
+#define	IVRS_CTRL_RID			1	/* MMIO RID */
+
+/* ACPI IVHD */
+struct ivhd_dev_cfg {
+	uint32_t start_id;
+	uint32_t end_id;
+	uint8_t	 data;			/* Device configuration. */
+	bool	 enable_ats;		/* ATS enabled for the device. */
+	int	 ats_qlen;		/* ATS invalidation queue depth. */
+};
+
+struct amdvi_domain {
+	uint64_t *ptp;			/* Highest level page table */
+	int	ptp_level;		/* Level of page tables */
+	u_int	id;			/* Domain id */
+	SLIST_ENTRY (amdvi_domain) next;
+};
+
+/*
+ * I/O Virtualization Hardware Definition Block (IVHD) type 0x10 (legacy)
+ * uses ACPI_IVRS_HARDWARE define in contrib/dev/acpica/include/actbl2.h
+ * New IVHD types 0x11 and 0x40 as defined in AMD IOMMU spec[48882] are missing in
+ * ACPI code. These new types add extra field EFR(Extended Feature Register).
+ * XXX : Use definition from ACPI when it is available.
+ */
+typedef struct acpi_ivrs_hardware_efr_sup
+{
+	ACPI_IVRS_HEADER Header;
+	UINT16 CapabilityOffset;   /* Offset for IOMMU control fields */
+	UINT64 BaseAddress;        /* IOMMU control registers */
+	UINT16 PciSegmentGroup;
+	UINT16 Info;               /* MSI number and unit ID */
+	UINT32 Attr;               /* IOMMU Feature */
+	UINT64 ExtFR;              /* IOMMU Extended Feature */
+	UINT64 Reserved;           /* v1 feature or v2 attribute */
+} __attribute__ ((__packed__)) ACPI_IVRS_HARDWARE_EFRSUP;
+CTASSERT(sizeof(ACPI_IVRS_HARDWARE_EFRSUP) == 40);
+
+/*
+ * Different type of IVHD.
+ * XXX: Use AcpiIvrsType once new IVHD types are available.
+*/
+enum IvrsType
+{
+	IVRS_TYPE_HARDWARE_LEGACY = 0x10, /* Legacy without EFRi support. */
+	IVRS_TYPE_HARDWARE_EFR 	  = 0x11, /* With EFR support. */
+	IVRS_TYPE_HARDWARE_MIXED  = 0x40, /* Mixed with EFR support. */
+};
+
+/*
+ * AMD IOMMU softc.
+ */
+struct amdvi_softc {
+	struct amdvi_ctrl *ctrl;	/* Control area. */
+	device_t 	dev;		/* IOMMU device. */
+	enum IvrsType   ivhd_type;	/* IOMMU IVHD type. */
+	bool		iotlb;		/* IOTLB supported by IOMMU */
+	struct amdvi_cmd *cmd;		/* Command descriptor area. */
+	int 		cmd_max;	/* Max number of commands. */
+	uint64_t	cmp_data;	/* Command completion write back. */
+	struct amdvi_event *event;	/* Event descriptor area. */
+	struct resource *event_res;	/* Event interrupt resource. */
+	void   		*event_tag;	/* Event interrupt tag. */
+	int		event_max;	/* Max number of events. */
+	int		event_irq;
+	int		event_rid;
+	/* ACPI various flags. */
+	uint32_t 	ivhd_flag;	/* ACPI IVHD flag. */
+	uint32_t 	ivhd_feature;	/* ACPI v1 Reserved or v2 attribute. */
+	uint64_t 	ext_feature;	/* IVHD EFR */
+	/* PCI related. */
+	uint16_t 	cap_off;	/* PCI Capability offset. */
+	uint8_t		pci_cap;	/* PCI capability. */
+	uint16_t 	pci_seg;	/* IOMMU PCI domain/segment. */
+	uint16_t 	pci_rid;	/* PCI BDF of IOMMU */
+	/* Device range under this IOMMU. */
+	uint16_t 	start_dev_rid;	/* First device under this IOMMU. */
+	uint16_t 	end_dev_rid;	/* Last device under this IOMMU. */
+
+	/* BIOS provided device configuration for end points. */
+	struct 		ivhd_dev_cfg dev_cfg[10];
+	int		dev_cfg_cnt;
+
+	/* Software statistics. */
+	uint64_t 	event_intr_cnt;	/* Total event INTR count. */
+	uint64_t 	total_cmd;	/* Total number of commands. */
+};
+
+int	amdvi_setup_hw(struct amdvi_softc *softc);
+int	amdvi_teardown_hw(struct amdvi_softc *softc);
+#endif /* _AMDVI_PRIV_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c
new file mode 100644
index 0000000000..370c20fb01
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c
@@ -0,0 +1,735 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016, Anish Gupta (anish@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_acpi.h"
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+
+#include <machine/vmparam.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <dev/acpica/acpivar.h>
+
+#include "io/iommu.h"
+#include "amdvi_priv.h"
+
+device_t *ivhd_devs;			/* IVHD or AMD-Vi device list. */
+int	ivhd_count;			/* Number of IVHD header. */
+/* 
+ * Cached IVHD header list.
+ * Single entry for each IVHD, filtered the legacy one.
+ */
+ACPI_IVRS_HARDWARE *ivhd_hdrs[10];	
+
+extern int amdvi_ptp_level;		/* Page table levels. */
+
+typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg);
+/*
+ * Iterate IVRS table for IVHD and IVMD device type.
+ */
+static void
+ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg)
+{
+	ACPI_TABLE_IVRS *ivrs;
+	ACPI_IVRS_HEADER *ivrs_hdr, *end;
+	ACPI_STATUS status;
+
+	status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs);
+	if (ACPI_FAILURE(status))
+		return;
+
+	if (ivrs->Header.Length == 0) {
+		return;
+	}
+
+	ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1);
+	end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length);
+
+	while (ivrs_hdr < end) {
+		if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) {
+			printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n",
+			    ivrs_hdr->Length);
+			break;
+		}
+
+		switch (ivrs_hdr->Type) {
+		case IVRS_TYPE_HARDWARE_LEGACY:	/* Legacy */
+		case IVRS_TYPE_HARDWARE_EFR:
+		case IVRS_TYPE_HARDWARE_MIXED:
+			if (!iter(ivrs_hdr, arg))
+				return;
+			break;
+
+		case ACPI_IVRS_TYPE_MEMORY1:
+		case ACPI_IVRS_TYPE_MEMORY2:
+		case ACPI_IVRS_TYPE_MEMORY3:
+			if (!iter(ivrs_hdr, arg))
+				return;
+
+			break;
+
+		default:
+			printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type);
+
+		}
+
+		ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr +
+			ivrs_hdr->Length);
+	}
+}
+
+static bool
+ivrs_is_ivhd(UINT8 type)
+{
+
+	switch(type) {
+	case IVRS_TYPE_HARDWARE_LEGACY:
+	case IVRS_TYPE_HARDWARE_EFR:
+	case IVRS_TYPE_HARDWARE_MIXED:
+		return (true);
+
+	default:
+		return (false);
+	}
+}
+
+/* Count the number of AMD-Vi devices in the system. */
+static int
+ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg)
+{
+
+	if (ivrs_is_ivhd(ivrs_he->Type))
+		ivhd_count++;
+
+	return (1);
+}
+
+struct find_ivrs_hdr_args {
+	int	i;
+	ACPI_IVRS_HEADER *ptr;
+};
+
+static int
+ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args)
+{
+	struct find_ivrs_hdr_args *fi;
+
+	fi = (struct find_ivrs_hdr_args *)args;
+	if (ivrs_is_ivhd(ivrs_hdr->Type)) {
+		if (fi->i == 0) {
+			fi->ptr = ivrs_hdr;
+			return (0);
+		}
+		fi->i--;
+	}
+
+	return (1);
+}
+
+static ACPI_IVRS_HARDWARE *
+ivhd_find_by_index(int idx)
+{
+	struct find_ivrs_hdr_args fi;
+
+	fi.i = idx;
+	fi.ptr = NULL;
+
+	ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi);
+
+	return ((ACPI_IVRS_HARDWARE *)fi.ptr);
+}
+
+static void
+ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id,
+    uint32_t end_id, uint8_t cfg, bool ats)
+{
+	struct ivhd_dev_cfg *dev_cfg;
+
+	/* If device doesn't have special data, don't add it. */
+	if (!cfg)
+		return;
+
+	dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++];
+	dev_cfg->start_id = start_id;
+	dev_cfg->end_id = end_id;
+	dev_cfg->data = cfg;
+	dev_cfg->enable_ats = ats;
+}
+
+/*
+ * Record device attributes as suggested by BIOS.
+ */
+static int
+ivhd_dev_parse(ACPI_IVRS_HARDWARE* ivhd, struct amdvi_softc *softc)
+{
+	ACPI_IVRS_DE_HEADER *de;
+	uint8_t *p, *end;
+	int range_start_id = 0, range_end_id = 0;
+	uint32_t *extended;
+	uint8_t all_data = 0, range_data = 0;
+	bool range_enable_ats = false, enable_ats;
+
+	softc->start_dev_rid = ~0;
+	softc->end_dev_rid = 0;
+
+	switch (ivhd->Header.Type) {
+		case IVRS_TYPE_HARDWARE_LEGACY:
+			p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE);
+			break;
+
+		case IVRS_TYPE_HARDWARE_EFR:
+		case IVRS_TYPE_HARDWARE_MIXED:
+			p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE_EFRSUP);
+			break;
+
+		default:
+			device_printf(softc->dev, 
+				"unknown type: 0x%x\n", ivhd->Header.Type);
+			return (-1);
+	}
+
+	end = (uint8_t *)ivhd + ivhd->Header.Length;
+
+	while (p < end) {
+		de = (ACPI_IVRS_DE_HEADER *)p;
+		softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id);
+		softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id);
+		switch (de->Type) {
+		case ACPI_IVRS_TYPE_ALL:
+			all_data = de->DataSetting;
+			break;
+
+		case ACPI_IVRS_TYPE_SELECT:
+		case ACPI_IVRS_TYPE_ALIAS_SELECT:
+		case ACPI_IVRS_TYPE_EXT_SELECT:
+			enable_ats = false;
+			if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) {
+				extended = (uint32_t *)(de + 1);
+				enable_ats =
+				    (*extended & IVHD_DEV_EXT_ATS_DISABLE) ?
+					false : true;
+			}
+			ivhd_dev_add_entry(softc, de->Id, de->Id,
+			    de->DataSetting | all_data, enable_ats);
+			break;
+
+		case ACPI_IVRS_TYPE_START:
+		case ACPI_IVRS_TYPE_ALIAS_START:
+		case ACPI_IVRS_TYPE_EXT_START:
+			range_start_id = de->Id;
+			range_data = de->DataSetting;
+			if (de->Type == ACPI_IVRS_TYPE_EXT_START) {
+				extended = (uint32_t *)(de + 1);
+				range_enable_ats =
+				    (*extended & IVHD_DEV_EXT_ATS_DISABLE) ?
+					false : true;
+			}
+			break;
+
+		case ACPI_IVRS_TYPE_END:
+			range_end_id = de->Id;
+			ivhd_dev_add_entry(softc, range_start_id, range_end_id,
+				range_data | all_data, range_enable_ats);
+			range_start_id = range_end_id = 0;
+			range_data = 0;
+			all_data = 0;
+			break;
+
+		case ACPI_IVRS_TYPE_PAD4:
+			break;
+
+		case ACPI_IVRS_TYPE_SPECIAL:
+			/* HPET or IOAPIC */
+			break;
+		default:
+			if ((de->Type < 5) ||
+			    (de->Type >= ACPI_IVRS_TYPE_PAD8))
+				device_printf(softc->dev,
+				    "Unknown dev entry:0x%x\n", de->Type);
+		}
+
+		if (softc->dev_cfg_cnt >
+			(sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) {
+			device_printf(softc->dev,
+			    "WARN Too many device entries.\n");
+			return (EINVAL);
+		}
+		if (de->Type < 0x40)
+			p += sizeof(ACPI_IVRS_DEVICE4);
+		else if (de->Type < 0x80)
+			p += sizeof(ACPI_IVRS_DEVICE8A);
+		else {
+			printf("Variable size IVHD type 0x%x not supported\n",
+			    de->Type);
+			break;
+		}
+	}
+
+	KASSERT((softc->end_dev_rid >= softc->start_dev_rid),
+	    ("Device end[0x%x] < start[0x%x.\n",
+	    softc->end_dev_rid, softc->start_dev_rid));
+
+	return (0);
+}
+
+static bool
+ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER  *new)
+{
+	/*
+	 * Newer IVRS header type take precedence.
+	 */
+	if ((old->DeviceId == new->DeviceId) &&
+		(old->Type == IVRS_TYPE_HARDWARE_LEGACY) &&
+		((new->Type == IVRS_TYPE_HARDWARE_EFR) ||
+		(new->Type == IVRS_TYPE_HARDWARE_MIXED))) {
+		return (true);
+	}
+
+	return (false);
+}
+
+static void
+ivhd_identify(driver_t *driver, device_t parent)
+{
+	ACPI_TABLE_IVRS *ivrs;
+	ACPI_IVRS_HARDWARE *ivhd;
+	ACPI_STATUS status;
+	int i, count = 0;
+	uint32_t ivrs_ivinfo;
+
+	if (acpi_disabled("ivhd"))
+		return;
+
+	status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs);
+	if (ACPI_FAILURE(status))
+		return;
+
+	if (ivrs->Header.Length == 0) {
+		return;
+	}
+
+	ivrs_ivinfo = ivrs->Info;
+	printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d"
+	       " flags:%b\n",
+		REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), 
+		REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22),
+		"\020\001EFRSup");
+
+	ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL);
+	if (!ivhd_count)
+		return;
+
+	for (i = 0; i < ivhd_count; i++) {
+		ivhd = ivhd_find_by_index(i);
+		KASSERT(ivhd, ("ivhd%d is NULL\n", i));
+		ivhd_hdrs[i] = ivhd;
+	}
+
+        /* 
+	 * Scan for presence of legacy and non-legacy device type
+	 * for same AMD-Vi device and override the old one.
+	 */
+	for (i = ivhd_count - 1 ; i > 0 ; i--){
+       		if (ivhd_is_newer(&ivhd_hdrs[i-1]->Header, 
+			&ivhd_hdrs[i]->Header)) {
+			ivhd_hdrs[i-1] = ivhd_hdrs[i];
+			ivhd_count--;
+		}
+       }	       
+
+	ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF,
+		M_WAITOK | M_ZERO);
+	for (i = 0; i < ivhd_count; i++) {
+		ivhd = ivhd_hdrs[i];
+		KASSERT(ivhd, ("ivhd%d is NULL\n", i));
+
+		/*
+		 * Use a high order to ensure that this driver is probed after
+		 * the Host-PCI bridge and the root PCI bus.
+		 */
+		ivhd_devs[i] = BUS_ADD_CHILD(parent,
+		    ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i);
+
+		/*
+		 * XXX: In case device was not destroyed before, add will fail.
+		 * locate the old device instance.
+		 */
+		if (ivhd_devs[i] == NULL) {
+			ivhd_devs[i] = device_find_child(parent, "ivhd", i);
+			if (ivhd_devs[i] == NULL) {
+				printf("AMD-Vi: cant find ivhd%d\n", i);
+				break;
+			}
+		}
+		count++;
+	}
+
+	/*
+	 * Update device count in case failed to attach.
+	 */
+	ivhd_count = count;
+}
+
+static int
+ivhd_probe(device_t dev)
+{
+	ACPI_IVRS_HARDWARE *ivhd;
+	int unit;
+
+	if (acpi_get_handle(dev) != NULL)
+		return (ENXIO);
+
+	unit = device_get_unit(dev);
+	KASSERT((unit < ivhd_count), 
+		("ivhd unit %d > count %d", unit, ivhd_count));
+	ivhd = ivhd_hdrs[unit];
+	KASSERT(ivhd, ("ivhd is NULL"));
+
+	switch (ivhd->Header.Type) {
+	case IVRS_TYPE_HARDWARE_EFR:
+		device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR");
+		break;
+	
+	case IVRS_TYPE_HARDWARE_MIXED:
+		device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format");
+		break;
+
+	case IVRS_TYPE_HARDWARE_LEGACY:
+        default:
+		device_set_desc(dev, "AMD-Vi/IOMMU ivhd");
+		break;
+	}
+
+	return (BUS_PROBE_NOWILDCARD);
+}
+
+static void
+ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag)
+{
+	/*
+	 * IVHD lgeacy type has two extra high bits in flag which has
+	 * been moved to EFR for non-legacy device.
+	 */
+	switch (ivhd_type) {
+	case IVRS_TYPE_HARDWARE_LEGACY:
+		device_printf(dev, "Flag:%b\n", flag,
+			"\020"
+			"\001HtTunEn"
+			"\002PassPW"
+			"\003ResPassPW"
+			"\004Isoc"
+			"\005IotlbSup"
+			"\006Coherent"
+			"\007PreFSup"
+			"\008PPRSup");
+		break;
+
+	case IVRS_TYPE_HARDWARE_EFR:
+	case IVRS_TYPE_HARDWARE_MIXED:
+		device_printf(dev, "Flag:%b\n", flag,
+			"\020"
+			"\001HtTunEn"
+			"\002PassPW"
+			"\003ResPassPW"
+			"\004Isoc"
+			"\005IotlbSup"
+			"\006Coherent");
+		break;
+
+	default:
+		device_printf(dev, "Can't decode flag of ivhd type :0x%x\n",
+			ivhd_type);
+		break;
+	}
+}
+
+/*
+ * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40).
+ */
+static void
+ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) 
+{
+	switch (ivhd_type) {
+	case IVRS_TYPE_HARDWARE_LEGACY:
+		device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d"
+			" MsiNumPPR = %d PNBanks= %d PNCounters= %d\n",
+			ivhd_type,
+			REG_BITS(feature, 31, 30),
+			REG_BITS(feature, 29, 28),
+			REG_BITS(feature, 27, 23),
+			REG_BITS(feature, 22, 17),
+			REG_BITS(feature, 16, 13));
+		device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n",
+			REG_BITS(feature, 12, 8),
+			REG_BITS(feature, 4, 3),
+			feature,
+			"\020"
+			"\002NXSup"
+			"\003GTSup"
+			"\004<b4>"
+			"\005IASup"
+			"\006GASup"
+			"\007HESup");
+		break;
+
+	/* Fewer features or attributes are reported in non-legacy type. */
+	case IVRS_TYPE_HARDWARE_EFR:
+	case IVRS_TYPE_HARDWARE_MIXED:
+		device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d"
+			" PNBanks= %d PNCounters= %d\n",
+			ivhd_type,
+			REG_BITS(feature, 27, 23),
+			REG_BITS(feature, 22, 17),
+			REG_BITS(feature, 16, 13));
+		break;
+
+	default: /* Other ivhd type features are not decoded. */
+		device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type);
+	}
+}
+
+/* Print extended features of IOMMU. */
+static void
+ivhd_print_ext_feature(device_t dev, uint64_t ext_feature)
+{
+	uint32_t ext_low, ext_high;
+
+	if (!ext_feature)
+		return;
+
+	ext_low = ext_feature;
+	device_printf(dev, "Extended features[31:0]:%b "
+		"HATS = 0x%x GATS = 0x%x "
+		"GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x "
+		"GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n",
+		(int)ext_low,
+		"\020"
+		"\001PreFSup"
+		"\002PPRSup"
+		"\003<b2>"
+		"\004NXSup"
+		"\005GTSup"
+		"\006<b5>"
+		"\007IASup"
+		"\008GASup"
+		"\009HESup"
+		"\010PCSup",
+		REG_BITS(ext_low, 11, 10),
+		REG_BITS(ext_low, 13, 12),
+		REG_BITS(ext_low, 15, 14),
+		REG_BITS(ext_low, 17, 16),
+		REG_BITS(ext_low, 20, 18),
+		REG_BITS(ext_low, 23, 21),
+		REG_BITS(ext_low, 25, 24),
+		REG_BITS(ext_low, 29, 28));
+
+	ext_high = ext_feature >> 32;
+	device_printf(dev, "Extended features[62:32]:%b "
+		"Max PASID: 0x%x DevTblSegSup = 0x%x "
+		"MarcSup = 0x%x\n",
+		(int)(ext_high),
+		"\020"
+		"\006USSup"
+		"\009PprOvrflwEarlySup"
+		"\010PPRAutoRspSup"
+		"\013BlKStopMrkSup"
+		"\014PerfOptSup"
+		"\015MsiCapMmioSup"
+		"\017GIOSup"
+		"\018HASup"
+		"\019EPHSup"
+		"\020AttrFWSup"
+		"\021HDSup"
+		"\023InvIotlbSup",
+	    	REG_BITS(ext_high, 5, 0),
+	    	REG_BITS(ext_high, 8, 7),
+	    	REG_BITS(ext_high, 11, 10));
+}
+
+static int
+ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd)
+{
+	device_t dev;
+	int max_ptp_level;
+
+	dev = softc->dev;
+	
+	ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag);
+	ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature);
+	ivhd_print_ext_feature(dev, softc->ext_feature);
+	max_ptp_level = 7;
+	/* Make sure device support minimum page level as requested by user. */
+	if (max_ptp_level < amdvi_ptp_level) {
+		device_printf(dev, "insufficient PTP level:%d\n",
+			max_ptp_level);
+		return (EINVAL);
+	} else {
+		device_printf(softc->dev, "supported paging level:%d, will use only: %d\n",
+	    		max_ptp_level, amdvi_ptp_level);
+	}
+
+	device_printf(softc->dev, "device range: 0x%x - 0x%x\n",
+			softc->start_dev_rid, softc->end_dev_rid);
+
+	return (0);
+}
+
+static int
+ivhd_attach(device_t dev)
+{
+	ACPI_IVRS_HARDWARE *ivhd;
+	ACPI_IVRS_HARDWARE_EFRSUP *ivhd_efr;
+	struct amdvi_softc *softc;
+	int status, unit;
+
+	unit = device_get_unit(dev);
+	KASSERT((unit < ivhd_count), 
+		("ivhd unit %d > count %d", unit, ivhd_count));
+	/* Make sure its same device for which attach is called. */
+	KASSERT((ivhd_devs[unit] == dev),
+		("Not same device old %p new %p", ivhd_devs[unit], dev));
+
+	softc = device_get_softc(dev);
+	softc->dev = dev;
+	ivhd = ivhd_hdrs[unit];
+	KASSERT(ivhd, ("ivhd is NULL"));
+
+	softc->ivhd_type = ivhd->Header.Type;
+	softc->pci_seg = ivhd->PciSegmentGroup;
+	softc->pci_rid = ivhd->Header.DeviceId;
+	softc->ivhd_flag = ivhd->Header.Flags;
+	/* 
+	 * On lgeacy IVHD type(0x10), it is documented as feature
+	 * but in newer type it is attribute.
+	 */
+	softc->ivhd_feature = ivhd->Reserved;
+	/* 
+	 * PCI capability has more capabilities that are not part of IVRS.
+	 */
+	softc->cap_off = ivhd->CapabilityOffset;
+
+#ifdef notyet
+	/* IVHD Info bit[4:0] is event MSI/X number. */
+	softc->event_msix = ivhd->Info & 0x1F;
+#endif
+	switch (ivhd->Header.Type) {
+		case IVRS_TYPE_HARDWARE_EFR:
+		case IVRS_TYPE_HARDWARE_MIXED:
+			ivhd_efr = (ACPI_IVRS_HARDWARE_EFRSUP *)ivhd;
+			softc->ext_feature = ivhd_efr->ExtFR;
+			break;
+
+	}
+
+	softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress);
+	status = ivhd_dev_parse(ivhd, softc);
+	if (status != 0) {
+		device_printf(dev,
+		    "endpoint device parsing error=%d\n", status);
+	}
+
+	status = ivhd_print_cap(softc, ivhd);
+	if (status != 0) {
+		return (status);
+	}
+
+	status = amdvi_setup_hw(softc);
+	if (status != 0) {
+		device_printf(dev, "couldn't be initialised, error=%d\n", 
+		    status);
+		return (status);
+	}
+
+	return (0);
+}
+
+static int
+ivhd_detach(device_t dev)
+{
+	struct amdvi_softc *softc;
+
+	softc = device_get_softc(dev);
+
+	amdvi_teardown_hw(softc);
+
+	/*
+	 * XXX: delete the device.
+	 * don't allow detach, return EBUSY.
+	 */
+	return (0);
+}
+
+static int
+ivhd_suspend(device_t dev)
+{
+
+	return (0);
+}
+
+static int
+ivhd_resume(device_t dev)
+{
+
+	return (0);
+}
+
+static device_method_t ivhd_methods[] = {
+	DEVMETHOD(device_identify, ivhd_identify),
+	DEVMETHOD(device_probe, ivhd_probe),
+	DEVMETHOD(device_attach, ivhd_attach),
+	DEVMETHOD(device_detach, ivhd_detach),
+	DEVMETHOD(device_suspend, ivhd_suspend),
+	DEVMETHOD(device_resume, ivhd_resume),
+	DEVMETHOD_END
+};
+
+static driver_t ivhd_driver = {
+	"ivhd",
+	ivhd_methods,
+	sizeof(struct amdvi_softc),
+};
+
+static devclass_t ivhd_devclass;
+
+/*
+ * Load this module at the end after PCI re-probing to configure interrupt.
+ */
+DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0,
+		      SI_ORDER_ANY);
+MODULE_DEPEND(ivhd, acpi, 1, 1, 1);
+MODULE_DEPEND(ivhd, pci, 1, 1, 1);
diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.c b/usr/src/uts/i86pc/io/vmm/amd/npt.c
new file mode 100644
index 0000000000..e61464a964
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/npt.c
@@ -0,0 +1,87 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+
+#include "npt.h"
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL);
+
+static int npt_flags;
+SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD,
+	&npt_flags, 0, NULL);
+
+#define NPT_IPIMASK	0xFF
+
+/*
+ * AMD nested page table init.
+ */
+int
+svm_npt_init(int ipinum)
+{
+	int enable_superpage = 1;
+
+	npt_flags = ipinum & NPT_IPIMASK;
+	TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage);
+	if (enable_superpage)
+		npt_flags |= PMAP_PDE_SUPERPAGE; 
+	
+	return (0);
+}
+
+static int
+npt_pinit(pmap_t pmap)
+{
+
+	return (pmap_pinit_type(pmap, PT_RVI, npt_flags));
+}
+
+struct vmspace *
+svm_npt_alloc(vm_offset_t min, vm_offset_t max)
+{
+	
+	return (vmspace_alloc(min, max, npt_pinit));
+}
+
+void
+svm_npt_free(struct vmspace *vmspace)
+{
+
+	vmspace_free(vmspace);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.h b/usr/src/uts/i86pc/io/vmm/amd/npt.h
new file mode 100644
index 0000000000..35530d7833
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/npt.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_NPT_H_
+#define _SVM_NPT_H_
+
+int 	svm_npt_init(int ipinum);
+struct	vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max);
+void	svm_npt_free(struct vmspace *vmspace);
+
+#endif /* _SVM_NPT_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/offsets.in b/usr/src/uts/i86pc/io/vmm/amd/offsets.in
new file mode 100644
index 0000000000..f8d2a716d7
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/offsets.in
@@ -0,0 +1,36 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+#include <sys/types.h>
+
+#include "amd/svm.h"
+
+svm_regctx
+	sctx_rbx	SCTX_RBX
+	sctx_rcx	SCTX_RCX
+	sctx_rbp	SCTX_RBP
+	sctx_rdx	SCTX_RDX
+	sctx_rdi	SCTX_RDI
+	sctx_rsi	SCTX_RSI
+	sctx_r8		SCTX_R8
+	sctx_r9		SCTX_R9
+	sctx_r10	SCTX_R10
+	sctx_r11	SCTX_R11
+	sctx_r12	SCTX_R12
+	sctx_r13	SCTX_R13
+	sctx_r14	SCTX_R14
+	sctx_r15	SCTX_R15
+
+/* Pull in definition for MSR_GSBASE */
+\#include <machine/specialreg.h>
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
new file mode 100644
index 0000000000..25dc3a63fa
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -0,0 +1,2446 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+
+#ifndef __FreeBSD__
+#include <sys/x86_archext.h>
+#include <sys/trap.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/cpufunc.h>
+#include <machine/psl.h>
+#include <machine/md_var.h>
+#include <machine/reg.h>
+#include <machine/specialreg.h>
+#include <machine/smp.h>
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_ktr.h"
+#include "vmm_ioport.h"
+#include "vatpic.h"
+#include "vlapic.h"
+#include "vlapic_priv.h"
+
+#include "x86.h"
+#include "vmcb.h"
+#include "svm.h"
+#include "svm_softc.h"
+#include "svm_msr.h"
+#include "npt.h"
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
+
+/*
+ * SVM CPUID function 0x8000_000A, edx bit decoding.
+ */
+#define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
+#define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
+#define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
+#define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
+#define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
+#define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
+#define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
+#define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
+#define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
+#define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
+#define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
+
+#define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
+				VMCB_CACHE_IOPM		|	\
+				VMCB_CACHE_I		|	\
+				VMCB_CACHE_TPR		|	\
+				VMCB_CACHE_CR2		|	\
+				VMCB_CACHE_CR		|	\
+				VMCB_CACHE_DR		|	\
+				VMCB_CACHE_DT		|	\
+				VMCB_CACHE_SEG		|	\
+				VMCB_CACHE_NP)
+
+static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
+SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
+    0, NULL);
+
+static MALLOC_DEFINE(M_SVM, "svm", "svm");
+static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
+
+#ifdef __FreeBSD__
+/* Per-CPU context area. */
+extern struct pcpu __pcpu[];
+#endif
+
+static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
+    "SVM features advertised by CPUID.8000000AH:EDX");
+
+static int disable_npf_assist;
+SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
+    &disable_npf_assist, 0, NULL);
+
+#ifdef __FreeBSD__
+/* Maximum ASIDs supported by the processor */
+static uint32_t nasid;
+SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
+    "Number of ASIDs supported by this processor");
+
+/* Current ASID generation for each host cpu */
+static struct asid asid[MAXCPU];
+
+/* 
+ * SVM host state saved area of size 4KB for each core.
+ */
+static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+#endif /* __FreeBSD__ */
+
+static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
+static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
+static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
+
+static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
+
+static __inline int
+flush_by_asid(void)
+{
+
+	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
+}
+
+static __inline int
+decode_assist(void)
+{
+
+	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
+}
+
+#ifdef __FreeBSD__
+static void
+svm_disable(void *arg __unused)
+{
+	uint64_t efer;
+
+	efer = rdmsr(MSR_EFER);
+	efer &= ~EFER_SVM;
+	wrmsr(MSR_EFER, efer);
+}
+
+/*
+ * Disable SVM on all CPUs.
+ */
+static int
+svm_cleanup(void)
+{
+
+	smp_rendezvous(NULL, svm_disable, NULL, NULL);
+	return (0);
+}
+
+/*
+ * Verify that all the features required by bhyve are available.
+ */
+static int
+check_svm_features(void)
+{
+	u_int regs[4];
+
+	/* CPUID Fn8000_000A is for SVM */
+	do_cpuid(0x8000000A, regs);
+	svm_feature &= regs[3];
+
+	/*
+	 * The number of ASIDs can be configured to be less than what is
+	 * supported by the hardware but not more.
+	 */
+	if (nasid == 0 || nasid > regs[1])
+		nasid = regs[1];
+	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
+
+	/* bhyve requires the Nested Paging feature */
+	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
+		printf("SVM: Nested Paging feature not available.\n");
+		return (ENXIO);
+	}
+
+	/* bhyve requires the NRIP Save feature */
+	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
+		printf("SVM: NRIP Save feature not available.\n");
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+static void
+svm_enable(void *arg __unused)
+{
+	uint64_t efer;
+
+	efer = rdmsr(MSR_EFER);
+	efer |= EFER_SVM;
+	wrmsr(MSR_EFER, efer);
+
+	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
+}
+
+/*
+ * Return 1 if SVM is enabled on this processor and 0 otherwise.
+ */
+static int
+svm_available(void)
+{
+	uint64_t msr;
+
+#ifdef __FreeBSD__
+	/* Section 15.4 Enabling SVM from APM2. */
+	if ((amd_feature2 & AMDID2_SVM) == 0) {
+		printf("SVM: not available.\n");
+		return (0);
+	}
+#else
+	if (!is_x86_feature(x86_featureset, X86FSET_SVM)) {
+		cmn_err(CE_WARN, "processor does not support SVM operation\n");
+		return (0);
+	}
+#endif
+
+	msr = rdmsr(MSR_VM_CR);
+	if ((msr & VM_CR_SVMDIS) != 0) {
+#ifdef __FreeBSD__
+		printf("SVM: disabled by BIOS.\n");
+#else
+		cmn_err(CE_WARN, "SVM disabled by BIOS.\n");
+#endif
+		return (0);
+	}
+
+	return (1);
+}
+
+static int
+svm_init(int ipinum)
+{
+	int error, cpu;
+
+	if (!svm_available())
+		return (ENXIO);
+
+	error = check_svm_features();
+	if (error)
+		return (error);
+
+	vmcb_clean &= VMCB_CACHE_DEFAULT;
+
+	for (cpu = 0; cpu < MAXCPU; cpu++) {
+		/*
+		 * Initialize the host ASIDs to their "highest" valid values.
+		 *
+		 * The next ASID allocation will rollover both 'gen' and 'num'
+		 * and start off the sequence at {1,1}.
+		 */
+		asid[cpu].gen = ~0UL;
+		asid[cpu].num = nasid - 1;
+	}
+
+	svm_msr_init();
+	svm_npt_init(ipinum);
+
+	/* Enable SVM on all CPUs */
+	smp_rendezvous(NULL, svm_enable, NULL, NULL);
+
+	return (0);
+}
+
+static void
+svm_restore(void)
+{
+
+	svm_enable(NULL);
+}		
+#else /* __FreeBSD__ */
+static int
+svm_cleanup(void)
+{
+	/* This is taken care of by the hma registration */
+	return (0);
+}
+
+static int
+svm_init(int ipinum)
+{
+	vmcb_clean &= VMCB_CACHE_DEFAULT;
+
+	svm_msr_init();
+	svm_npt_init(ipinum);
+
+	return (0);
+}
+
+static void
+svm_restore(void)
+{
+	/* No-op on illumos */
+}
+#endif /* __FreeBSD__ */
+
+/* Pentium compatible MSRs */
+#define MSR_PENTIUM_START 	0	
+#define MSR_PENTIUM_END 	0x1FFF
+/* AMD 6th generation and Intel compatible MSRs */
+#define MSR_AMD6TH_START 	0xC0000000UL	
+#define MSR_AMD6TH_END 		0xC0001FFFUL	
+/* AMD 7th and 8th generation compatible MSRs */
+#define MSR_AMD7TH_START 	0xC0010000UL	
+#define MSR_AMD7TH_END 		0xC0011FFFUL	
+
+/*
+ * Get the index and bit position for a MSR in permission bitmap.
+ * Two bits are used for each MSR: lower bit for read and higher bit for write.
+ */
+static int
+svm_msr_index(uint64_t msr, int *index, int *bit)
+{
+	uint32_t base, off;
+
+	*index = -1;
+	*bit = (msr % 4) * 2;
+	base = 0;
+
+	if (msr <= MSR_PENTIUM_END) {
+		*index = msr / 4;
+		return (0);
+	}
+
+	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 
+	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
+		off = (msr - MSR_AMD6TH_START); 
+		*index = (off + base) / 4;
+		return (0);
+	} 
+
+	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
+	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
+		off = (msr - MSR_AMD7TH_START);
+		*index = (off + base) / 4;
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+/*
+ * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
+ */
+static void
+svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
+{
+	int index, bit, error;
+
+	error = svm_msr_index(msr, &index, &bit);
+	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
+	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
+	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
+	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
+	    "msr %#lx", __func__, bit, msr));
+
+	if (read)
+		perm_bitmap[index] &= ~(1UL << bit);
+
+	if (write)
+		perm_bitmap[index] &= ~(2UL << bit);
+}
+
+static void
+svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
+{
+
+	svm_msr_perm(perm_bitmap, msr, true, true);
+}
+
+static void
+svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
+{
+
+	svm_msr_perm(perm_bitmap, msr, true, false);
+}
+
+static __inline int
+svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
+{
+	struct vmcb_ctrl *ctrl;
+
+	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
+}
+
+static __inline void
+svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
+    int enabled)
+{
+	struct vmcb_ctrl *ctrl;
+	uint32_t oldval;
+
+	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	oldval = ctrl->intercept[idx];
+
+	if (enabled)
+		ctrl->intercept[idx] |= bitmask;
+	else
+		ctrl->intercept[idx] &= ~bitmask;
+
+	if (ctrl->intercept[idx] != oldval) {
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
+		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
+		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
+	}
+}
+
+static __inline void
+svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
+{
+
+	svm_set_intercept(sc, vcpu, off, bitmask, 0);
+}
+
+static __inline void
+svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
+{
+
+	svm_set_intercept(sc, vcpu, off, bitmask, 1);
+}
+
+static void
+vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
+    uint64_t msrpm_base_pa, uint64_t np_pml4)
+{
+	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
+	uint32_t mask;
+	int n;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	state = svm_get_vmcb_state(sc, vcpu);
+
+	ctrl->iopm_base_pa = iopm_base_pa;
+	ctrl->msrpm_base_pa = msrpm_base_pa;
+
+	/* Enable nested paging */
+	ctrl->np_enable = 1;
+	ctrl->n_cr3 = np_pml4;
+
+	/*
+	 * Intercept accesses to the control registers that are not shadowed
+	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
+	 */
+	for (n = 0; n < 16; n++) {
+		mask = (BIT(n) << 16) | BIT(n);
+		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
+			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
+		else
+			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
+	}
+
+
+	/*
+	 * Intercept everything when tracing guest exceptions otherwise
+	 * just intercept machine check exception.
+	 */
+	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
+		for (n = 0; n < 32; n++) {
+			/*
+			 * Skip unimplemented vectors in the exception bitmap.
+			 */
+			if (n == 2 || n == 9) {
+				continue;
+			}
+			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
+		}
+	} else {
+		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
+	}
+
+	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+	    VMCB_INTCPT_FERR_FREEZE);
+
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
+
+	/*
+	 * From section "Canonicalization and Consistency Checks" in APMv2
+	 * the VMRUN intercept bit must be set to pass the consistency check.
+	 */
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
+
+	/*
+	 * The ASID will be set to a non-zero value just before VMRUN.
+	 */
+	ctrl->asid = 0;
+
+	/*
+	 * Section 15.21.1, Interrupt Masking in EFLAGS
+	 * Section 15.21.2, Virtualizing APIC.TPR
+	 *
+	 * This must be set for %rflag and %cr8 isolation of guest and host.
+	 */
+	ctrl->v_intr_masking = 1;
+
+	/* Enable Last Branch Record aka LBR for debugging */
+	ctrl->lbr_virt_en = 1;
+	state->dbgctl = BIT(0);
+
+	/* EFER_SVM must always be set when the guest is executing */
+	state->efer = EFER_SVM;
+
+	/* Set up the PAT to power-on state */
+	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
+	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(2, PAT_UNCACHED)		|
+	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	    PAT_VALUE(4, PAT_WRITE_BACK)	|
+	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(6, PAT_UNCACHED)		|
+	    PAT_VALUE(7, PAT_UNCACHEABLE);
+
+	/* Set up DR6/7 to power-on state */
+	state->dr6 = DBREG_DR6_RESERVED1;
+	state->dr7 = DBREG_DR7_RESERVED1;
+}
+
+/*
+ * Initialize a virtual machine.
+ */
+static void *
+svm_vminit(struct vm *vm, pmap_t pmap)
+{
+	struct svm_softc *svm_sc;
+	struct svm_vcpu *vcpu;
+	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
+	int i;
+	uint16_t maxcpus;
+
+	svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
+	if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
+		panic("malloc of svm_softc not aligned on page boundary");
+
+	svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
+	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
+	if (svm_sc->msr_bitmap == NULL)
+		panic("contigmalloc of SVM MSR bitmap failed");
+	svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
+	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
+	if (svm_sc->iopm_bitmap == NULL)
+		panic("contigmalloc of SVM IO bitmap failed");
+
+	svm_sc->vm = vm;
+	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
+
+	/*
+	 * Intercept read and write accesses to all MSRs.
+	 */
+	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
+
+	/*
+	 * Access to the following MSRs is redirected to the VMCB when the
+	 * guest is executing. Therefore it is safe to allow the guest to
+	 * read/write these MSRs directly without hypervisor involvement.
+	 */
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
+
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
+	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
+
+	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
+
+	/*
+	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
+	 */
+	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
+
+	/* Intercept access to all I/O ports. */
+	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
+
+	iopm_pa = vtophys(svm_sc->iopm_bitmap);
+	msrpm_pa = vtophys(svm_sc->msr_bitmap);
+	pml4_pa = svm_sc->nptp;
+	maxcpus = vm_get_maxcpus(svm_sc->vm);
+	for (i = 0; i < maxcpus; i++) {
+		vcpu = svm_get_vcpu(svm_sc, i);
+		vcpu->nextrip = ~0;
+		vcpu->lastcpu = NOCPU;
+		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
+		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
+		svm_msr_guest_init(svm_sc, i);
+	}
+	return (svm_sc);
+}
+
+/*
+ * Collateral for a generic SVM VM-exit.
+ */
+static void
+vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
+{
+
+	vme->exitcode = VM_EXITCODE_SVM;
+	vme->u.svm.exitcode = code;
+	vme->u.svm.exitinfo1 = info1;
+	vme->u.svm.exitinfo2 = info2;
+}
+
+static int
+svm_cpl(struct vmcb_state *state)
+{
+
+	/*
+	 * From APMv2:
+	 *   "Retrieve the CPL from the CPL field in the VMCB, not
+	 *    from any segment DPL"
+	 */
+	return (state->cpl);
+}
+
+static enum vm_cpu_mode
+svm_vcpu_mode(struct vmcb *vmcb)
+{
+	struct vmcb_segment seg;
+	struct vmcb_state *state;
+	int error;
+
+	state = &vmcb->state;
+
+	if (state->efer & EFER_LMA) {
+		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
+		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
+		    error));
+
+		/*
+		 * Section 4.8.1 for APM2, check if Code Segment has
+		 * Long attribute set in descriptor.
+		 */
+		if (seg.attrib & VMCB_CS_ATTRIB_L)
+			return (CPU_MODE_64BIT);
+		else
+			return (CPU_MODE_COMPATIBILITY);
+	} else  if (state->cr0 & CR0_PE) {
+		return (CPU_MODE_PROTECTED);
+	} else {
+		return (CPU_MODE_REAL);
+	}
+}
+
+static enum vm_paging_mode
+svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
+{
+
+	if ((cr0 & CR0_PG) == 0)
+		return (PAGING_MODE_FLAT);
+	if ((cr4 & CR4_PAE) == 0)
+		return (PAGING_MODE_32);
+	if (efer & EFER_LME)
+		return (PAGING_MODE_64);
+	else
+		return (PAGING_MODE_PAE);
+}
+
+/*
+ * ins/outs utility routines
+ */
+static uint64_t
+svm_inout_str_index(struct svm_regctx *regs, int in)
+{
+	uint64_t val;
+
+	val = in ? regs->sctx_rdi : regs->sctx_rsi;
+
+	return (val);
+}
+
+static uint64_t
+svm_inout_str_count(struct svm_regctx *regs, int rep)
+{
+	uint64_t val;
+
+	val = rep ? regs->sctx_rcx : 1;
+
+	return (val);
+}
+
+static void
+svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
+    int in, struct vm_inout_str *vis)
+{
+	int error, s;
+
+	if (in) {
+		vis->seg_name = VM_REG_GUEST_ES;
+	} else {
+		/* The segment field has standard encoding */
+		s = (info1 >> 10) & 0x7;
+		vis->seg_name = vm_segment_name(s);
+	}
+
+	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
+	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
+}
+
+static int
+svm_inout_str_addrsize(uint64_t info1)
+{
+        uint32_t size;
+
+        size = (info1 >> 7) & 0x7;
+        switch (size) {
+        case 1:
+                return (2);     /* 16 bit */
+        case 2:
+                return (4);     /* 32 bit */
+        case 4:
+                return (8);     /* 64 bit */
+        default:
+                panic("%s: invalid size encoding %d", __func__, size);
+        }
+}
+
+static void
+svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
+{
+	struct vmcb_state *state;
+
+	state = &vmcb->state;
+	paging->cr3 = state->cr3;
+	paging->cpl = svm_cpl(state);
+	paging->cpu_mode = svm_vcpu_mode(vmcb);
+	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
+	    state->efer);
+}
+
+#define	UNHANDLED 0
+
+/*
+ * Handle guest I/O intercept.
+ */
+static int
+svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
+{
+	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
+	struct svm_regctx *regs;
+	struct vm_inout_str *vis;
+	uint64_t info1;
+	int inout_string;
+
+	state = svm_get_vmcb_state(svm_sc, vcpu);
+	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
+	regs  = svm_get_guest_regctx(svm_sc, vcpu);
+
+	info1 = ctrl->exitinfo1;
+	inout_string = info1 & BIT(2) ? 1 : 0;
+
+	/*
+	 * The effective segment number in EXITINFO1[12:10] is populated
+	 * only if the processor has the DecodeAssist capability.
+	 *
+	 * XXX this is not specified explicitly in APMv2 but can be verified
+	 * empirically.
+	 */
+	if (inout_string && !decode_assist())
+		return (UNHANDLED);
+
+	vmexit->exitcode 	= VM_EXITCODE_INOUT;
+	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
+	vmexit->u.inout.string 	= inout_string;
+	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
+	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
+	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
+	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
+
+	if (inout_string) {
+		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
+		vis = &vmexit->u.inout_str;
+		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
+		vis->rflags = state->rflags;
+		vis->cr0 = state->cr0;
+		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
+		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
+		vis->addrsize = svm_inout_str_addrsize(info1);
+		svm_inout_str_seginfo(svm_sc, vcpu, info1,
+		    vmexit->u.inout.in, vis);
+	}
+
+	return (UNHANDLED);
+}
+
+static int
+npf_fault_type(uint64_t exitinfo1)
+{
+
+	if (exitinfo1 & VMCB_NPF_INFO1_W)
+		return (VM_PROT_WRITE);
+	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
+		return (VM_PROT_EXECUTE);
+	else
+		return (VM_PROT_READ);
+}
+
+static bool
+svm_npf_emul_fault(uint64_t exitinfo1)
+{
+	
+	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
+		return (false);
+	}
+
+	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
+		return (false);
+	}
+
+	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
+		return (false);
+	}
+
+	return (true);	
+}
+
+static void
+svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
+{
+	struct vm_guest_paging *paging;
+	struct vmcb_segment seg;
+	struct vmcb_ctrl *ctrl;
+	char *inst_bytes;
+	int error, inst_len;
+
+	ctrl = &vmcb->ctrl;
+	paging = &vmexit->u.inst_emul.paging;
+
+	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+	vmexit->u.inst_emul.gpa = gpa;
+	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
+	svm_paging_info(vmcb, paging);
+
+	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
+	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
+
+	switch(paging->cpu_mode) {
+	case CPU_MODE_REAL:
+		vmexit->u.inst_emul.cs_base = seg.base;
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
+	case CPU_MODE_PROTECTED:
+	case CPU_MODE_COMPATIBILITY:
+		vmexit->u.inst_emul.cs_base = seg.base;
+
+		/*
+		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
+		 */
+		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
+		    1 : 0;
+		break;
+	default:
+		vmexit->u.inst_emul.cs_base = 0;
+		vmexit->u.inst_emul.cs_d = 0;
+		break;	
+	}
+
+	/*
+	 * Copy the instruction bytes into 'vie' if available.
+	 */
+	if (decode_assist() && !disable_npf_assist) {
+		inst_len = ctrl->inst_len;
+		inst_bytes = (char *)ctrl->inst_bytes;
+	} else {
+		inst_len = 0;
+		inst_bytes = NULL;
+	}
+	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
+}
+
+#ifdef KTR
+static const char *
+intrtype_to_str(int intr_type)
+{
+	switch (intr_type) {
+	case VMCB_EVENTINJ_TYPE_INTR:
+		return ("hwintr");
+	case VMCB_EVENTINJ_TYPE_NMI:
+		return ("nmi");
+	case VMCB_EVENTINJ_TYPE_INTn:
+		return ("swintr");
+	case VMCB_EVENTINJ_TYPE_EXCEPTION:
+		return ("exception");
+	default:
+		panic("%s: unknown intr_type %d", __func__, intr_type);
+	}
+}
+#endif
+
+/*
+ * Inject an event to vcpu as described in section 15.20, "Event injection".
+ */
+static void
+svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
+		 uint32_t error, bool ec_valid)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
+	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
+
+	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
+	    __func__, vector));
+
+	switch (intr_type) {
+	case VMCB_EVENTINJ_TYPE_INTR:
+	case VMCB_EVENTINJ_TYPE_NMI:
+	case VMCB_EVENTINJ_TYPE_INTn:
+		break;
+	case VMCB_EVENTINJ_TYPE_EXCEPTION:
+		if (vector >= 0 && vector <= 31 && vector != 2)
+			break;
+		/* FALLTHROUGH */
+	default:
+		panic("%s: invalid intr_type/vector: %d/%d", __func__,
+		    intr_type, vector);
+	}
+	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
+	if (ec_valid) {
+		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
+		ctrl->eventinj |= (uint64_t)error << 32;
+		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
+		    intrtype_to_str(intr_type), vector, error);
+	} else {
+		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
+		    intrtype_to_str(intr_type), vector);
+	}
+}
+
+static void
+svm_update_virqinfo(struct svm_softc *sc, int vcpu)
+{
+	struct vm *vm;
+	struct vlapic *vlapic;
+	struct vmcb_ctrl *ctrl;
+
+	vm = sc->vm;
+	vlapic = vm_lapic(vm, vcpu);
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	/* Update %cr8 in the emulated vlapic */
+	vlapic_set_cr8(vlapic, ctrl->v_tpr);
+
+	/* Virtual interrupt injection is not used. */
+	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
+	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
+}
+
+static void
+svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
+{
+	struct vmcb_ctrl *ctrl;
+	uint64_t intinfo;
+
+	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
+	intinfo = ctrl->exitintinfo;	
+	if (!VMCB_EXITINTINFO_VALID(intinfo))
+		return;
+
+	/*
+	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
+	 *
+	 * If a #VMEXIT happened during event delivery then record the event
+	 * that was being delivered.
+	 */
+	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
+		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
+	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
+	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
+}
+
+static __inline int
+vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
+{
+
+	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+	    VMCB_INTCPT_VINTR));
+}
+
+static __inline void
+enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
+		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
+		KASSERT(vintr_intercept_enabled(sc, vcpu),
+		    ("%s: vintr intercept should be enabled", __func__));
+		return;
+	}
+
+	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
+	ctrl->v_irq = 1;
+	ctrl->v_ign_tpr = 1;
+	ctrl->v_intr_vector = 0;
+	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
+}
+
+static __inline void
+disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
+		KASSERT(!vintr_intercept_enabled(sc, vcpu),
+		    ("%s: vintr intercept should be disabled", __func__));
+		return;
+	}
+
+	VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
+	ctrl->v_irq = 0;
+	ctrl->v_intr_vector = 0;
+	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
+	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
+}
+
+static int
+svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
+{
+	struct vmcb_ctrl *ctrl;
+	int oldval, newval;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	oldval = ctrl->intr_shadow;
+	newval = val ? 1 : 0;
+	if (newval != oldval) {
+		ctrl->intr_shadow = newval;
+		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
+	}
+	return (0);
+}
+
+static int
+svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	*val = ctrl->intr_shadow;
+	return (0);
+}
+
+/*
+ * Once an NMI is injected it blocks delivery of further NMIs until the handler
+ * executes an IRET. The IRET intercept is enabled when an NMI is injected to
+ * to track when the vcpu is done handling the NMI.
+ */
+static int
+nmi_blocked(struct svm_softc *sc, int vcpu)
+{
+	int blocked;
+
+	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+	    VMCB_INTCPT_IRET);
+	return (blocked);
+}
+
+static void
+enable_nmi_blocking(struct svm_softc *sc, int vcpu)
+{
+
+	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
+	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
+}
+
+static void
+clear_nmi_blocking(struct svm_softc *sc, int vcpu)
+{
+	int error;
+
+	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
+	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
+	/*
+	 * When the IRET intercept is cleared the vcpu will attempt to execute
+	 * the "iret" when it runs next. However, it is possible to inject
+	 * another NMI into the vcpu before the "iret" has actually executed.
+	 *
+	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
+	 * it will trap back into the hypervisor. If an NMI is pending for
+	 * the vcpu it will be injected into the guest.
+	 *
+	 * XXX this needs to be fixed
+	 */
+	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
+
+	/*
+	 * Set 'intr_shadow' to prevent an NMI from being injected on the
+	 * immediate VMRUN.
+	 */
+	error = svm_modify_intr_shadow(sc, vcpu, 1);
+	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
+}
+
+#define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
+
+static int
+svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
+{
+	struct vm_exit *vme;
+	struct vmcb_state *state;
+	uint64_t changed, lma, oldval;
+	int error;
+
+	state = svm_get_vmcb_state(sc, vcpu);
+
+	oldval = state->efer;
+	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
+
+	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
+	changed = oldval ^ newval;
+
+	if (newval & EFER_MBZ_BITS)
+		goto gpf;
+
+	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
+	if (changed & EFER_LME) {
+		if (state->cr0 & CR0_PG)
+			goto gpf;
+	}
+
+	/* EFER.LMA = EFER.LME & CR0.PG */
+	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
+		lma = EFER_LMA;
+	else
+		lma = 0;
+
+	if ((newval & EFER_LMA) != lma)
+		goto gpf;
+
+	if (newval & EFER_NXE) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
+			goto gpf;
+	}
+
+	/*
+	 * XXX bhyve does not enforce segment limits in 64-bit mode. Until
+	 * this is fixed flag guest attempt to set EFER_LMSLE as an error.
+	 */
+	if (newval & EFER_LMSLE) {
+		vme = vm_exitinfo(sc->vm, vcpu);
+		vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
+		*retu = true;
+		return (0);
+	}
+
+	if (newval & EFER_FFXSR) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
+			goto gpf;
+	}
+
+	if (newval & EFER_TCE) {
+		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
+			goto gpf;
+	}
+
+	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
+	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
+	return (0);
+gpf:
+	vm_inject_gp(sc->vm, vcpu);
+	return (0);
+}
+
+static int
+emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
+    bool *retu)
+{
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
+	else if (num == MSR_EFER)
+		error = svm_write_efer(sc, vcpu, val, retu);
+	else
+		error = svm_wrmsr(sc, vcpu, num, val, retu);
+
+	return (error);
+}
+
+static int
+emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
+{
+	struct vmcb_state *state;
+	struct svm_regctx *ctx;
+	uint64_t result;
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
+	else
+		error = svm_rdmsr(sc, vcpu, num, &result, retu);
+
+	if (error == 0) {
+		state = svm_get_vmcb_state(sc, vcpu);
+		ctx = svm_get_guest_regctx(sc, vcpu);
+		state->rax = result & 0xffffffff;
+		ctx->sctx_rdx = result >> 32;
+	}
+
+	return (error);
+}
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(uint64_t reason)
+{
+	static char reasonbuf[32];
+
+	switch (reason) {
+	case VMCB_EXIT_INVALID:
+		return ("invalvmcb");
+	case VMCB_EXIT_SHUTDOWN:
+		return ("shutdown");
+	case VMCB_EXIT_NPF:
+		return ("nptfault");
+	case VMCB_EXIT_PAUSE:
+		return ("pause");
+	case VMCB_EXIT_HLT:
+		return ("hlt");
+	case VMCB_EXIT_CPUID:
+		return ("cpuid");
+	case VMCB_EXIT_IO:
+		return ("inout");
+	case VMCB_EXIT_MC:
+		return ("mchk");
+	case VMCB_EXIT_INTR:
+		return ("extintr");
+	case VMCB_EXIT_NMI:
+		return ("nmi");
+	case VMCB_EXIT_VINTR:
+		return ("vintr");
+	case VMCB_EXIT_MSR:
+		return ("msr");
+	case VMCB_EXIT_IRET:
+		return ("iret");
+	case VMCB_EXIT_MONITOR:
+		return ("monitor");
+	case VMCB_EXIT_MWAIT:
+		return ("mwait");
+	default:
+		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
+		return (reasonbuf);
+	}
+}
+#endif	/* KTR */
+
+/*
+ * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
+ * that are due to instruction intercepts as well as MSR and IOIO intercepts
+ * and exceptions caused by INT3, INTO and BOUND instructions.
+ *
+ * Return 1 if the nRIP is valid and 0 otherwise.
+ */
+static int
+nrip_valid(uint64_t exitcode)
+{
+	switch (exitcode) {
+	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
+	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
+	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
+	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
+	case 0x43:		/* INT3 */
+	case 0x44:		/* INTO */
+	case 0x45:		/* BOUND */
+	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
+	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
+		return (1);
+	default:
+		return (0);
+	}
+}
+
+static int
+svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
+{
+	struct vmcb *vmcb;
+	struct vmcb_state *state;
+	struct vmcb_ctrl *ctrl;
+	struct svm_regctx *ctx;
+	uint64_t code, info1, info2, val;
+	uint32_t eax, ecx, edx;
+#ifdef __FreeBSD__
+	int error, errcode_valid, handled, idtvec, reflect;
+#else
+	int error, errcode_valid = 0, handled, idtvec, reflect;
+#endif
+	bool retu;
+
+	ctx = svm_get_guest_regctx(svm_sc, vcpu);
+	vmcb = svm_get_vmcb(svm_sc, vcpu);
+	state = &vmcb->state;
+	ctrl = &vmcb->ctrl;
+
+	handled = 0;
+	code = ctrl->exitcode;
+	info1 = ctrl->exitinfo1;
+	info2 = ctrl->exitinfo2;
+
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+	vmexit->rip = state->rip;
+	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
+
+	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
+
+	/*
+	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
+	 * in an inconsistent state and can trigger assertions that would
+	 * never happen otherwise.
+	 */
+	if (code == VMCB_EXIT_INVALID) {
+		vm_exit_svm(vmexit, code, info1, info2);
+		return (0);
+	}
+
+	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
+	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
+
+	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
+	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
+	    vmexit->inst_length, code, info1, info2));
+
+	svm_update_virqinfo(svm_sc, vcpu);
+	svm_save_intinfo(svm_sc, vcpu);
+
+	switch (code) {
+	case VMCB_EXIT_IRET:
+		/*
+		 * Restart execution at "iret" but with the intercept cleared.
+		 */
+		vmexit->inst_length = 0;
+		clear_nmi_blocking(svm_sc, vcpu);
+		handled = 1;
+		break;
+	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
+		handled = 1;
+		break;
+	case VMCB_EXIT_INTR:	/* external interrupt */
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
+		handled = 1;
+		break;
+	case VMCB_EXIT_NMI:	/* external NMI */
+		handled = 1;
+		break;
+	case 0x40 ... 0x5F:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
+		reflect = 1;
+		idtvec = code - 0x40;
+		switch (idtvec) {
+		case IDT_MC:
+			/*
+			 * Call the machine check handler by hand. Also don't
+			 * reflect the machine check back into the guest.
+			 */
+			reflect = 0;
+			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
+#ifdef __FreeBSD__
+			__asm __volatile("int $18");
+#else
+			vmm_call_trap(T_MCE);
+#endif
+			break;
+		case IDT_PF:
+			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
+			    info2);
+			KASSERT(error == 0, ("%s: error %d updating cr2",
+			    __func__, error));
+			/* fallthru */
+		case IDT_NP:
+		case IDT_SS:
+		case IDT_GP:
+		case IDT_AC:
+		case IDT_TS:
+			errcode_valid = 1;
+			break;
+
+		case IDT_DF:
+			errcode_valid = 1;
+			info1 = 0;
+			break;
+
+		case IDT_BP:
+		case IDT_OF:
+		case IDT_BR:
+			/*
+			 * The 'nrip' field is populated for INT3, INTO and
+			 * BOUND exceptions and this also implies that
+			 * 'inst_length' is non-zero.
+			 *
+			 * Reset 'inst_length' to zero so the guest %rip at
+			 * event injection is identical to what it was when
+			 * the exception originally happened.
+			 */
+			VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
+			    "to zero before injecting exception %d",
+			    vmexit->inst_length, idtvec);
+			vmexit->inst_length = 0;
+			/* fallthru */
+		default:
+			errcode_valid = 0;
+			info1 = 0;
+			break;
+		}
+		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
+		    "when reflecting exception %d into guest",
+		    vmexit->inst_length, idtvec));
+
+		if (reflect) {
+			/* Reflect the exception back into the guest */
+			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
+			    "%d/%#x into the guest", idtvec, (int)info1);
+			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
+			    errcode_valid, info1, 0);
+			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
+			    __func__, error));
+		}
+		handled = 1;
+		break;
+	case VMCB_EXIT_MSR:	/* MSR access. */
+		eax = state->rax;
+		ecx = ctx->sctx_rcx;
+		edx = ctx->sctx_rdx;
+		retu = false;	
+
+		if (info1) {
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
+			val = (uint64_t)edx << 32 | eax;
+			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
+			    ecx, val);
+			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
+				vmexit->exitcode = VM_EXITCODE_WRMSR;
+				vmexit->u.msr.code = ecx;
+				vmexit->u.msr.wval = val;
+			} else if (!retu) {
+				handled = 1;
+			} else {
+				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+				    ("emulate_wrmsr retu with bogus exitcode"));
+			}
+		} else {
+			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
+			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
+				vmexit->exitcode = VM_EXITCODE_RDMSR;
+				vmexit->u.msr.code = ecx;
+			} else if (!retu) {
+				handled = 1;
+			} else {
+				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+				    ("emulate_rdmsr retu with bogus exitcode"));
+			}
+		}
+		break;
+	case VMCB_EXIT_IO:
+		handled = svm_handle_io(svm_sc, vcpu, vmexit);
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
+		break;
+	case VMCB_EXIT_CPUID:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
+		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
+		    (uint32_t *)&state->rax,
+		    (uint32_t *)&ctx->sctx_rbx,
+		    (uint32_t *)&ctx->sctx_rcx,
+		    (uint32_t *)&ctx->sctx_rdx);
+		break;
+	case VMCB_EXIT_HLT:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
+		vmexit->exitcode = VM_EXITCODE_HLT;
+		vmexit->u.hlt.rflags = state->rflags;
+		break;
+	case VMCB_EXIT_PAUSE:
+		vmexit->exitcode = VM_EXITCODE_PAUSE;
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
+		break;
+	case VMCB_EXIT_NPF:
+		/* EXITINFO2 contains the faulting guest physical address */
+		if (info1 & VMCB_NPF_INFO1_RSV) {
+			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
+			    "reserved bits set: info1(%#lx) info2(%#lx)",
+			    info1, info2);
+		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
+			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->u.paging.gpa = info2;
+			vmexit->u.paging.fault_type = npf_fault_type(info1);
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
+			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
+			    "on gpa %#lx/%#lx at rip %#lx",
+			    info2, info1, state->rip);
+		} else if (svm_npf_emul_fault(info1)) {
+			svm_handle_inst_emul(vmcb, info2, vmexit);
+			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
+			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
+			    "for gpa %#lx/%#lx at rip %#lx",
+			    info2, info1, state->rip);
+		}
+		break;
+	case VMCB_EXIT_MONITOR:
+		vmexit->exitcode = VM_EXITCODE_MONITOR;
+		break;
+	case VMCB_EXIT_MWAIT:
+		vmexit->exitcode = VM_EXITCODE_MWAIT;
+		break;
+	default:
+		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
+		break;
+	}	
+
+	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
+	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
+	    vmexit->rip, vmexit->inst_length);
+
+	if (handled) {
+		vmexit->rip += vmexit->inst_length;
+		vmexit->inst_length = 0;
+		state->rip = vmexit->rip;
+	} else {
+		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+			/*
+			 * If this VM exit was not claimed by anybody then
+			 * treat it as a generic SVM exit.
+			 */
+			vm_exit_svm(vmexit, code, info1, info2);
+		} else {
+			/*
+			 * The exitcode and collateral have been populated.
+			 * The VM exit will be processed further in userland.
+			 */
+		}
+	}
+	return (handled);
+}
+
+static void
+svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
+{
+	uint64_t intinfo;
+
+	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
+		return;
+
+	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
+	    "valid: %#lx", __func__, intinfo));
+
+	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
+		VMCB_EXITINTINFO_VECTOR(intinfo),
+		VMCB_EXITINTINFO_EC(intinfo),
+		VMCB_EXITINTINFO_EC_VALID(intinfo));
+	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
+	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
+}
+
+/*
+ * Inject event to virtual cpu.
+ */
+static void
+svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
+{
+	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
+	struct svm_vcpu *vcpustate;
+	uint8_t v_tpr;
+	int vector, need_intr_window;
+	int extint_pending;
+
+	state = svm_get_vmcb_state(sc, vcpu);
+	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
+	vcpustate = svm_get_vcpu(sc, vcpu);
+
+	need_intr_window = 0;
+
+	vlapic_tmr_update(vlapic);
+
+	if (vcpustate->nextrip != state->rip) {
+		ctrl->intr_shadow = 0;
+		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
+		    "cleared due to rip change: %#lx/%#lx",
+		    vcpustate->nextrip, state->rip);
+	}
+
+	/*
+	 * Inject pending events or exceptions for this vcpu.
+	 *
+	 * An event might be pending because the previous #VMEXIT happened
+	 * during event delivery (i.e. ctrl->exitintinfo).
+	 *
+	 * An event might also be pending because an exception was injected
+	 * by the hypervisor (e.g. #PF during instruction emulation).
+	 */
+	svm_inj_intinfo(sc, vcpu);
+
+	/* NMI event has priority over interrupts. */
+	if (vm_nmi_pending(sc->vm, vcpu)) {
+		if (nmi_blocked(sc, vcpu)) {
+			/*
+			 * Can't inject another NMI if the guest has not
+			 * yet executed an "iret" after the last NMI.
+			 */
+			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
+			    "to NMI-blocking");
+		} else if (ctrl->intr_shadow) {
+			/*
+			 * Can't inject an NMI if the vcpu is in an intr_shadow.
+			 */
+			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
+			    "interrupt shadow");
+			need_intr_window = 1;
+			goto done;
+		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
+			/*
+			 * If there is already an exception/interrupt pending
+			 * then defer the NMI until after that.
+			 */
+			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
+			    "eventinj %#lx", ctrl->eventinj);
+
+			/*
+			 * Use self-IPI to trigger a VM-exit as soon as
+			 * possible after the event injection is completed.
+			 *
+			 * This works only if the external interrupt exiting
+			 * is at a lower priority than the event injection.
+			 *
+			 * Although not explicitly specified in APMv2 the
+			 * relative priorities were verified empirically.
+			 */
+			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
+		} else {
+			vm_nmi_clear(sc->vm, vcpu);
+
+			/* Inject NMI, vector number is not used */
+			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
+			    IDT_NMI, 0, false);
+
+			/* virtual NMI blocking is now in effect */
+			enable_nmi_blocking(sc, vcpu);
+
+			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
+		}
+	}
+
+	extint_pending = vm_extint_pending(sc->vm, vcpu);
+	if (!extint_pending) {
+		if (!vlapic_pending_intr(vlapic, &vector))
+			goto done;
+		KASSERT(vector >= 16 && vector <= 255,
+		    ("invalid vector %d from local APIC", vector));
+	} else {
+		/* Ask the legacy pic for a vector to inject */
+		vatpic_pending_intr(sc->vm, &vector);
+		KASSERT(vector >= 0 && vector <= 255,
+		    ("invalid vector %d from INTR", vector));
+	}
+
+	/*
+	 * If the guest has disabled interrupts or is in an interrupt shadow
+	 * then we cannot inject the pending interrupt.
+	 */
+	if ((state->rflags & PSL_I) == 0) {
+		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
+		    "rflags %#lx", vector, state->rflags);
+		need_intr_window = 1;
+		goto done;
+	}
+
+	if (ctrl->intr_shadow) {
+		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
+		    "interrupt shadow", vector);
+		need_intr_window = 1;
+		goto done;
+	}
+
+	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
+		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
+		    "eventinj %#lx", vector, ctrl->eventinj);
+		need_intr_window = 1;
+		goto done;
+	}
+
+	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
+
+	if (!extint_pending) {
+		vlapic_intr_accepted(vlapic, vector);
+	} else {
+		vm_extint_clear(sc->vm, vcpu);
+		vatpic_intr_accepted(sc->vm, vector);
+	}
+
+	/*
+	 * Force a VM-exit as soon as the vcpu is ready to accept another
+	 * interrupt. This is done because the PIC might have another vector
+	 * that it wants to inject. Also, if the APIC has a pending interrupt
+	 * that was preempted by the ExtInt then it allows us to inject the
+	 * APIC vector as soon as possible.
+	 */
+	need_intr_window = 1;
+done:
+	/*
+	 * The guest can modify the TPR by writing to %CR8. In guest mode
+	 * the processor reflects this write to V_TPR without hypervisor
+	 * intervention.
+	 *
+	 * The guest can also modify the TPR by writing to it via the memory
+	 * mapped APIC page. In this case, the write will be emulated by the
+	 * hypervisor. For this reason V_TPR must be updated before every
+	 * VMRUN.
+	 */
+	v_tpr = vlapic_get_cr8(vlapic);
+	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
+	if (ctrl->v_tpr != v_tpr) {
+		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
+		    ctrl->v_tpr, v_tpr);
+		ctrl->v_tpr = v_tpr;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
+	}
+
+	if (need_intr_window) {
+		/*
+		 * We use V_IRQ in conjunction with the VINTR intercept to
+		 * trap into the hypervisor as soon as a virtual interrupt
+		 * can be delivered.
+		 *
+		 * Since injected events are not subject to intercept checks
+		 * we need to ensure that the V_IRQ is not actually going to
+		 * be delivered on VM entry. The KASSERT below enforces this.
+		 */
+		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
+		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
+		    ("Bogus intr_window_exiting: eventinj (%#lx), "
+		    "intr_shadow (%u), rflags (%#lx)",
+		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
+		enable_intr_window_exiting(sc, vcpu);
+	} else {
+		disable_intr_window_exiting(sc, vcpu);
+	}
+}
+
+static __inline void
+restore_host_tss(void)
+{
+#ifdef __FreeBSD__
+	struct system_segment_descriptor *tss_sd;
+
+	/*
+	 * The TSS descriptor was in use prior to launching the guest so it
+	 * has been marked busy.
+	 *
+	 * 'ltr' requires the descriptor to be marked available so change the
+	 * type to "64-bit available TSS".
+	 */
+	tss_sd = PCPU_GET(tss);
+	tss_sd->sd_type = SDT_SYSTSS;
+	ltr(GSEL(GPROC0_SEL, SEL_KPL));
+#else
+	system_desc_t *tss = (system_desc_t *)&CPU->cpu_gdt[GDT_KTSS];
+
+	tss->ssd_type = SDT_SYSTSS;
+	wr_tsr(KTSS_SEL);
+#endif
+}
+
+#ifdef __FreeBSD__
+static void
+check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
+{
+	struct svm_vcpu *vcpustate;
+	struct vmcb_ctrl *ctrl;
+	long eptgen;
+	bool alloc_asid;
+
+	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
+	    "active on cpu %u", __func__, thiscpu));
+
+	vcpustate = svm_get_vcpu(sc, vcpuid);
+	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
+
+	/*
+	 * The TLB entries associated with the vcpu's ASID are not valid
+	 * if either of the following conditions is true:
+	 *
+	 * 1. The vcpu's ASID generation is different than the host cpu's
+	 *    ASID generation. This happens when the vcpu migrates to a new
+	 *    host cpu. It can also happen when the number of vcpus executing
+	 *    on a host cpu is greater than the number of ASIDs available.
+	 *
+	 * 2. The pmap generation number is different than the value cached in
+	 *    the 'vcpustate'. This happens when the host invalidates pages
+	 *    belonging to the guest.
+	 *
+	 *	asidgen		eptgen	      Action
+	 *	mismatch	mismatch
+	 *	   0		   0		(a)
+	 *	   0		   1		(b1) or (b2)
+	 *	   1		   0		(c)
+	 *	   1		   1		(d)
+	 *
+	 * (a) There is no mismatch in eptgen or ASID generation and therefore
+	 *     no further action is needed.
+	 *
+	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
+	 *      retained and the TLB entries associated with this ASID
+	 *      are flushed by VMRUN.
+	 *
+	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
+	 *      allocated.
+	 *
+	 * (c) A new ASID is allocated.
+	 *
+	 * (d) A new ASID is allocated.
+	 */
+
+	alloc_asid = false;
+	eptgen = pmap->pm_eptgen;
+	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
+
+	if (vcpustate->asid.gen != asid[thiscpu].gen) {
+		alloc_asid = true;	/* (c) and (d) */
+	} else if (vcpustate->eptgen != eptgen) {
+		if (flush_by_asid())
+			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
+		else
+			alloc_asid = true;			/* (b2) */
+	} else {
+		/*
+		 * This is the common case (a).
+		 */
+		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
+		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
+		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
+	}
+
+	if (alloc_asid) {
+		if (++asid[thiscpu].num >= nasid) {
+			asid[thiscpu].num = 1;
+			if (++asid[thiscpu].gen == 0)
+				asid[thiscpu].gen = 1;
+			/*
+			 * If this cpu does not support "flush-by-asid"
+			 * then flush the entire TLB on a generation
+			 * bump. Subsequent ASID allocation in this
+			 * generation can be done without a TLB flush.
+			 */
+			if (!flush_by_asid())
+				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
+		}
+		vcpustate->asid.gen = asid[thiscpu].gen;
+		vcpustate->asid.num = asid[thiscpu].num;
+
+		ctrl->asid = vcpustate->asid.num;
+		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
+		/*
+		 * If this cpu supports "flush-by-asid" then the TLB
+		 * was not flushed after the generation bump. The TLB
+		 * is flushed selectively after every new ASID allocation.
+		 */
+		if (flush_by_asid())
+			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
+	}
+	vcpustate->eptgen = eptgen;
+
+	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
+	KASSERT(ctrl->asid == vcpustate->asid.num,
+	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
+}
+#else /* __FreeBSD__ */
+static void
+check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
+{
+	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
+	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
+	long eptgen;
+	uint8_t flush;
+
+	eptgen = pmap->pm_eptgen;
+	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
+	    vcpustate->eptgen == eptgen);
+
+	if (flush != VMCB_TLB_FLUSH_NOTHING) {
+		ctrl->asid = vcpustate->hma_asid.hsa_asid;
+		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
+	}
+	ctrl->tlb_ctrl = flush;
+	vcpustate->eptgen = eptgen;
+}
+#endif /* __FreeBSD__ */
+
+static __inline void
+disable_gintr(void)
+{
+
+	__asm __volatile("clgi");
+}
+
+static __inline void
+enable_gintr(void)
+{
+
+        __asm __volatile("stgi");
+}
+
+static __inline void
+svm_dr_enter_guest(struct svm_regctx *gctx)
+{
+
+	/* Save host control debug registers. */
+	gctx->host_dr7 = rdr7();
+	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
+
+	/*
+	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
+	 * exceptions in the host based on the guest DRx values.  The
+	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
+	 * VMCB.
+	 */
+	load_dr7(0);
+	wrmsr(MSR_DEBUGCTLMSR, 0);
+
+	/* Save host debug registers. */
+	gctx->host_dr0 = rdr0();
+	gctx->host_dr1 = rdr1();
+	gctx->host_dr2 = rdr2();
+	gctx->host_dr3 = rdr3();
+	gctx->host_dr6 = rdr6();
+
+	/* Restore guest debug registers. */
+	load_dr0(gctx->sctx_dr0);
+	load_dr1(gctx->sctx_dr1);
+	load_dr2(gctx->sctx_dr2);
+	load_dr3(gctx->sctx_dr3);
+}
+
+static __inline void
+svm_dr_leave_guest(struct svm_regctx *gctx)
+{
+
+	/* Save guest debug registers. */
+	gctx->sctx_dr0 = rdr0();
+	gctx->sctx_dr1 = rdr1();
+	gctx->sctx_dr2 = rdr2();
+	gctx->sctx_dr3 = rdr3();
+
+	/*
+	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
+	 * last.
+	 */
+	load_dr0(gctx->host_dr0);
+	load_dr1(gctx->host_dr1);
+	load_dr2(gctx->host_dr2);
+	load_dr3(gctx->host_dr3);
+	load_dr6(gctx->host_dr6);
+	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
+	load_dr7(gctx->host_dr7);
+}
+
+/*
+ * Start vcpu with specified RIP.
+ */
+static int
+svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
+	struct vm_eventinfo *evinfo)
+{
+	struct svm_regctx *gctx;
+	struct svm_softc *svm_sc;
+	struct svm_vcpu *vcpustate;
+	struct vmcb_state *state;
+	struct vmcb_ctrl *ctrl;
+	struct vm_exit *vmexit;
+	struct vlapic *vlapic;
+	struct vm *vm;
+	uint64_t vmcb_pa;
+	int handled;
+	uint16_t ldt_sel;
+
+	svm_sc = arg;
+	vm = svm_sc->vm;
+
+	vcpustate = svm_get_vcpu(svm_sc, vcpu);
+	state = svm_get_vmcb_state(svm_sc, vcpu);
+	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
+	vmexit = vm_exitinfo(vm, vcpu);
+	vlapic = vm_lapic(vm, vcpu);
+
+	gctx = svm_get_guest_regctx(svm_sc, vcpu);
+	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
+
+	if (vcpustate->lastcpu != curcpu) {
+		/*
+		 * Force new ASID allocation by invalidating the generation.
+		 */
+#ifdef __FreeBSD__
+		vcpustate->asid.gen = 0;
+#else
+		vcpustate->hma_asid.hsa_gen = 0;
+#endif
+
+		/*
+		 * Invalidate the VMCB state cache by marking all fields dirty.
+		 */
+		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
+
+		/*
+		 * XXX
+		 * Setting 'vcpustate->lastcpu' here is bit premature because
+		 * we may return from this function without actually executing
+		 * the VMRUN  instruction. This could happen if an AST or yield
+		 * condition is pending on the first time through the loop.
+		 *
+		 * This works for now but any new side-effects of vcpu
+		 * migration should take this case into account.
+		 */
+		vcpustate->lastcpu = curcpu;
+		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
+	}
+
+	svm_msr_guest_enter(svm_sc, vcpu);
+
+#ifndef __FreeBSD__
+	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
+	vcpustate->loaded = B_TRUE;
+#endif
+
+	/* Update Guest RIP */
+	state->rip = rip;
+
+	do {
+#ifndef __FreeBSD__
+		/*
+		 * Interrupt injection may involve mutex contention which, on
+		 * illumos bhyve, are blocking/non-spin.  Doing so with global
+		 * interrupts disabled is a recipe for deadlock, so it is
+		 * performed here.
+		 */
+		svm_inj_interrupts(svm_sc, vcpu, vlapic);
+#endif
+
+		/*
+		 * Disable global interrupts to guarantee atomicity during
+		 * loading of guest state. This includes not only the state
+		 * loaded by the "vmrun" instruction but also software state
+		 * maintained by the hypervisor: suspended and rendezvous
+		 * state, NPT generation number, vlapic interrupts etc.
+		 */
+		disable_gintr();
+
+		if (vcpu_suspended(evinfo)) {
+			enable_gintr();
+			vm_exit_suspended(vm, vcpu, state->rip);
+			break;
+		}
+
+		if (vcpu_runblocked(evinfo)) {
+			enable_gintr();
+			vm_exit_runblock(vm, vcpu, state->rip);
+			break;
+		}
+
+		if (vcpu_reqidle(evinfo)) {
+			enable_gintr();
+			vm_exit_reqidle(vm, vcpu, state->rip);
+			break;
+		}
+
+		/* We are asked to give the cpu by scheduler. */
+		if (vcpu_should_yield(vm, vcpu)) {
+			enable_gintr();
+			vm_exit_astpending(vm, vcpu, state->rip);
+			break;
+		}
+
+		if (vcpu_debugged(vm, vcpu)) {
+			enable_gintr();
+			vm_exit_debug(vm, vcpu, state->rip);
+			break;
+		}
+
+		/*
+		 * #VMEXIT resumes the host with the guest LDTR, so
+		 * save the current LDT selector so it can be restored
+		 * after an exit.  The userspace hypervisor probably
+		 * doesn't use a LDT, but save and restore it to be
+		 * safe.
+		 */
+		ldt_sel = sldt();
+
+#ifdef __FreeBSD__
+		svm_inj_interrupts(svm_sc, vcpu, vlapic);
+#endif
+
+		/* Activate the nested pmap on 'curcpu' */
+		CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
+
+		/*
+		 * Check the pmap generation and the ASID generation to
+		 * ensure that the vcpu does not use stale TLB mappings.
+		 */
+		check_asid(svm_sc, vcpu, pmap, curcpu);
+
+		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
+		vcpustate->dirty = 0;
+		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
+
+		/* Launch Virtual Machine. */
+		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
+		svm_dr_enter_guest(gctx);
+#ifdef __FreeBSD__
+		svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]);
+#else
+		svm_launch(vmcb_pa, gctx, CPU);
+#endif
+		svm_dr_leave_guest(gctx);
+
+		CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
+
+		/*
+		 * The host GDTR and IDTR is saved by VMRUN and restored
+		 * automatically on #VMEXIT. However, the host TSS needs
+		 * to be restored explicitly.
+		 */
+		restore_host_tss();
+
+		/* Restore host LDTR. */
+		lldt(ldt_sel);
+
+		/* #VMEXIT disables interrupts so re-enable them here. */ 
+		enable_gintr();
+
+		/* Update 'nextrip' */
+		vcpustate->nextrip = state->rip;
+
+		/* Handle #VMEXIT and if required return to user space. */
+		handled = svm_vmexit(svm_sc, vcpu, vmexit);
+	} while (handled);
+
+	svm_msr_guest_exit(svm_sc, vcpu);
+
+#ifndef __FreeBSD__
+	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
+	vcpustate->loaded = B_FALSE;
+#endif
+
+	return (0);
+}
+
+static void
+svm_vmcleanup(void *arg)
+{
+	struct svm_softc *sc = arg;
+
+	contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
+	contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
+	free(sc, M_SVM);
+}
+
+static register_t *
+swctx_regptr(struct svm_regctx *regctx, int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_RBX:
+		return (&regctx->sctx_rbx);
+	case VM_REG_GUEST_RCX:
+		return (&regctx->sctx_rcx);
+	case VM_REG_GUEST_RDX:
+		return (&regctx->sctx_rdx);
+	case VM_REG_GUEST_RDI:
+		return (&regctx->sctx_rdi);
+	case VM_REG_GUEST_RSI:
+		return (&regctx->sctx_rsi);
+	case VM_REG_GUEST_RBP:
+		return (&regctx->sctx_rbp);
+	case VM_REG_GUEST_R8:
+		return (&regctx->sctx_r8);
+	case VM_REG_GUEST_R9:
+		return (&regctx->sctx_r9);
+	case VM_REG_GUEST_R10:
+		return (&regctx->sctx_r10);
+	case VM_REG_GUEST_R11:
+		return (&regctx->sctx_r11);
+	case VM_REG_GUEST_R12:
+		return (&regctx->sctx_r12);
+	case VM_REG_GUEST_R13:
+		return (&regctx->sctx_r13);
+	case VM_REG_GUEST_R14:
+		return (&regctx->sctx_r14);
+	case VM_REG_GUEST_R15:
+		return (&regctx->sctx_r15);
+	case VM_REG_GUEST_DR0:
+		return (&regctx->sctx_dr0);
+	case VM_REG_GUEST_DR1:
+		return (&regctx->sctx_dr1);
+	case VM_REG_GUEST_DR2:
+		return (&regctx->sctx_dr2);
+	case VM_REG_GUEST_DR3:
+		return (&regctx->sctx_dr3);
+	default:
+		return (NULL);
+	}
+}
+
+static int
+svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
+{
+	struct svm_softc *svm_sc;
+	register_t *reg;
+
+	svm_sc = arg;
+
+	if (ident == VM_REG_GUEST_INTR_SHADOW) {
+		return (svm_get_intr_shadow(svm_sc, vcpu, val));
+	}
+
+	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
+		return (0);
+	}
+
+	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
+
+	if (reg != NULL) {
+		*val = *reg;
+		return (0);
+	}
+
+	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
+	return (EINVAL);
+}
+
+static int
+svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
+{
+	struct svm_softc *svm_sc;
+	register_t *reg;
+
+	svm_sc = arg;
+
+	if (ident == VM_REG_GUEST_INTR_SHADOW) {
+		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
+	}
+
+	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
+		return (0);
+	}
+
+	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
+
+	if (reg != NULL) {
+		*reg = val;
+		return (0);
+	}
+
+	/*
+	 * XXX deal with CR3 and invalidate TLB entries tagged with the
+	 * vcpu's ASID. This needs to be treated differently depending on
+	 * whether 'running' is true/false.
+	 */
+
+	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
+	return (EINVAL);
+}
+
+static int
+svm_setcap(void *arg, int vcpu, int type, int val)
+{
+	struct svm_softc *sc;
+	int error;
+
+	sc = arg;
+	error = 0;
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_HLT, val);
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_PAUSE, val);
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		/* Unrestricted guest execution cannot be disabled in SVM */
+		if (val == 0)
+			error = EINVAL;
+		break;
+	default:
+		error = ENOENT;
+		break;
+	}
+	return (error);
+}
+
+static int
+svm_getcap(void *arg, int vcpu, int type, int *retval)
+{
+	struct svm_softc *sc;
+	int error;
+
+	sc = arg;
+	error = 0;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_HLT);
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+		    VMCB_INTCPT_PAUSE);
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		*retval = 1;	/* unrestricted guest is always enabled */
+		break;
+	default:
+		error = ENOENT;
+		break;
+	}
+	return (error);
+}
+
+static struct vlapic *
+svm_vlapic_init(void *arg, int vcpuid)
+{
+	struct svm_softc *svm_sc;
+	struct vlapic *vlapic;
+
+	svm_sc = arg;
+	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
+	vlapic->vm = svm_sc->vm;
+	vlapic->vcpuid = vcpuid;
+	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
+
+	vlapic_init(vlapic);
+
+	return (vlapic);
+}
+
+static void
+svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
+{
+
+        vlapic_cleanup(vlapic);
+        free(vlapic, M_SVM_VLAPIC);
+}
+
+#ifndef __FreeBSD__
+static void
+svm_savectx(void *arg, int vcpu)
+{
+	struct svm_softc *sc = arg;
+
+	if (sc->vcpu[vcpu].loaded) {
+		svm_msr_guest_exit(sc, vcpu);
+	}
+}
+
+static void
+svm_restorectx(void *arg, int vcpu)
+{
+	struct svm_softc *sc = arg;
+
+	if (sc->vcpu[vcpu].loaded) {
+		svm_msr_guest_enter(sc, vcpu);
+	}
+}
+#endif /* __FreeBSD__ */
+
+struct vmm_ops vmm_ops_amd = {
+	svm_init,
+	svm_cleanup,
+	svm_restore,
+	svm_vminit,
+	svm_vmrun,
+	svm_vmcleanup,
+	svm_getreg,
+	svm_setreg,
+	vmcb_getdesc,
+	vmcb_setdesc,
+	svm_getcap,
+	svm_setcap,
+	svm_npt_alloc,
+	svm_npt_free,
+	svm_vlapic_init,
+	svm_vlapic_cleanup,
+
+#ifndef __FreeBSD__
+	svm_savectx,
+	svm_restorectx,
+#endif
+};
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.h b/usr/src/uts/i86pc/io/vmm/amd/svm.h
new file mode 100644
index 0000000000..c78f7eb067
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.h
@@ -0,0 +1,74 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_H_
+#define _SVM_H_
+
+/*
+ * Guest register state that is saved outside the VMCB.
+ */
+struct svm_regctx {
+	register_t	sctx_rbp;
+	register_t	sctx_rbx;
+	register_t	sctx_rcx;
+	register_t	sctx_rdx;
+	register_t	sctx_rdi;
+	register_t	sctx_rsi;
+	register_t	sctx_r8;
+	register_t	sctx_r9;
+	register_t	sctx_r10;
+	register_t	sctx_r11;
+	register_t	sctx_r12;
+	register_t	sctx_r13;
+	register_t	sctx_r14;
+	register_t	sctx_r15;
+	register_t	sctx_dr0;
+	register_t	sctx_dr1;
+	register_t	sctx_dr2;
+	register_t	sctx_dr3;
+
+	register_t	host_dr0;
+	register_t	host_dr1;
+	register_t	host_dr2;
+	register_t	host_dr3;
+	register_t	host_dr6;
+	register_t	host_dr7;
+	uint64_t	host_debugctl;
+};
+
+#ifdef __FreeBSD__
+struct pcpu;
+void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu);
+#else
+struct cpu;
+void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *pcpu);
+#endif
+
+#endif /* _SVM_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c
new file mode 100644
index 0000000000..0c1ce0e4e0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c
@@ -0,0 +1,199 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#include "svm.h"
+#include "vmcb.h"
+#include "svm_softc.h"
+#include "svm_msr.h"
+
+#ifndef MSR_AMDK8_IPM
+#define	MSR_AMDK8_IPM	0xc0010055
+#endif
+
+enum {
+	IDX_MSR_LSTAR,
+	IDX_MSR_CSTAR,
+	IDX_MSR_STAR,
+	IDX_MSR_SF_MASK,
+	HOST_MSR_NUM		/* must be the last enumeration */
+};
+
+#ifdef __FreeBSD__
+static uint64_t host_msrs[HOST_MSR_NUM];
+
+void
+svm_msr_init(void)
+{
+	/* 
+	 * It is safe to cache the values of the following MSRs because they
+	 * don't change based on curcpu, curproc or curthread.
+	 */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+}
+#else
+
+CTASSERT(HOST_MSR_NUM == SVM_HOST_MSR_NUM);
+
+void
+svm_msr_init(void)
+{
+	/*
+	 * These MSRs do vary between CPUs on illumos, so saving system-wide
+	 * values for them serves no purpose.
+	 */
+}
+#endif /* __FreeBSD__ */
+
+void
+svm_msr_guest_init(struct svm_softc *sc, int vcpu)
+{
+	/*
+	 * All the MSRs accessible to the guest are either saved/restored by
+	 * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored
+	 * by VMSAVE/VMLOAD (e.g., MSR_GSBASE).
+	 *
+	 * There are no guest MSRs that are saved/restored "by hand" so nothing
+	 * more to do here.
+	 */
+	return;
+}
+
+void
+svm_msr_guest_enter(struct svm_softc *sc, int vcpu)
+{
+	/*
+	 * Save host MSRs (if any) and restore guest MSRs (if any).
+	 */
+#ifndef __FreeBSD__
+	uint64_t *host_msrs = sc->host_msrs[vcpu];
+
+	/* Save host MSRs */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+#endif /* __FreeBSD__ */
+}
+
+void
+svm_msr_guest_exit(struct svm_softc *sc, int vcpu)
+{
+#ifndef __FreeBSD__
+	uint64_t *host_msrs = sc->host_msrs[vcpu];
+#endif
+	/*
+	 * Save guest MSRs (if any) and restore host MSRs.
+	 */
+	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
+
+	/* MSR_KGSBASE will be restored on the way back to userspace */
+}
+
+int
+svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
+    bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		*result = 0;
+		break;
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+	case MSR_AMDK8_IPM:
+	case MSR_EXTFEATURES:
+		*result = 0;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+int
+svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	case MSR_MCG_CAP:
+	case MSR_MCG_STATUS:
+		break;		/* ignore writes */
+	case MSR_MTRRcap:
+		vm_inject_gp(sc->vm, vcpu);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+	case MSR_SYSCFG:
+		break;		/* Ignore writes */
+	case MSR_AMDK8_IPM:
+		/*
+		 * Ignore writes to the "Interrupt Pending Message" MSR.
+		 */
+		break;
+	case MSR_K8_UCODE_UPDATE:
+		/*
+		 * Ignore writes to microcode update register.
+		 */
+		break;
+	case MSR_EXTFEATURES:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h
new file mode 100644
index 0000000000..1dba8101ab
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_MSR_H_
+#define	_SVM_MSR_H_
+
+struct svm_softc;
+
+void svm_msr_init(void);
+void svm_msr_guest_init(struct svm_softc *sc, int vcpu);
+void svm_msr_guest_enter(struct svm_softc *sc, int vcpu);
+void svm_msr_guest_exit(struct svm_softc *sc, int vcpu);
+
+int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
+    bool *retu);
+int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
+    bool *retu);
+
+#endif	/* _SVM_MSR_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
new file mode 100644
index 0000000000..b5ac1903e7
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h
@@ -0,0 +1,131 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SVM_SOFTC_H_
+#define _SVM_SOFTC_H_
+
+#define SVM_IO_BITMAP_SIZE	(3 * PAGE_SIZE)
+#define SVM_MSR_BITMAP_SIZE	(2 * PAGE_SIZE)
+
+#ifdef __FreeBSD__
+struct asid {
+	uint64_t	gen;	/* range is [1, ~0UL] */
+	uint32_t	num;	/* range is [1, nasid - 1] */
+};
+#else
+#include <sys/hma.h>
+
+/* This must match HOST_MSR_NUM in svm_msr.c (where it is CTASSERTed) */
+#define SVM_HOST_MSR_NUM	4
+#endif /* __FreeBSD__ */
+
+/*
+ * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space
+ * due to VMCB alignment requirements.
+ */
+struct svm_vcpu {
+	struct vmcb	vmcb;	 /* hardware saved vcpu context */
+	struct svm_regctx swctx; /* software saved vcpu context */
+	uint64_t	vmcb_pa; /* VMCB physical address */
+	uint64_t	nextrip; /* next instruction to be executed by guest */
+        int		lastcpu; /* host cpu that the vcpu last ran on */
+	uint32_t	dirty;	 /* state cache bits that must be cleared */
+	long		eptgen;	 /* pmap->pm_eptgen when the vcpu last ran */
+#ifdef __FreeBSD__
+	struct asid	asid;
+#else
+	hma_svm_asid_t	hma_asid;
+	boolean_t	loaded;
+#endif
+} __aligned(PAGE_SIZE);
+
+/*
+ * SVM softc, one per virtual machine.
+ */
+struct svm_softc {
+	uint8_t apic_page[VM_MAXCPU][PAGE_SIZE];
+	struct svm_vcpu vcpu[VM_MAXCPU];
+	vm_offset_t 	nptp;			    /* nested page table */
+	uint8_t		*iopm_bitmap;    /* shared by all vcpus */
+	uint8_t		*msr_bitmap;    /* shared by all vcpus */
+	struct vm	*vm;
+#ifndef __FreeBSD__
+	uint64_t	host_msrs[VM_MAXCPU][SVM_HOST_MSR_NUM];
+#endif
+};
+
+CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0);
+
+static __inline struct svm_vcpu *
+svm_get_vcpu(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu]));
+}
+
+static __inline struct vmcb *
+svm_get_vmcb(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].vmcb));
+}
+
+static __inline struct vmcb_state *
+svm_get_vmcb_state(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].vmcb.state));
+}
+
+static __inline struct vmcb_ctrl *
+svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].vmcb.ctrl));
+}
+
+static __inline struct svm_regctx *
+svm_get_guest_regctx(struct svm_softc *sc, int vcpu)
+{
+
+	return (&(sc->vcpu[vcpu].swctx));
+}
+
+static __inline void
+svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits)
+{
+        struct svm_vcpu *vcpustate;
+
+        vcpustate = svm_get_vcpu(sc, vcpu);
+
+        vcpustate->dirty |= dirtybits;
+}
+
+#endif /* _SVM_SOFTC_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
new file mode 100644
index 0000000000..fad994b09c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
@@ -0,0 +1,164 @@
+/*-
+ * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+
+#include "svm_assym.h"
+
+/* Porting note: This is named 'svm_support.S' upstream. */
+
+#if defined(lint)
+
+struct svm_regctx;
+struct cpu;
+
+/*ARGSUSED*/
+void
+svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *cpu)
+{}
+
+#else /* lint */
+
+#define	VMLOAD	.byte 0x0f, 0x01, 0xda
+#define	VMRUN	.byte 0x0f, 0x01, 0xd8
+#define	VMSAVE	.byte 0x0f, 0x01, 0xdb
+
+
+/*
+ * Flush scratch registers to avoid lingering guest state being used for
+ * Spectre v1 attacks when returning from guest entry.
+ */
+#define	SVM_GUEST_FLUSH_SCRATCH						\
+	xorl	%edi, %edi;						\
+	xorl	%esi, %esi;						\
+	xorl	%edx, %edx;						\
+	xorl	%ecx, %ecx;						\
+	xorl	%r8d, %r8d;						\
+	xorl	%r9d, %r9d;						\
+	xorl	%r10d, %r10d;						\
+	xorl	%r11d, %r11d;
+
+/* Stack layout (offset from %rsp) for svm_launch */
+#define	SVMSTK_R15	0x00	/* callee saved %r15			*/
+#define	SVMSTK_R14	0x08	/* callee saved %r14			*/
+#define	SVMSTK_R13	0x10	/* callee saved %r13			*/
+#define	SVMSTK_R12	0x18	/* callee saved %r12			*/
+#define	SVMSTK_RBX	0x20	/* callee saved %rbx			*/
+#define	SVMSTK_RDX	0x28	/* save-args %rdx (struct cpu *)	*/
+#define	SVMSTK_RSI	0x30	/* save-args %rsi (struct svm_regctx *)	*/
+#define	SVMSTK_RDI	0x38	/* save-args %rdi (uint64_t vmcb_pa)	*/
+#define	SVMSTK_FP	0x40	/* frame pointer %rbp			*/
+#define	SVMSTKSIZE	SVMSTK_FP
+
+/*
+ * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu)
+ * %rdi: physical address of VMCB
+ * %rsi: pointer to guest context
+ * %rdx: pointer to the pcpu data
+ */
+ENTRY_NP(svm_launch)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$SVMSTKSIZE, %rsp
+	movq	%r15, SVMSTK_R15(%rsp)
+	movq	%r14, SVMSTK_R14(%rsp)
+	movq	%r13, SVMSTK_R13(%rsp)
+	movq	%r12, SVMSTK_R12(%rsp)
+	movq	%rbx, SVMSTK_RBX(%rsp)
+	movq	%rdx, SVMSTK_RDX(%rsp)
+	movq	%rsi, SVMSTK_RSI(%rsp)
+	movq	%rdi, SVMSTK_RDI(%rsp)
+
+	/* VMLOAD and VMRUN expect the VMCB physaddr in %rax */
+	movq	%rdi, %rax
+
+	/* Restore guest state. */
+	movq	SCTX_R8(%rsi), %r8
+	movq	SCTX_R9(%rsi), %r9
+	movq	SCTX_R10(%rsi), %r10
+	movq	SCTX_R11(%rsi), %r11
+	movq	SCTX_R12(%rsi), %r12
+	movq	SCTX_R13(%rsi), %r13
+	movq	SCTX_R14(%rsi), %r14
+	movq	SCTX_R15(%rsi), %r15
+	movq	SCTX_RBP(%rsi), %rbp
+	movq	SCTX_RBX(%rsi), %rbx
+	movq	SCTX_RCX(%rsi), %rcx
+	movq	SCTX_RDX(%rsi), %rdx
+	movq	SCTX_RDI(%rsi), %rdi
+	movq	SCTX_RSI(%rsi), %rsi	/* %rsi must be restored last */
+
+	VMLOAD
+	VMRUN
+	VMSAVE
+
+	/* Grab the svm_regctx pointer */
+	movq	SVMSTK_RSI(%rsp), %rax
+
+	/* Save guest state. */
+	movq	%r8, SCTX_R8(%rax)
+	movq	%r9, SCTX_R9(%rax)
+	movq	%r10, SCTX_R10(%rax)
+	movq	%r11, SCTX_R11(%rax)
+	movq	%r12, SCTX_R12(%rax)
+	movq	%r13, SCTX_R13(%rax)
+	movq	%r14, SCTX_R14(%rax)
+	movq	%r15, SCTX_R15(%rax)
+	movq	%rbp, SCTX_RBP(%rax)
+	movq	%rbx, SCTX_RBX(%rax)
+	movq	%rcx, SCTX_RCX(%rax)
+	movq	%rdx, SCTX_RDX(%rax)
+	movq	%rdi, SCTX_RDI(%rax)
+	movq	%rsi, SCTX_RSI(%rax)
+
+	/* Restore callee-saved registers */
+	movq	SVMSTK_R15(%rsp), %r15
+	movq	SVMSTK_R14(%rsp), %r14
+	movq	SVMSTK_R13(%rsp), %r13
+	movq	SVMSTK_R12(%rsp), %r12
+	movq	SVMSTK_RBX(%rsp), %rbx
+
+	/* Fix %gsbase to point back to the correct 'struct cpu *' */
+	movq	SVMSTK_RDX(%rsp), %rdx
+	movl	%edx, %eax
+	shrq	$32, %rdx
+	movl	$MSR_GSBASE, %ecx
+	wrmsr
+
+	SVM_GUEST_FLUSH_SCRATCH
+
+	addq	$SVMSTKSIZE, %rsp
+	popq	%rbp
+	ret
+SET_SIZE(svm_launch)
+
+#endif /* lint */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.c b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c
new file mode 100644
index 0000000000..5075b69867
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c
@@ -0,0 +1,454 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+
+#include "vmcb.h"
+#include "svm.h"
+#include "svm_softc.h"
+
+/*
+ * The VMCB aka Virtual Machine Control Block is a 4KB aligned page
+ * in memory that describes the virtual machine.
+ *
+ * The VMCB contains:
+ * - instructions or events in the guest to intercept
+ * - control bits that modify execution environment of the guest
+ * - guest processor state (e.g. general purpose registers)
+ */
+
+/*
+ * Return VMCB segment area.
+ */
+static struct vmcb_segment *
+vmcb_segptr(struct vmcb *vmcb, int type)
+{
+	struct vmcb_state *state;
+	struct vmcb_segment *seg;
+
+	state = &vmcb->state;
+
+	switch (type) {
+	case VM_REG_GUEST_CS:
+		seg = &state->cs;
+		break;
+
+	case VM_REG_GUEST_DS:
+		seg = &state->ds;
+		break;
+
+	case VM_REG_GUEST_ES:
+		seg = &state->es;
+		break;
+
+	case VM_REG_GUEST_FS:
+		seg = &state->fs;
+		break;
+
+	case VM_REG_GUEST_GS:
+		seg = &state->gs;
+		break;
+
+	case VM_REG_GUEST_SS:
+		seg = &state->ss;
+		break;
+
+	case VM_REG_GUEST_GDTR:
+		seg = &state->gdt;
+		break;
+
+	case VM_REG_GUEST_IDTR:
+		seg = &state->idt;
+		break;
+
+	case VM_REG_GUEST_LDTR:
+		seg = &state->ldt;
+		break;
+
+	case VM_REG_GUEST_TR:
+		seg = &state->tr;
+		break;
+
+	default:
+		seg = NULL;
+		break;
+	}
+
+	return (seg);
+}
+
+static int
+vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident,
+	uint64_t *val)
+{
+	struct vmcb *vmcb;
+	int off, bytes;
+	char *ptr;
+
+	vmcb	= svm_get_vmcb(softc, vcpu);
+	off	= VMCB_ACCESS_OFFSET(ident);
+	bytes	= VMCB_ACCESS_BYTES(ident);
+
+	if ((off + bytes) >= sizeof (struct vmcb))
+		return (EINVAL);
+
+	ptr = (char *)vmcb;
+
+	if (!write)
+		*val = 0;
+
+	switch (bytes) {
+	case 8:
+	case 4:
+	case 2:
+		if (write)
+			memcpy(ptr + off, val, bytes);
+		else
+			memcpy(val, ptr + off, bytes);
+		break;
+	default:
+		VCPU_CTR1(softc->vm, vcpu,
+		    "Invalid size %d for VMCB access: %d", bytes);
+		return (EINVAL);
+	}
+
+	/* Invalidate all VMCB state cached by h/w. */
+	if (write)
+		svm_set_dirty(softc, vcpu, 0xffffffff);
+
+	return (0);
+}
+
+/*
+ * Read from segment selector, control and general purpose register of VMCB.
+ */
+int
+vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval)
+{
+	struct vmcb *vmcb;
+	struct vmcb_state *state;
+	struct vmcb_segment *seg;
+	int err;
+
+	vmcb = svm_get_vmcb(sc, vcpu);
+	state = &vmcb->state;
+	err = 0;
+
+	if (VMCB_ACCESS_OK(ident))
+		return (vmcb_access(sc, vcpu, 0, ident, retval));
+
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		*retval = state->cr0;
+		break;
+
+	case VM_REG_GUEST_CR2:
+		*retval = state->cr2;
+		break;
+
+	case VM_REG_GUEST_CR3:
+		*retval = state->cr3;
+		break;
+
+	case VM_REG_GUEST_CR4:
+		*retval = state->cr4;
+		break;
+
+	case VM_REG_GUEST_DR6:
+		*retval = state->dr6;
+		break;
+
+	case VM_REG_GUEST_DR7:
+		*retval = state->dr7;
+		break;
+
+	case VM_REG_GUEST_EFER:
+		*retval = state->efer;
+		break;
+
+	case VM_REG_GUEST_RAX:
+		*retval = state->rax;
+		break;
+
+	case VM_REG_GUEST_RFLAGS:
+		*retval = state->rflags;
+		break;
+
+	case VM_REG_GUEST_RIP:
+		*retval = state->rip;
+		break;
+
+	case VM_REG_GUEST_RSP:
+		*retval = state->rsp;
+		break;
+
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_SS:
+	case VM_REG_GUEST_LDTR:
+	case VM_REG_GUEST_TR:
+		seg = vmcb_segptr(vmcb, ident);
+		KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
+		    __func__, ident));
+		*retval = seg->selector;
+		break;
+
+	case VM_REG_GUEST_GDTR:
+	case VM_REG_GUEST_IDTR:
+		/* GDTR and IDTR don't have segment selectors */
+		err = EINVAL;
+		break;
+	default:
+		err =  EINVAL;
+		break;
+	}
+
+	return (err);
+}
+
+/*
+ * Write to segment selector, control and general purpose register of VMCB.
+ */
+int
+vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
+{
+	struct vmcb *vmcb;
+	struct vmcb_state *state;
+	struct vmcb_segment *seg;
+	int err, dirtyseg;
+
+	vmcb = svm_get_vmcb(sc, vcpu);
+	state = &vmcb->state;
+	dirtyseg = 0;
+	err = 0;
+
+	if (VMCB_ACCESS_OK(ident))
+		return (vmcb_access(sc, vcpu, 1, ident, &val));
+
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		state->cr0 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_CR2:
+		state->cr2 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2);
+		break;
+
+	case VM_REG_GUEST_CR3:
+		state->cr3 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_CR4:
+		state->cr4 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_DR6:
+		state->dr6 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
+		break;
+
+	case VM_REG_GUEST_DR7:
+		state->dr7 = val;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
+		break;
+
+	case VM_REG_GUEST_EFER:
+		/* EFER_SVM must always be set when the guest is executing */
+		state->efer = val | EFER_SVM;
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
+		break;
+
+	case VM_REG_GUEST_RAX:
+		state->rax = val;
+		break;
+
+	case VM_REG_GUEST_RFLAGS:
+		state->rflags = val;
+		break;
+
+	case VM_REG_GUEST_RIP:
+		state->rip = val;
+		break;
+
+	case VM_REG_GUEST_RSP:
+		state->rsp = val;
+		break;
+
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_SS:
+		dirtyseg = 1;		/* FALLTHROUGH */
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_LDTR:
+	case VM_REG_GUEST_TR:
+		seg = vmcb_segptr(vmcb, ident);
+		KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
+		    __func__, ident));
+		seg->selector = val;
+		if (dirtyseg)
+			svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
+		break;
+
+	case VM_REG_GUEST_GDTR:
+	case VM_REG_GUEST_IDTR:
+		/* GDTR and IDTR don't have segment selectors */
+		err = EINVAL;
+		break;
+	default:
+		err = EINVAL;
+		break;
+	}
+
+	return (err);
+}
+
+int
+vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2)
+{
+	struct vmcb_segment *seg;
+
+	seg = vmcb_segptr(vmcb, ident);
+	if (seg != NULL) {
+		bcopy(seg, seg2, sizeof(struct vmcb_segment));
+		return (0);
+	} else {
+		return (EINVAL);
+	}
+}
+
+int
+vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmcb *vmcb;
+	struct svm_softc *sc;
+	struct vmcb_segment *seg;
+	uint16_t attrib;
+
+	sc = arg;
+	vmcb = svm_get_vmcb(sc, vcpu);
+
+	seg = vmcb_segptr(vmcb, reg);
+	KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
+	    __func__, reg));
+
+	seg->base = desc->base;
+	seg->limit = desc->limit;
+	if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
+		/*
+		 * Map seg_desc access to VMCB attribute format.
+		 *
+		 * SVM uses the 'P' bit in the segment attributes to indicate a
+		 * NULL segment so clear it if the segment is marked unusable.
+		 */
+		attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
+		if (SEG_DESC_UNUSABLE(desc->access)) {
+			attrib &= ~0x80;
+		}
+		seg->attrib = attrib;
+	}
+
+	VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), "
+	    "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib);
+
+	switch (reg) {
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_SS:
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
+		break;
+	case VM_REG_GUEST_GDTR:
+	case VM_REG_GUEST_IDTR:
+		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+int
+vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmcb *vmcb;
+	struct svm_softc *sc;
+	struct vmcb_segment *seg;
+
+	sc = arg;
+	vmcb = svm_get_vmcb(sc, vcpu);
+	seg = vmcb_segptr(vmcb, reg);
+	KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
+	    __func__, reg));
+
+	desc->base = seg->base;
+	desc->limit = seg->limit;
+	desc->access = 0;
+
+	if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
+		/* Map seg_desc access to VMCB attribute format */
+		desc->access = ((seg->attrib & 0xF00) << 4) |
+		    (seg->attrib & 0xFF);
+
+		/*
+		 * VT-x uses bit 16 to indicate a segment that has been loaded
+		 * with a NULL selector (aka unusable). The 'desc->access'
+		 * field is interpreted in the VT-x format by the
+		 * processor-independent code.
+		 *
+		 * SVM uses the 'P' bit to convey the same information so
+		 * convert it into the VT-x format. For more details refer to
+		 * section "Segment State in the VMCB" in APMv2.
+		 */
+		if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) {
+			if ((desc->access & 0x80) == 0)
+				desc->access |= 0x10000;  /* Unusable segment */
+		}
+	}
+
+	return (0);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h
new file mode 100644
index 0000000000..ec7caa91f9
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h
@@ -0,0 +1,336 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCB_H_
+#define	_VMCB_H_
+
+struct svm_softc;
+
+#define BIT(n)			(1ULL << n)
+
+/*
+ * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15
+ * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B
+ */
+
+/* vmcb_ctrl->intercept[] array indices */
+#define	VMCB_CR_INTCPT		0
+#define	VMCB_DR_INTCPT		1
+#define	VMCB_EXC_INTCPT		2
+#define	VMCB_CTRL1_INTCPT	3
+#define	VMCB_CTRL2_INTCPT	4
+
+/* intercept[VMCB_CTRL1_INTCPT] fields */
+#define	VMCB_INTCPT_INTR		BIT(0)
+#define	VMCB_INTCPT_NMI			BIT(1)
+#define	VMCB_INTCPT_SMI			BIT(2)
+#define	VMCB_INTCPT_INIT		BIT(3)
+#define	VMCB_INTCPT_VINTR		BIT(4)
+#define	VMCB_INTCPT_CR0_WRITE		BIT(5)
+#define	VMCB_INTCPT_IDTR_READ		BIT(6)
+#define	VMCB_INTCPT_GDTR_READ		BIT(7)
+#define	VMCB_INTCPT_LDTR_READ		BIT(8)
+#define	VMCB_INTCPT_TR_READ		BIT(9)
+#define	VMCB_INTCPT_IDTR_WRITE		BIT(10)
+#define	VMCB_INTCPT_GDTR_WRITE		BIT(11)
+#define	VMCB_INTCPT_LDTR_WRITE		BIT(12)
+#define	VMCB_INTCPT_TR_WRITE		BIT(13)
+#define	VMCB_INTCPT_RDTSC		BIT(14)
+#define	VMCB_INTCPT_RDPMC		BIT(15)
+#define	VMCB_INTCPT_PUSHF		BIT(16)
+#define	VMCB_INTCPT_POPF		BIT(17)
+#define	VMCB_INTCPT_CPUID		BIT(18)
+#define	VMCB_INTCPT_RSM			BIT(19)
+#define	VMCB_INTCPT_IRET		BIT(20)
+#define	VMCB_INTCPT_INTn		BIT(21)
+#define	VMCB_INTCPT_INVD		BIT(22)
+#define	VMCB_INTCPT_PAUSE		BIT(23)
+#define	VMCB_INTCPT_HLT			BIT(24)
+#define	VMCB_INTCPT_INVPG		BIT(25)
+#define	VMCB_INTCPT_INVPGA		BIT(26)
+#define	VMCB_INTCPT_IO			BIT(27)
+#define	VMCB_INTCPT_MSR			BIT(28)
+#define	VMCB_INTCPT_TASK_SWITCH		BIT(29)
+#define	VMCB_INTCPT_FERR_FREEZE		BIT(30)
+#define	VMCB_INTCPT_SHUTDOWN		BIT(31)
+
+/* intercept[VMCB_CTRL2_INTCPT] fields */
+#define	VMCB_INTCPT_VMRUN		BIT(0)
+#define	VMCB_INTCPT_VMMCALL		BIT(1)
+#define	VMCB_INTCPT_VMLOAD		BIT(2)
+#define	VMCB_INTCPT_VMSAVE		BIT(3)
+#define	VMCB_INTCPT_STGI		BIT(4)
+#define	VMCB_INTCPT_CLGI		BIT(5)
+#define	VMCB_INTCPT_SKINIT		BIT(6)
+#define	VMCB_INTCPT_RDTSCP		BIT(7)
+#define	VMCB_INTCPT_ICEBP		BIT(8)
+#define	VMCB_INTCPT_WBINVD		BIT(9)
+#define	VMCB_INTCPT_MONITOR		BIT(10)
+#define	VMCB_INTCPT_MWAIT		BIT(11)
+#define	VMCB_INTCPT_MWAIT_ARMED		BIT(12)
+#define	VMCB_INTCPT_XSETBV		BIT(13)
+
+/* VMCB TLB control */
+#define	VMCB_TLB_FLUSH_NOTHING		0	/* Flush nothing */
+#define	VMCB_TLB_FLUSH_ALL		1	/* Flush entire TLB */
+#define	VMCB_TLB_FLUSH_GUEST		3	/* Flush all guest entries */
+#define	VMCB_TLB_FLUSH_GUEST_NONGLOBAL	7	/* Flush guest non-PG entries */
+
+/* VMCB state caching */
+#define	VMCB_CACHE_NONE		0	/* No caching */
+#define	VMCB_CACHE_I		BIT(0)	/* Intercept, TSC off, Pause filter */
+#define	VMCB_CACHE_IOPM		BIT(1)	/* I/O and MSR permission */
+#define	VMCB_CACHE_ASID		BIT(2)	/* ASID */
+#define	VMCB_CACHE_TPR		BIT(3)	/* V_TPR to V_INTR_VECTOR */
+#define	VMCB_CACHE_NP		BIT(4)	/* Nested Paging */
+#define	VMCB_CACHE_CR		BIT(5)	/* CR0, CR3, CR4 & EFER */
+#define	VMCB_CACHE_DR		BIT(6)	/* Debug registers */
+#define	VMCB_CACHE_DT		BIT(7)	/* GDT/IDT */
+#define	VMCB_CACHE_SEG		BIT(8)	/* User segments, CPL */
+#define	VMCB_CACHE_CR2		BIT(9)	/* page fault address */
+#define	VMCB_CACHE_LBR		BIT(10)	/* Last branch */
+
+/* VMCB control event injection */
+#define	VMCB_EVENTINJ_EC_VALID		BIT(11)	/* Error Code valid */
+#define	VMCB_EVENTINJ_VALID		BIT(31)	/* Event valid */
+
+/* Event types that can be injected */
+#define	VMCB_EVENTINJ_TYPE_INTR		0
+#define	VMCB_EVENTINJ_TYPE_NMI		2
+#define	VMCB_EVENTINJ_TYPE_EXCEPTION	3
+#define	VMCB_EVENTINJ_TYPE_INTn		4
+
+/* VMCB exit code, APM vol2 Appendix C */
+#define	VMCB_EXIT_MC			0x52
+#define	VMCB_EXIT_INTR			0x60
+#define	VMCB_EXIT_NMI			0x61
+#define	VMCB_EXIT_VINTR			0x64
+#define	VMCB_EXIT_PUSHF			0x70
+#define	VMCB_EXIT_POPF			0x71
+#define	VMCB_EXIT_CPUID			0x72
+#define	VMCB_EXIT_IRET			0x74
+#define	VMCB_EXIT_PAUSE			0x77
+#define	VMCB_EXIT_HLT			0x78
+#define	VMCB_EXIT_IO			0x7B
+#define	VMCB_EXIT_MSR			0x7C
+#define	VMCB_EXIT_SHUTDOWN		0x7F
+#define	VMCB_EXIT_VMSAVE		0x83
+#define	VMCB_EXIT_MONITOR		0x8A
+#define	VMCB_EXIT_MWAIT			0x8B
+#define	VMCB_EXIT_NPF			0x400
+#define	VMCB_EXIT_INVALID		-1
+
+/*
+ * Nested page fault.
+ * Bit definitions to decode EXITINFO1.
+ */
+#define	VMCB_NPF_INFO1_P		BIT(0) /* Nested page present. */
+#define	VMCB_NPF_INFO1_W		BIT(1) /* Access was write. */
+#define	VMCB_NPF_INFO1_U		BIT(2) /* Access was user access. */
+#define	VMCB_NPF_INFO1_RSV		BIT(3) /* Reserved bits present. */
+#define	VMCB_NPF_INFO1_ID		BIT(4) /* Code read. */
+
+#define	VMCB_NPF_INFO1_GPA		BIT(32) /* Guest physical address. */
+#define	VMCB_NPF_INFO1_GPT		BIT(33) /* Guest page table. */
+
+/*
+ * EXITINTINFO, Interrupt exit info for all intrecepts.
+ * Section 15.7.2, Intercepts during IDT Interrupt Delivery.
+ */
+#define VMCB_EXITINTINFO_VECTOR(x)	((x) & 0xFF)
+#define VMCB_EXITINTINFO_TYPE(x)	(((x) >> 8) & 0x7)
+#define VMCB_EXITINTINFO_EC_VALID(x)	(((x) & BIT(11)) ? 1 : 0)
+#define VMCB_EXITINTINFO_VALID(x)	(((x) & BIT(31)) ? 1 : 0)
+#define VMCB_EXITINTINFO_EC(x)		(((x) >> 32) & 0xFFFFFFFF)
+
+/* Offset of various VMCB fields. */
+#define	VMCB_OFF_CTRL(x)		(x)
+#define	VMCB_OFF_STATE(x)		((x) + 0x400)
+
+#define	VMCB_OFF_CR_INTERCEPT		VMCB_OFF_CTRL(0x0)
+#define	VMCB_OFF_DR_INTERCEPT		VMCB_OFF_CTRL(0x4)
+#define	VMCB_OFF_EXC_INTERCEPT		VMCB_OFF_CTRL(0x8)
+#define	VMCB_OFF_INST1_INTERCEPT	VMCB_OFF_CTRL(0xC)
+#define	VMCB_OFF_INST2_INTERCEPT	VMCB_OFF_CTRL(0x10)
+#define	VMCB_OFF_IO_PERM		VMCB_OFF_CTRL(0x40)
+#define	VMCB_OFF_MSR_PERM		VMCB_OFF_CTRL(0x48)
+#define	VMCB_OFF_TSC_OFFSET		VMCB_OFF_CTRL(0x50)
+#define	VMCB_OFF_ASID			VMCB_OFF_CTRL(0x58)
+#define	VMCB_OFF_TLB_CTRL		VMCB_OFF_CTRL(0x5C)
+#define	VMCB_OFF_VIRQ			VMCB_OFF_CTRL(0x60)
+#define	VMCB_OFF_EXIT_REASON		VMCB_OFF_CTRL(0x70)
+#define	VMCB_OFF_EXITINFO1		VMCB_OFF_CTRL(0x78)
+#define	VMCB_OFF_EXITINFO2		VMCB_OFF_CTRL(0x80)
+#define	VMCB_OFF_EXITINTINFO		VMCB_OFF_CTRL(0x88)
+#define	VMCB_OFF_AVIC_BAR		VMCB_OFF_CTRL(0x98)
+#define	VMCB_OFF_NPT_BASE		VMCB_OFF_CTRL(0xB0)
+#define	VMCB_OFF_AVIC_PAGE		VMCB_OFF_CTRL(0xE0)
+#define	VMCB_OFF_AVIC_LT		VMCB_OFF_CTRL(0xF0)
+#define	VMCB_OFF_AVIC_PT		VMCB_OFF_CTRL(0xF8)
+#define	VMCB_OFF_SYSENTER_CS		VMCB_OFF_STATE(0x228)
+#define	VMCB_OFF_SYSENTER_ESP		VMCB_OFF_STATE(0x230)
+#define	VMCB_OFF_SYSENTER_EIP		VMCB_OFF_STATE(0x238)
+#define	VMCB_OFF_GUEST_PAT		VMCB_OFF_STATE(0x268)
+
+/*
+ * Encode the VMCB offset and bytes that we want to read from VMCB.
+ */
+#define	VMCB_ACCESS(o, w)		(0x80000000 | (((w) & 0xF) << 16) | \
+					((o) & 0xFFF))
+#define	VMCB_ACCESS_OK(v)               ((v) & 0x80000000 )
+#define	VMCB_ACCESS_BYTES(v)            (((v) >> 16) & 0xF)
+#define	VMCB_ACCESS_OFFSET(v)           ((v) & 0xFFF)
+
+#ifdef _KERNEL
+/* VMCB save state area segment format */
+struct vmcb_segment {
+	uint16_t	selector;
+	uint16_t	attrib;
+	uint32_t	limit;
+	uint64_t	base;
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb_segment) == 16);
+
+/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */
+#define	VMCB_CS_ATTRIB_L		BIT(9)	/* Long mode. */
+#define	VMCB_CS_ATTRIB_D		BIT(10)	/* OPerand size bit. */
+
+/*
+ * The VMCB is divided into two areas - the first one contains various
+ * control bits including the intercept vector and the second one contains
+ * the guest state.
+ */
+
+/* VMCB control area - padded up to 1024 bytes */
+struct vmcb_ctrl {
+	uint32_t intercept[5];	/* all intercepts */
+	uint8_t	 pad1[0x28];	/* Offsets 0x14-0x3B are reserved. */
+	uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */
+	uint16_t pause_filcnt;  /* Offset 0x3E, PAUSE filter count */
+	uint64_t iopm_base_pa;	/* 0x40: IOPM_BASE_PA */
+	uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */
+	uint64_t tsc_offset;	/* 0x50: TSC_OFFSET */
+	uint32_t asid;		/* 0x58: Guest ASID */
+	uint8_t	 tlb_ctrl;	/* 0x5C: TLB_CONTROL */
+	uint8_t  pad2[3];	/* 0x5D-0x5F: Reserved. */
+	uint8_t	 v_tpr;		/* 0x60: V_TPR, guest CR8 */
+	uint8_t	 v_irq:1;	/* Is virtual interrupt pending? */
+	uint8_t	:7; 		/* Padding */
+	uint8_t v_intr_prio:4;	/* 0x62: Priority for virtual interrupt. */
+	uint8_t v_ign_tpr:1;
+	uint8_t :3;
+	uint8_t	v_intr_masking:1; /* Guest and host sharing of RFLAGS. */
+	uint8_t	:7;
+	uint8_t	v_intr_vector;	/* 0x64: Vector for virtual interrupt. */
+	uint8_t pad3[3];	/* 0x65-0x67 Reserved. */
+	uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */
+	uint64_t :63;
+	uint64_t exitcode;	/* 0x70, Exitcode */
+	uint64_t exitinfo1;	/* 0x78, EXITINFO1 */
+	uint64_t exitinfo2;	/* 0x80, EXITINFO2 */
+	uint64_t exitintinfo;	/* 0x88, Interrupt exit value. */
+	uint64_t np_enable:1;   /* 0x90, Nested paging enable. */
+	uint64_t :63;
+	uint8_t  pad4[0x10];	/* 0x98-0xA7 reserved. */
+	uint64_t eventinj;	/* 0xA8, Event injection. */
+	uint64_t n_cr3;		/* B0, Nested page table. */
+	uint64_t lbr_virt_en:1;	/* Enable LBR virtualization. */
+	uint64_t :63;
+	uint32_t vmcb_clean;	/* 0xC0: VMCB clean bits for caching */
+	uint32_t :32;		/* 0xC4: Reserved */
+	uint64_t nrip;		/* 0xC8: Guest next nRIP. */
+	uint8_t	inst_len;	/* 0xD0: #NPF decode assist */
+	uint8_t	inst_bytes[15];
+	uint8_t	padd6[0x320];
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
+
+struct vmcb_state {
+	struct   vmcb_segment es;
+	struct   vmcb_segment cs;
+	struct   vmcb_segment ss;
+	struct   vmcb_segment ds;
+	struct   vmcb_segment fs;
+	struct   vmcb_segment gs;
+	struct   vmcb_segment gdt;
+	struct   vmcb_segment ldt;
+	struct   vmcb_segment idt;
+	struct   vmcb_segment tr;
+	uint8_t	 pad1[0x2b];		/* Reserved: 0xA0-0xCA */
+	uint8_t	 cpl;
+	uint8_t  pad2[4];
+	uint64_t efer;
+	uint8_t	 pad3[0x70];		/* Reserved: 0xd8-0x147 */
+	uint64_t cr4;
+	uint64_t cr3;			/* Guest CR3 */
+	uint64_t cr0;
+	uint64_t dr7;
+	uint64_t dr6;
+	uint64_t rflags;
+	uint64_t rip;
+	uint8_t	 pad4[0x58]; 		/* Reserved: 0x180-0x1D7 */
+	uint64_t rsp;
+	uint8_t	 pad5[0x18]; 		/* Reserved 0x1E0-0x1F7 */
+	uint64_t rax;
+	uint64_t star;
+	uint64_t lstar;
+	uint64_t cstar;
+	uint64_t sfmask;
+	uint64_t kernelgsbase;
+	uint64_t sysenter_cs;
+	uint64_t sysenter_esp;
+	uint64_t sysenter_eip;
+	uint64_t cr2;
+	uint8_t	 pad6[0x20];
+	uint64_t g_pat;
+	uint64_t dbgctl;
+	uint64_t br_from;
+	uint64_t br_to;
+	uint64_t int_from;
+	uint64_t int_to;
+	uint8_t	 pad7[0x968];		/* Reserved up to end of VMCB */
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb_state) == 0xC00);
+
+struct vmcb {
+	struct vmcb_ctrl ctrl;
+	struct vmcb_state state;
+} __attribute__ ((__packed__));
+CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
+CTASSERT(offsetof(struct vmcb, state) == 0x400);
+
+int	vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval);
+int	vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val);
+int	vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
+int	vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
+int	vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg);
+
+#endif /* _KERNEL */
+#endif /* _VMCB_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c
index 5ae9ed2f6a..4915537b0a 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/ept.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/ept.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -39,33 +41,35 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $");
+__FBSDID("$FreeBSD$");
 
+#include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/types.h>
-#include <sys/errno.h>
 #include <sys/systm.h>
-#include <sys/malloc.h>
 #include <sys/smp.h>
+#include <sys/sysctl.h>
+#ifndef __FreeBSD__
+#include <sys/hma.h>
+#endif
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
-
-#include <machine/param.h>
-#include <machine/cpufunc.h>
-#include <machine/pmap.h>
-#include <machine/vmparam.h>
+#include <vm/vm_extern.h>
 
 #include <machine/vmm.h>
+
 #include "vmx_cpufunc.h"
-#include "vmx.h"
 #include "ept.h"
 
+#define	EPT_SUPPORTS_EXEC_ONLY(cap)	((cap) & (1UL << 0))
 #define	EPT_PWL4(cap)			((cap) & (1UL << 6))
 #define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
 #define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
 #define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
-#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
 #define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+#define	AD_BITS_SUPPORTED(cap)		((cap) & (1UL << 21))
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
 
 #define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
 #define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
@@ -75,28 +79,22 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z g
 #define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
 	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
 
-#define	EPT_PG_RD			(1 << 0)
-#define	EPT_PG_WR			(1 << 1)
-#define	EPT_PG_EX			(1 << 2)
-#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
-#define	EPT_PG_IGNORE_PAT		(1 << 6)
-#define	EPT_PG_SUPERPAGE		(1 << 7)
+#define	EPT_PWLEVELS		4		/* page walk levels */
+#define	EPT_ENABLE_AD_BITS	(1 << 6)
 
-#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);
 
-MALLOC_DECLARE(M_VMX);
+static int ept_enable_ad_bits;
 
-static uint64_t page_sizes_mask;
-
-/*
- * Set this to 1 to have the EPT tables respect the guest PAT settings
- */
-static int ept_pat_passthru;
+static int ept_pmap_flags;
+SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
+    &ept_pmap_flags, 0, NULL);
 
 int
-ept_init(void)
+ept_init(int ipinum)
 {
-	int page_shift;
+	int use_hw_ad_bits, use_superpages, use_exec_only;
 	uint64_t cap;
 
 	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
@@ -116,17 +114,24 @@ ept_init(void)
 	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
 		return (EINVAL);
 
-	/* Set bits in 'page_sizes_mask' for each valid page size */
-	page_shift = PAGE_SHIFT;
-	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+	ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK;
 
-	page_shift += 9;
-	if (EPT_PDE_SUPERPAGE(cap))
-		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+	use_superpages = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
+	if (use_superpages && EPT_PDE_SUPERPAGE(cap))
+		ept_pmap_flags |= PMAP_PDE_SUPERPAGE;	/* 2MB superpage */
 
-	page_shift += 9;
-	if (EPT_PDPTE_SUPERPAGE(cap))
-		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+	use_hw_ad_bits = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
+	if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
+		ept_enable_ad_bits = 1;
+	else
+		ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
+
+	use_exec_only = 1;
+	TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
+	if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
+		ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
 
 	return (0);
 }
@@ -165,288 +170,61 @@ ept_dump(uint64_t *ptp, int nlevels)
 }
 #endif
 
-static size_t
-ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
-{
-	int spshift, ptpshift, ptpindex, nlevels;
-
-	/*
-	 * Compute the size of the mapping that we can accomodate.
-	 *
-	 * This is based on three factors:
-	 * - super page sizes supported by the processor
-	 * - alignment of the region starting at 'gpa' and 'hpa'
-	 * - length of the region 'len'
-	 */
-	spshift = PAGE_SHIFT;
-	if (spok)
-		spshift += (EPT_PWLEVELS - 1) * 9;
-	while (spshift >= PAGE_SHIFT) {
-		uint64_t spsize = 1UL << spshift;
-		if ((page_sizes_mask & spsize) != 0 &&
-		    (gpa & (spsize - 1)) == 0 &&
-		    (hpa & (spsize - 1)) == 0 &&
-		    length >= spsize) {
-			break;
-		}
-		spshift -= 9;
-	}
-
-	if (spshift < PAGE_SHIFT) {
-		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
-		      "length 0x%016lx, page_sizes_mask 0x%016lx",
-		      gpa, hpa, length, page_sizes_mask);
-	}
-
-	nlevels = EPT_PWLEVELS;
-	while (--nlevels >= 0) {
-		ptpshift = PAGE_SHIFT + nlevels * 9;
-		ptpindex = (gpa >> ptpshift) & 0x1FF;
-
-		/* We have reached the leaf mapping */
-		if (spshift >= ptpshift)
-			break;
-
-		/*
-		 * We are working on a non-leaf page table page.
-		 *
-		 * Create the next level page table page if necessary and point
-		 * to it from the current page table.
-		 */
-		if (ptp[ptpindex] == 0) {
-#ifdef	__FreeBSD__
-			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
-#else
-			void *nlp = kmem_zalloc(PAGE_SIZE, KM_SLEEP);
-			ASSERT((((uintptr_t)nlp) & PAGE_MASK) == 0);
-#endif
-			ptp[ptpindex] = vtophys(nlp);
-			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
-		}
-
-		/* Work our way down to the next level page table page */
-#ifdef	__FreeBSD__
-		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
-#else
-		ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptp[ptpindex] & EPT_ADDR_MASK));
-#endif
-	}
-
-	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
-		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
-		      "mismatch\n", gpa, ptpshift);
-	}
-
-	if (prot != VM_PROT_NONE) {
-		/* Do the mapping */
-		ptp[ptpindex] = hpa;
-
-		/* Apply the access controls */
-		if (prot & VM_PROT_READ)
-			ptp[ptpindex] |= EPT_PG_RD;
-		if (prot & VM_PROT_WRITE)
-			ptp[ptpindex] |= EPT_PG_WR;
-		if (prot & VM_PROT_EXECUTE)
-			ptp[ptpindex] |= EPT_PG_EX;
-
-		/*
-		 * By default the PAT type is ignored - this appears to
-		 * be how other hypervisors handle EPT. Allow this to be
-		 * overridden.
-		 */
-		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
-		if (!ept_pat_passthru)
-			ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
-
-		if (nlevels > 0)
-			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
-	} else {
-		/* Remove the mapping */
-		ptp[ptpindex] = 0;
-	}
-
-	return (1UL << ptpshift);
-}
-
-static vm_paddr_t
-ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
-{
-	int nlevels, ptpshift, ptpindex;
-	uint64_t ptpval, hpabase, pgmask;
-
-	nlevels = EPT_PWLEVELS;
-	while (--nlevels >= 0) {
-		ptpshift = PAGE_SHIFT + nlevels * 9;
-		ptpindex = (gpa >> ptpshift) & 0x1FF;
-
-		ptpval = ptp[ptpindex];
-
-		/* Cannot make progress beyond this point */
-		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
-			break;
-
-		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
-			pgmask = (1UL << ptpshift) - 1;
-			hpabase = ptpval & ~pgmask;
-			return (hpabase | (gpa & pgmask));
-		}
-
-		/* Work our way down to the next level page table page */
-#ifdef	__FreBSD__
-		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
-#else
-		ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptpval & EPT_ADDR_MASK));
-#endif
-	}
-
-	return ((vm_paddr_t)-1);
-}
-
-static void
-ept_free_pt_entry(pt_entry_t pte)
-{
-	if (pte == 0)
-		return;
-
-	/* sanity check */
-	if ((pte & EPT_PG_SUPERPAGE) != 0)
-		panic("ept_free_pt_entry: pte cannot have superpage bit");
-
-	return;
-}
-
-static void
-ept_free_pd_entry(pd_entry_t pde)
-{
-	pt_entry_t	*pt;
-	int		i;
-
-	if (pde == 0)
-		return;
-
-	if ((pde & EPT_PG_SUPERPAGE) == 0) {
-#ifdef	__FreeBSD__
-		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
-		for (i = 0; i < NPTEPG; i++)
-			ept_free_pt_entry(pt[i]);
-		free(pt, M_VMX);	/* free the page table page */
-#else
-		page_t		*pp;
-		pt = (pt_entry_t *)hat_kpm_pfn2va(btop(pde & EPT_ADDR_MASK));
-		for (i = 0; i < NPTEPG; i++)
-			ept_free_pt_entry(pt[i]);
-		pp = page_numtopp_nolock(btop(pde & EPT_ADDR_MASK));
-		kmem_free((void *)pp->p_offset, PAGE_SIZE);
-#endif
-	}
-}
-
+#ifdef __FreeBSD__
 static void
-ept_free_pdp_entry(pdp_entry_t pdpe)
+invept_single_context(void *arg)
 {
-	pd_entry_t 	*pd;
-	int		 i;
-
-	if (pdpe == 0)
-		return;
+	struct invept_desc desc = *(struct invept_desc *)arg;
 
-	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
-#ifdef	__FreeBSD__
-		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
-		for (i = 0; i < NPDEPG; i++)
-			ept_free_pd_entry(pd[i]);
-		free(pd, M_VMX);	/* free the page directory page */
-#else
-		page_t		*pp;
-		pd = (pd_entry_t *)hat_kpm_pfn2va(btop(pdpe & EPT_ADDR_MASK));
-		for (i = 0; i < NPDEPG; i++)
-			ept_free_pd_entry(pd[i]);
-		pp = page_numtopp_nolock(btop(pdpe & EPT_ADDR_MASK));
-		kmem_free((void *)pp->p_offset, PAGE_SIZE);
-#endif
-	}
+	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
 }
 
-static void
-ept_free_pml4_entry(pml4_entry_t pml4e)
+void
+ept_invalidate_mappings(u_long eptp)
 {
-	pdp_entry_t	*pdp;
-	int		i;
+	struct invept_desc invept_desc = { 0 };
 
-	if (pml4e == 0)
-		return;
+	invept_desc.eptp = eptp;
 
-	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
-#ifdef	__FreeBSD__
-		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
-		for (i = 0; i < NPDPEPG; i++)
-			ept_free_pdp_entry(pdp[i]);
-		free(pdp, M_VMX);	/* free the page directory ptr page */
-#else
-		page_t		*pp;
-		pdp = (pdp_entry_t *)hat_kpm_pfn2va(btop(pml4e
-		    & EPT_ADDR_MASK));
-		for (i = 0; i < NPDPEPG; i++)
-			ept_free_pdp_entry(pdp[i]);
-		pp = page_numtopp_nolock(btop(pml4e & EPT_ADDR_MASK));
-		kmem_free((void *)pp->p_offset, PAGE_SIZE);
-#endif
-	}
+	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
 }
-
+#else /* __FreeBSD__ */
 void
-ept_vmcleanup(struct vmx *vmx)
+ept_invalidate_mappings(u_long eptp)
 {
-	int 		 i;
-
-	for (i = 0; i < NPML4EPG; i++)
-		ept_free_pml4_entry(vmx->pml4ept[i]);
+	hma_vmx_invept_allcpus((uintptr_t)eptp);
 }
+#endif /* __FreeBSD__ */
 
-int
-ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
-		vm_memattr_t attr, int prot, boolean_t spok)
+static int
+ept_pinit(pmap_t pmap)
 {
-	size_t n;
-	struct vmx *vmx = arg;
-
-	while (len > 0) {
-		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
-				       prot, spok);
-		len -= n;
-		gpa += n;
-		hpa += n;
-	}
 
-	return (0);
+	return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
 }
 
-vm_paddr_t
-ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+struct vmspace *
+ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
 {
-	vm_paddr_t hpa;
-	struct vmx *vmx;
 
-	vmx = arg;
-	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
-	return (hpa);
+	return (vmspace_alloc(min, max, ept_pinit));
 }
 
-static void
-invept_single_context(void *arg)
+void
+ept_vmspace_free(struct vmspace *vmspace)
 {
-	struct invept_desc desc = *(struct invept_desc *)arg;
 
-	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+	vmspace_free(vmspace);
 }
 
-void
-ept_invalidate_mappings(u_long pml4ept)
+uint64_t
+eptp(uint64_t pml4)
 {
-	struct invept_desc invept_desc = { 0 };
+	uint64_t eptp_val;
 
-	invept_desc.eptp = EPTP(pml4ept);
+	eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
+	if (ept_enable_ad_bits)
+		eptp_val |= EPT_ENABLE_AD_BITS;
 
-	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+	return (eptp_val);
 }
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h
index d0bcce7ec3..4a029e8b22 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/ept.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/ept.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/ept.h 245678 2013-01-20 03:42:49Z neel $
+ * $FreeBSD$
  */
 
 #ifndef	_EPT_H_
@@ -31,13 +33,9 @@
 
 struct vmx;
 
-#define	EPT_PWLEVELS	4		/* page walk levels */
-#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
-
-int	ept_init(void);
-int	ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
-	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
-vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
-void	ept_invalidate_mappings(u_long ept_pml4);
-void	ept_vmcleanup(struct vmx *vmx);
+int	ept_init(int ipinum);
+void	ept_invalidate_mappings(u_long eptp);
+struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
+void	ept_vmspace_free(struct vmspace *vmspace);
+uint64_t eptp(uint64_t pml4);
 #endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/offsets.in b/usr/src/uts/i86pc/io/vmm/intel/offsets.in
new file mode 100644
index 0000000000..d60a2d8f5f
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/offsets.in
@@ -0,0 +1,62 @@
+/*
+ * COPYRIGHT 2014 Pluribus Networks Inc.
+ *
+ * All rights reserved. This copyright notice is Copyright Management
+ * Information under 17 USC 1202 and is included to protect this work and
+ * deter copyright infringement.  Removal or alteration of this Copyright
+ * Management Information without the express written permission from
+ * Pluribus Networks Inc is prohibited, and any such unauthorized removal
+ * or alteration will be a violation of federal law.
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+
+#include <machine/pmap.h>
+#include <machine/vmm.h>
+
+#include "intel/vmx_cpufunc.h"
+#include "intel/vmx.h"
+#include "vm/vm_glue.h"
+
+vmxctx
+	guest_rdi		VMXCTX_GUEST_RDI
+	guest_rsi		VMXCTX_GUEST_RSI
+	guest_rdx		VMXCTX_GUEST_RDX
+	guest_rcx		VMXCTX_GUEST_RCX
+	guest_r8		VMXCTX_GUEST_R8
+	guest_r9		VMXCTX_GUEST_R9
+	guest_rax		VMXCTX_GUEST_RAX
+	guest_rbx		VMXCTX_GUEST_RBX
+	guest_rbp		VMXCTX_GUEST_RBP
+	guest_r10		VMXCTX_GUEST_R10
+	guest_r11		VMXCTX_GUEST_R11
+	guest_r12		VMXCTX_GUEST_R12
+	guest_r13		VMXCTX_GUEST_R13
+	guest_r14		VMXCTX_GUEST_R14
+	guest_r15		VMXCTX_GUEST_R15
+	guest_cr2		VMXCTX_GUEST_CR2
+	inst_fail_status	VMXCTX_INST_FAIL_STATUS
+	pmap			VMXCTX_PMAP
+
+vmx
+	eptgen		VMX_EPTGEN
+	eptp		VMX_EPTP
+
+pmap
+	pm_active	PM_ACTIVE
+	pm_eptgen	PM_EPTGEN
+
+cpu
+	cpu_id
+
+\#define	VM_SUCCESS		0
+\#define	VM_FAIL_INVALID		1
+\#define	VM_FAIL_VALID		2
+
+\#define	VMX_GUEST_VMEXIT	0
+\#define	VMX_VMRESUME_ERROR	1
+\#define	VMX_VMLAUNCH_ERROR	2
+\#define	VMX_INVEPT_ERROR	3
+\#define	VMX_VMWRITE_ERROR	4
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c
index bbd2da2a34..d19f6bc262 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,6 +38,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifdef	__FreeBSD__
@@ -43,9 +46,10 @@
 #endif
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/pcpu.h>
 
@@ -64,6 +68,12 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z
 #include <ddb/ddb.h>
 #endif
 
+SYSCTL_DECL(_hw_vmm_vmx);
+
+static int no_flush_rsb;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW,
+    &no_flush_rsb, 0, "Do not flush RSB upon vmexit");
+
 static uint64_t
 vmcs_fix_regval(uint32_t encoding, uint64_t val)
 {
@@ -117,6 +127,14 @@ vmcs_field_encoding(int ident)
 		return (VMCS_GUEST_LDTR_SELECTOR);
 	case VM_REG_GUEST_EFER:
 		return (VMCS_GUEST_IA32_EFER);
+	case VM_REG_GUEST_PDPTE0:
+		return (VMCS_GUEST_PDPTE0);
+	case VM_REG_GUEST_PDPTE1:
+		return (VMCS_GUEST_PDPTE1);
+	case VM_REG_GUEST_PDPTE2:
+		return (VMCS_GUEST_PDPTE2);
+	case VM_REG_GUEST_PDPTE3:
+		return (VMCS_GUEST_PDPTE3);
 	default:
 		return (-1);
 	}
@@ -332,40 +350,15 @@ done:
 	return (error);
 }
 
-#ifndef	__FreeBSD__
-int
-vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count)
-{
-	int error;
-
-	VMPTRLD(vmcs);
-
-	/*
-	 * Host MSRs are loaded from the VM-exit MSR-load area.
-	 */
-	if ((error = vmwrite(VMCS_EXIT_MSR_LOAD, h_area)) != 0)
-		goto done;
-	if ((error = vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, h_count)) != 0)
-		goto done;
-
-	error = 0;
-done:
-	VMCLEAR(vmcs);
-	return (error);
-}
-#endif
-
 int
-vmcs_set_defaults(struct vmcs *vmcs,
-		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
-		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
-		  uint32_t procbased_ctls2, uint32_t exit_ctls,
-		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+vmcs_init(struct vmcs *vmcs)
 {
 	int error, codesel, datasel, tsssel;
 	u_long cr0, cr4, efer;
-	uint64_t eptp, pat, fsbase, idtrbase;
-	uint32_t exc_bitmap;
+	uint64_t pat;
+#ifdef	__FreeBSD__
+	uint64_t fsbase, idtrbase;
+#endif
 
 	codesel = vmm_get_host_codesel();
 	datasel = vmm_get_host_datasel();
@@ -376,34 +369,6 @@ vmcs_set_defaults(struct vmcs *vmcs,
 	 */
 	VMPTRLD(vmcs);
 
-	/*
-	 * Load the VMX controls
-	 */
-	if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
-		goto done;
-	if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
-		goto done;
-	if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
-		goto done;
-	if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
-		goto done;
-	if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
-		goto done;
-
-	/* Guest state */
-
-	/* Initialize guest IA32_PAT MSR with the default value */
-	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
-	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
-	      PAT_VALUE(2, PAT_UNCACHED)	|
-	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
-	      PAT_VALUE(4, PAT_WRITE_BACK)	|
-	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
-	      PAT_VALUE(6, PAT_UNCACHED)	|
-	      PAT_VALUE(7, PAT_UNCACHEABLE);
-	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
-		goto done;
-
 	/* Host state */
 
 	/* Initialize host IA32_PAT MSR */
@@ -466,37 +431,35 @@ vmcs_set_defaults(struct vmcs *vmcs,
 	fsbase = vmm_get_host_fsbase();
 	if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
 		goto done;
-#endif
 
 	idtrbase = vmm_get_host_idtrbase();
 	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
 		goto done;
 
-	/* instruction pointer */
-	if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
-		goto done;
-
-	/* stack pointer */
-	if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
-		goto done;
-
-	/* eptp */
-	eptp = EPTP(ept_pml4);
-	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+#else /* __FreeBSD__ */
+	/*
+	 * Configure host sysenter MSRs to be restored on VM exit.
+	 * The thread-specific MSR_INTC_SEP_ESP value is loaded in vmx_run.
+	 */
+	if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_CS, KCS_SEL)) != 0)
 		goto done;
-
-	/* vpid */
-	if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+	/* Natively defined as MSR_INTC_SEP_EIP */
+	if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_EIP,
+	    rdmsr(MSR_SYSENTER_EIP_MSR))) != 0)
 		goto done;
 
-	/* msr bitmap */
-	if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
-		goto done;
+#endif /* __FreeBSD__ */
 
-	/* exception bitmap */
-	exc_bitmap = 1 << IDT_MC;
-	if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
-		goto done;
+	/* instruction pointer */
+	if (no_flush_rsb) {
+		if ((error = vmwrite(VMCS_HOST_RIP,
+		    (u_long)vmx_exit_guest)) != 0)
+			goto done;
+	} else {
+		if ((error = vmwrite(VMCS_HOST_RIP,
+		    (u_long)vmx_exit_guest_flush_rsb)) != 0)
+			goto done;
+	}
 
 	/* link pointer */
 	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h
index 20e99e8184..edde5c6dd5 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,19 +25,40 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.h 276098 2014-12-23 02:14:49Z neel $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef _VMCS_H_
 #define	_VMCS_H_
 
 #ifdef _KERNEL
+#ifndef _ASM
 struct vmcs {
 	uint32_t	identifier;
 	uint32_t	abort_code;
 	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+#ifndef __FreeBSD__
+	/*
+	 * Keep the physical address of the VMCS cached adjacent for the
+	 * structure so it can be referenced in contexts which are too delicate
+	 * for a call into the HAT.  For the moment it means wasting a whole
+	 * page on padding for the PA value to maintain alignment, but it
+	 * allows the consumers of 'struct vmcs *' to easily access the value
+	 * without a significant change to the interface.
+	 */
+	uint64_t	vmcs_pa;
+	char		_pa_pad[PAGE_SIZE - sizeof (vm_paddr_t)];
+#endif
 };
+#ifdef __FreeBSD__
 CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+#else
+CTASSERT(sizeof(struct vmcs) == (2*PAGE_SIZE));
+#endif
 
 /* MSR save region is composed of an array of 'struct msr_entry' */
 struct msr_entry {
@@ -47,15 +70,6 @@ struct msr_entry {
 
 int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
 int	vmcs_init(struct vmcs *vmcs);
-#ifndef	__FreeBSD__
-int vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count);
-#endif
-int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
-			  u_long ept_pml4,
-			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
-			  uint32_t procbased_ctls2, uint32_t exit_ctls,
-			  uint32_t entry_ctls, u_long msr_bitmap,
-			  uint16_t vpid);
 int	vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv);
 int	vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val);
 int	vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
@@ -86,6 +100,65 @@ vmcs_write(uint32_t encoding, uint64_t val)
 	error = vmwrite(encoding, val);
 	KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error));
 }
+
+#ifndef __FreeBSD__
+/*
+ * Due to header complexity combined with the need to cache the physical
+ * address for the VMCS, these must be defined here rather than vmx_cpufunc.h.
+ */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr = vmcs->vmcs_pa;
+
+	__asm __volatile("vmclear %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr = vmcs->vmcs_pa;
+
+	__asm __volatile("vmptrld %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline void
+VMCLEAR(struct vmcs *vmcs)
+{
+	int err;
+
+	err = vmclear(vmcs);
+	if (err != 0)
+		panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+	critical_exit();
+}
+
+static __inline void
+VMPTRLD(struct vmcs *vmcs)
+{
+	int err;
+
+	critical_enter();
+
+	err = vmptrld(vmcs);
+	if (err != 0)
+		panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+#endif /* __FreeBSD__ */
+
 #endif	/* _VMX_CPUFUNC_H_ */
 
 #define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
@@ -99,6 +172,7 @@ vmcs_write(uint32_t encoding, uint64_t val)
 #define	vmcs_idt_vectoring_info()	vmcs_read(VMCS_IDT_VECTORING_INFO)
 #define	vmcs_idt_vectoring_err()	vmcs_read(VMCS_IDT_VECTORING_ERROR)
 
+#endif	/* _ASM */
 #endif	/* _KERNEL */
 
 #define	VMCS_INITIAL			0xffffffffffffffff
@@ -345,6 +419,14 @@ vmcs_write(uint32_t encoding, uint64_t val)
 #define EXIT_REASON_WBINVD		54
 #define EXIT_REASON_XSETBV		55
 #define	EXIT_REASON_APIC_WRITE		56
+#define	EXIT_REASON_RDRAND		57
+#define	EXIT_REASON_INVPCID		58
+#define	EXIT_REASON_VMFUNC		59
+#define	EXIT_REASON_ENCLS		60
+#define	EXIT_REASON_RDSEED		61
+#define	EXIT_REASON_PM_LOG_FULL		62
+#define	EXIT_REASON_XSAVES		63
+#define	EXIT_REASON_XRSTORS		64
 
 /*
  * NMI unblocking due to IRET.
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index 7ddf4e2a46..ce42ff8c9c 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -1,6 +1,9 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
+ * Copyright (c) 2018 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -23,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,10 +39,11 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -50,12 +54,21 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
+#ifndef __FreeBSD__
+#include <sys/x86_archext.h>
+#include <sys/smp_impldefs.h>
+#include <sys/smt.h>
+#include <sys/hma.h>
+#include <sys/trap.h>
+#endif
+
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
+#include <machine/reg.h>
 #include <machine/segments.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
@@ -75,6 +88,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t
 
 #include "ept.h"
 #include "vmx_cpufunc.h"
+#include "vmcs.h"
 #include "vmx.h"
 #include "vmx_msr.h"
 #include "x86.h"
@@ -90,13 +104,30 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t
 	(PROCBASED_INT_WINDOW_EXITING	|				\
 	 PROCBASED_NMI_WINDOW_EXITING)
 
+#ifdef __FreeBSD__
+#define	PROCBASED_CTLS_ONE_SETTING					\
+	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_MWAIT_EXITING	|				\
+	 PROCBASED_MONITOR_EXITING	|				\
+	 PROCBASED_IO_EXITING		|				\
+	 PROCBASED_MSR_BITMAPS		|				\
+	 PROCBASED_CTLS_WINDOW_SETTING	|				\
+	 PROCBASED_CR8_LOAD_EXITING	|				\
+	 PROCBASED_CR8_STORE_EXITING)
+#else
+/* We consider TSC offset a necessity for unsynched TSC handling */
 #define	PROCBASED_CTLS_ONE_SETTING 					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_TSC_OFFSET		|				\
+	 PROCBASED_MWAIT_EXITING	|				\
+	 PROCBASED_MONITOR_EXITING	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
 	 PROCBASED_CR8_LOAD_EXITING	|				\
 	 PROCBASED_CR8_STORE_EXITING)
+#endif /* __FreeBSD__ */
+
 #define	PROCBASED_CTLS_ZERO_SETTING	\
 	(PROCBASED_CR3_LOAD_EXITING |	\
 	PROCBASED_CR3_STORE_EXITING |	\
@@ -106,20 +137,21 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
 #define	VM_EXIT_CTLS_ONE_SETTING					\
-	(VM_EXIT_HOST_LMA			|			\
+	(VM_EXIT_SAVE_DEBUG_CONTROLS		|			\
+	VM_EXIT_HOST_LMA			|			\
+	VM_EXIT_LOAD_PAT			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER			|			\
-	VM_EXIT_LOAD_PAT			|			\
-	VM_EXIT_SAVE_PAT			|			\
-	VM_EXIT_LOAD_PAT)
+	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
 
-#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
+#define	VM_EXIT_CTLS_ZERO_SETTING	0
 
-#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT)
+#define	VM_ENTRY_CTLS_ONE_SETTING					\
+	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
+	VM_ENTRY_LOAD_EFER)
 
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
-	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
-	VM_ENTRY_INTO_SMM			|			\
+	(VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
 #define	HANDLED		1
@@ -131,11 +163,10 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
 
+#ifdef __FreeBSD__
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
-#ifndef	__FreeBSD__
-static vm_paddr_t vmxon_region_pa[MAXCPU];
-#endif
+#endif /*__FreeBSD__ */
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
 static uint32_t exit_ctls, entry_ctls;
@@ -159,29 +190,135 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 /*
  * Optional capabilities
  */
+SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
+
 static int cap_halt_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
+    "HLT triggers a VM-exit");
+
 static int cap_pause_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
+    0, "PAUSE triggers a VM-exit");
+
 static int cap_unrestricted_guest;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
+    &cap_unrestricted_guest, 0, "Unrestricted guests");
+
 static int cap_monitor_trap;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
+    &cap_monitor_trap, 0, "Monitor trap flag");
+
 static int cap_invpcid;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
+    0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
-static int pirvec;
+static int pirvec = -1;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
     &pirvec, 0, "APICv posted interrupt vector");
 
+#ifdef __FreeBSD__
 static struct unrhdr *vpid_unr;
+#endif /* __FreeBSD__ */
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);
 
+static int guest_l1d_flush;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
+    &guest_l1d_flush, 0, NULL);
+static int guest_l1d_flush_sw;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
+    &guest_l1d_flush_sw, 0, NULL);
+
+static struct msr_entry msr_load_list[1] __aligned(16);
+
+/*
+ * The definitions of SDT probes for VMX.
+ */
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
+    "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
+    "struct vmx *", "int", "struct vm_exit *", "uint64_t");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
+    "struct vmx *", "int", "struct vm_exit *", "uint32_t");
+
+SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
+    "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
+    "struct vmx *", "int", "struct vm_exit *", "uint32_t");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
+    "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
+
+SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
+    "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
+    "struct vmx *", "int", "struct vm_exit *", "uint64_t");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
+    "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
+    "struct vmx *", "int", "struct vm_exit *");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
+    "struct vmx *", "int", "struct vm_exit *", "uint32_t");
+
+SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
+    "struct vmx *", "int", "struct vm_exit *", "int");
+
 /*
  * Use the last page below 4GB as the APIC access address. This address is
  * occupied by the boot firmware so it is guaranteed that it will not conflict
@@ -193,6 +330,9 @@ static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
 static void vmx_inject_pir(struct vlapic *vlapic);
+#ifndef __FreeBSD__
+static int vmx_apply_tsc_adjust(struct vmx *, int);
+#endif /* __FreeBSD__ */
 
 #ifdef KTR
 static const char *
@@ -279,8 +419,8 @@ exit_reason_to_str(int reason)
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
-	case EXIT_REASON_MCE:
-		return "mce";
+	case EXIT_REASON_MCE_DURING_ENTRY:
+		return "mce-during-entry";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
@@ -312,83 +452,6 @@ exit_reason_to_str(int reason)
 		return (reasonbuf);
 	}
 }
-
-#ifdef SETJMP_TRACE
-static const char *
-vmx_setjmp_rc2str(int rc)
-{
-	switch (rc) {
-	case VMX_RETURN_DIRECT:
-		return "direct";
-	case VMX_RETURN_LONGJMP:
-		return "longjmp";
-	case VMX_RETURN_VMRESUME:
-		return "vmresume";
-	case VMX_RETURN_VMLAUNCH:
-		return "vmlaunch";
-	case VMX_RETURN_AST:
-		return "ast";
-	default:
-		return "unknown";
-	}
-}
-
-#define	SETJMP_TRACE(vmx, vcpu, vmxctx, regname)			  \
-	VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
-		 (vmxctx)->regname)
-
-static void
-vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
-{
-	uint64_t host_rip, host_rsp;
-
-	if (vmxctx != &vmx->ctx[vcpu])
-		panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
-			vmxctx, &vmx->ctx[vcpu]);
-
-	VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
-	VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
-		 vmx_setjmp_rc2str(rc), rc);
-
-	host_rsp = host_rip = ~0;
-	vmread(VMCS_HOST_RIP, &host_rip);
-	vmread(VMCS_HOST_RSP, &host_rsp);
-	VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
-		 host_rip, host_rsp);
-
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
-
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
-	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
-}
-#endif
-#else
-static void __inline
-vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
-{
-	return;
-}
 #endif	/* KTR */
 
 static int
@@ -411,7 +474,7 @@ vmx_allow_x2apic_msrs(struct vmx *vmx)
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
-	
+
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
 
@@ -465,7 +528,11 @@ vpid_free(int vpid)
 	 */
 
 	if (vpid > VM_MAXCPU)
+#ifdef __FreeBSD__
 		free_unr(vpid_unr, vpid);
+#else
+		hma_vmx_vpid_free((uint16_t)vpid);
+#endif
 }
 
 static void
@@ -490,7 +557,14 @@ vpid_alloc(uint16_t *vpid, int num)
 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
 	 */
 	for (i = 0; i < num; i++) {
+#ifdef __FreeBSD__
 		x = alloc_unr(vpid_unr);
+#else
+		uint16_t tmp;
+
+		tmp = hma_vmx_vpid_alloc();
+		x = (tmp == 0) ? -1 : tmp;
+#endif
 		if (x == -1)
 			break;
 		else
@@ -519,6 +593,7 @@ vpid_alloc(uint16_t *vpid, int num)
 	}
 }
 
+#ifdef __FreeBSD__
 static void
 vpid_init(void)
 {
@@ -535,50 +610,6 @@ vpid_init(void)
 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
 }
 
-#ifndef	__FreeBSD__
-static void
-msr_save_area_init(struct msr_entry *g_area, int *g_count)
-{
-	int cnt;
-
-	static struct msr_entry guest_msrs[] = {
-		{ MSR_KGSBASE, 0, 0 },
-		{ MSR_LSTAR, 0, 0 },
-		{ MSR_CSTAR, 0, 0 },
-		{ MSR_STAR, 0, 0 },
-		{ MSR_SF_MASK, 0, 0 },
-	};
-
-	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
-	if (cnt > GUEST_MSR_MAX_ENTRIES)
-		panic("guest msr save area overrun");
-	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
-	*g_count = cnt;
-}
-
-static void
-host_msr_save_area_init(struct msr_entry *h_area, int *h_count)
-{
-	int i, cnt;
-
-	static struct msr_entry host_msrs[] = {
-		{ MSR_LSTAR, 0, 0 },
-		{ MSR_CSTAR, 0, 0 },
-		{ MSR_STAR, 0, 0 },
-		{ MSR_SF_MASK, 0, 0 },
-	};
-
-	cnt = sizeof(host_msrs) / sizeof(host_msrs[0]);
-	if (cnt > HOST_MSR_MAX_ENTRIES)
-		panic("host msr save area overrun");
-	for (i = 0; i < cnt; i++) {
-		host_msrs[i].val = rdmsr(host_msrs[i].index);
-	}
-	bcopy(host_msrs, h_area, sizeof(host_msrs));
-	*h_count = cnt;
-}
-#endif
-
 static void
 vmx_disable(void *arg __unused)
 {
@@ -603,17 +634,18 @@ vmx_disable(void *arg __unused)
 static int
 vmx_cleanup(void)
 {
-	
-#ifdef	__FreeBSD__
-	if (pirvec != 0)
-		vmm_ipi_free(pirvec);
-#endif
+
+	if (pirvec >= 0)
+		lapic_ipi_free(pirvec);
 
 	if (vpid_unr != NULL) {
 		delete_unrhdr(vpid_unr);
 		vpid_unr = NULL;
 	}
 
+	if (nmi_flush_l1d_sw == 1)
+		nmi_flush_l1d_sw = 0;
+
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
@@ -636,40 +668,50 @@ vmx_enable(void *arg __unused)
 	load_cr4(rcr4() | CR4_VMXE);
 
 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
-#ifdef	__FreeBSD__
 	error = vmxon(vmxon_region[curcpu]);
-#else
-	error = vmxon_pa(vmxon_region_pa[curcpu]);
-	ASSERT(error == 0);
-#endif
 	if (error == 0)
 		vmxon_enabled[curcpu] = 1;
 }
 
+static void
+vmx_restore(void)
+{
+
+	if (vmxon_enabled[curcpu])
+		vmxon(vmxon_region[curcpu]);
+}
+#else /* __FreeBSD__ */
 static int
-vmx_init(void)
+vmx_cleanup(void)
 {
-#define	X86FSET_VMX	35
-	extern uchar_t x86_featureset[];
-	extern boolean_t is_x86_feature(void *featureset, uint_t feature);
-	int error;
-	uint64_t fixed0, fixed1, feature_control;
-	uint32_t tmp;
-#ifndef	__FreeBSD__
-	int i;
+	/* This is taken care of by the hma registration */
+	return (0);
+}
+
+static void
+vmx_restore(void)
+{
+	/* No-op on illumos */
+}
+#endif /* __FreeBSD__ */
+
+static int
+vmx_init(int ipinum)
+{
+	int error, use_tpr_shadow;
+#ifdef __FreeBSD__
+	uint64_t basic, fixed0, fixed1, feature_control;
+#else
+	uint64_t fixed0, fixed1;
 #endif
+	uint32_t tmp, procbased2_vid_bits;
 
+#ifdef __FreeBSD__
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
-#ifdef	__FreeBSD__
 	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
-#else
-	if (!is_x86_feature(x86_featureset, X86FSET_VMX)) {
-		cmn_err(CE_WARN, "vmx_init: processor does not support VMX operation\n");
-	}
-#endif
 
 	/*
 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
@@ -682,6 +724,18 @@ vmx_init(void)
 		return (ENXIO);
 	}
 
+	/*
+	 * Verify capabilities MSR_VMX_BASIC:
+	 * - bit 54 indicates support for INS/OUTS decoding
+	 */
+	basic = rdmsr(MSR_VMX_BASIC);
+	if ((basic & (1UL << 54)) == 0) {
+		printf("vmx_init: processor does not support desired basic "
+		    "capabilities\n");
+		return (EINVAL);
+	}
+#endif /* __FreeBSD__ */
+
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
@@ -769,13 +823,119 @@ vmx_init(void)
 					PROCBASED2_UNRESTRICTED_GUEST, 0,
 				        &tmp) == 0);
 
+	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
+	    &tmp) == 0);
+
+	/*
+	 * Check support for virtual interrupt delivery.
+	 */
+	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
+	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
+	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
+	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
+
+	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
+	    &tmp) == 0);
+
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+	    procbased2_vid_bits, 0, &tmp);
+	if (error == 0 && use_tpr_shadow) {
+		virtual_interrupt_delivery = 1;
+		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
+		    &virtual_interrupt_delivery);
+	}
+
+	if (virtual_interrupt_delivery) {
+		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
+		procbased_ctls2 |= procbased2_vid_bits;
+		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
+
+		/*
+		 * No need to emulate accesses to %CR8 if virtual
+		 * interrupt delivery is enabled.
+		 */
+		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
+		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
+
+		/*
+		 * Check for Posted Interrupts only if Virtual Interrupt
+		 * Delivery is enabled.
+		 */
+		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
+		    &tmp);
+		if (error == 0) {
+#ifdef __FreeBSD__
+			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+			    &IDTVEC(justreturn));
+			if (pirvec < 0) {
+				if (bootverbose) {
+					printf("vmx_init: unable to allocate "
+					    "posted interrupt vector\n");
+				}
+			} else {
+				posted_interrupts = 1;
+				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
+				    &posted_interrupts);
+			}
+#else
+			/*
+			 * If the PSM-provided interfaces for requesting and
+			 * using a PIR IPI vector are present, use them for
+			 * posted interrupts.
+			 */
+			if (psm_get_pir_ipivect != NULL &&
+			    psm_send_pir_ipi != NULL) {
+				pirvec = psm_get_pir_ipivect();
+				posted_interrupts = 1;
+			}
+#endif
+		}
+	}
+
+	if (posted_interrupts)
+		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
+
 	/* Initialize EPT */
-	error = ept_init();
+	error = ept_init(ipinum);
 	if (error) {
 		printf("vmx_init: ept initialization failed (%d)\n", error);
 		return (error);
 	}
 
+#ifdef __FreeBSD__
+	guest_l1d_flush = (cpu_ia32_arch_caps &
+	    IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
+	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
+
+	/*
+	 * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
+	 * available.  Otherwise fall back to the software flush
+	 * method which loads enough data from the kernel text to
+	 * flush existing L1D content, both on VMX entry and on NMI
+	 * return.
+	 */
+	if (guest_l1d_flush) {
+		if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
+			guest_l1d_flush_sw = 1;
+			TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
+			    &guest_l1d_flush_sw);
+		}
+		if (guest_l1d_flush_sw) {
+			if (nmi_flush_l1d_sw <= 1)
+				nmi_flush_l1d_sw = 1;
+		} else {
+			msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
+			msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
+		}
+	}
+#else
+	/* L1D flushing is taken care of by smt_acquire() and friends */
+	guest_l1d_flush = 0;
+#endif /* __FreeBSD__ */
+
 	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
@@ -801,24 +961,52 @@ vmx_init(void)
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
-#ifndef	__FreeBSD__
-	for (i = 0; i < MAXCPU; i++) {
-		vmxon_region_pa[i] = vtophys(&vmxon_region[i]);
-	}
-#endif
-
+#ifdef __FreeBSD__
 	vpid_init();
+#endif
 
 	vmx_msr_init();
 
+#ifdef __FreeBSD__
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+#endif
 
 	vmx_initialized = 1;
 
 	return (0);
 }
 
+static void
+vmx_trigger_hostintr(int vector)
+{
+#ifdef __FreeBSD__
+	uintptr_t func;
+	struct gate_descriptor *gd;
+
+	gd = &idt[vector];
+
+	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
+	    "invalid vector %d", vector));
+	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
+	    vector));
+	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
+	    "has invalid type %d", vector, gd->gd_type));
+	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
+	    "has invalid dpl %d", vector, gd->gd_dpl));
+	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
+	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
+	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
+	    "IST %d", vector, gd->gd_ist));
+
+	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
+	vmx_call_isr(func);
+#else
+	VERIFY(vector >= 32 && vector <= 255);
+	vmx_call_isr(vector - 32);
+#endif /* __FreeBSD__ */
+}
+
 static int
 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 {
@@ -852,15 +1040,14 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
 
 static void *
-vmx_vminit(struct vm *vm)
+vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
-	int i, error, guest_msr_count;
-#ifndef	__FreeBSD__
-	int host_msr_count;
-#endif
+	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
+	uint32_t exc_bitmap;
+	uint16_t maxcpus;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
@@ -869,6 +1056,8 @@ vmx_vminit(struct vm *vm)
 	}
 	vmx->vm = vm;
 
+	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
@@ -878,7 +1067,7 @@ vmx_vminit(struct vm *vm)
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
-	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+	ept_invalidate_mappings(vmx->eptp);
 
 	msr_bitmap_initialize(vmx->msr_bitmap);
 
@@ -896,10 +1085,6 @@ vmx_vminit(struct vm *vm)
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
-	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
-	 * and entry respectively. It is also restored from the host VMCS
-	 * area on a VM exit.
-	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as
 	 * that will impact the host TSC.  If the guest does a write
 	 * the "use TSC offsetting" execution control is enabled and the
@@ -912,15 +1097,36 @@ vmx_vminit(struct vm *vm)
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
-	    guest_msr_rw(vmx, MSR_PAT) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
 
-	for (i = 0; i < VM_MAXCPU; i++) {
+	if (virtual_interrupt_delivery) {
+		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
+		    APIC_ACCESS_ADDRESS);
+		/* XXX this should really return an error to the caller */
+		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
+	}
+
+	maxcpus = vm_get_maxcpus(vm);
+	for (i = 0; i < maxcpus; i++) {
+#ifndef __FreeBSD__
+		/*
+		 * Cache physical address lookups for various components which
+		 * may be required inside the critical_enter() section implied
+		 * by VMPTRLD() below.
+		 */
+		vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap);
+		vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]);
+		vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]);
+#endif /* __FreeBSD__ */
+
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
+#ifndef __FreeBSD__
+		vmcs->vmcs_pa = (uint64_t)vtophys(vmcs);
+#endif
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
@@ -929,42 +1135,83 @@ vmx_vminit(struct vm *vm)
 
 		vmx_msr_guest_init(vmx, i);
 
-		error = vmcs_set_defaults(vmcs,
-					  (u_long)vmx_longjmp,
-					  (u_long)&vmx->ctx[i],
-					  vtophys(vmx->pml4ept),
-					  pinbased_ctls,
-					  procbased_ctls,
-					  procbased_ctls2,
-					  exit_ctls, entry_ctls,
-					  vtophys(vmx->msr_bitmap),
-					  vpid[i]);
-
-		if (error != 0)
-			panic("vmx_vminit: vmcs_set_defaults error %d", error);
+		error = vmcs_init(vmcs);
+		KASSERT(error == 0, ("vmcs_init error %d", error));
 
-		vmx->cap[i].set = 0;
-		vmx->cap[i].proc_ctls = procbased_ctls;
+		VMPTRLD(vmcs);
+		error = 0;
+#ifdef __FreeBSD__
+		/*
+		 * The illumos vmx_enter_guest implementation avoids some of
+		 * the %rsp-manipulation games which are present in the stock
+		 * one from FreeBSD.
+		 */
+		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
+#endif
+		error += vmwrite(VMCS_EPTP, vmx->eptp);
+		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
+		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
+		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
+		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
+		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
+#ifdef __FreeBSD__
+		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
+#else
+		error += vmwrite(VMCS_MSR_BITMAP, msr_bitmap_pa);
+#endif
+		error += vmwrite(VMCS_VPID, vpid[i]);
+
+		if (guest_l1d_flush && !guest_l1d_flush_sw) {
+			vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
+			    (vm_offset_t)&msr_load_list[0]));
+			vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
+			    nitems(msr_load_list));
+			vmcs_write(VMCS_EXIT_MSR_STORE, 0);
+			vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
+		}
 
-		vmx->state[i].lastcpu = -1;
-		vmx->state[i].vpid = vpid[i];
+		/* exception bitmap */
+		if (vcpu_trace_exceptions(vm, i))
+			exc_bitmap = 0xffffffff;
+		else
+			exc_bitmap = 1 << IDT_MC;
+		error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
 
-#ifndef	__FreeBSD__
-		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+		vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
+		error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
 
-		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
-		    guest_msr_count);
-		if (error != 0)
-			panic("vmcs_set_msr_save error %d", error);
+		if (virtual_interrupt_delivery) {
+			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
+#ifdef __FreeBSD__
+			error += vmwrite(VMCS_VIRTUAL_APIC,
+			    vtophys(&vmx->apic_page[i]));
+#else
+			error += vmwrite(VMCS_VIRTUAL_APIC, apic_page_pa);
+#endif
+			error += vmwrite(VMCS_EOI_EXIT0, 0);
+			error += vmwrite(VMCS_EOI_EXIT1, 0);
+			error += vmwrite(VMCS_EOI_EXIT2, 0);
+			error += vmwrite(VMCS_EOI_EXIT3, 0);
+		}
+		if (posted_interrupts) {
+			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
+#ifdef __FreeBSD__
+			error += vmwrite(VMCS_PIR_DESC,
+			    vtophys(&vmx->pir_desc[i]));
+#else
+			error += vmwrite(VMCS_PIR_DESC, pir_desc_pa);
+#endif
+		}
+		VMCLEAR(vmcs);
+		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
-		host_msr_save_area_init(vmx->host_msrs[i], &host_msr_count);
+		vmx->cap[i].set = 0;
+		vmx->cap[i].proc_ctls = procbased_ctls;
+		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
-		error = vmcs_set_host_msr_save(&vmx->vmcs[i],
-					       vtophys(vmx->host_msrs[i]),
-					       host_msr_count);
-		if (error != 0)
-			panic("vmcs_set_msr_save error %d", error);
-#endif
+		vmx->state[i].nextrip = ~0;
+		vmx->state[i].lastcpu = NOCPU;
+		vmx->state[i].vpid = vpid[i];
 
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
@@ -979,6 +1226,8 @@ vmx_vminit(struct vm *vm)
 		error = vmx_setup_cr4_shadow(vmcs, 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
+
+		vmx->ctx[i].pmap = pmap;
 	}
 
 	return (vmx);
@@ -987,9 +1236,13 @@ vmx_vminit(struct vm *vm)
 static int
 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
+#ifdef __FreeBSD__
 	int handled, func;
-	
+
 	func = vmxctx->guest_rax;
+#else
+	int handled;
+#endif
 
 	handled = x86_emulate_cpuid(vm, vcpu,
 				    (uint32_t*)(&vmxctx->guest_rax),
@@ -1016,6 +1269,8 @@ vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
 #endif
+	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip,
+	    uint32_t, exit_reason);
 }
 
 static __inline void
@@ -1026,36 +1281,40 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 #endif
 }
 
-static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
+static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
+
+/*
+ * Invalidate guest mappings identified by its vpid from the TLB.
+ */
+static __inline void
+vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
-	struct invvpid_desc invvpid_desc = { 0 };
-#ifndef	__FreeBSD__
-	desctbr_t idtr, gdtr;
-#endif
+	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
-	vmcs_write(VMCS_HOST_FS_BASE, vmm_get_host_fsbase());
-	if (vmxstate->lastcpu == curcpu)
+	if (vmxstate->vpid == 0)
 		return;
 
-	vmxstate->lastcpu = curcpu;
-
-	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
-
-	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
-	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
-	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	if (!running) {
+		/*
+		 * Set the 'lastcpu' to an invalid host cpu.
+		 *
+		 * This will invalidate TLB entries tagged with the vcpu's
+		 * vpid the next time it runs via vmx_set_pcpu_defaults().
+		 */
+		vmxstate->lastcpu = NOCPU;
+		return;
+	}
 
-#ifndef	__FreeBSD__
-	vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(MSR_SYSENTER_CS_MSR));
-	vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR));
-	vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(MSR_SYSENTER_EIP_MSR));
+#ifdef __FreeBSD__
+	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
+	    "critical section", __func__, vcpu));
 #endif
 
 	/*
-	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
@@ -1069,29 +1328,70 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
-	if (vmxstate->vpid != 0) {
+	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+		invvpid_desc._res1 = 0;
+		invvpid_desc._res2 = 0;
 		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid_desc.linear_addr = 0;
 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
+	} else {
+		/*
+		 * The invvpid can be skipped if an invept is going to
+		 * be performed before entering the guest. The invept
+		 * will invalidate combined mappings tagged with
+		 * 'vmx->eptp' for all vpids.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
-static void 
-vm_exit_update_rip(struct vm_exit *vmexit)
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 {
-	int error;
-
-	error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
-	if (error)
-		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
-}
+	struct vmxstate *vmxstate;
 
-/*
- * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
- */
-CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+#ifndef __FreeBSD__
+	/*
+	 * Regardless of whether the VM appears to have migrated between CPUs,
+	 * save the host sysenter stack pointer.  As it points to the kernel
+	 * stack of each thread, the correct value must be maintained for every
+	 * trip into the critical section.
+	 */
+	vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR));
 
-static void __inline
-vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+	/*
+	 * Perform any needed TSC_OFFSET adjustment based on TSC_MSR writes or
+	 * migration between host CPUs with differing TSC values.
+	 */
+	VERIFY0(vmx_apply_tsc_adjust(vmx, vcpu));
+#endif
+
+	vmxstate = &vmx->state[vcpu];
+	if (vmxstate->lastcpu == curcpu)
+		return;
+
+	vmxstate->lastcpu = curcpu;
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+#ifndef __FreeBSD__
+	/* Load the per-CPU IDT address */
+	vmcs_write(VMCS_HOST_IDTR_BASE, vmm_get_host_idtrbase());
+#endif
+	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	vmx_invvpid(vmx, vcpu, pmap, 1);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static __inline void
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
@@ -1101,23 +1401,18 @@ vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 	}
 }
 
-static void __inline
+static __inline void
 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
-#ifdef	__FreeBSD__
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
-#else
-	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
-	    ("intr_window_exiting not set: %x", vmx->cap[vcpu].proc_ctls));
-#endif
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
-static void __inline
+static __inline void
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
@@ -1128,22 +1423,18 @@ vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 	}
 }
 
-static void __inline
+static __inline void
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
-#ifdef	__FreeBSD__
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
-#else
-	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
-	    ("nmi_window_exiting not set %x", vmx->cap[vcpu].proc_ctls));
-#endif
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
+#ifdef __FreeBSD__
 int
 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
 {
@@ -1159,34 +1450,55 @@ vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
 
 	return (error);
 }
+#else /* __FreeBSD__ */
+/*
+ * Set the TSC adjustment, taking into account the offsets measured between
+ * host physical CPUs.  This is required even if the guest has not set a TSC
+ * offset since vCPUs inherit the TSC offset of whatever physical CPU it has
+ * migrated onto.  Without this mitigation, un-synched host TSCs will convey
+ * the appearance of TSC time-travel to the guest as its vCPUs migrate.
+ */
+static int
+vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu)
+{
+	extern hrtime_t tsc_gethrtime_tick_delta(void);
+	const uint64_t target_offset = (vcpu_tsc_offset(vmx->vm, vcpu) +
+	    (uint64_t)tsc_gethrtime_tick_delta());
+	int error = 0;
+
+	ASSERT(vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET);
+
+	if (vmx->tsc_offset_active[vcpu] != target_offset) {
+		error = vmwrite(VMCS_TSC_OFFSET, target_offset);
+		vmx->tsc_offset_active[vcpu] = target_offset;
+	}
+
+	return (error);
+}
+#endif /* __FreeBSD__ */
 
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
+#ifndef __FreeBSD__
+static uint32_t
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+#else
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
+#endif
 {
 	uint32_t gi, info;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
-#ifdef	__FreeBSD__
 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 	    "interruptibility-state %#x", gi));
-#else
-	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
-	    "interruptibility-state %x", gi));
-#endif
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
-#ifdef	__FreeBSD__
 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 	    "VM-entry interruption information %#x", info));
-#else
-	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
-	    "VM-entry interruption information %x", info));
-#endif
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
@@ -1199,32 +1511,220 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
+
+#ifndef __FreeBSD__
+	return (info);
+#endif
 }
 
+#ifndef __FreeBSD__
 static void
-vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
+vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
+    uint64_t guestrip)
 {
-	int vector, need_nmi_exiting, extint_pending;
-	uint64_t rflags, entryinfo;
+	uint64_t entryinfo, rflags;
 	uint32_t gi, info;
+	int vector;
+	boolean_t extint_pending = B_FALSE;
+
+	vlapic_tmr_update(vlapic);
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+
+	if (vmx->state[vcpu].nextrip != guestrip &&
+	    (gi & HWINTR_BLOCKING) != 0) {
+		VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
+		    "cleared due to rip change: %#lx/%#lx",
+		    vmx->state[vcpu].nextrip, guestrip);
+		gi &= ~HWINTR_BLOCKING;
+		vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+	}
+
+	/*
+	 * It could be that an interrupt is already pending for injection from
+	 * the VMCS.  This would be the case if the vCPU exited for conditions
+	 * such as an AST before a vm-entry delivered the injection.
+	 */
+	if ((info & VMCS_INTR_VALID) != 0) {
+		goto cantinject;
+	}
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
-#ifdef	__FreeBSD__
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
+
+		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
+		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
+
+		info = entryinfo;
+		vector = info & 0xff;
+		if (vector == IDT_BP || vector == IDT_OF) {
+			/*
+			 * VT-x requires #BP and #OF to be injected as software
+			 * exceptions.
+			 */
+			info &= ~VMCS_INTR_T_MASK;
+			info |= VMCS_INTR_T_SWEXCEPTION;
+		}
+
+		if (info & VMCS_INTR_DEL_ERRCODE)
+			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
+		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+	}
+
+	if (vm_nmi_pending(vmx->vm, vcpu)) {
+		int need_nmi_exiting = 1;
+
+		/*
+		 * If there are no conditions blocking NMI injection then
+		 * inject it directly here otherwise enable "NMI window
+		 * exiting" to inject it as soon as we can.
+		 *
+		 * We also check for STI_BLOCKING because some implementations
+		 * don't allow NMI injection in this case. If we are running
+		 * on a processor that doesn't have this restriction it will
+		 * immediately exit and the NMI will be injected in the
+		 * "NMI window exiting" handler.
+		 */
+		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
+			if ((info & VMCS_INTR_VALID) == 0) {
+				info = vmx_inject_nmi(vmx, vcpu);
+				need_nmi_exiting = 0;
+			} else {
+				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
+				    "due to VM-entry intr info %#x", info);
+			}
+		} else {
+			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
+			    "Guest Interruptibility-state %#x", gi);
+		}
+
+		if (need_nmi_exiting) {
+			vmx_set_nmi_window_exiting(vmx, vcpu);
+			return;
+		}
+	}
+
+	/* Check the AT-PIC and APIC for interrupts. */
+	if (vm_extint_pending(vmx->vm, vcpu)) {
+		/* Ask the legacy pic for a vector to inject */
+		vatpic_pending_intr(vmx->vm, &vector);
+		extint_pending = B_TRUE;
+
+		/*
+		 * From the Intel SDM, Volume 3, Section "Maskable
+		 * Hardware Interrupts":
+		 * - maskable interrupt vectors [0,255] can be delivered
+		 *   through the INTR pin.
+		 */
+		KASSERT(vector >= 0 && vector <= 255,
+		    ("invalid vector %d from INTR", vector));
+	} else if (!virtual_interrupt_delivery) {
+		/* Ask the local apic for a vector to inject */
+		if (!vlapic_pending_intr(vlapic, &vector))
+			return;
+
+		/*
+		 * From the Intel SDM, Volume 3, Section "Maskable
+		 * Hardware Interrupts":
+		 * - maskable interrupt vectors [16,255] can be delivered
+		 *   through the local APIC.
+		*/
+		KASSERT(vector >= 16 && vector <= 255,
+		    ("invalid vector %d from local APIC", vector));
+	} else {
+		/* No futher injection needed */
+		return;
+	}
+
+	/*
+	 * Verify that the guest is interruptable and the above logic has not
+	 * already queued an event for injection.
+	 */
+	if ((gi & HWINTR_BLOCKING) != 0) {
+		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+		    "Guest Interruptibility-state %#x", vector, gi);
+		goto cantinject;
+	}
+	if ((info & VMCS_INTR_VALID) != 0) {
+		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+		    "VM-entry intr info %#x", vector, info);
+		goto cantinject;
+	}
+	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+	if ((rflags & PSL_I) == 0) {
+		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+		    "rflags %#lx", vector, rflags);
+		goto cantinject;
+	}
+
+	/* Inject the interrupt */
+	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
+	info |= vector;
+	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+
+	if (extint_pending) {
+		vm_extint_clear(vmx->vm, vcpu);
+		vatpic_intr_accepted(vmx->vm, vector);
+
+		/*
+		 * After we accepted the current ExtINT the PIC may
+		 * have posted another one.  If that is the case, set
+		 * the Interrupt Window Exiting execution control so
+		 * we can inject that one too.
+		 *
+		 * Also, interrupt window exiting allows us to inject any
+		 * pending APIC vector that was preempted by the ExtINT
+		 * as soon as possible. This applies both for the software
+		 * emulated vlapic and the hardware assisted virtual APIC.
+		 */
+		vmx_set_int_window_exiting(vmx, vcpu);
+	} else {
+		/* Update the Local APIC ISR */
+		vlapic_intr_accepted(vlapic, vector);
+	}
+
+	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+	return;
+
+cantinject:
+	/*
+	 * Set the Interrupt Window Exiting execution control so we can inject
+	 * the interrupt as soon as blocking condition goes away.
+	 */
+	vmx_set_int_window_exiting(vmx, vcpu);
+}
 #else
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
+    uint64_t guestrip)
+{
+	int vector, need_nmi_exiting, extint_pending;
+	uint64_t rflags, entryinfo;
+	uint32_t gi, info;
+
+	vlapic_tmr_update(vlapic);
+
+	if (vmx->state[vcpu].nextrip != guestrip) {
+		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+		if (gi & HWINTR_BLOCKING) {
+			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
+			    "cleared due to rip change: %#lx/%#lx",
+			    vmx->state[vcpu].nextrip, guestrip);
+			gi &= ~HWINTR_BLOCKING;
+			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+		}
+	}
+
+	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
-		    "intinfo is not valid: %lx", __func__, entryinfo));
-#endif
+		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
-#ifdef	__FreeBSD__
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
-#else
-		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
-		     "pending exception: %lx/%x", __func__, entryinfo, info));
-#endif
 
 		info = entryinfo;
 		vector = info & 0xff;
@@ -1277,12 +1777,10 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 
 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
 
-#ifdef	__FreeBSD__
 	if (!extint_pending && virtual_interrupt_delivery) {
 		vmx_inject_pir(vlapic);
 		return;
 	}
-#endif
 
 	/*
 	 * If interrupt-window exiting is already in effect then don't bother
@@ -1388,6 +1886,7 @@ cantinject:
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
+#endif /* __FreeBSD__ */
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
@@ -1420,6 +1919,92 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
+static void
+vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+	uint32_t gi;
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
+	    ("NMI blocking is not in effect %#x", gi));
+}
+
+static int
+vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+	struct vmxctx *vmxctx;
+	uint64_t xcrval;
+	const struct xsave_limits *limits;
+
+	vmxctx = &vmx->ctx[vcpu];
+	limits = vmm_get_xsave_limits();
+
+	/*
+	 * Note that the processor raises a GP# fault on its own if
+	 * xsetbv is executed for CPL != 0, so we do not have to
+	 * emulate that fault here.
+	 */
+
+	/* Only xcr0 is supported. */
+	if (vmxctx->guest_rcx != 0) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
+	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
+		vm_inject_ud(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
+	if ((xcrval & ~limits->xcr0_allowed) != 0) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	if (!(xcrval & XFEATURE_ENABLED_X87)) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/* AVX (YMM_Hi128) requires SSE. */
+	if (xcrval & XFEATURE_ENABLED_AVX &&
+	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/*
+	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
+	 * ZMM_Hi256, and Hi16_ZMM.
+	 */
+	if (xcrval & XFEATURE_AVX512 &&
+	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
+	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/*
+	 * Intel MPX requires both bound register state flags to be
+	 * set.
+	 */
+	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
+	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
+		vm_inject_gp(vmx->vm, vcpu);
+		return (HANDLED);
+	}
+
+	/*
+	 * This runs "inside" vmrun() with the guest's FPU state, so
+	 * modifying xcr0 directly modifies the guest's xcr0, not the
+	 * host's.
+	 */
+	load_xcr(0, xcrval);
+	return (HANDLED);
+}
+
 static uint64_t
 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 {
@@ -1734,6 +2319,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+	vmexit->inst_length = 0;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
@@ -1799,6 +2385,189 @@ ept_emulation_fault(uint64_t ept_qual)
 	return (TRUE);
 }
 
+static __inline int
+apic_access_virtualization(struct vmx *vmx, int vcpuid)
+{
+	uint32_t proc_ctls2;
+
+	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
+	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
+}
+
+static __inline int
+x2apic_virtualization(struct vmx *vmx, int vcpuid)
+{
+	uint32_t proc_ctls2;
+
+	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
+	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
+}
+
+static int
+vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
+    uint64_t qual)
+{
+	int error, handled, offset;
+	uint32_t *apic_regs, vector;
+	bool retu;
+
+	handled = HANDLED;
+	offset = APIC_WRITE_OFFSET(qual);
+
+	if (!apic_access_virtualization(vmx, vcpuid)) {
+		/*
+		 * In general there should not be any APIC write VM-exits
+		 * unless APIC-access virtualization is enabled.
+		 *
+		 * However self-IPI virtualization can legitimately trigger
+		 * an APIC-write VM-exit so treat it specially.
+		 */
+		if (x2apic_virtualization(vmx, vcpuid) &&
+		    offset == APIC_OFFSET_SELF_IPI) {
+			apic_regs = (uint32_t *)(vlapic->apic_page);
+			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
+			vlapic_self_ipi_handler(vlapic, vector);
+			return (HANDLED);
+		} else
+			return (UNHANDLED);
+	}
+
+	switch (offset) {
+	case APIC_OFFSET_ID:
+		vlapic_id_write_handler(vlapic);
+		break;
+	case APIC_OFFSET_LDR:
+		vlapic_ldr_write_handler(vlapic);
+		break;
+	case APIC_OFFSET_DFR:
+		vlapic_dfr_write_handler(vlapic);
+		break;
+	case APIC_OFFSET_SVR:
+		vlapic_svr_write_handler(vlapic);
+		break;
+	case APIC_OFFSET_ESR:
+		vlapic_esr_write_handler(vlapic);
+		break;
+	case APIC_OFFSET_ICR_LOW:
+		retu = false;
+		error = vlapic_icrlo_write_handler(vlapic, &retu);
+		if (error != 0 || retu)
+			handled = UNHANDLED;
+		break;
+	case APIC_OFFSET_CMCI_LVT:
+	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+		vlapic_lvt_write_handler(vlapic, offset);
+		break;
+	case APIC_OFFSET_TIMER_ICR:
+		vlapic_icrtmr_write_handler(vlapic);
+		break;
+	case APIC_OFFSET_TIMER_DCR:
+		vlapic_dcr_write_handler(vlapic);
+		break;
+	default:
+		handled = UNHANDLED;
+		break;
+	}
+	return (handled);
+}
+
+static bool
+apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
+{
+
+	if (apic_access_virtualization(vmx, vcpuid) &&
+	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
+		return (true);
+	else
+		return (false);
+}
+
+static int
+vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
+{
+	uint64_t qual;
+	int access_type, offset, allowed;
+
+	if (!apic_access_virtualization(vmx, vcpuid))
+		return (UNHANDLED);
+
+	qual = vmexit->u.vmx.exit_qualification;
+	access_type = APIC_ACCESS_TYPE(qual);
+	offset = APIC_ACCESS_OFFSET(qual);
+
+	allowed = 0;
+	if (access_type == 0) {
+		/*
+		 * Read data access to the following registers is expected.
+		 */
+		switch (offset) {
+		case APIC_OFFSET_APR:
+		case APIC_OFFSET_PPR:
+		case APIC_OFFSET_RRR:
+		case APIC_OFFSET_CMCI_LVT:
+		case APIC_OFFSET_TIMER_CCR:
+			allowed = 1;
+			break;
+		default:
+			break;
+		}
+	} else if (access_type == 1) {
+		/*
+		 * Write data access to the following registers is expected.
+		 */
+		switch (offset) {
+		case APIC_OFFSET_VER:
+		case APIC_OFFSET_APR:
+		case APIC_OFFSET_PPR:
+		case APIC_OFFSET_RRR:
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+		case APIC_OFFSET_CMCI_LVT:
+		case APIC_OFFSET_TIMER_CCR:
+			allowed = 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (allowed) {
+		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
+		    VIE_INVALID_GLA);
+	}
+
+	/*
+	 * Regardless of whether the APIC-access is allowed this handler
+	 * always returns UNHANDLED:
+	 * - if the access is allowed then it is handled by emulating the
+	 *   instruction that caused the VM-exit (outside the critical section)
+	 * - if the access is not allowed then it will be converted to an
+	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
+	 */
+	return (UNHANDLED);
+}
+
+static enum task_switch_reason
+vmx_task_switch_reason(uint64_t qual)
+{
+	int reason;
+
+	reason = (qual >> 30) & 0x3;
+	switch (reason) {
+	case 0:
+		return (TSR_CALL);
+	case 1:
+		return (TSR_IRET);
+	case 2:
+		return (TSR_JMP);
+	case 3:
+		return (TSR_IDT_GATE);
+	default:
+		panic("%s: invalid reason %d", __func__, reason);
+	}
+}
+
 static int
 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
@@ -1839,31 +2608,150 @@ emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 	return (error);
 }
 
+#ifndef __FreeBSD__
+#define	__predict_false(x)	(x)
+#endif
+
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
-	int error, handled, in;
-	struct vmcs *vmcs;
+	int error, errcode, errcode_valid, handled, in;
 	struct vmxctx *vmxctx;
+	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
-	uint32_t eax, ecx, edx, idtvec_info, intr_info, inst_info;
-	uint64_t qual, gla, gpa, cr3;
+	struct vm_task_switch *ts;
+	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
+	uint32_t intr_type, intr_vec, reason;
+	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
 	handled = UNHANDLED;
-	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
+
 	qual = vmexit->u.vmx.exit_qualification;
+	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
+	SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
+
+	/*
+	 * VM-entry failures during or after loading guest state.
+	 *
+	 * These VM-exits are uncommon but must be handled specially
+	 * as most VM-exit fields are not populated as usual.
+	 */
+	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
+		VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
+#ifdef __FreeBSD__
+		__asm __volatile("int $18");
+#else
+		vmm_call_trap(T_MCE);
+#endif
+		return (1);
+	}
 
-	switch (vmexit->u.vmx.exit_reason) {
+	/*
+	 * VM exits that can be triggered during event delivery need to
+	 * be handled specially by re-injecting the event if the IDT
+	 * vectoring information field's valid bit is set.
+	 *
+	 * See "Information for VM Exits During Event Delivery" in Intel SDM
+	 * for details.
+	 */
+	idtvec_info = vmcs_idt_vectoring_info();
+	if (idtvec_info & VMCS_IDT_VEC_VALID) {
+		idtvec_info &= ~(1 << 12); /* clear undefined bit */
+		exitintinfo = idtvec_info;
+		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+			idtvec_err = vmcs_idt_vectoring_err();
+			exitintinfo |= (uint64_t)idtvec_err << 32;
+		}
+		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
+		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
+		    __func__, error));
+
+		/*
+		 * If 'virtual NMIs' are being used and the VM-exit
+		 * happened while injecting an NMI during the previous
+		 * VM-entry, then clear "blocking by NMI" in the
+		 * Guest Interruptibility-State so the NMI can be
+		 * reinjected on the subsequent VM-entry.
+		 *
+		 * However, if the NMI was being delivered through a task
+		 * gate, then the new task must start execution with NMIs
+		 * blocked so don't clear NMI blocking in this case.
+		 */
+		intr_type = idtvec_info & VMCS_INTR_T_MASK;
+		if (intr_type == VMCS_INTR_T_NMI) {
+			if (reason != EXIT_REASON_TASK_SWITCH)
+				vmx_clear_nmi_blocking(vmx, vcpu);
+			else
+				vmx_assert_nmi_blocking(vmx, vcpu);
+		}
+
+		/*
+		 * Update VM-entry instruction length if the event being
+		 * delivered was a software interrupt or software exception.
+		 */
+		if (intr_type == VMCS_INTR_T_SWINTR ||
+		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
+		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
+			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
+		}
+	}
+
+	switch (reason) {
+	case EXIT_REASON_TASK_SWITCH:
+		ts = &vmexit->u.task_switch;
+		ts->tsssel = qual & 0xffff;
+		ts->reason = vmx_task_switch_reason(qual);
+		ts->ext = 0;
+		ts->errcode_valid = 0;
+		vmx_paging_info(&ts->paging);
+		/*
+		 * If the task switch was due to a CALL, JMP, IRET, software
+		 * interrupt (INT n) or software exception (INT3, INTO),
+		 * then the saved %rip references the instruction that caused
+		 * the task switch. The instruction length field in the VMCS
+		 * is valid in this case.
+		 *
+		 * In all other cases (e.g., NMI, hardware exception) the
+		 * saved %rip is one that would have been saved in the old TSS
+		 * had the task switch completed normally so the instruction
+		 * length field is not needed in this case and is explicitly
+		 * set to 0.
+		 */
+		if (ts->reason == TSR_IDT_GATE) {
+			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
+			    ("invalid idtvec_info %#x for IDT task switch",
+			    idtvec_info));
+			intr_type = idtvec_info & VMCS_INTR_T_MASK;
+			if (intr_type != VMCS_INTR_T_SWINTR &&
+			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
+			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
+				/* Task switch triggered by external event */
+				ts->ext = 1;
+				vmexit->inst_length = 0;
+				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+					ts->errcode_valid = 1;
+					ts->errcode = vmcs_idt_vectoring_err();
+				}
+			}
+		}
+		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
+		SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
+		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
+		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
+		    ts->ext ? "external" : "internal",
+		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
+		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
+		SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
 		switch (qual & 0xf) {
 		case 0:
 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
@@ -1881,6 +2769,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
+		SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx);
 		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
@@ -1901,6 +2790,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
+		SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx,
+		    (uint64_t)edx << 32 | eax);
 		error = emulate_wrmsr(vmx, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
@@ -1917,19 +2808,29 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+		SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+		if (virtual_interrupt_delivery)
+			vmexit->u.hlt.intr_status =
+			    vmcs_read(VMCS_GUEST_INTR_STATUS);
+		else
+			vmexit->u.hlt.intr_status = 0;
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
+		SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		vmexit->inst_length = 0;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
+		SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		break;
 	case EXIT_REASON_INTR_WINDOW:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
+		SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
 		vmx_clear_int_window_exiting(vmx, vcpu);
 		return (1);
 	case EXIT_REASON_EXT_INTR:
@@ -1943,6 +2844,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * this virtual interrupt during the subsequent VM enter.
 		 */
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+		SDT_PROBE4(vmm, vmx, exit, interrupt,
+		    vmx, vcpu, vmexit, intr_info);
 
 		/*
 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
@@ -1950,18 +2853,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 */
 		if (!(intr_info & VMCS_INTR_VALID))
 			return (1);
-#ifdef	__FreeBSD__
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 		    ("VM exit interruption info invalid: %#x", intr_info));
-#else
-		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
-		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
-		    ("VM exit interruption info invalid: %x", intr_info));
-#endif
-#if 0	/* XXX */
 		vmx_trigger_hostintr(intr_info & 0xff);
-#endif
 
 		/*
 		 * This is special. We want to treat this as an 'handled'
@@ -1970,6 +2865,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 		return (1);
 	case EXIT_REASON_NMI_WINDOW:
+		SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
 		/* Exit to allow the pending virtual NMI to be injected */
 		if (vm_nmi_pending(vmx->vm, vcpu))
 			vmx_inject_nmi(vmx, vcpu);
@@ -1997,21 +2893,21 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 			vis->addrsize = inout_str_addrsize(inst_info);
 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 		}
+		SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_CPUID:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
+		SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EXCEPTION:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
-#ifdef	__FreeBSD__
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
-#else
-		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
-		    ("VM exit interruption info invalid: %x", intr_info));
-#endif
+
+		intr_vec = intr_info & 0xff;
+		intr_type = intr_info & VMCS_INTR_T_MASK;
 
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
@@ -2020,26 +2916,147 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
+		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
-		    (intr_info & 0xff) != IDT_DF &&
+		    (intr_vec != IDT_DF) &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
-		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
+		if (intr_type == VMCS_INTR_T_NMI)
 			return (1);
-		break;
+
+		/*
+		 * Call the machine check handler by hand. Also don't reflect
+		 * the machine check back into the guest.
+		 */
+		if (intr_vec == IDT_MC) {
+			VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
+#ifdef __FreeBSD__
+			__asm __volatile("int $18");
+#else
+			vmm_call_trap(T_MCE);
+#endif
+			return (1);
+		}
+
+		if (intr_vec == IDT_PF) {
+			error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
+			KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
+			    __func__, error));
+		}
+
+		/*
+		 * Software exceptions exhibit trap-like behavior. This in
+		 * turn requires populating the VM-entry instruction length
+		 * so that the %rip in the trap frame is past the INT3/INTO
+		 * instruction.
+		 */
+		if (intr_type == VMCS_INTR_T_SWEXCEPTION)
+			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
+
+		/* Reflect all other exceptions back into the guest */
+		errcode_valid = errcode = 0;
+		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
+			errcode_valid = 1;
+			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
+		}
+		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
+		    "the guest", intr_vec, errcode);
+		SDT_PROBE5(vmm, vmx, exit, exception,
+		    vmx, vcpu, vmexit, intr_vec, errcode);
+		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
+		    errcode_valid, errcode, 0);
+		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
+		    __func__, error));
+		return (1);
+
 	case EXIT_REASON_EPT_FAULT:
+		/*
+		 * If 'gpa' lies within the address space allocated to
+		 * memory then this must be a nested page fault otherwise
+		 * this must be an instruction that accesses MMIO space.
+		 */
 		gpa = vmcs_gpa();
-		if (ept_emulation_fault(qual)) {
+		if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
+		    apic_access_fault(vmx, vcpu, gpa)) {
+			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->inst_length = 0;
+			vmexit->u.paging.gpa = gpa;
+			vmexit->u.paging.fault_type = ept_fault_type(qual);
+			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
+			SDT_PROBE5(vmm, vmx, exit, nestedfault,
+			    vmx, vcpu, vmexit, gpa, qual);
+		} else if (ept_emulation_fault(qual)) {
 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
+			SDT_PROBE4(vmm, vmx, exit, mmiofault,
+			    vmx, vcpu, vmexit, gpa);
 		}
+		/*
+		 * If Virtual NMIs control is 1 and the VM-exit is due to an
+		 * EPT fault during the execution of IRET then we must restore
+		 * the state of "virtual-NMI blocking" before resuming.
+		 *
+		 * See description of "NMI unblocking due to IRET" in
+		 * "Exit Qualification for EPT Violations".
+		 */
+		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
+		    (qual & EXIT_QUAL_NMIUDTI) != 0)
+			vmx_restore_nmi_blocking(vmx, vcpu);
+		break;
+	case EXIT_REASON_VIRTUALIZED_EOI:
+		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
+		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
+		SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
+		vmexit->inst_length = 0;	/* trap-like */
+		break;
+	case EXIT_REASON_APIC_ACCESS:
+		SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
+		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
+		break;
+	case EXIT_REASON_APIC_WRITE:
+		/*
+		 * APIC-write VM exit is trap-like so the %rip is already
+		 * pointing to the next instruction.
+		 */
+		vmexit->inst_length = 0;
+		vlapic = vm_lapic(vmx->vm, vcpu);
+		SDT_PROBE4(vmm, vmx, exit, apicwrite,
+		    vmx, vcpu, vmexit, vlapic);
+		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
+		break;
+	case EXIT_REASON_XSETBV:
+		SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
+		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
+		break;
+	case EXIT_REASON_MONITOR:
+		SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
+		vmexit->exitcode = VM_EXITCODE_MONITOR;
+		break;
+	case EXIT_REASON_MWAIT:
+		SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
+		vmexit->exitcode = VM_EXITCODE_MWAIT;
+		break;
+	case EXIT_REASON_VMCALL:
+	case EXIT_REASON_VMCLEAR:
+	case EXIT_REASON_VMLAUNCH:
+	case EXIT_REASON_VMPTRLD:
+	case EXIT_REASON_VMPTRST:
+	case EXIT_REASON_VMREAD:
+	case EXIT_REASON_VMRESUME:
+	case EXIT_REASON_VMWRITE:
+	case EXIT_REASON_VMXOFF:
+	case EXIT_REASON_VMXON:
+		SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit);
+		vmexit->exitcode = VM_EXITCODE_VMINSN;
 		break;
 	default:
+		SDT_PROBE4(vmm, vmx, exit, unknown,
+		    vmx, vcpu, vmexit, reason);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}
@@ -2055,17 +3072,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * the one we just processed. Therefore we update the
 		 * guest rip in the VMCS and in 'vmexit'.
 		 */
-		vm_exit_update_rip(vmexit);
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
-
-		/*
-		 * Special case for spinning up an AP - exit to userspace to
-		 * give the controlling process a chance to intercept and
-		 * spin up a thread for the AP.
-		 */
-		if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
-			handled = 0;
+		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
@@ -2083,91 +3092,340 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 			 */
 		}
 	}
-	return (handled);
+
+	SDT_PROBE4(vmm, vmx, exit, return,
+	    vmx, vcpu, vmexit, handled);
+	return (handled);
+}
+
+static void
+vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
+{
+
+	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
+	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
+	    vmxctx->inst_fail_status));
+
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_VMX;
+	vmexit->u.vmx.status = vmxctx->inst_fail_status;
+	vmexit->u.vmx.inst_error = vmcs_instruction_error();
+	vmexit->u.vmx.exit_reason = ~0;
+	vmexit->u.vmx.exit_qualification = ~0;
+
+	switch (rc) {
+	case VMX_VMRESUME_ERROR:
+	case VMX_VMLAUNCH_ERROR:
+	case VMX_INVEPT_ERROR:
+#ifndef __FreeBSD__
+	case VMX_VMWRITE_ERROR:
+#endif
+		vmexit->u.vmx.inst_type = rc;
+		break;
+	default:
+		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
+	}
+}
+
+/*
+ * If the NMI-exiting VM execution control is set to '1' then an NMI in
+ * non-root operation causes a VM-exit. NMI blocking is in effect so it is
+ * sufficient to simply vector to the NMI handler via a software interrupt.
+ * However, this must be done before maskable interrupts are enabled
+ * otherwise the "iret" issued by an interrupt handler will incorrectly
+ * clear NMI blocking.
+ */
+static __inline void
+vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
+{
+	uint32_t intr_info;
+
+	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
+
+	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
+		return;
+
+	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
+	    ("VM exit interruption info invalid: %#x", intr_info));
+
+	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
+		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
+		    "to NMI has invalid vector: %#x", intr_info));
+		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
+#ifdef __FreeBSD__
+		__asm __volatile("int $2");
+#else
+		vmm_call_trap(T_NMIFLT);
+#endif
+	}
+}
+
+static __inline void
+vmx_dr_enter_guest(struct vmxctx *vmxctx)
+{
+	register_t rflags;
+
+	/* Save host control debug registers. */
+	vmxctx->host_dr7 = rdr7();
+	vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
+
+	/*
+	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
+	 * exceptions in the host based on the guest DRx values.  The
+	 * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
+	 */
+	load_dr7(0);
+	wrmsr(MSR_DEBUGCTLMSR, 0);
+
+	/*
+	 * Disable single stepping the kernel to avoid corrupting the
+	 * guest DR6.  A debugger might still be able to corrupt the
+	 * guest DR6 by setting a breakpoint after this point and then
+	 * single stepping.
+	 */
+	rflags = read_rflags();
+	vmxctx->host_tf = rflags & PSL_T;
+	write_rflags(rflags & ~PSL_T);
+
+	/* Save host debug registers. */
+	vmxctx->host_dr0 = rdr0();
+	vmxctx->host_dr1 = rdr1();
+	vmxctx->host_dr2 = rdr2();
+	vmxctx->host_dr3 = rdr3();
+	vmxctx->host_dr6 = rdr6();
+
+	/* Restore guest debug registers. */
+	load_dr0(vmxctx->guest_dr0);
+	load_dr1(vmxctx->guest_dr1);
+	load_dr2(vmxctx->guest_dr2);
+	load_dr3(vmxctx->guest_dr3);
+	load_dr6(vmxctx->guest_dr6);
+}
+
+static __inline void
+vmx_dr_leave_guest(struct vmxctx *vmxctx)
+{
+
+	/* Save guest debug registers. */
+	vmxctx->guest_dr0 = rdr0();
+	vmxctx->guest_dr1 = rdr1();
+	vmxctx->guest_dr2 = rdr2();
+	vmxctx->guest_dr3 = rdr3();
+	vmxctx->guest_dr6 = rdr6();
+
+	/*
+	 * Restore host debug registers.  Restore DR7, DEBUGCTL, and
+	 * PSL_T last.
+	 */
+	load_dr0(vmxctx->host_dr0);
+	load_dr1(vmxctx->host_dr1);
+	load_dr2(vmxctx->host_dr2);
+	load_dr3(vmxctx->host_dr3);
+	load_dr6(vmxctx->host_dr6);
+	wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
+	load_dr7(vmxctx->host_dr7);
+	write_rflags(read_rflags() | vmxctx->host_tf);
 }
 
 static int
-vmx_run(void *arg, int vcpu, register_t rip)
+vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
+    struct vm_eventinfo *evinfo)
 {
-	int error, vie, rc, handled, astpending;
-	uint32_t exit_reason;
+	int rc, handled, launched;
 	struct vmx *vmx;
 	struct vm *vm;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
-	
+	uint32_t exit_reason;
+#ifdef __FreeBSD__
+	struct region_descriptor gdtr, idtr;
+	uint16_t ldt_sel;
+#endif
+
 	vmx = arg;
 	vm = vmx->vm;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vlapic = vm_lapic(vm, vcpu);
-	vmxctx->launched = 0;
+	vmexit = vm_exitinfo(vm, vcpu);
+	launched = 0;
 
-	astpending = 0;
-	vmexit = vm_exitinfo(vmx->vm, vcpu);
+	KASSERT(vmxctx->pmap == pmap,
+	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
 	vmx_msr_guest_enter(vmx, vcpu);
 
 	VMPTRLD(vmcs);
 
+#ifndef __FreeBSD__
+	VERIFY(vmx->vmcs_state[vcpu] == VS_NONE && curthread->t_preempt != 0);
+	vmx->vmcs_state[vcpu] = VS_LOADED;
+#endif
+
 	/*
 	 * XXX
 	 * We do this every time because we may setup the virtual machine
 	 * from a different process than the one that actually runs it.
 	 *
 	 * If the life of a virtual machine was spent entirely in the context
-	 * of a single process we could do this once in vmcs_set_defaults().
+	 * of a single process we could do this once in vmx_vminit().
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
 	vmcs_write(VMCS_GUEST_RIP, rip);
-	vmx_set_pcpu_defaults(vmx, vcpu);
+	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
-		vmx_inject_interrupts(vmx, vcpu, vlapic);
-		vmx_run_trace(vmx, vcpu);
-		rc = vmx_setjmp(vmxctx);
-#ifdef SETJMP_TRACE
-		vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
-#endif
-		switch (rc) {
-		case VMX_RETURN_DIRECT:
-			if (vmxctx->launched == 0) {
-				vmxctx->launched = 1;
-				vmx_launch(vmxctx);
-			} else
-				vmx_resume(vmxctx);
-			panic("vmx_launch/resume should not return");
+		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
+		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
+
+		handled = UNHANDLED;
+		/*
+		 * Interrupts are disabled from this point on until the
+		 * guest starts executing. This is done for the following
+		 * reasons:
+		 *
+		 * If an AST is asserted on this thread after the check below,
+		 * then the IPI_AST notification will not be lost, because it
+		 * will cause a VM exit due to external interrupt as soon as
+		 * the guest state is loaded.
+		 *
+		 * A posted interrupt after 'vmx_inject_interrupts()' will
+		 * not be "lost" because it will be held pending in the host
+		 * APIC because interrupts are disabled. The pending interrupt
+		 * will be recognized as soon as the guest state is loaded.
+		 *
+		 * The same reasoning applies to the IPI generated by
+		 * pmap_invalidate_ept().
+		 */
+#ifdef __FreeBSD__
+		disable_intr();
+		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
+#else
+		/*
+		 * The bulk of guest interrupt injection is done without
+		 * interrupts disabled on the host CPU.  This is necessary
+		 * since contended mutexes might force the thread to sleep.
+		 */
+		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
+		disable_intr();
+		if (virtual_interrupt_delivery) {
+			vmx_inject_pir(vlapic);
+		}
+#endif /* __FreeBSD__ */
+
+		/*
+		 * Check for vcpu suspension after injecting events because
+		 * vmx_inject_interrupts() can suspend the vcpu due to a
+		 * triple fault.
+		 */
+		if (vcpu_suspended(evinfo)) {
+			enable_intr();
+			vm_exit_suspended(vmx->vm, vcpu, rip);
+			break;
+		}
+
+		if (vcpu_runblocked(evinfo)) {
+			enable_intr();
+			vm_exit_runblock(vmx->vm, vcpu, rip);
+			break;
+		}
+
+		if (vcpu_reqidle(evinfo)) {
+			enable_intr();
+			vm_exit_reqidle(vmx->vm, vcpu, rip);
 			break;
-		case VMX_RETURN_LONGJMP:
-			break;			/* vm exit */
-		case VMX_RETURN_AST:
-			astpending = 1;
+		}
+
+		if (vcpu_should_yield(vm, vcpu)) {
+			enable_intr();
+			vm_exit_astpending(vmx->vm, vcpu, rip);
+			vmx_astpending_trace(vmx, vcpu, rip);
+			handled = HANDLED;
 			break;
-		case VMX_RETURN_VMRESUME:
-			vie = vmcs_instruction_error();
-			if (vmxctx->launch_error == VM_FAIL_INVALID ||
-			    vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
-				printf("vmresume error %d vmcs inst error %d\n",
-					vmxctx->launch_error, vie);
-				goto err_exit;
+		}
+
+		if (vcpu_debugged(vm, vcpu)) {
+			enable_intr();
+			vm_exit_debug(vmx->vm, vcpu, rip);
+			break;
+		}
+
+#ifndef __FreeBSD__
+		if ((rc = smt_acquire()) != 1) {
+			enable_intr();
+			vmexit->rip = rip;
+			vmexit->inst_length = 0;
+			if (rc == -1) {
+				vmexit->exitcode = VM_EXITCODE_HT;
+			} else {
+				vmexit->exitcode = VM_EXITCODE_BOGUS;
+				handled = HANDLED;
 			}
-			vmx_launch(vmxctx);	/* try to launch the guest */
-			panic("vmx_launch should not return");
 			break;
-		case VMX_RETURN_VMLAUNCH:
-			vie = vmcs_instruction_error();
-#if 1
-			printf("vmlaunch error %d vmcs inst error %d\n",
-				vmxctx->launch_error, vie);
-#endif
-			goto err_exit;
-		default:
-			panic("vmx_setjmp returned %d", rc);
 		}
-		
-		/* collect some basic information for VM exit processing */
+
+		/*
+		 * If this thread has gone off-cpu due to mutex operations
+		 * during vmx_run, the VMCS will have been unloaded, forcing a
+		 * re-VMLAUNCH as opposed to VMRESUME.
+		 */
+		launched = (vmx->vmcs_state[vcpu] & VS_LAUNCHED) != 0;
+		/*
+		 * Restoration of the GDT limit is taken care of by
+		 * vmx_savectx().  Since the maximum practical index for the
+		 * IDT is 255, restoring its limits from the post-VMX-exit
+		 * default of 0xffff is not a concern.
+		 *
+		 * Only 64-bit hypervisor callers are allowed, which forgoes
+		 * the need to restore any LDT descriptor.  Toss an error to
+		 * anyone attempting to break that rule.
+		 */
+		if (curproc->p_model != DATAMODEL_LP64) {
+			smt_release();
+			enable_intr();
+			bzero(vmexit, sizeof (*vmexit));
+			vmexit->rip = rip;
+			vmexit->exitcode = VM_EXITCODE_VMX;
+			vmexit->u.vmx.status = VM_FAIL_INVALID;
+			handled = UNHANDLED;
+			break;
+		}
+#else
+		/*
+		 * VM exits restore the base address but not the
+		 * limits of GDTR and IDTR.  The VMCS only stores the
+		 * base address, so VM exits set the limits to 0xffff.
+		 * Save and restore the full GDTR and IDTR to restore
+		 * the limits.
+		 *
+		 * The VMCS does not save the LDTR at all, and VM
+		 * exits clear LDTR as if a NULL selector were loaded.
+		 * The userspace hypervisor probably doesn't use a
+		 * LDT, but save and restore it to be safe.
+		 */
+		sgdt(&gdtr);
+		sidt(&idtr);
+		ldt_sel = sldt();
+#endif
+
+		vmx_run_trace(vmx, vcpu);
+		vmx_dr_enter_guest(vmxctx);
+		rc = vmx_enter_guest(vmxctx, vmx, launched);
+		vmx_dr_leave_guest(vmxctx);
+
+#ifndef	__FreeBSD__
+		vmx->vmcs_state[vcpu] |= VS_LAUNCHED;
+		smt_release();
+#else
+		bare_lgdt(&gdtr);
+		lidt(&idtr);
+		lldt(ldt_sel);
+#endif
+
+		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
@@ -2176,21 +3434,19 @@ vmx_run(void *arg, int vcpu, register_t rip)
 		/* Update 'nextrip' */
 		vmx->state[vcpu].nextrip = rip;
 
-		/* enable interrupts */
-		enable_intr();
-
-		if (astpending) {
-			handled = 1;
-			vmexit->inst_length = 0;
-			vmexit->exitcode = VM_EXITCODE_BOGUS;
-			vmx_astpending_trace(vmx, vcpu, rip);
-			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
-			break;
+		if (rc == VMX_GUEST_VMEXIT) {
+			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
+			enable_intr();
+			handled = vmx_exit_process(vmx, vcpu, vmexit);
+		} else {
+			enable_intr();
+			vmx_exit_inst_error(vmxctx, rc, vmexit);
 		}
-
-		handled = vmx_exit_process(vmx, vcpu, vmexit);
+#ifdef	__FreeBSD__
+		launched = 1;
+#endif
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
-
+		rip = vmexit->rip;
 	} while (handled);
 
 	/*
@@ -2204,44 +3460,36 @@ vmx_run(void *arg, int vcpu, register_t rip)
 	}
 
 	if (!handled)
-		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
+		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 
-	VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",
+	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
 	vmx_msr_guest_exit(vmx, vcpu);
 
-	return (0);
-
-err_exit:
-	vmexit->exitcode = VM_EXITCODE_VMX;
-	vmexit->u.vmx.exit_reason = (uint32_t)-1;
-	vmexit->u.vmx.exit_qualification = (uint32_t)-1;
-	vmexit->u.vmx.status = ~0;
-	VMCLEAR(vmcs);
-	vmx_msr_guest_exit(vmx, vcpu);
+#ifndef __FreeBSD__
+	VERIFY(vmx->vmcs_state != VS_NONE && curthread->t_preempt != 0);
+	vmx->vmcs_state[vcpu] = VS_NONE;
+#endif
 
-	return (ENOEXEC);
+	return (0);
 }
 
 static void
 vmx_vmcleanup(void *arg)
 {
-	int i, error;
+	int i;
 	struct vmx *vmx = arg;
+	uint16_t maxcpus;
 
-	for (i = 0; i < VM_MAXCPU; i++)
-		vpid_free(vmx->state[i].vpid);
+	if (apic_access_virtualization(vmx, 0))
+		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 
-	/*
-	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
-	 */
-	error = vmclear(&vmx->vmcs[0]);
-	if (error != 0)
-		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+	maxcpus = vm_get_maxcpus(vmx->vm);
+	for (i = 0; i < maxcpus; i++)
+		vpid_free(vmx->state[i].vpid);
 
-	ept_vmcleanup(vmx);
 	free(vmx, M_VMX);
 
 	return;
@@ -2284,6 +3532,16 @@ vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 		return (&vmxctx->guest_r15);
 	case VM_REG_GUEST_CR2:
 		return (&vmxctx->guest_cr2);
+	case VM_REG_GUEST_DR0:
+		return (&vmxctx->guest_dr0);
+	case VM_REG_GUEST_DR1:
+		return (&vmxctx->guest_dr1);
+	case VM_REG_GUEST_DR2:
+		return (&vmxctx->guest_dr2);
+	case VM_REG_GUEST_DR3:
+		return (&vmxctx->guest_dr3);
+	case VM_REG_GUEST_DR6:
+		return (&vmxctx->guest_dr6);
 	default:
 		break;
 	}
@@ -2314,6 +3572,46 @@ vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 		return (EINVAL);
 }
 
+static int
+vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
+{
+	uint64_t gi;
+	int error;
+
+	error = vmcs_getreg(&vmx->vmcs[vcpu], running,
+	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
+	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
+	return (error);
+}
+
+static int
+vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
+{
+	struct vmcs *vmcs;
+	uint64_t gi;
+	int error, ident;
+
+	/*
+	 * Forcing the vcpu into an interrupt shadow is not supported.
+	 */
+	if (val) {
+		error = EINVAL;
+		goto done;
+	}
+
+	vmcs = &vmx->vmcs[vcpu];
+	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
+	error = vmcs_getreg(vmcs, running, ident, &gi);
+	if (error == 0) {
+		gi &= ~HWINTR_BLOCKING;
+		error = vmcs_setreg(vmcs, running, ident, gi);
+	}
+done:
+	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
+	    error ? "failed" : "succeeded");
+	return (error);
+}
+
 static int
 vmx_shadow_reg(int reg)
 {
@@ -2324,8 +3622,8 @@ vmx_shadow_reg(int reg)
 	switch (reg) {
 	case VM_REG_GUEST_CR0:
 		shreg = VMCS_CR0_SHADOW;
-                break;
-        case VM_REG_GUEST_CR4:
+		break;
+	case VM_REG_GUEST_CR4:
 		shreg = VMCS_CR4_SHADOW;
 		break;
 	default:
@@ -2345,6 +3643,9 @@ vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
+
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
@@ -2356,12 +3657,16 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
+	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
+
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
@@ -2389,10 +3694,22 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 		if (shadow > 0) {
 			/*
 			 * Store the unmodified value in the shadow
-			 */			
+			 */
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
+
+		if (reg == VM_REG_GUEST_CR3) {
+			/*
+			 * Invalidate the guest vcpu's TLB mappings to emulate
+			 * the behavior of updating %cr3.
+			 *
+			 * XXX the processor retains global mappings when %cr3
+			 * is updated but vmx_invvpid() does not.
+			 */
+			pmap = vmx->ctx[vcpu].pmap;
+			vmx_invvpid(vmx, vcpu, pmap, running);
+		}
 	}
 
 	return (error);
@@ -2452,6 +3769,10 @@ vmx_getcap(void *arg, int vcpu, int type, int *retval)
 		if (cap_unrestricted_guest)
 			ret = 0;
 		break;
+	case VM_CAP_ENABLE_INVPCID:
+		if (cap_invpcid)
+			ret = 0;
+		break;
 	default:
 		break;
 	}
@@ -2508,11 +3829,21 @@ vmx_setcap(void *arg, int vcpu, int type, int val)
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
 			retval = 0;
-			baseval = procbased_ctls2;
+			pptr = &vmx->cap[vcpu].proc_ctls2;
+			baseval = *pptr;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
+	case VM_CAP_ENABLE_INVPCID:
+		if (cap_invpcid) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls2;
+			baseval = *pptr;
+			flag = PROCBASED2_ENABLE_INVPCID;
+			reg = VMCS_SEC_PROC_BASED_CTLS;
+		}
+		break;
 	default:
 		break;
 	}
@@ -2546,15 +3877,18 @@ vmx_setcap(void *arg, int vcpu, int type, int val)
 		}
 	}
 
-        return (retval);
+	return (retval);
 }
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
+	u_int	pending_prio;
 };
 
+#define VPR_PRIO_BIT(vpr)	(1 << ((vpr) >> 4))
+
 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
 do {									\
 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
@@ -2576,7 +3910,7 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	uint64_t mask;
-	int idx, notify;
+	int idx, notify = 0;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
@@ -2589,7 +3923,37 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 	idx = vector / 64;
 	mask = 1UL << (vector % 64);
 	atomic_set_long(&pir_desc->pir[idx], mask);
-	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
+
+	/*
+	 * A notification is required whenever the 'pending' bit makes a
+	 * transition from 0->1.
+	 *
+	 * Even if the 'pending' bit is already asserted, notification about
+	 * the incoming interrupt may still be necessary.  For example, if a
+	 * vCPU is HLTed with a high PPR, a low priority interrupt would cause
+	 * the 0->1 'pending' transition with a notification, but the vCPU
+	 * would ignore the interrupt for the time being.  The same vCPU would
+	 * need to then be notified if a high-priority interrupt arrived which
+	 * satisfied the PPR.
+	 *
+	 * The priorities of interrupts injected while 'pending' is asserted
+	 * are tracked in a custom bitfield 'pending_prio'.  Should the
+	 * to-be-injected interrupt exceed the priorities already present, the
+	 * notification is sent.  The priorities recorded in 'pending_prio' are
+	 * cleared whenever the 'pending' bit makes another 0->1 transition.
+	 */
+	if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
+		notify = 1;
+		vlapic_vtx->pending_prio = 0;
+	} else {
+		const u_int old_prio = vlapic_vtx->pending_prio;
+		const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
+
+		if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
+			atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
+			notify = 1;
+		}
+	}
 
 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 	    level, "vmx_set_intr_ready");
@@ -2616,8 +3980,27 @@ vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 	pir_desc = vlapic_vtx->pir_desc;
 
 	pending = atomic_load_acq_long(&pir_desc->pending);
-	if (!pending)
-		return (0);	/* common case */
+	if (!pending) {
+		/*
+		 * While a virtual interrupt may have already been
+		 * processed the actual delivery maybe pending the
+		 * interruptibility of the guest.  Recognize a pending
+		 * interrupt by reevaluating virtual interrupts
+		 * following Section 29.2.1 in the Intel SDM Volume 3.
+		 */
+		struct vm_exit *vmexit;
+		uint8_t rvi, ppr;
+
+		vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+		rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
+		lapic = vlapic->apic_page;
+		ppr = lapic->ppr & APIC_TPR_INT;
+		if (rvi > ppr) {
+			return (1);
+		}
+
+		return (0);
+	}
 
 	/*
 	 * If there is an interrupt pending then it will be recognized only
@@ -2627,21 +4010,38 @@ vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 	 * interrupt will be recognized.
 	 */
 	lapic = vlapic->apic_page;
-	ppr = lapic->ppr & 0xf0;
+	ppr = lapic->ppr & APIC_TPR_INT;
 	if (ppr == 0)
 		return (1);
 
 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 	    lapic->ppr);
 
+	vpr = 0;
 	for (i = 3; i >= 0; i--) {
 		pirval = pir_desc->pir[i];
 		if (pirval != 0) {
-			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
-			return (vpr > ppr);
+			vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
+			break;
 		}
 	}
-	return (0);
+
+	/*
+	 * If the highest-priority pending interrupt falls short of the
+	 * processor priority of this vCPU, ensure that 'pending_prio' does not
+	 * have any stale bits which would preclude a higher-priority interrupt
+	 * from incurring a notification later.
+	 */
+	if (vpr <= ppr) {
+		const u_int prio_bit = VPR_PRIO_BIT(vpr);
+		const u_int old = vlapic_vtx->pending_prio;
+
+		if (old > prio_bit && (old & prio_bit) == 0) {
+			vlapic_vtx->pending_prio = prio_bit;
+		}
+		return (0);
+	}
+	return (1);
 }
 
 static void
@@ -2652,37 +4052,65 @@ vmx_intr_accepted(struct vlapic *vlapic, int vector)
 }
 
 static void
-vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
+vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks)
+{
+	vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]);
+	vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]);
+	vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]);
+	vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]);
+}
+
+static void
+vmx_enable_x2apic_mode(struct vlapic *vlapic)
 {
-	struct vlapic_vtx *vlapic_vtx;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
-	uint64_t mask, val;
+	uint32_t proc_ctls2;
+	int vcpuid, error;
 
-	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
-	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
-	    ("vmx_set_tmr: vcpu cannot be running"));
+	vcpuid = vlapic->vcpuid;
+	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
+	vmcs = &vmx->vmcs[vcpuid];
 
-	vlapic_vtx = (struct vlapic_vtx *)vlapic;
-	vmx = vlapic_vtx->vmx;
-	vmcs = &vmx->vmcs[vlapic->vcpuid];
-	mask = 1UL << (vector % 64);
+	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
+	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
+	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
+
+	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
+	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
+	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 
 	VMPTRLD(vmcs);
-	val = vmcs_read(VMCS_EOI_EXIT(vector));
-	if (level)
-		val |= mask;
-	else
-		val &= ~mask;
-	vmcs_write(VMCS_EOI_EXIT(vector), val);
+	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 	VMCLEAR(vmcs);
+
+	if (vlapic->vcpuid == 0) {
+		/*
+		 * The nested page table mappings are shared by all vcpus
+		 * so unmap the APIC access page just once.
+		 */
+		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
+		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
+		    __func__, error));
+
+		/*
+		 * The MSR bitmap is shared by all vcpus so modify it only
+		 * once in the context of vcpu 0.
+		 */
+		error = vmx_allow_x2apic_msrs(vmx);
+		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
+		    __func__, error));
+	}
 }
 
 static void
 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 {
-
+#ifdef __FreeBSD__
 	ipi_cpu(hostcpu, pirvec);
+#else
+	psm_send_pir_ipi(hostcpu);
+#endif
 }
 
 /*
@@ -2785,7 +4213,7 @@ vmx_vlapic_init(void *arg, int vcpuid)
 	struct vmx *vmx;
 	struct vlapic *vlapic;
 	struct vlapic_vtx *vlapic_vtx;
-	
+
 	vmx = arg;
 
 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
@@ -2802,9 +4230,7 @@ vmx_vlapic_init(void *arg, int vcpuid)
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
-#ifdef	__FreeBSD__
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
-#endif
 	}
 
 	if (posted_interrupts)
@@ -2823,20 +4249,129 @@ vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 	free(vlapic, M_VLAPIC);
 }
 
+#ifndef __FreeBSD__
+static void
+vmx_savectx(void *arg, int vcpu)
+{
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+	if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) {
+		VERIFY3U(vmclear(vmcs), ==, 0);
+		vmx_msr_guest_exit(vmx, vcpu);
+		/*
+		 * Having VMCLEARed the VMCS, it can no longer be re-entered
+		 * with VMRESUME, but must be VMLAUNCHed again.
+		 */
+		vmx->vmcs_state[vcpu] &= ~VS_LAUNCHED;
+	}
+
+	reset_gdtr_limit();
+}
+
+static void
+vmx_restorectx(void *arg, int vcpu)
+{
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+	ASSERT0(vmx->vmcs_state[vcpu] & VS_LAUNCHED);
+
+	if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) {
+		vmx_msr_guest_enter(vmx, vcpu);
+		VERIFY3U(vmptrld(vmcs), ==, 0);
+	}
+}
+#endif /* __FreeBSD__ */
+
 struct vmm_ops vmm_ops_intel = {
 	vmx_init,
 	vmx_cleanup,
+	vmx_restore,
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
-	ept_vmmmap_set,
-	ept_vmmmap_get,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_getcap,
 	vmx_setcap,
+	ept_vmspace_alloc,
+	ept_vmspace_free,
 	vmx_vlapic_init,
 	vmx_vlapic_cleanup,
+
+#ifndef __FreeBSD__
+	vmx_savectx,
+	vmx_restorectx,
+#endif
 };
+
+#ifndef __FreeBSD__
+/* Side-effect free HW validation derived from checks in vmx_init. */
+int
+vmx_x86_supported(const char **msg)
+{
+	int error;
+	uint32_t tmp;
+
+	ASSERT(msg != NULL);
+
+	/* Check support for primary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING,
+	    PROCBASED_CTLS_ZERO_SETTING, &tmp);
+	if (error) {
+		*msg = "processor does not support desired primary "
+		    "processor-based controls";
+		return (error);
+	}
+
+	/* Check support for secondary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+	    MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING,
+	    PROCBASED_CTLS2_ZERO_SETTING, &tmp);
+	if (error) {
+		*msg = "processor does not support desired secondary "
+		    "processor-based controls";
+		return (error);
+	}
+
+	/* Check support for pin-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+	    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING,
+	    PINBASED_CTLS_ZERO_SETTING, &tmp);
+	if (error) {
+		*msg = "processor does not support desired pin-based controls";
+		return (error);
+	}
+
+	/* Check support for VM-exit controls */
+	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+	    VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &tmp);
+	if (error) {
+		*msg = "processor does not support desired exit controls";
+		return (error);
+	}
+
+	/* Check support for VM-entry controls */
+	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &tmp);
+	if (error) {
+		*msg = "processor does not support desired entry controls";
+		return (error);
+	}
+
+	/* Unrestricted guest is nominally optional, but not for us. */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+	    PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp);
+	if (error) {
+		*msg = "processor does not support desired unrestricted guest "
+		    "controls";
+		return (error);
+	}
+
+	return (0);
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
index 50ca62b371..2d16799bdd 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmx.h 284174 2015-06-09 00:14:47Z tychon $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _VMX_H_
@@ -31,15 +37,9 @@
 
 #include "vmcs.h"
 
-#ifndef	__FreeBSD__
-#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
-#define	HOST_MSR_MAX_ENTRIES	64		/* arbitrary */
-#endif
+struct pmap;
 
 struct vmxctx {
-	register_t	tmpstk[32];		/* vmx_return() stack */
-	register_t	tmpstktop;
-
 	register_t	guest_rdi;		/* Guest state */
 	register_t	guest_rsi;
 	register_t	guest_rdx;
@@ -56,7 +56,13 @@ struct vmxctx {
 	register_t	guest_r14;
 	register_t	guest_r15;
 	register_t	guest_cr2;
+	register_t	guest_dr0;
+	register_t	guest_dr1;
+	register_t	guest_dr2;
+	register_t	guest_dr3;
+	register_t	guest_dr6;
 
+#ifdef __FreeBSD__
 	register_t	host_r15;		/* Host state */
 	register_t	host_r14;
 	register_t	host_r13;
@@ -64,13 +70,24 @@ struct vmxctx {
 	register_t	host_rbp;
 	register_t	host_rsp;
 	register_t	host_rbx;
-	register_t	host_rip;
+#endif /* __FreeBSD__ */
+
+	register_t	host_dr0;
+	register_t	host_dr1;
+	register_t	host_dr2;
+	register_t	host_dr3;
+	register_t	host_dr6;
+	register_t	host_dr7;
+	uint64_t	host_debugctl;
+	int		host_tf;
+
+	int		inst_fail_status;
+
 	/*
-	 * XXX todo debug registers and fpu state
+	 * The pmap needs to be deactivated in vmx_enter_guest()
+	 * so keep a copy of the 'pmap' in each vmxctx.
 	 */
-	
-	int		launched;		/* vmcs launch state */
-	int		launch_error;
+	struct pmap	*pmap;
 };
 
 struct vmxcap {
@@ -105,52 +122,55 @@ enum {
 	IDX_MSR_STAR,
 	IDX_MSR_SF_MASK,
 	IDX_MSR_KGSBASE,
+	IDX_MSR_PAT,
 	GUEST_MSR_NUM		/* must be the last enumeration */
 };
 
+#ifndef	__FreeBSD__
+typedef enum {
+	VS_NONE		= 0x0,
+	VS_LAUNCHED	= 0x1,
+	VS_LOADED	= 0x2
+} vmcs_state_t;
+#endif /* __FreeBSD__ */
+
 /* virtual machine softc */
 struct vmx {
-	pml4_entry_t	pml4ept[NPML4EPG];
 	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
 	struct apic_page apic_page[VM_MAXCPU];	/* one apic page per vcpu */
 	char		msr_bitmap[PAGE_SIZE];
 	struct pir_desc	pir_desc[VM_MAXCPU];
-#ifdef	__FreeBSD__
 	uint64_t	guest_msrs[VM_MAXCPU][GUEST_MSR_NUM];
-#else
-	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
-	struct msr_entry host_msrs[VM_MAXCPU][HOST_MSR_MAX_ENTRIES];
+#ifndef	__FreeBSD__
+	uint64_t	host_msrs[VM_MAXCPU][GUEST_MSR_NUM];
+	uint64_t	tsc_offset_active[VM_MAXCPU];
+	vmcs_state_t	vmcs_state[VM_MAXCPU];
 #endif
 	struct vmxctx	ctx[VM_MAXCPU];
 	struct vmxcap	cap[VM_MAXCPU];
 	struct vmxstate	state[VM_MAXCPU];
+	uint64_t	eptp;
 	struct vm	*vm;
+	long		eptgen[MAXCPU];		/* cached pmap->pm_eptgen */
 };
-CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
 
-#define	VMX_RETURN_DIRECT	0
-#define	VMX_RETURN_LONGJMP	1
-#define	VMX_RETURN_VMRESUME	2
-#define	VMX_RETURN_VMLAUNCH	3
-#define	VMX_RETURN_AST		4
-/*
- * vmx_setjmp() returns:
- * - 0 when it returns directly
- * - 1 when it returns from vmx_longjmp
- * - 2 when it returns from vmx_resume (which would only be in the error case)
- * - 3 when it returns from vmx_launch (which would only be in the error case)
- * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
- */
-int	vmx_setjmp(struct vmxctx *ctx);
-void	vmx_longjmp(void);			/* returns via vmx_setjmp */
-void	vmx_launch(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
-void	vmx_resume(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+#define	VMX_GUEST_VMEXIT	0
+#define	VMX_VMRESUME_ERROR	1
+#define	VMX_VMLAUNCH_ERROR	2
+#define	VMX_INVEPT_ERROR	3
+#define	VMX_VMWRITE_ERROR	4
+int	vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched);
+void	vmx_call_isr(uintptr_t entry);
 
 u_long	vmx_fix_cr0(u_long cr0);
 u_long	vmx_fix_cr4(u_long cr4);
 
 int	vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset);
 
+extern char	vmx_exit_guest[];
+extern char	vmx_exit_guest_flush_rsb[];
+
 #endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
index 08b1469f19..5408d129ad 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmx_controls.h 260410 2014-01-07 21:04:49Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _VMX_CONTROLS_H_
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
index 9513f6c70b..f0c5ba7691 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmx_cpufunc.h 245678 2013-01-20 03:42:49Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,6 +38,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef	_VMX_CPUFUNC_H_
@@ -71,7 +74,12 @@ vmxon(char *region)
 	int error;
 	uint64_t addr;
 
+#ifdef __FreeBSD__
 	addr = vtophys(region);
+#else
+	/* This is pre-translated in illumos */
+	addr = (uint64_t)region;
+#endif
 	__asm __volatile("vmxon %[addr];"
 			 VMX_SET_ERROR_CODE
 			 : [error] "=r" (error)
@@ -81,21 +89,7 @@ vmxon(char *region)
 	return (error);
 }
 
-/* returns 0 on success and non-zero on failure */
-static __inline int
-vmxon_pa(vm_paddr_t addr)
-{
-	int error;
-
-	__asm __volatile("vmxon %[addr];"
-			 VMX_SET_ERROR_CODE
-			 : [error] "=r" (error)
-			 : [addr] "m" (*(uint64_t *)&addr)
-			 : "memory");
-
-	return (error);
-}
-
+#ifdef __FreeBSD__
 /* returns 0 on success and non-zero on failure */
 static __inline int
 vmclear(struct vmcs *vmcs)
@@ -111,6 +105,7 @@ vmclear(struct vmcs *vmcs)
 			 : "memory");
 	return (error);
 }
+#endif /* __FreeBSD__ */
 
 static __inline void
 vmxoff(void)
@@ -126,6 +121,7 @@ vmptrst(uint64_t *addr)
 	__asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
 }
 
+#ifdef __FreeBSD__
 static __inline int
 vmptrld(struct vmcs *vmcs)
 {
@@ -140,6 +136,7 @@ vmptrld(struct vmcs *vmcs)
 			 : "memory");
 	return (error);
 }
+#endif /* __FreeBSD__ */
 
 static __inline int
 vmwrite(uint64_t reg, uint64_t val)
@@ -169,7 +166,8 @@ vmread(uint64_t r, uint64_t *addr)
 	return (error);
 }
 
-static void __inline
+#ifdef __FreeBSD__
+static __inline void
 VMCLEAR(struct vmcs *vmcs)
 {
 	int err;
@@ -181,7 +179,7 @@ VMCLEAR(struct vmcs *vmcs)
 	critical_exit();
 }
 
-static void __inline
+static __inline void
 VMPTRLD(struct vmcs *vmcs)
 {
 	int err;
@@ -192,6 +190,7 @@ VMPTRLD(struct vmcs *vmcs)
 	if (err != 0)
 		panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
 }
+#endif /* __FreeBSD__ */
 
 #define	INVVPID_TYPE_ADDRESS		0UL
 #define	INVVPID_TYPE_SINGLE_CONTEXT	1UL
@@ -205,7 +204,7 @@ struct invvpid_desc {
 };
 CTASSERT(sizeof(struct invvpid_desc) == 16);
 
-static void __inline
+static __inline void
 invvpid(uint64_t type, struct invvpid_desc desc)
 {
 	int error;
@@ -228,7 +227,7 @@ struct invept_desc {
 };
 CTASSERT(sizeof(struct invept_desc) == 16);
 
-static void __inline
+static __inline void
 invept(uint64_t type, struct invept_desc desc)
 {
 	int error;
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
index 1ced311ca8..4a1a2cd358 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,26 +25,26 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
+#include <sys/proc.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
+#include <machine/pcb.h>
 #include <machine/specialreg.h>
 #include <machine/vmm.h>
 
-#ifndef	__FreeBSD__
-#include <vm/pmap.h>
-#endif
-
 #include "vmx.h"
 #include "vmx_msr.h"
 
@@ -184,7 +186,9 @@ msr_bitmap_change_access(char *bitmap, u_int msr, int access)
 static uint64_t misc_enable;
 static uint64_t platform_info;
 static uint64_t turbo_ratio_limit;
+#ifdef __FreeBSD__
 static uint64_t host_msrs[GUEST_MSR_NUM];
+#endif /* __FreeBSD__ */
 
 static bool
 nehalem_cpu(void)
@@ -234,13 +238,33 @@ westmere_cpu(void)
 	return (false);
 }
 
+static bool
+pat_valid(uint64_t val)
+{
+	int i, pa;
+
+	/*
+	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
+	 *
+	 * Extract PA0 through PA7 and validate that each one encodes a
+	 * valid memory type.
+	 */
+	for (i = 0; i < 8; i++) {
+		pa = (val >> (i * 8)) & 0xff;
+		if (pa == 2 || pa == 3 || pa >= 8)
+			return (false);
+	}
+	return (true);
+}
+
 void
 vmx_msr_init(void)
 {
 	uint64_t bus_freq, ratio;
 	int i;
 
-#ifdef	__FreeBSD__
+#ifdef __FreeBSD__
+	/* XXXJOY: Do we want to do this caching? */
 	/*
 	 * It is safe to cache the values of the following MSRs because
 	 * they don't change based on curcpu, curproc or curthread.
@@ -249,7 +273,7 @@ vmx_msr_init(void)
 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
-#endif
+#endif /* __FreeBSD__ */
 
 	/*
 	 * Initialize emulated MSRs
@@ -308,6 +332,10 @@ vmx_msr_init(void)
 void
 vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
 {
+	uint64_t *guest_msrs;
+
+	guest_msrs = vmx->guest_msrs[vcpuid];
+
 	/*
 	 * The permissions bitmap is shared between all vcpus so initialize it
 	 * once when initializing the vBSP.
@@ -319,29 +347,55 @@ vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
 		guest_msr_rw(vmx, MSR_SF_MASK);
 		guest_msr_rw(vmx, MSR_KGSBASE);
 	}
+
+	/*
+	 * Initialize guest IA32_PAT MSR with default value after reset.
+	 */
+	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
+	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(2, PAT_UNCACHED)		|
+	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	    PAT_VALUE(4, PAT_WRITE_BACK)	|
+	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	    PAT_VALUE(6, PAT_UNCACHED)		|
+	    PAT_VALUE(7, PAT_UNCACHEABLE);
+
 	return;
 }
 
 void
 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
 {
-#ifdef	__FreeBSD__
 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
 
-	/* Save host MSRs (if any) and restore guest MSRs */
+#ifndef __FreeBSD__
+	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
+
+	/* Save host MSRs */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+#endif /* __FreeBSD__ */
+
+	/* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */
+#ifdef __FreeBSD__
+	update_pcb_bases(curpcb);
+#endif
 	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
 	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
 	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
 	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
 	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
-#endif
 }
 
 void
 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
 {
-#ifdef	__FreeBSD__
 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+#ifndef __FreeBSD__
+	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
+#endif
 
 	/* Save guest MSRs */
 	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
@@ -357,13 +411,16 @@ vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
 
 	/* MSR_KGSBASE will be restored on the way back to userspace */
-#endif
 }
 
 int
 vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 {
-	int error = 0;
+	const uint64_t *guest_msrs;
+	int error;
+
+	guest_msrs = vmx->guest_msrs[vcpuid];
+	error = 0;
 
 	switch (num) {
 	case MSR_MCG_CAP:
@@ -387,6 +444,9 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 	case MSR_TURBO_RATIO_LIMIT1:
 		*val = turbo_ratio_limit;
 		break;
+	case MSR_PAT:
+		*val = guest_msrs[IDX_MSR_PAT];
+		break;
 	default:
 		error = EINVAL;
 		break;
@@ -397,10 +457,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 int
 vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
+	uint64_t *guest_msrs;
 	uint64_t changed;
 	int error;
 	
+	guest_msrs = vmx->guest_msrs[vcpuid];
 	error = 0;
+
 	switch (num) {
 	case MSR_MCG_CAP:
 	case MSR_MCG_STATUS:
@@ -433,9 +496,17 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 			error = EINVAL;
 
 		break;
+	case MSR_PAT:
+		if (pat_valid(val))
+			guest_msrs[IDX_MSR_PAT] = val;
+		else
+			vm_inject_gp(vmx->vm, vcpuid);
+		break;
+#ifdef __FreeBSD__
 	case MSR_TSC:
 		error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc());
 		break;
+#endif /* __FreeBSD__ */
 	default:
 		error = EINVAL;
 		break;
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
index 5300d14d9b..ac2adb0dd1 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.h 271888 2014-09-20 02:35:21Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _VMX_MSR_H_
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
index d57dde1093..0130f88dd6 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -23,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/intel/vmx_support.S 245678 2013-01-20 03:42:49Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,50 +37,41 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
-#include <machine/asmacros.h>
+#include <sys/asm_linkage.h>
+#include <sys/segments.h>
 
-#include "vmx_assym.s"
+/* Porting note: This is named 'vmx_support.S' upstream. */
 
-/*
- * Disable interrupts before updating %rsp in VMX_CHECK_AST or
- * VMX_GUEST_RESTORE.
- *
- * The location that %rsp points to is a 'vmxctx' and not a
- * real stack so we don't want an interrupt handler to trash it
- */
-#define	VMX_DISABLE_INTERRUPTS		cli
 
-/*
- * If the thread hosting the vcpu has an ast pending then take care of it
- * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
- *
- * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
- * are disabled.
- */
-#ifdef	__FreeBSD__
-#define	VMX_CHECK_AST							\
-	movq	PCPU(CURTHREAD),%rax;					\
-	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax);	\
-	je	9f;							\
-	movq	$VMX_RETURN_AST,%rsi;					\
-	movq	%rdi,%rsp;						\
-	addq	$VMXCTX_TMPSTKTOP,%rsp;					\
-	callq	vmx_return;						\
-9:
-#else
-#define	VMX_CHECK_AST							\
-	movq	%gs:CPU_THREAD,%rax;					\
-	movl	T_ASTFLAG(%rax),%eax;					\
-	test	%al,%al;						\
-	je	9f;							\
-	movq	$VMX_RETURN_AST,%rsi;					\
-	movq	%rdi,%rsp;						\
-	addq	$VMXCTX_TMPSTKTOP,%rsp;					\
-	callq	vmx_return;						\
-9:
-#endif
+
+#if defined(lint)
+
+struct vmxctx;
+struct vmx;
+
+/*ARGSUSED*/
+void
+vmx_launch(struct vmxctx *ctx)
+{}
+
+void
+vmx_exit_guest()
+{}
+
+/*ARGSUSED*/
+int
+vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched)
+{
+	return (0);
+}
+
+#else /* lint */
+
+#include "vmx_assym.h"
+#include "vmcs.h"
 
 /*
  * Assumes that %rdi holds a pointer to the 'vmxctx'.
@@ -92,7 +84,6 @@
  * host context in case of an error with 'vmlaunch' or 'vmresume'.
  */
 #define	VMX_GUEST_RESTORE						\
-	movq	%rdi,%rsp;						\
 	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
 	movq	%rsi,%cr2;						\
 	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
@@ -111,161 +102,283 @@
 	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
 	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
 
-#define	VM_INSTRUCTION_ERROR(reg)					\
-	jnc 	1f;							\
-	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
-	jmp 	3f;							\
-1:	jnz 	2f;							\
-	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
-	jmp 	3f;							\
-2:	movl 	$VM_SUCCESS,reg;					\
-3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
-
-	.text
-/*
- * int vmx_setjmp(ctxp)
- * %rdi = ctxp
- *
- * Return value is '0' when it returns directly from here.
- * Return value is '1' when it returns after a vm exit through vmx_longjmp.
- */
-ENTRY(vmx_setjmp)
-	movq	(%rsp),%rax			/* return address */
-	movq    %r15,VMXCTX_HOST_R15(%rdi)
-	movq    %r14,VMXCTX_HOST_R14(%rdi)
-	movq    %r13,VMXCTX_HOST_R13(%rdi)
-	movq    %r12,VMXCTX_HOST_R12(%rdi)
-	movq    %rbp,VMXCTX_HOST_RBP(%rdi)
-	movq    %rsp,VMXCTX_HOST_RSP(%rdi)
-	movq    %rbx,VMXCTX_HOST_RBX(%rdi)
-	movq    %rax,VMXCTX_HOST_RIP(%rdi)
+#define	VMX_GUEST_SAVE							\
+	movq	%rdi, VMXSTK_TMPRDI(%rsp);				\
+	movq	VMXSTK_RDI(%rsp), %rdi;					\
+	movq	%rbp, VMXCTX_GUEST_RBP(%rdi);				\
+	leaq	VMXSTK_FP(%rsp), %rbp;					\
+	movq	%rsi, VMXCTX_GUEST_RSI(%rdi);				\
+	movq	%rdx, VMXCTX_GUEST_RDX(%rdi);				\
+	movq	%rcx, VMXCTX_GUEST_RCX(%rdi);				\
+	movq	%r8, VMXCTX_GUEST_R8(%rdi);				\
+	movq	%r9, VMXCTX_GUEST_R9(%rdi);				\
+	movq	%rax, VMXCTX_GUEST_RAX(%rdi);				\
+	movq	%rbx, VMXCTX_GUEST_RBX(%rdi);				\
+	movq	%r10, VMXCTX_GUEST_R10(%rdi);				\
+	movq	%r11, VMXCTX_GUEST_R11(%rdi);				\
+	movq	%r12, VMXCTX_GUEST_R12(%rdi);				\
+	movq	%r13, VMXCTX_GUEST_R13(%rdi);				\
+	movq	%r14, VMXCTX_GUEST_R14(%rdi);				\
+	movq	%r15, VMXCTX_GUEST_R15(%rdi);				\
+	movq	%cr2, %rbx;						\
+	movq	%rbx, VMXCTX_GUEST_CR2(%rdi);				\
+	movq	VMXSTK_TMPRDI(%rsp), %rdx;				\
+	movq	%rdx, VMXCTX_GUEST_RDI(%rdi);
 
-	/*
-	 * XXX save host debug registers
-	 */
-	movl	$VMX_RETURN_DIRECT,%eax
-	ret
-END(vmx_setjmp)
 
 /*
- * void vmx_return(struct vmxctx *ctxp, int retval)
- * %rdi = ctxp
- * %rsi = retval
- * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ * Flush scratch registers to avoid lingering guest state being used for
+ * Spectre v1 attacks when returning from guest entry.
  */
-ENTRY(vmx_return)
-	/* Restore host context. */
-	movq	VMXCTX_HOST_R15(%rdi),%r15
-	movq	VMXCTX_HOST_R14(%rdi),%r14
-	movq	VMXCTX_HOST_R13(%rdi),%r13
-	movq	VMXCTX_HOST_R12(%rdi),%r12
-	movq	VMXCTX_HOST_RBP(%rdi),%rbp
-	movq	VMXCTX_HOST_RSP(%rdi),%rsp
-	movq	VMXCTX_HOST_RBX(%rdi),%rbx
-	movq	VMXCTX_HOST_RIP(%rdi),%rax
-	movq	%rax,(%rsp)			/* return address */
+#define	VMX_GUEST_FLUSH_SCRATCH						\
+	xorl	%edi, %edi;						\
+	xorl	%esi, %esi;						\
+	xorl	%edx, %edx;						\
+	xorl	%ecx, %ecx;						\
+	xorl	%r8d, %r8d;						\
+	xorl	%r9d, %r9d;						\
+	xorl	%r10d, %r10d;						\
+	xorl	%r11d, %r11d;
 
-	/*
-	 * XXX restore host debug registers
-	 */
-	movl	%esi,%eax
-	ret
-END(vmx_return)
 
-/*
- * void vmx_longjmp(void)
- * %rsp points to the struct vmxctx
- */
-ENTRY(vmx_longjmp)
-	/*
-	 * Save guest state that is not automatically saved in the vmcs.
-	 */
-	movq	%rdi,VMXCTX_GUEST_RDI(%rsp)
-	movq	%rsi,VMXCTX_GUEST_RSI(%rsp)
-	movq	%rdx,VMXCTX_GUEST_RDX(%rsp)
-	movq	%rcx,VMXCTX_GUEST_RCX(%rsp)
-	movq	%r8,VMXCTX_GUEST_R8(%rsp)
-	movq	%r9,VMXCTX_GUEST_R9(%rsp)
-	movq	%rax,VMXCTX_GUEST_RAX(%rsp)
-	movq	%rbx,VMXCTX_GUEST_RBX(%rsp)
-	movq	%rbp,VMXCTX_GUEST_RBP(%rsp)
-	movq	%r10,VMXCTX_GUEST_R10(%rsp)
-	movq	%r11,VMXCTX_GUEST_R11(%rsp)
-	movq	%r12,VMXCTX_GUEST_R12(%rsp)
-	movq	%r13,VMXCTX_GUEST_R13(%rsp)
-	movq	%r14,VMXCTX_GUEST_R14(%rsp)
-	movq	%r15,VMXCTX_GUEST_R15(%rsp)
-
-	movq	%cr2,%rdi
-	movq	%rdi,VMXCTX_GUEST_CR2(%rsp)
-
-	movq	%rsp,%rdi
-	movq	$VMX_RETURN_LONGJMP,%rsi
-
-	addq	$VMXCTX_TMPSTKTOP,%rsp
-	callq	vmx_return
-END(vmx_longjmp)
+/* Stack layout (offset from %rsp) for vmx_enter_guest */
+#define	VMXSTK_TMPRDI	0x00	/* temp store %rdi on vmexit		*/
+#define	VMXSTK_R15	0x08	/* callee saved %r15			*/
+#define	VMXSTK_R14	0x10	/* callee saved %r14			*/
+#define	VMXSTK_R13	0x18	/* callee saved %r13			*/
+#define	VMXSTK_R12	0x20	/* callee saved %r12			*/
+#define	VMXSTK_RBX	0x28	/* callee saved %rbx			*/
+#define	VMXSTK_RDX	0x30	/* save-args %rdx (int launched)	*/
+#define	VMXSTK_RSI	0x38	/* save-args %rsi (struct vmx *vmx)	*/
+#define	VMXSTK_RDI	0x40	/* save-args %rdi (struct vmxctx *ctx)	*/
+#define	VMXSTK_FP	0x48	/* frame pointer %rbp			*/
+#define	VMXSTKSIZE	VMXSTK_FP
 
 /*
- * void vmx_resume(struct vmxctx *ctxp)
- * %rdi = ctxp
- *
- * Although the return type is a 'void' this function may return indirectly
- * through vmx_setjmp() with a return value of 2.
+ * vmx_enter_guest(struct vmxctx *vmxctx, int launched)
+ * Interrupts must be disabled on entry.
  */
-ENTRY(vmx_resume)
-	VMX_DISABLE_INTERRUPTS
+ENTRY_NP(vmx_enter_guest)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$VMXSTKSIZE, %rsp
+	movq	%r15, VMXSTK_R15(%rsp)
+	movq	%r14, VMXSTK_R14(%rsp)
+	movq	%r13, VMXSTK_R13(%rsp)
+	movq	%r12, VMXSTK_R12(%rsp)
+	movq	%rbx, VMXSTK_RBX(%rsp)
+	movq	%rdx, VMXSTK_RDX(%rsp)
+	movq	%rsi, VMXSTK_RSI(%rsp)
+	movq	%rdi, VMXSTK_RDI(%rsp)
+
+	movq	%rdi, %r12	/* vmxctx */
+	movq	%rsi, %r13	/* vmx */
+	movl	%edx, %r14d	/* launch state */
+	movq	VMXCTX_PMAP(%rdi), %rbx
 
-	VMX_CHECK_AST
+	/* Activate guest pmap on this cpu. */
+	leaq	PM_ACTIVE(%rbx), %rdi
+	movl	%gs:CPU_ID, %esi
+	call	cpuset_atomic_add
+	movq	%r12, %rdi
 
 	/*
-	 * Restore guest state that is not automatically loaded from the vmcs.
+	 * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
+	 * then we must invalidate all mappings associated with this EPTP.
 	 */
-	VMX_GUEST_RESTORE
+	movq	PM_EPTGEN(%rbx), %r10
+	movl	%gs:CPU_ID, %eax
+	cmpq	%r10, VMX_EPTGEN(%r13, %rax, 8)
+	je	guest_restore
+
+	/* Refresh 'vmx->eptgen[curcpu]' */
+	movq	%r10, VMX_EPTGEN(%r13, %rax, 8)
+
+	/* Setup the invept descriptor on the host stack */
+	pushq	$0x0
+	pushq	VMX_EPTP(%r13)
+	movl	$0x1, %eax	/* Single context invalidate */
+	invept	(%rsp), %rax
+	leaq	0x10(%rsp), %rsp
+	jbe	invept_error		/* Check invept instruction error */
 
+guest_restore:
+	/* Write the current %rsp into the VMCS to be restored on vmexit */
+	movl	$VMCS_HOST_RSP, %eax
+	vmwrite	%rsp, %rax
+	jbe	vmwrite_error
+
+	/* Check if vmresume is adequate or a full vmlaunch is required */
+	cmpl	$0, %r14d
+	je	do_launch
+
+	VMX_GUEST_RESTORE
 	vmresume
+	/*
+	 * In the common case, 'vmresume' returns back to the host through
+	 * 'vmx_exit_guest'. If there is an error we return VMX_VMRESUME_ERROR
+	 * to the caller.
+	 */
+	leaq	VMXSTK_FP(%rsp), %rbp
+	movq	VMXSTK_RDI(%rsp), %rdi
+	movl	$VMX_VMRESUME_ERROR, %eax
+	jmp	decode_inst_error
 
+do_launch:
+	VMX_GUEST_RESTORE
+	vmlaunch
 	/*
-	 * Capture the reason why vmresume failed.
+	 * In the common case, 'vmlaunch' returns back to the host through
+	 * 'vmx_exit_guest'. If there is an error we return VMX_VMLAUNCH_ERROR
+	 * to the caller.
 	 */
-	VM_INSTRUCTION_ERROR(%eax)
+	leaq	VMXSTK_FP(%rsp), %rbp
+	movq	VMXSTK_RDI(%rsp), %rdi
+	movl	$VMX_VMLAUNCH_ERROR, %eax
+	jmp	decode_inst_error
+
+vmwrite_error:
+	movl	$VMX_VMWRITE_ERROR, %eax
+	jmp	decode_inst_error
+invept_error:
+	movl	$VMX_INVEPT_ERROR, %eax
+	jmp	decode_inst_error
+decode_inst_error:
+	movl	$VM_FAIL_VALID, %r11d
+	jz	inst_error
+	movl	$VM_FAIL_INVALID, %r11d
+inst_error:
+	movl	%r11d, VMXCTX_INST_FAIL_STATUS(%rdi)
 
-	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
-	movq	%rsp,%rdi
-	movq	$VMX_RETURN_VMRESUME,%rsi
+	movq	VMXCTX_PMAP(%rdi), %rdi
+	leaq	PM_ACTIVE(%rdi), %rdi
+	movl	%gs:CPU_ID, %esi
+	movq	%rax, %r12
+	call	cpuset_atomic_del
+	movq	%r12, %rax
 
-	addq	$VMXCTX_TMPSTKTOP,%rsp
-	callq	vmx_return
-END(vmx_resume)
+	movq	VMXSTK_RBX(%rsp), %rbx
+	movq	VMXSTK_R12(%rsp), %r12
+	movq	VMXSTK_R13(%rsp), %r13
+	movq	VMXSTK_R14(%rsp), %r14
+	movq	VMXSTK_R15(%rsp), %r15
+
+	VMX_GUEST_FLUSH_SCRATCH
+
+	addq	$VMXSTKSIZE, %rsp
+	popq	%rbp
+	ret
 
 /*
- * void vmx_launch(struct vmxctx *ctxp)
- * %rdi = ctxp
- *
- * Although the return type is a 'void' this function may return indirectly
- * through vmx_setjmp() with a return value of 3.
+ * Non-error VM-exit from the guest. Make this a label so it can
+ * be used by C code when setting up the VMCS.
+ * The VMCS-restored %rsp points to the struct vmxctx
  */
-ENTRY(vmx_launch)
-	VMX_DISABLE_INTERRUPTS
+.align	ASM_ENTRY_ALIGN;
+ALTENTRY(vmx_exit_guest)
+	/* Save guest state that is not automatically saved in the vmcs. */
+	VMX_GUEST_SAVE
 
-	VMX_CHECK_AST
+	/* Deactivate guest pmap on this cpu. */
+	movq	VMXCTX_PMAP(%rdi), %rdi
+	leaq	PM_ACTIVE(%rdi), %rdi
+	movl	%gs:CPU_ID, %esi
+	call	cpuset_atomic_del
 
 	/*
-	 * Restore guest state that is not automatically loaded from the vmcs.
+	 * This will return to the caller of 'vmx_enter_guest()' with a return
+	 * value of VMX_GUEST_VMEXIT.
 	 */
-	VMX_GUEST_RESTORE
+	movl	$VMX_GUEST_VMEXIT, %eax
+	movq	VMXSTK_RBX(%rsp), %rbx
+	movq	VMXSTK_R12(%rsp), %r12
+	movq	VMXSTK_R13(%rsp), %r13
+	movq	VMXSTK_R14(%rsp), %r14
+	movq	VMXSTK_R15(%rsp), %r15
 
-	vmlaunch
+	VMX_GUEST_FLUSH_SCRATCH
+
+	addq	$VMXSTKSIZE, %rsp
+	popq	%rbp
+	ret
+SET_SIZE(vmx_enter_guest)
+
+
+
+.align	ASM_ENTRY_ALIGN;
+ALTENTRY(vmx_exit_guest_flush_rsb)
+	/* Save guest state that is not automatically saved in the vmcs. */
+	VMX_GUEST_SAVE
+
+	/* Deactivate guest pmap on this cpu. */
+	movq	VMXCTX_PMAP(%rdi), %rdi
+	leaq	PM_ACTIVE(%rdi), %rdi
+	movl	%gs:CPU_ID, %esi
+	call	cpuset_atomic_del
+
+	VMX_GUEST_FLUSH_SCRATCH
 
 	/*
-	 * Capture the reason why vmlaunch failed.
+	 * To prevent malicious branch target predictions from affecting the
+	 * host, overwrite all entries in the RSB upon exiting a guest.
 	 */
-	VM_INSTRUCTION_ERROR(%eax)
+	movl	$16, %ecx	/* 16 iterations, two calls per loop */
+	movq	%rsp, %rax
+loop:
+	call	2f		/* create an RSB entry. */
+1:
+	pause
+	call	1b		/* capture rogue speculation. */
+2:
+	call	2f		/* create an RSB entry. */
+1:
+	pause
+	call	1b		/* capture rogue speculation. */
+2:
+	subl	$1, %ecx
+	jnz	loop
+	movq	%rax, %rsp
 
-	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
-	movq	%rsp,%rdi
-	movq	$VMX_RETURN_VMLAUNCH,%rsi
+	/*
+	 * This will return to the caller of 'vmx_enter_guest()' with a return
+	 * value of VMX_GUEST_VMEXIT.
+	 */
+	movl	$VMX_GUEST_VMEXIT, %eax
+	movq	VMXSTK_RBX(%rsp), %rbx
+	movq	VMXSTK_R12(%rsp), %r12
+	movq	VMXSTK_R13(%rsp), %r13
+	movq	VMXSTK_R14(%rsp), %r14
+	movq	VMXSTK_R15(%rsp), %r15
+
+	addq	$VMXSTKSIZE, %rsp
+	popq	%rbp
+	ret
+SET_SIZE(vmx_exit_guest_flush_rsb)
+
+/*
+ * %rdi = trapno
+ *
+ * We need to do enough to convince cmnint - and its iretting tail - that we're
+ * a legit interrupt stack frame.
+ */
+ENTRY_NP(vmx_call_isr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rsp, %r11
+	andq	$~0xf, %rsp	/* align stack */
+	pushq	$KDS_SEL	/* %ss */
+	pushq	%r11		/* %rsp */
+	pushfq			/* %rflags */
+	pushq	$KCS_SEL	/* %cs */
+	leaq	.iret_dest(%rip), %rcx
+	pushq	%rcx		/* %rip */
+	pushq	$0		/* err */
+	pushq	%rdi		/* trapno */
+	cli
+	jmp	cmnint		/* %rip (and call) */
+.iret_dest:
+	popq	%rbp
+	ret
+SET_SIZE(vmx_call_isr)
 
-	addq	$VMXCTX_TMPSTKTOP,%rsp
-	callq	vmx_return
-END(vmx_launch)
+#endif /* lint */
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
new file mode 100644
index 0000000000..9474b30fc6
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
@@ -0,0 +1,690 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/vmparam.h>
+#include <contrib/dev/acpica/include/acpi.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+	volatile uint32_t	version;
+	volatile uint32_t	res0;
+	volatile uint64_t	cap;
+	volatile uint64_t	ext_cap;
+	volatile uint32_t	gcr;
+	volatile uint32_t	gsr;
+	volatile uint64_t	rta;
+	volatile uint64_t	ccr;
+};
+
+#define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
+#define	VTD_CAP_ND(cap)		((cap) & 0x7)
+#define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
+#define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
+#define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
+
+#define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
+#define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
+
+#define	VTD_GCR_WBF		(1 << 27)
+#define	VTD_GCR_SRTP		(1 << 30)
+#define	VTD_GCR_TE		(1U << 31)
+
+#define	VTD_GSR_WBFS		(1 << 27)
+#define	VTD_GSR_RTPS		(1 << 30)
+#define	VTD_GSR_TES		(1U << 31)
+
+#define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
+#define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
+
+#define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
+#define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
+#define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
+#define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
+#define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
+#define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
+#define	VTD_IIR_DOMAIN_P	32
+
+#define	VTD_ROOT_PRESENT	0x1
+#define	VTD_CTX_PRESENT		0x1
+#define	VTD_CTX_TT_ALL		(1UL << 2)
+
+#define	VTD_PTE_RD		(1UL << 0)
+#define	VTD_PTE_WR		(1UL << 1)
+#define	VTD_PTE_SUPERPAGE	(1UL << 7)
+#define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
+
+#define VTD_RID2IDX(rid)	(((rid) & 0xff) * 2)
+
+struct domain {
+	uint64_t	*ptp;		/* first level page table page */
+	int		pt_levels;	/* number of page table levels */
+	int		addrwidth;	/* 'AW' field in context entry */
+	int		spsmask;	/* supported super page sizes */
+	u_int		id;		/* domain id */
+	vm_paddr_t	maxaddr;	/* highest address to be mapped */
+	SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define	DRHD_MAX_UNITS	8
+static int		drhd_num;
+static struct vtdmap	*vtdmaps[DRHD_MAX_UNITS];
+static int		max_domains;
+typedef int		(*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+	int nd;
+
+	nd = VTD_CAP_ND(vtdmap->cap);
+
+	switch (nd) {
+	case 0:
+		return (16);
+	case 1:
+		return (64);
+	case 2:
+		return (256);
+	case 3:
+		return (1024);
+	case 4:
+		return (4 * 1024);
+	case 5:
+		return (16 * 1024);
+	case 6:
+		return (64 * 1024);
+	default:
+		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+	}
+}
+
+static u_int
+domain_id(void)
+{
+	u_int id;
+	struct domain *dom;
+
+	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
+	for (id = 1; id < max_domains; id++) {
+		SLIST_FOREACH(dom, &domhead, next) {
+			if (dom->id == id)
+				break;
+		}
+		if (dom == NULL)
+			break;		/* found it */
+	}
+	
+	if (id >= max_domains)
+		panic("domain ids exhausted");
+
+	return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+		pmap_invalidate_cache();
+
+	if (VTD_CAP_RWBF(vtdmap->cap)) {
+		vtdmap->gcr = VTD_GCR_WBF;
+		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+			;
+	}
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+		;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+	int offset;
+	volatile uint64_t *iotlb_reg, val;
+
+	vtd_wbflush(vtdmap);
+
+	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+	
+	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+		      VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+	while (1) {
+		val = *iotlb_reg;
+		if ((val & VTD_IIR_IVT) == 0)
+			break;
+	}
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = VTD_GCR_TE;
+	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+		;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = 0;
+	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+		;
+}
+
+static int
+vtd_init(void)
+{
+	int i, units, remaining;
+	struct vtdmap *vtdmap;
+	vm_paddr_t ctx_paddr;
+	char *end, envname[32];
+	unsigned long mapaddr;
+	ACPI_STATUS status;
+	ACPI_TABLE_DMAR *dmar;
+	ACPI_DMAR_HEADER *hdr;
+	ACPI_DMAR_HARDWARE_UNIT *drhd;
+
+	/*
+	 * Allow the user to override the ACPI DMAR table by specifying the
+	 * physical address of each remapping unit.
+	 *
+	 * The following example specifies two remapping units at
+	 * physical addresses 0xfed90000 and 0xfeda0000 respectively.
+	 * set vtd.regmap.0.addr=0xfed90000
+	 * set vtd.regmap.1.addr=0xfeda0000
+	 */
+	for (units = 0; units < DRHD_MAX_UNITS; units++) {
+		snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units);
+		if (getenv_ulong(envname, &mapaddr) == 0)
+			break;
+		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
+	}
+
+	if (units > 0)
+		goto skip_dmar;
+
+	/* Search for DMAR table. */
+	status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
+	if (ACPI_FAILURE(status))
+		return (ENXIO);
+
+	end = (char *)dmar + dmar->Header.Length;
+	remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR);
+	while (remaining > sizeof(ACPI_DMAR_HEADER)) {
+		hdr = (ACPI_DMAR_HEADER *)(end - remaining);
+		if (hdr->Length > remaining)
+			break;
+		/*
+		 * From Intel VT-d arch spec, version 1.3:
+		 * BIOS implementations must report mapping structures
+		 * in numerical order, i.e. All remapping structures of
+		 * type 0 (DRHD) enumerated before remapping structures of
+		 * type 1 (RMRR) and so forth.
+		 */
+		if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
+			break;
+
+		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
+		vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
+		if (units >= DRHD_MAX_UNITS)
+			break;
+		remaining -= hdr->Length;
+	}
+
+	if (units <= 0)
+		return (ENXIO);
+
+skip_dmar:
+	drhd_num = units;
+	vtdmap = vtdmaps[0];
+
+	if (VTD_CAP_CM(vtdmap->cap) != 0)
+		panic("vtd_init: invalid caching mode");
+
+	max_domains = vtd_max_domains(vtdmap);
+
+	/*
+	 * Set up the root-table to point to the context-entry tables
+	 */
+	for (i = 0; i < 256; i++) {
+		ctx_paddr = vtophys(ctx_tables[i]);
+		if (ctx_paddr & PAGE_MASK)
+			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+	}
+
+	return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_wbflush(vtdmap);
+
+		/* Update the root table address */
+		vtdmap->rta = vtophys(root_table);
+		vtdmap->gcr = VTD_GCR_SRTP;
+		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+			;
+
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+
+		vtd_translation_enable(vtdmap);
+	}
+}
+
+static void
+vtd_disable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_translation_disable(vtdmap);
+	}
+}
+
+static void
+vtd_add_device(void *arg, uint16_t rid)
+{
+	int idx;
+	uint64_t *ctxp;
+	struct domain *dom = arg;
+	vm_paddr_t pt_paddr;
+	struct vtdmap *vtdmap;
+	uint8_t bus;
+
+	vtdmap = vtdmaps[0];
+	bus = PCI_RID2BUS(rid);
+	ctxp = ctx_tables[bus];
+	pt_paddr = vtophys(dom->ptp);
+	idx = VTD_RID2IDX(rid);
+
+	if (ctxp[idx] & VTD_CTX_PRESENT) {
+		panic("vtd_add_device: device %x is already owned by "
+		      "domain %d", rid,
+		      (uint16_t)(ctxp[idx + 1] >> 8));
+	}
+
+	/*
+	 * Order is important. The 'present' bit is set only after all fields
+	 * of the context pointer are initialized.
+	 */
+	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+	if (VTD_ECAP_DI(vtdmap->ext_cap))
+		ctxp[idx] = VTD_CTX_TT_ALL;
+	else
+		ctxp[idx] = 0;
+
+	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+	/*
+	 * 'Not Present' entries are not cached in either the Context Cache
+	 * or in the IOTLB, so there is no need to invalidate either of them.
+	 */
+}
+
+static void
+vtd_remove_device(void *arg, uint16_t rid)
+{
+	int i, idx;
+	uint64_t *ctxp;
+	struct vtdmap *vtdmap;
+	uint8_t bus;
+
+	bus = PCI_RID2BUS(rid);
+	ctxp = ctx_tables[bus];
+	idx = VTD_RID2IDX(rid);
+
+	/*
+	 * Order is important. The 'present' bit is must be cleared first.
+	 */
+	ctxp[idx] = 0;
+	ctxp[idx + 1] = 0;
+
+	/*
+	 * Invalidate the Context Cache and the IOTLB.
+	 *
+	 * XXX use device-selective invalidation for Context Cache
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+#define	CREATE_MAPPING	0
+#define	REMOVE_MAPPING	1
+
+static uint64_t
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+		   int remove)
+{
+	struct domain *dom;
+	int i, spshift, ptpshift, ptpindex, nlevels;
+	uint64_t spsize, *ptp;
+
+	dom = arg;
+	ptpindex = 0;
+	ptpshift = 0;
+
+	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
+	    gpa, len));
+	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
+	    "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
+
+	if (gpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+	if (hpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+	if (len & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+	/*
+	 * Compute the size of the mapping that we can accommodate.
+	 *
+	 * This is based on three factors:
+	 * - supported super page size
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = 48;
+	for (i = 3; i >= 0; i--) {
+		spsize = 1UL << spshift;
+		if ((dom->spsmask & (1 << i)) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    (len >= spsize)) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	ptp = dom->ptp;
+	nlevels = dom->pt_levels;
+	while (--nlevels >= 0) {
+		ptpshift = 12 + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift) {
+			break;
+		}
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create a downstream page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+		}
+
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+	/*
+	 * Update the 'gpa' -> 'hpa' mapping
+	 */
+	if (remove) {
+		ptp[ptpindex] = 0;
+	} else {
+		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+		if (nlevels > 0)
+			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+	}
+
+	return (1UL << ptpshift);
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	/*
+	 * Invalidate the IOTLB.
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+	struct domain *dom;
+	vm_paddr_t addr;
+	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+	struct vtdmap *vtdmap;
+
+	if (drhd_num <= 0)
+		panic("vtd_create_domain: no dma remapping hardware available");
+
+	vtdmap = vtdmaps[0];
+
+	/*
+	 * Calculate AGAW.
+	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+	 */
+	addr = 0;
+	for (gaw = 0; addr < maxaddr; gaw++)
+		addr = 1ULL << gaw;
+
+	res = (gaw - 12) % 9;
+	if (res == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - res;
+
+	if (agaw > 64)
+		agaw = 64;
+
+	/*
+	 * Select the smallest Supported AGAW and the corresponding number
+	 * of page table levels.
+	 */
+	pt_levels = 2;
+	sagaw = 30;
+	addrwidth = 0;
+	tmp = VTD_CAP_SAGAW(vtdmap->cap);
+	for (i = 0; i < 5; i++) {
+		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+			break;
+		pt_levels++;
+		addrwidth++;
+		sagaw += 9;
+		if (sagaw > 64)
+			sagaw = 64;
+	}
+
+	if (i >= 5) {
+		panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+		      VTD_CAP_SAGAW(vtdmap->cap), agaw);
+	}
+
+	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+	dom->pt_levels = pt_levels;
+	dom->addrwidth = addrwidth;
+	dom->id = domain_id();
+	dom->maxaddr = maxaddr;
+	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+	if ((uintptr_t)dom->ptp & PAGE_MASK)
+		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+#ifdef notyet
+	/*
+	 * XXX superpage mappings for the iommu do not work correctly.
+	 *
+	 * By default all physical memory is mapped into the host_domain.
+	 * When a VM is allocated wired memory the pages belonging to it
+	 * are removed from the host_domain and added to the vm's domain.
+	 *
+	 * If the page being removed was mapped using a superpage mapping
+	 * in the host_domain then we need to demote the mapping before
+	 * removing the page.
+	 *
+	 * There is not any code to deal with the demotion at the moment
+	 * so we disable superpage mappings altogether.
+	 */
+	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+#endif
+
+	SLIST_INSERT_HEAD(&domhead, dom, next);
+
+	return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+	int i;
+	uint64_t *nlp;
+
+	if (level > 1) {
+		for (i = 0; i < 512; i++) {
+			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+				continue;
+			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+				continue;
+			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+			vtd_free_ptp(nlp, level - 1);
+		}
+	}
+
+	bzero(ptp, PAGE_SIZE);
+	free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+	struct domain *dom;
+	
+	dom = arg;
+
+	SLIST_REMOVE(&domhead, dom, domain, next);
+	vtd_free_ptp(dom->ptp, dom->pt_levels);
+	free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+	vtd_init,
+	vtd_cleanup,
+	vtd_enable,
+	vtd_disable,
+	vtd_create_domain,
+	vtd_destroy_domain,
+	vtd_create_mapping,
+	vtd_remove_mapping,
+	vtd_add_device,
+	vtd_remove_device,
+	vtd_invalidate_tlb,
+};
diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.h b/usr/src/uts/i86pc/io/vmm/io/iommu.h
new file mode 100644
index 0000000000..f8003a5d45
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/iommu.h
@@ -0,0 +1,76 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define	_IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+					   vm_paddr_t hpa, uint64_t len);
+typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
+					   uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, uint16_t rid);
+typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid);
+typedef void (*iommu_invalidate_tlb_t)(void *dom);
+
+struct iommu_ops {
+	iommu_init_func_t	init;		/* module wide */
+	iommu_cleanup_func_t	cleanup;
+	iommu_enable_func_t	enable;
+	iommu_disable_func_t	disable;
+
+	iommu_create_domain_t	create_domain;	/* domain-specific */
+	iommu_destroy_domain_t	destroy_domain;
+	iommu_create_mapping_t	create_mapping;
+	iommu_remove_mapping_t	remove_mapping;
+	iommu_add_device_t	add_device;
+	iommu_remove_device_t	remove_device;
+	iommu_invalidate_tlb_t	invalidate_tlb;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void	iommu_cleanup(void);
+void	*iommu_host_domain(void);
+void	*iommu_create_domain(vm_paddr_t maxaddr);
+void	iommu_destroy_domain(void *dom);
+void	iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+			     size_t len);
+void	iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
+void	iommu_add_device(void *dom, uint16_t rid);
+void	iommu_remove_device(void *dom, uint16_t rid);
+void	iommu_invalidate_tlb(void *domain);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.h b/usr/src/uts/i86pc/io/vmm/io/ppt.h
new file mode 100644
index 0000000000..686b15db49
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.h
@@ -0,0 +1,56 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define	_IO_PPT_H_
+
+int	ppt_unassign_all(struct vm *vm);
+int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+		      uint64_t addr, uint64_t msg, int numvec);
+int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+		int idx, uint64_t addr, uint64_t msg, uint32_t vector_control);
+int	ppt_assigned_devices(struct vm *vm);
+boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
+
+/*
+ * Returns the number of devices sequestered by the ppt driver for assignment
+ * to virtual machines.
+ */
+int	ppt_avail_devices(void);
+
+/*
+ * The following functions should never be called directly.
+ * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
+ */
+int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c b/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c
new file mode 100644
index 0000000000..989e88e17b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c
@@ -0,0 +1,86 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+
+/*
+ * IOMMU Stub
+ *
+ * Until proper iommu support can be wired into bhyve, stub out all the
+ * functions to either fail, if reasonable, or panic.
+ */
+
+void
+iommu_cleanup(void)
+{
+}
+
+void *
+iommu_host_domain(void)
+{
+	return (NULL);
+}
+
+/*ARGSUSED*/
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+	return (NULL);
+}
+
+/*ARGSUSED*/
+void
+iommu_destroy_domain(void *dom)
+{
+	panic("unimplemented");
+}
+
+/*ARGSUSED*/
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+	panic("unimplemented");
+}
+
+/*ARGSUSED*/
+void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+	panic("unimplemented");
+}
+
+/*ARGSUSED*/
+void
+iommu_add_device(void *dom, uint16_t rid)
+{
+	panic("unimplemented");
+}
+
+/*ARGSUSED*/
+void
+iommu_remove_device(void *dom, uint16_t rid)
+{
+	panic("unimplemented");
+}
+
+/*ARGSUSED*/
+void
+iommu_invalidate_tlb(void *domain)
+{
+	panic("unimplemented");
+}
+
diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c b/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c
new file mode 100644
index 0000000000..9d5b1f5cdc
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c
@@ -0,0 +1,92 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+
+#include <sys/vmm.h>
+
+/*
+ * PCI Pass-Through Stub
+ *
+ * Until proper passthrough support can be wired into bhyve, stub out all the
+ * functions to either fail or no-op.
+ */
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa,
+    size_t len, vm_paddr_t hpa)
+{
+	return (ENXIO);
+}
+
+/*ARGSUSED*/
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+    uint64_t addr, uint64_t msg, int numvec)
+{
+	return (ENXIO);
+}
+
+/*ARGSUSED*/
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, int idx,
+    uint64_t addr, uint64_t msg, uint32_t vector_control)
+{
+	return (ENXIO);
+}
+
+/*ARGSUSED*/
+int
+ppt_assigned_devices(struct vm *vm)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+boolean_t
+ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
+{
+	return (B_FALSE);
+}
+
+/*ARGSUSED*/
+int
+ppt_avail_devices(void)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+	return (ENOENT);
+}
+
+/*ARGSUSED*/
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+	return (ENXIO);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c
index a93b252c91..ba4cd7785e 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vatpic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -25,12 +27,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpic.c 279683 2015-03-06 02:05:45Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h
index ef5e51b158..d4a1be1820 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vatpic.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vatpic.h 273706 2014-10-26 19:03:06Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _VATPIC_H_
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
index ce17bdc92c..9b3e7376d5 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vatpit.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2018 Joyent, Inc.
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
@@ -26,12 +27,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpit.c 273706 2014-10-26 19:03:06Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -79,7 +79,7 @@ struct vatpit_callout_arg {
 struct channel {
 	int		mode;
 	uint16_t	initial;	/* initial counter value */
-	sbintime_t	now_sbt;	/* uptime when counter was loaded */
+	struct bintime	now_bt;		/* uptime when counter was loaded */
 	uint8_t		cr[2];
 	uint8_t		ol[2];
 	bool		slatched;	/* status latched */
@@ -88,7 +88,7 @@ struct channel {
 	int		olbyte;
 	int		frbyte;
 	struct callout	callout;
-	sbintime_t	callout_sbt;	/* target time */
+	struct bintime	callout_bt;	/* target time */
 	struct vatpit_callout_arg callout_arg;
 };
 
@@ -96,26 +96,41 @@ struct vatpit {
 	struct vm	*vm;
 	struct mtx	mtx;
 
-	sbintime_t	freq_sbt;
+	struct bintime	freq_bt;
 
 	struct channel	channel[3];
 };
 
 static void pit_timer_start_cntr0(struct vatpit *vatpit);
 
+static uint64_t
+vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c)
+{
+	struct bintime delta;
+	uint64_t result;
+
+	binuptime(&delta);
+	bintime_sub(&delta, &c->now_bt);
+
+	result = delta.sec * PIT_8254_FREQ;
+	result += delta.frac / vatpit->freq_bt.frac;
+
+	return (result);
+}
+
 static int
 vatpit_get_out(struct vatpit *vatpit, int channel)
 {
 	struct channel *c;
-	sbintime_t delta_ticks;
+	uint64_t delta_ticks;
 	int out;
 
 	c = &vatpit->channel[channel];
 
 	switch (c->mode) {
 	case TIMER_INTTC:
-		delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
-		out = ((c->initial - delta_ticks) <= 0);
+		delta_ticks = vatpit_delta_ticks(vatpit, c);
+		out = (delta_ticks >= c->initial);
 		break;
 	default:
 		out = 0;
@@ -165,24 +180,28 @@ static void
 pit_timer_start_cntr0(struct vatpit *vatpit)
 {
 	struct channel *c;
-	sbintime_t now, delta, precision;
 
 	c = &vatpit->channel[0];
 	if (c->initial != 0) {
-		delta = c->initial * vatpit->freq_sbt;
-		precision = delta >> tc_precexp;
-		c->callout_sbt = c->callout_sbt + delta;
+		sbintime_t precision;
+		struct bintime now, delta;
+
+		delta.sec = 0;
+		delta.frac = vatpit->freq_bt.frac * c->initial;
+		bintime_add(&c->callout_bt, &delta);
+		precision = bttosbt(delta) >> tc_precexp;
 
 		/*
-		 * Reset 'callout_sbt' if the time that the callout
-		 * was supposed to fire is more than 'c->initial'
-		 * ticks in the past.
+		 * Reset 'callout_bt' if the time that the callout was supposed
+		 * to fire is more than 'c->initial' ticks in the past.
 		 */
-		now = sbinuptime();
-		if (c->callout_sbt < now)
-			c->callout_sbt = now + delta;
+		binuptime(&now);
+		if (bintime_cmp(&c->callout_bt, &now, <)) {
+			c->callout_bt = now;
+			bintime_add(&c->callout_bt, &delta);
+		}
 
-		callout_reset_sbt(&c->callout, c->callout_sbt,
+		callout_reset_sbt(&c->callout, bttosbt(c->callout_bt),
 		    precision, vatpit_callout_handler, &c->callout_arg,
 		    C_ABSOLUTE);
 	}
@@ -192,7 +211,7 @@ static uint16_t
 pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch)
 {
 	uint16_t lval;
-	sbintime_t delta_ticks;
+	uint64_t delta_ticks;
 
 	/* cannot latch a new value until the old one has been consumed */
 	if (latch && c->olbyte != 0)
@@ -208,12 +227,11 @@ pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch)
 		 * here.
 		 */
 		c->initial = TIMER_DIV(PIT_8254_FREQ, 100);
-		c->now_sbt = sbinuptime();
+		binuptime(&c->now_bt);
 		c->status &= ~TIMER_STS_NULLCNT;
 	}
 
-	delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
-
+	delta_ticks = vatpit_delta_ticks(vatpit, c);
 	lval = c->initial - delta_ticks % c->initial;
 
 	if (latch) {
@@ -384,10 +402,10 @@ vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 			c->frbyte = 0;
 			c->crbyte = 0;
 			c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
-			c->now_sbt = sbinuptime();
+			binuptime(&c->now_bt);
 			/* Start an interval timer for channel 0 */
 			if (port == TIMER_CNTR0) {
-				c->callout_sbt = c->now_sbt;
+				c->callout_bt = c->now_bt;
 				pit_timer_start_cntr0(vatpit);
 			}
 			if (c->initial == 0)
@@ -424,7 +442,6 @@ struct vatpit *
 vatpit_init(struct vm *vm)
 {
 	struct vatpit *vatpit;
-	struct bintime bt;
 	struct vatpit_callout_arg *arg;
 	int i;
 
@@ -433,11 +450,10 @@ vatpit_init(struct vm *vm)
 
 	mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN);
 
-	FREQ2BT(PIT_8254_FREQ, &bt);
-	vatpit->freq_sbt = bttosbt(bt);
+	FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt);
 
 	for (i = 0; i < 3; i++) {
-		callout_init(&vatpit->channel[i].callout, true);
+		callout_init(&vatpit->channel[i].callout, 1);
 		arg = &vatpit->channel[i].callout_arg;
 		arg->vatpit = vatpit;
 		arg->channel_num = i;
@@ -456,3 +472,16 @@ vatpit_cleanup(struct vatpit *vatpit)
 
 	free(vatpit, M_VATPIT);
 }
+
+#ifndef __FreeBSD__
+void
+vatpit_localize_resources(struct vatpit *vatpit)
+{
+	for (uint_t i = 0; i < 3; i++) {
+		/* Only localize channels which might be running */
+		if (vatpit->channel[i].mode != 0) {
+			vmm_glue_callout_localize(&vatpit->channel[i].callout);
+		}
+	}
+}
+#endif /* __FreeBSD */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h
index f20ad73e47..4bf9fe048d 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vatpit.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
@@ -24,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vatpit.h 273706 2014-10-26 19:03:06Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _VATPIT_H_
@@ -42,4 +44,8 @@ int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
 int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port,
     int bytes, uint32_t *eax);
 
+#ifndef __FreeBSD__
+void vatpit_localize_resources(struct vatpit *);
+#endif
+
 #endif	/* _VATPIT_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.c b/usr/src/uts/i86pc/io/vmm/io/vhpet.c
index 25f6013da0..c82b4626bd 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vhpet.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
@@ -24,11 +26,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
@@ -36,7 +42,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z ty
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
 
 #include <dev/acpica/acpi_hpet.h>
 
@@ -52,7 +57,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z ty
 
 static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet");
 
-#define	HPET_FREQ	10000000		/* 10.0 Mhz */
+#define	HPET_FREQ	16777216		/* 16.7 (2^24) Mhz */
 #define	FS_PER_S	1000000000000000ul
 
 /* Timer N Configuration and Capabilities Register */
@@ -104,7 +109,6 @@ vhpet_capabilities(void)
 	uint64_t cap = 0;
 
 	cap |= 0x8086 << 16;			/* vendor id */
-	cap |= HPET_CAP_LEG_RT;			/* legacy routing capable */
 	cap |= (VHPET_NUM_TIMERS - 1) << 8;	/* number of timers */
 	cap |= 1;				/* revision */
 	cap &= ~HPET_CAP_COUNT_SIZE;		/* 32-bit timer */
@@ -127,15 +131,6 @@ vhpet_timer_msi_enabled(struct vhpet *vhpet, int n)
 {
 	const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN;
 
-	/*
-	 * LegacyReplacement Route configuration takes precedence over MSI
-	 * for timers 0 and 1.
-	 */
-	if (n == 0 || n == 1) {
-		if (vhpet->config & HPET_CNF_LEG_RT)
-			return (false);
-	}
-
 	if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable)
 		return (true);
 	else
@@ -152,41 +147,9 @@ vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n)
 	if (vhpet_timer_msi_enabled(vhpet, n))
 		return (0);
 
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		/*
-		 * In "legacy routing" timers 0 and 1 are connected to
-		 * ioapic pins 2 and 8 respectively.
-		 */
-		switch (n) {
-		case 0:
-			return (2);
-		case 1:
-			return (8);
-		}
-	}
-
 	return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9);
 }
 
-static __inline int
-vhpet_timer_atpic_pin(struct vhpet *vhpet, int n)
-{
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		/*
-		 * In "legacy routing" timers 0 and 1 are connected to
-		 * 8259 master pin 0 and slave pin 0 respectively.
-		 */
-		switch (n) {
-		case 0:
-			return (0);
-		case 1:
-			return (8);
-		}
-	}
-
-	return (-1);
-}
-
 static uint32_t
 vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 {
@@ -211,7 +174,7 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 		/*
 		 * The sbinuptime corresponding to the 'countbase' is
 		 * meaningless when the counter is disabled. Make sure
-		 * that the the caller doesn't want to use it.
+		 * that the caller doesn't want to use it.
 		 */
 		KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL"));
 	}
@@ -221,17 +184,12 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 static void
 vhpet_timer_clear_isr(struct vhpet *vhpet, int n)
 {
-	int pin, legacy_pin;
+	int pin;
 
 	if (vhpet->isr & (1 << n)) {
 		pin = vhpet_timer_ioapic_pin(vhpet, n);
 		KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n));
 		vioapic_deassert_irq(vhpet->vm, pin);
-
-		legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
-		if (legacy_pin != -1)
-			vatpic_deassert_irq(vhpet->vm, legacy_pin);
-
 		vhpet->isr &= ~(1 << n);
 	}
 }
@@ -257,12 +215,6 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 	KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: "
 	    "timer %d is using MSI", n));
 
-	/* The legacy replacement interrupts are always edge triggered */
-	if (vhpet->config & HPET_CNF_LEG_RT) {
-		if (n == 0 || n == 1)
-			return (true);
-	}
-
 	if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0)
 		return (true);
 	else
@@ -272,7 +224,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 static void
 vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 {
-	int pin, legacy_pin;
+	int pin;
 
 	/* If interrupts are not enabled for this timer then just return. */
 	if (!vhpet_timer_interrupt_enabled(vhpet, n))
@@ -298,17 +250,11 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 		return;
 	}
 
-	legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
-
 	if (vhpet_timer_edge_trig(vhpet, n)) {
 		vioapic_pulse_irq(vhpet->vm, pin);
-		if (legacy_pin != -1)
-			vatpic_pulse_irq(vhpet->vm, legacy_pin);
 	} else {
 		vhpet->isr |= 1 << n;
 		vioapic_assert_irq(vhpet->vm, pin);
-		if (legacy_pin != -1)
-			vatpic_assert_irq(vhpet->vm, legacy_pin);
 	}
 }
 
@@ -402,10 +348,6 @@ vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now)
 {
 	sbintime_t delta, precision;
 
-	/* If interrupts are not enabled for this timer then just return. */
-	if (!vhpet_timer_interrupt_enabled(vhpet, n))
-		return;
-
 	if (vhpet->timer[n].comprate != 0)
 		vhpet_adjust_compval(vhpet, n, counter);
 	else {
@@ -588,6 +530,13 @@ vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size,
 		counter = vhpet_counter(vhpet, nowptr);
 		oldval = vhpet->config;
 		update_register(&vhpet->config, data, mask);
+
+		/*
+		 * LegacyReplacement Routing is not supported so clear the
+		 * bit explicitly.
+		 */
+		vhpet->config &= ~HPET_CNF_LEG_RT;
+
 		if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) {
 			if (vhpet_counter_enabled(vhpet)) {
 				vhpet_start_counting(vhpet);
@@ -777,8 +726,10 @@ vhpet_init(struct vm *vm)
 	vhpet->freq_sbt = bttosbt(bt);
 
 	pincount = vioapic_pincount(vm);
-	if (pincount >= 24)
-		allowed_irqs = 0x00f00000;	/* irqs 20, 21, 22 and 23 */
+	if (pincount >= 32)
+		allowed_irqs = 0xff000000;	/* irqs 24-31 */
+	else if (pincount >= 20)
+		allowed_irqs = 0xf << (pincount - 4);	/* 4 upper irqs */
 	else
 		allowed_irqs = 0;
 
@@ -819,3 +770,12 @@ vhpet_getcap(struct vm_hpet_cap *cap)
 	cap->capabilities = vhpet_capabilities();
 	return (0);
 }
+#ifndef __FreeBSD__
+void
+vhpet_localize_resources(struct vhpet *vhpet)
+{
+	for (uint_t i = 0; i < VHPET_NUM_TIMERS; i++) {
+		vmm_glue_callout_localize(&vhpet->timer[i].callout);
+	}
+}
+#endif /* __FreeBSD */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.h b/usr/src/uts/i86pc/io/vmm/io/vhpet.h
index 868809d166..8e28241b32 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vhpet.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
@@ -24,7 +26,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vhpet.h 258579 2013-11-25 19:04:51Z neel $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _VHPET_H_
@@ -41,4 +47,8 @@ int	vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val,
 	    int size, void *arg);
 int	vhpet_getcap(struct vm_hpet_cap *cap);
 
+#ifndef __FreeBSD__
+void vhpet_localize_resources(struct vhpet *vhpet);
+#endif
+
 #endif	/* _VHPET_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
index 5adf5de16d..dbd3420420 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vioapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
@@ -24,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -37,19 +39,20 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
+#include <sys/cpuset.h>
 
 #include <x86/apicreg.h>
 #include <machine/vmm.h>
@@ -62,7 +65,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z
 #define	IOREGSEL	0x00
 #define	IOWIN		0x10
 
-#define	REDIR_ENTRIES	24
+#define	REDIR_ENTRIES	32
 #define	RTBL_RO_BITS	((uint64_t)(IOART_REM_IRR | IOART_DELIVS))
 
 struct vioapic {
@@ -234,48 +237,139 @@ vioapic_pulse_irq(struct vm *vm, int irq)
 	return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
 }
 
+#define	REDIR_IS_PHYS(reg)	(((reg) & IOART_DESTMOD) == IOART_DESTPHY)
+#define	REDIR_IS_LOWPRIO(reg)	(((reg) & IOART_DELMOD) == IOART_DELLOPRI)
+/* Level-triggered interrupts only valid in fixed and low-priority modes */
+#define	REDIR_IS_LVLTRIG(reg)						\
+    (((reg) & IOART_TRGRLVL) != 0 &&					\
+    (((reg) & IOART_DELMOD) == IOART_DELFIXED || REDIR_IS_LOWPRIO(reg)))
+#define	REDIR_DEST(reg)		((reg) >> (32 + APIC_ID_SHIFT))
+#define	REDIR_VECTOR(reg)	((reg) & IOART_INTVEC)
+
 /*
- * Reset the vlapic's trigger-mode register to reflect the ioapic pin
- * configuration.
+ * Given a redirection entry, determine which vCPUs would be targeted.
  */
 static void
-vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
+vioapic_calcdest(struct vioapic *vioapic, uint64_t redir_ent, cpuset_t *dmask)
 {
-	struct vioapic *vioapic;
-	struct vlapic *vlapic;
-	uint32_t low, high, dest;
-	int delmode, pin, vector;
-	bool level, phys;
 
-	vlapic = vm_lapic(vm, vcpuid);
-	vioapic = vm_ioapic(vm);
+	/*
+	 * When calculating interrupt destinations with vlapic_calcdest(), the
+	 * legacy xAPIC format is assumed, since the system lacks interrupt
+	 * redirection hardware.
+	 * See vlapic_deliver_intr() for more details.
+	 */
+	vlapic_calcdest(vioapic->vm, dmask, REDIR_DEST(redir_ent),
+	    REDIR_IS_PHYS(redir_ent), REDIR_IS_LOWPRIO(redir_ent), false);
+}
+
+/*
+ * Across all redirection entries utilizing a specified vector, determine the
+ * set of vCPUs which would be targeted by a level-triggered interrupt.
+ */
+static void
+vioapic_tmr_active(struct vioapic *vioapic, uint8_t vec, cpuset_t *result)
+{
+	u_int i;
+
+	CPU_ZERO(result);
+	if (vec == 0) {
+		return;
+	}
+
+	for (i = 0; i < REDIR_ENTRIES; i++) {
+		cpuset_t dest;
+		const uint64_t val = vioapic->rtbl[i].reg;
+
+		if (!REDIR_IS_LVLTRIG(val) || REDIR_VECTOR(val) != vec) {
+			continue;
+		}
+
+		CPU_ZERO(&dest);
+		vioapic_calcdest(vioapic, val, &dest);
+		CPU_OR(result, &dest);
+	}
+}
+
+/*
+ * Update TMR state in vLAPICs after changes to vIOAPIC pin configuration
+ */
+static void
+vioapic_update_tmrs(struct vioapic *vioapic, int vcpuid, uint64_t oldval,
+    uint64_t newval)
+{
+	cpuset_t active, allset, newset, oldset;
+	struct vm *vm;
+	uint8_t newvec, oldvec;
+
+	vm = vioapic->vm;
+	CPU_ZERO(&allset);
+	CPU_ZERO(&newset);
+	CPU_ZERO(&oldset);
+	newvec = oldvec = 0;
+
+	if (REDIR_IS_LVLTRIG(oldval)) {
+		vioapic_calcdest(vioapic, oldval, &oldset);
+		CPU_OR(&allset, &oldset);
+		oldvec = REDIR_VECTOR(oldval);
+	}
+
+	if (REDIR_IS_LVLTRIG(newval)) {
+		vioapic_calcdest(vioapic, newval, &newset);
+		CPU_OR(&allset, &newset);
+		newvec = REDIR_VECTOR(newval);
+	}
+
+	if (CPU_EMPTY(&allset) ||
+	    (CPU_CMP(&oldset, &newset) == 0 && oldvec == newvec)) {
+		return;
+	}
 
-	VIOAPIC_LOCK(vioapic);
 	/*
-	 * Reset all vectors to be edge-triggered.
+	 * Since the write to the redirection table has already occurred, a
+	 * scan of level-triggered entries referencing the old vector will find
+	 * only entries which are now currently valid.
 	 */
-	vlapic_reset_tmr(vlapic);
-	for (pin = 0; pin < REDIR_ENTRIES; pin++) {
-		low = vioapic->rtbl[pin].reg;
-		high = vioapic->rtbl[pin].reg >> 32;
+	vioapic_tmr_active(vioapic, oldvec, &active);
 
-		level = low & IOART_TRGRLVL ? true : false;
-		if (!level)
+	while (!CPU_EMPTY(&allset)) {
+		struct vlapic *vlapic;
+		u_int i;
+
+		i = CPU_FFS(&allset) - 1;
+		CPU_CLR(i, &allset);
+
+		if (oldvec == newvec &&
+		    CPU_ISSET(i, &oldset) && CPU_ISSET(i, &newset)) {
 			continue;
+		}
 
-		/*
-		 * For a level-triggered 'pin' let the vlapic figure out if
-		 * an assertion on this 'pin' would result in an interrupt
-		 * being delivered to it. If yes, then it will modify the
-		 * TMR bit associated with this vector to level-triggered.
-		 */
-		phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
-		delmode = low & IOART_DELMOD;
-		vector = low & IOART_INTVEC;
-		dest = high >> APIC_ID_SHIFT;
-		vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
+		if (i != vcpuid) {
+			vcpu_block_run(vm, i);
+		}
+
+		vlapic = vm_lapic(vm, i);
+		if (CPU_ISSET(i, &oldset)) {
+			/*
+			 * Perform the deassertion if no other level-triggered
+			 * IOAPIC entries target this vCPU with the old vector
+			 *
+			 * Note: Sharing of vectors like that should be
+			 * extremely rare in modern operating systems and was
+			 * previously unsupported by the bhyve vIOAPIC.
+			 */
+			if (!CPU_ISSET(i, &active)) {
+				vlapic_tmr_set(vlapic, oldvec, false);
+			}
+		}
+		if (CPU_ISSET(i, &newset)) {
+			vlapic_tmr_set(vlapic, newvec, true);
+		}
+
+		if (i != vcpuid) {
+			vcpu_unblock_run(vm, i);
+		}
 	}
-	VIOAPIC_UNLOCK(vioapic);
 }
 
 static uint32_t
@@ -319,7 +413,6 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
 	uint64_t data64, mask64;
 	uint64_t last, changed;
 	int regnum, pin, lshift;
-	cpuset_t allvcpus;
 
 	regnum = addr & 0xff;
 	switch (regnum) {
@@ -355,20 +448,15 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
 
 		/*
 		 * If any fields in the redirection table entry (except mask
-		 * or polarity) have changed then rendezvous all the vcpus
-		 * to update their vlapic trigger-mode registers.
+		 * or polarity) have changed then update the trigger-mode
+		 * registers on all the vlapics.
 		 */
 		changed = last ^ vioapic->rtbl[pin].reg;
 		if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
 			VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
 			    "vlapic trigger-mode register", pin);
-			VIOAPIC_UNLOCK(vioapic);
-#if 0	/* XXX */
-			allvcpus = vm_active_cpus(vioapic->vm);
-			vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
-			    vioapic_update_tmr, NULL);
-#endif
-			VIOAPIC_LOCK(vioapic);
+			vioapic_update_tmrs(vioapic, vcpuid, last,
+			    vioapic->rtbl[pin].reg);
 		}
 
 		/*
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.h b/usr/src/uts/i86pc/io/vmm/io/vioapic.h
index 9479ebb10e..6bf3e80e05 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vioapic.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
@@ -24,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vioapic.h 258699 2013-11-27 22:18:08Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -45,10 +47,6 @@
 #define	VIOAPIC_BASE	0xFEC00000
 #define	VIOAPIC_SIZE	4096
 
-#include "vdev.h"
-
-struct vm;
-
 struct vioapic *vioapic_init(struct vm *vm);
 void	vioapic_cleanup(struct vioapic *vioapic);
 
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
index 9a0a3058ea..4e58249c8d 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
@@ -1,6 +1,9 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
+ * Copyright (c) 2019 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -23,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,10 +39,11 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
@@ -57,7 +61,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z n
 
 #include <machine/vmm.h>
 
-#include "vmm_ipi.h"
 #include "vmm_lapic.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
@@ -82,7 +85,15 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z n
 #define	VLAPIC_TIMER_UNLOCK(vlapic)	mtx_unlock_spin(&((vlapic)->timer_mtx))
 #define	VLAPIC_TIMER_LOCKED(vlapic)	mtx_owned(&((vlapic)->timer_mtx))
 
-#define VLAPIC_BUS_FREQ	tsc_freq
+/*
+ * APIC timer frequency:
+ * - arbitrary but chosen to be in the ballpark of contemporary hardware.
+ * - power-of-two to avoid loss of precision when converted to a bintime.
+ */
+#define VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
+
+static void vlapic_set_error(struct vlapic *, uint32_t, bool);
+static void vlapic_tmr_reset(struct vlapic *);
 
 static __inline uint32_t
 vlapic_get_id(struct vlapic *vlapic)
@@ -259,7 +270,6 @@ vlapic_dcr_write_handler(struct vlapic *vlapic)
 	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
-
 void
 vlapic_esr_write_handler(struct vlapic *vlapic)
 {
@@ -287,7 +297,8 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 	}
 
 	if (vector < 16) {
-		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR);
+		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
+		    false);
 		VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
 		    vector);
 		return (1);
@@ -449,20 +460,22 @@ vlapic_mask_lvts(struct vlapic *vlapic)
 }
 
 static int
-vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt)
+vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt)
 {
-	uint32_t vec, mode;
+	uint32_t mode, reg, vec;
 
-	if (lvt & APIC_LVT_M)
-		return (0);
+	reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
 
-	vec = lvt & APIC_LVT_VECTOR;
-	mode = lvt & APIC_LVT_DM;
+	if (reg & APIC_LVT_M)
+		return (0);
+	vec = reg & APIC_LVT_VECTOR;
+	mode = reg & APIC_LVT_DM;
 
 	switch (mode) {
 	case APIC_LVT_DM_FIXED:
 		if (vec < 16) {
-			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
+			    lvt == APIC_LVT_ERROR);
 			return (0);
 		}
 		if (vlapic_set_intr_ready(vlapic, vec, false))
@@ -566,6 +579,8 @@ vlapic_update_ppr(struct vlapic *vlapic)
 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
 }
 
+static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
+
 static void
 vlapic_process_eoi(struct vlapic *vlapic)
 {
@@ -576,11 +591,7 @@ vlapic_process_eoi(struct vlapic *vlapic)
 	isrptr = &lapic->isr0;
 	tmrptr = &lapic->tmr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		bitpos = fls(isrptr[idx]);
 		if (bitpos-- != 0) {
@@ -589,17 +600,21 @@ vlapic_process_eoi(struct vlapic *vlapic)
 				      vlapic->isrvec_stk_top);
 			}
 			isrptr[idx] &= ~(1 << bitpos);
+			vector = i * 32 + bitpos;
+			VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d",
+			    vector);
 			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
 			vlapic->isrvec_stk_top--;
 			vlapic_update_ppr(vlapic);
 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
-				vector = i * 32 + bitpos;
 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
 				    vector);
 			}
 			return;
 		}
 	}
+	VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI");
+	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
 }
 
 static __inline int
@@ -621,22 +636,22 @@ vlapic_periodic_timer(struct vlapic *vlapic)
 
 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
 
-void
-vlapic_set_error(struct vlapic *vlapic, uint32_t mask)
+static void
+vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
 {
-	uint32_t lvt;
 
 	vlapic->esr_pending |= mask;
-	if (vlapic->esr_firing)
+
+	/*
+	 * Avoid infinite recursion if the error LVT itself is configured with
+	 * an illegal vector.
+	 */
+	if (lvt_error)
 		return;
-	vlapic->esr_firing = 1;
 
-	// The error LVT always uses the fixed delivery mode.
-	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
-	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+	if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
 	}
-	vlapic->esr_firing = 0;
 }
 
 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
@@ -644,13 +659,10 @@ static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
 static void
 vlapic_fire_timer(struct vlapic *vlapic)
 {
-	uint32_t lvt;
 
 	KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
-	
-	// The timer LVT always uses the fixed delivery mode.
-	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
-	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+
+	if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
 		VLAPIC_CTR0(vlapic, "vlapic timer fired");
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
 	}
@@ -662,10 +674,8 @@ static VMM_STAT(VLAPIC_INTR_CMC,
 void
 vlapic_fire_cmci(struct vlapic *vlapic)
 {
-	uint32_t lvt;
 
-	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
-	if (vlapic_fire_lvt(vlapic, lvt)) {
+	if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
 	}
 }
@@ -676,7 +686,6 @@ static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
 int
 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
 {
-	uint32_t lvt;
 
 	if (vlapic_enabled(vlapic) == false) {
 		/*
@@ -699,35 +708,20 @@ vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
 
 	switch (vector) {
 	case APIC_LVT_LINT0:
-		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT);
-		break;
 	case APIC_LVT_LINT1:
-		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT);
-		break;
 	case APIC_LVT_TIMER:
-		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
-		lvt |= APIC_LVT_DM_FIXED;
-		break;
 	case APIC_LVT_ERROR:
-		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
-		lvt |= APIC_LVT_DM_FIXED;
-		break;
 	case APIC_LVT_PMC:
-		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT);
-		break;
 	case APIC_LVT_THERMAL:
-		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT);
-		break;
 	case APIC_LVT_CMCI:
-		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
+		if (vlapic_fire_lvt(vlapic, vector)) {
+			vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
+			    LVTS_TRIGGERRED, vector, 1);
+		}
 		break;
 	default:
 		return (EINVAL);
 	}
-	if (vlapic_fire_lvt(vlapic, lvt)) {
-		vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
-		    LVTS_TRIGGERRED, vector, 1);
-	}
 	return (0);
 }
 
@@ -831,11 +825,11 @@ vlapic_icrtmr_write_handler(struct vlapic *vlapic)
 /*
  * This function populates 'dmask' with the set of vcpus that match the
  * addressing specified by the (dest, phys, lowprio) tuple.
- * 
+ *
  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
  * or xAPIC (8-bit) destination field.
  */
-static void
+void
 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
     bool lowprio, bool x2apic_dest)
 {
@@ -860,12 +854,12 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
 		 */
 		CPU_ZERO(dmask);
 		vcpuid = vm_apicid2vcpuid(vm, dest);
-		if (vcpuid < VM_MAXCPU)
+		if (vcpuid < vm_get_maxcpus(vm))
 			CPU_SET(vcpuid, dmask);
 	} else {
 		/*
 		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
-		 * bitmask. This model is only avilable in the xAPIC mode.
+		 * bitmask. This model is only available in the xAPIC mode.
 		 */
 		mda_flat_ldest = dest & 0xff;
 
@@ -883,7 +877,7 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
 
 		/*
 		 * Logical mode: match each APIC that has a bit set
-		 * in it's LDR that matches a bit in the ldest.
+		 * in its LDR that matches a bit in the ldest.
 		 */
 		CPU_ZERO(dmask);
 		amask = vm_active_cpus(vm);
@@ -987,6 +981,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 	struct vlapic *vlapic2;
 	struct vm_exit *vmexit;
 	struct LAPIC *lapic;
+	uint16_t maxcpus;
 
 	lapic = vlapic->apic_page;
 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
@@ -1000,7 +995,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 	mode = icrval & APIC_DELMODE_MASK;
 
 	if (mode == APIC_DELMODE_FIXED && vec < 16) {
-		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
 		VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
 		return (0);
 	}
@@ -1048,11 +1043,12 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 		return (0);	/* handled completely in the kernel */
 	}
 
+	maxcpus = vm_get_maxcpus(vlapic->vm);
 	if (mode == APIC_DELMODE_INIT) {
 		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
 			return (0);
 
-		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
 			vlapic2 = vm_lapic(vlapic->vm, dest);
 
 			/* move from INIT to waiting-for-SIPI state */
@@ -1065,7 +1061,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 	}
 
 	if (mode == APIC_DELMODE_STARTUP) {
-		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
 			vlapic2 = vm_lapic(vlapic->vm, dest);
 
 			/*
@@ -1118,11 +1114,7 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
 
 	irrptr = &lapic->irr0;
 
-	/*
-	 * The x86 architecture reserves the the first 32 vectors for use
-	 * by the processor.
-	 */
-	for (i = 7; i > 0; i--) {
+	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		val = atomic_load_acq_int(&irrptr[idx]);
 		bitpos = fls(val);
@@ -1461,7 +1453,7 @@ vlapic_reset(struct vlapic *vlapic)
 	lapic->dfr = 0xffffffff;
 	lapic->svr = APIC_SVR_VECTOR;
 	vlapic_mask_lvts(vlapic);
-	vlapic_reset_tmr(vlapic);
+	vlapic_tmr_reset(vlapic);
 
 	lapic->dcr_timer = 0;
 	vlapic_dcr_write_handler(vlapic);
@@ -1478,7 +1470,8 @@ void
 vlapic_init(struct vlapic *vlapic)
 {
 	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
-	KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU,
+	KASSERT(vlapic->vcpuid >= 0 &&
+	    vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
 	    ("vlapic_init: vcpuid is not initialized"));
 	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
 	    "initialized"));
@@ -1628,60 +1621,85 @@ vlapic_enabled(struct vlapic *vlapic)
 }
 
 static void
-vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
+vlapic_tmr_reset(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
-	uint32_t *tmrptr, mask;
-	int idx;
 
 	lapic = vlapic->apic_page;
-	tmrptr = &lapic->tmr0;
-	idx = (vector / 32) * 4;
-	mask = 1 << (vector % 32);
-	if (level)
-		tmrptr[idx] |= mask;
-	else
-		tmrptr[idx] &= ~mask;
-
-	if (vlapic->ops.set_tmr != NULL)
-		(*vlapic->ops.set_tmr)(vlapic, vector, level);
+	lapic->tmr0 = lapic->tmr1 = lapic->tmr2 = lapic->tmr3 = 0;
+	lapic->tmr4 = lapic->tmr5 = lapic->tmr6 = lapic->tmr7 = 0;
+	vlapic->tmr_pending = 1;
 }
 
+/*
+ * Synchronize TMR designations into the LAPIC state.
+ * The vCPU must be in the VCPU_RUNNING state.
+ */
 void
-vlapic_reset_tmr(struct vlapic *vlapic)
+vlapic_tmr_update(struct vlapic *vlapic)
 {
-	int vector;
+	struct LAPIC *lapic;
+	uint32_t *tmrptr;
+	uint32_t result[VLAPIC_TMR_CNT];
+	u_int i, tmr_idx;
+
+	if (vlapic->tmr_pending == 0) {
+		return;
+	}
+
+	lapic = vlapic->apic_page;
+	tmrptr = &lapic->tmr0;
+
+	VLAPIC_CTR0(vlapic, "synchronizing TMR");
+	for (i = 0; i < VLAPIC_TMR_CNT; i++) {
+		tmr_idx = i * 4;
 
-	VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
+		tmrptr[tmr_idx] &= ~vlapic->tmr_vec_deassert[i];
+		tmrptr[tmr_idx] |= vlapic->tmr_vec_assert[i];
+		vlapic->tmr_vec_deassert[i] = 0;
+		vlapic->tmr_vec_assert[i] = 0;
+		result[i] = tmrptr[tmr_idx];
+	}
+	vlapic->tmr_pending = 0;
 
-	for (vector = 0; vector <= 255; vector++)
-		vlapic_set_tmr(vlapic, vector, false);
+	if (vlapic->ops.set_tmr != NULL) {
+		(*vlapic->ops.set_tmr)(vlapic, result);
+	}
 }
 
+/*
+ * Designate the TMR state for a given interrupt vector.
+ * The caller must hold the vIOAPIC lock and prevent the vCPU corresponding to
+ * this vLAPIC instance from being-in or entering the VCPU_RUNNING state.
+ */
 void
-vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
-    int delmode, int vector)
+vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active)
 {
-	cpuset_t dmask;
-	bool lowprio;
-
-	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+	const uint32_t idx = vector / 32;
+	const uint32_t mask = 1 << (vector % 32);
+
+	VLAPIC_CTR2(vlapic, "TMR for vector %u %sasserted", vector,
+	    active ? "" : "de");
+	if (active) {
+		vlapic->tmr_vec_assert[idx] |= mask;
+		vlapic->tmr_vec_deassert[idx] &= ~mask;
+	} else {
+		vlapic->tmr_vec_deassert[idx] |= mask;
+		vlapic->tmr_vec_assert[idx] &= ~mask;
+	}
 
 	/*
-	 * A level trigger is valid only for fixed and lowprio delivery modes.
+	 * Track the number of TMR changes between calls to vlapic_tmr_update.
+	 * While a simple boolean would suffice, this count may be useful when
+	 * tracing or debugging, and is cheap to calculate.
 	 */
-	if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
-		VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
-		    "delivery-mode %d", delmode);
-		return;
-	}
-
-	lowprio = (delmode == APIC_DELMODE_LOWPRIO);
-	vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
-
-	if (!CPU_ISSET(vlapic->vcpuid, &dmask))
-		return;
+	vlapic->tmr_pending = MIN(UINT32_MAX - 1, vlapic->tmr_pending) + 1;
+}
 
-	VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
-	vlapic_set_tmr(vlapic, vector, true);
+#ifndef __FreeBSD__
+void
+vlapic_localize_resources(struct vlapic *vlapic)
+{
+	vmm_glue_callout_localize(&vlapic->callout);
 }
+#endif /* __FreeBSD */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
index 3fa705d818..e1a52551a9 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vlapic.h 262281 2014-02-21 06:03:54Z neel $
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _VLAPIC_H_
@@ -69,7 +75,6 @@ int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
  */
 void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum);
 
-void vlapic_set_error(struct vlapic *vlapic, uint32_t mask);
 void vlapic_fire_cmci(struct vlapic *vlapic);
 int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
 
@@ -81,16 +86,11 @@ bool vlapic_enabled(struct vlapic *vlapic);
 void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
     int delmode, int vec);
 
-/* Reset the trigger-mode bits for all vectors to be edge-triggered */
-void vlapic_reset_tmr(struct vlapic *vlapic);
+void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
+    bool lowprio, bool x2apic_dest);
 
-/*
- * Set the trigger-mode bit associated with 'vector' to level-triggered if
- * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to
- * this 'vlapic'.
- */
-void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
-    int delmode, int vector);
+void vlapic_tmr_update(struct vlapic *vlapic);
+void vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active);
 
 void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val);
 uint64_t vlapic_get_cr8(struct vlapic *vlapic);
@@ -106,4 +106,9 @@ void vlapic_icrtmr_write_handler(struct vlapic *vlapic);
 void vlapic_dcr_write_handler(struct vlapic *vlapic);
 void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset);
 void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val);
+
+#ifndef __FreeBSD__
+void vlapic_localize_resources(struct vlapic *vlapic);
+#endif
+
 #endif	/* _VLAPIC_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
index f9bd2e0e8b..5795d48d52 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/io/vlapic_priv.h 263211 2014-03-15 23:09:34Z tychon $
+ * $FreeBSD$
  */
 
 #ifndef _VLAPIC_PRIV_H_
@@ -136,6 +138,8 @@ enum boot_state {
 
 #define VLAPIC_MAXLVT_INDEX	APIC_LVT_CMCI
 
+#define VLAPIC_TMR_CNT		8
+
 struct vlapic;
 
 struct vlapic_ops {
@@ -143,7 +147,7 @@ struct vlapic_ops {
 	int (*pending_intr)(struct vlapic *vlapic, int *vecptr);
 	void (*intr_accepted)(struct vlapic *vlapic, int vector);
 	void (*post_intr)(struct vlapic *vlapic, int hostcpu);
-	void (*set_tmr)(struct vlapic *vlapic, int vector, bool level);
+	void (*set_tmr)(struct vlapic *vlapic, const uint32_t *result);
 	void (*enable_x2apic_mode)(struct vlapic *vlapic);
 };
 
@@ -154,7 +158,7 @@ struct vlapic {
 	struct vlapic_ops	ops;
 
 	uint32_t		esr_pending;
-	int			esr_firing;
+	uint32_t		tmr_pending;
 
 	struct callout	callout;	/* vlapic timer */
 	struct bintime	timer_fire_bt;	/* callout expiry time */
@@ -182,6 +186,19 @@ struct vlapic {
 	 */
 	uint32_t	svr_last;
 	uint32_t	lvt_last[VLAPIC_MAXLVT_INDEX + 1];
+
+	/*
+	 * Store intended modifications to the trigger-mode register state.
+	 * Along with the tmr_pending counter above, these are protected by the
+	 * vIOAPIC lock and can only be modified under specific conditions:
+	 *
+	 * 1. When holding the vIOAPIC lock, and the vCPU to which the vLAPIC
+	 *    belongs is prevented from entering the VCPU_RUNNING state.
+	 * 2. When the owning vCPU is in the VCPU_RUNNING state, and is
+	 *    applying the TMR modifications prior to interrupt injection.
+	 */
+	uint32_t	tmr_vec_deassert[VLAPIC_TMR_CNT];
+	uint32_t	tmr_vec_assert[VLAPIC_TMR_CNT];
 };
 
 void vlapic_init(struct vlapic *vlapic);
diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c
new file mode 100644
index 0000000000..4df909777d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c
@@ -0,0 +1,105 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+
+#include "vpmtmr.h"
+
+/*
+ * The ACPI Power Management timer is a free-running 24- or 32-bit
+ * timer with a frequency of 3.579545MHz
+ *
+ * This implementation will be 32-bits
+ */
+
+#define PMTMR_FREQ	3579545  /* 3.579545MHz */
+
+struct vpmtmr {
+	sbintime_t	freq_sbt;
+	sbintime_t	baseuptime;
+	uint32_t	baseval;
+};
+
+static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer");
+
+struct vpmtmr *
+vpmtmr_init(struct vm *vm)
+{
+	struct vpmtmr *vpmtmr;
+	struct bintime bt;
+
+	vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO);
+	vpmtmr->baseuptime = sbinuptime();
+	vpmtmr->baseval = 0;
+
+	FREQ2BT(PMTMR_FREQ, &bt);
+	vpmtmr->freq_sbt = bttosbt(bt);
+
+	return (vpmtmr);
+}
+
+void
+vpmtmr_cleanup(struct vpmtmr *vpmtmr)
+{
+
+	free(vpmtmr, M_VPMTMR);
+}
+
+int
+vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	struct vpmtmr *vpmtmr;
+	sbintime_t now, delta;
+
+	if (!in || bytes != 4)
+		return (-1);
+
+	vpmtmr = vm_pmtmr(vm);
+
+	/*
+	 * No locking needed because 'baseuptime' and 'baseval' are
+	 * written only during initialization.
+	 */
+	now = sbinuptime();
+	delta = now - vpmtmr->baseuptime;
+	KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: "
+	    "%#lx to %#lx", vpmtmr->baseuptime, now));
+	*val = vpmtmr->baseval + delta / vpmtmr->freq_sbt;
+
+	return (0);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h
new file mode 100644
index 0000000000..e6562da5c0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h
@@ -0,0 +1,44 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VPMTMR_H_
+#define	_VPMTMR_H_
+
+#define	IO_PMTMR 0x408
+
+struct vpmtmr;
+
+struct vpmtmr *vpmtmr_init(struct vm *vm);
+void vpmtmr_cleanup(struct vpmtmr *pmtmr);
+
+int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.c b/usr/src/uts/i86pc/io/vmm/io/vrtc.c
new file mode 100644
index 0000000000..f12d22fc26
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.c
@@ -0,0 +1,1061 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+
+#include <machine/vmm.h>
+
+#include <isa/rtc.h>
+
+#include "vmm_ktr.h"
+#include "vatpic.h"
+#include "vioapic.h"
+#include "vrtc.h"
+
+/* Register layout of the RTC */
+struct rtcdev {
+	uint8_t	sec;
+	uint8_t	alarm_sec;
+	uint8_t	min;
+	uint8_t	alarm_min;
+	uint8_t	hour;
+	uint8_t	alarm_hour;
+	uint8_t	day_of_week;
+	uint8_t	day_of_month;
+	uint8_t	month;
+	uint8_t	year;
+	uint8_t	reg_a;
+	uint8_t	reg_b;
+	uint8_t	reg_c;
+	uint8_t	reg_d;
+	uint8_t	nvram[36];
+	uint8_t	century;
+	uint8_t	nvram2[128 - 51];
+} __packed;
+CTASSERT(sizeof(struct rtcdev) == 128);
+CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY);
+
+struct vrtc {
+	struct vm	*vm;
+	struct mtx	mtx;
+	struct callout	callout;
+	u_int		addr;		/* RTC register to read or write */
+	sbintime_t	base_uptime;
+	time_t		base_rtctime;
+	struct rtcdev	rtcdev;
+};
+
+#define	VRTC_LOCK(vrtc)		mtx_lock(&((vrtc)->mtx))
+#define	VRTC_UNLOCK(vrtc)	mtx_unlock(&((vrtc)->mtx))
+#define	VRTC_LOCKED(vrtc)	mtx_owned(&((vrtc)->mtx))
+
+/*
+ * RTC time is considered "broken" if:
+ * - RTC updates are halted by the guest
+ * - RTC date/time fields have invalid values
+ */
+#define	VRTC_BROKEN_TIME	((time_t)-1)
+
+#define	RTC_IRQ			8
+#define	RTCSB_BIN		0x04
+#define	RTCSB_ALL_INTRS		(RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR)
+#define	rtc_halted(vrtc)	((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0)
+#define	aintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0)
+#define	pintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0)
+#define	uintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0)
+
+static void vrtc_callout_handler(void *arg);
+static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval);
+
+static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc");
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL);
+
+static int rtc_flag_broken_time = 1;
+SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN,
+    &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected");
+
+static __inline bool
+divider_enabled(int reg_a)
+{
+	/*
+	 * The RTC is counting only when dividers are not held in reset.
+	 */
+	return ((reg_a & 0x70) == 0x20);
+}
+
+static __inline bool
+update_enabled(struct vrtc *vrtc)
+{
+	/*
+	 * RTC date/time can be updated only if:
+	 * - divider is not held in reset
+	 * - guest has not disabled updates
+	 * - the date/time fields have valid contents
+	 */
+	if (!divider_enabled(vrtc->rtcdev.reg_a))
+		return (false);
+
+	if (rtc_halted(vrtc))
+		return (false);
+
+	if (vrtc->base_rtctime == VRTC_BROKEN_TIME)
+		return (false);
+
+	return (true);
+}
+
+static time_t
+vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime)
+{
+	sbintime_t now, delta;
+	time_t t, secs;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	t = vrtc->base_rtctime;
+	*basetime = vrtc->base_uptime;
+	if (update_enabled(vrtc)) {
+		now = sbinuptime();
+		delta = now - vrtc->base_uptime;
+		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
+		    "%#lx to %#lx", vrtc->base_uptime, now));
+		secs = delta / SBT_1S;
+		t += secs;
+		*basetime += secs * SBT_1S;
+	}
+	return (t);
+}
+
+static __inline uint8_t
+rtcset(struct rtcdev *rtc, int val)
+{
+
+	KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d",
+	    __func__, val));
+
+	return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]);
+}
+
+static void
+secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update)
+{
+	struct clocktime ct;
+	struct timespec ts;
+	struct rtcdev *rtc;
+	int hour;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	if (rtctime < 0) {
+		KASSERT(rtctime == VRTC_BROKEN_TIME,
+		    ("%s: invalid vrtc time %#lx", __func__, rtctime));
+		return;
+	}
+
+	/*
+	 * If the RTC is halted then the guest has "ownership" of the
+	 * date/time fields. Don't update the RTC date/time fields in
+	 * this case (unless forced).
+	 */
+	if (rtc_halted(vrtc) && !force_update)
+		return;
+
+	ts.tv_sec = rtctime;
+	ts.tv_nsec = 0;
+	clock_ts_to_ct(&ts, &ct);
+
+	KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d",
+	    ct.sec));
+	KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d",
+	    ct.min));
+	KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d",
+	    ct.hour));
+	KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d",
+	    ct.dow));
+	KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d",
+	    ct.day));
+	KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d",
+	    ct.mon));
+	KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d",
+	    ct.year));
+
+	rtc = &vrtc->rtcdev;
+	rtc->sec = rtcset(rtc, ct.sec);
+	rtc->min = rtcset(rtc, ct.min);
+
+	if (rtc->reg_b & RTCSB_24HR) {
+		hour = ct.hour;
+	} else {
+		/*
+		 * Convert to the 12-hour format.
+		 */
+		switch (ct.hour) {
+		case 0:			/* 12 AM */
+		case 12:		/* 12 PM */
+			hour = 12;
+			break;
+		default:
+			/*
+			 * The remaining 'ct.hour' values are interpreted as:
+			 * [1  - 11] ->  1 - 11 AM
+			 * [13 - 23] ->  1 - 11 PM
+			 */
+			hour = ct.hour % 12;
+			break;
+		}
+	}
+
+	rtc->hour = rtcset(rtc, hour);
+
+	if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12)
+		rtc->hour |= 0x80;	    /* set MSB to indicate PM */
+
+	rtc->day_of_week = rtcset(rtc, ct.dow + 1);
+	rtc->day_of_month = rtcset(rtc, ct.day);
+	rtc->month = rtcset(rtc, ct.mon);
+	rtc->year = rtcset(rtc, ct.year % 100);
+	rtc->century = rtcset(rtc, ct.year / 100);
+}
+
+static int
+rtcget(struct rtcdev *rtc, int val, int *retval)
+{
+	uint8_t upper, lower;
+
+	if (rtc->reg_b & RTCSB_BIN) {
+		*retval = val;
+		return (0);
+	}
+
+	lower = val & 0xf;
+	upper = (val >> 4) & 0xf;
+
+	if (lower > 9 || upper > 9)
+		return (-1);
+
+	*retval = upper * 10 + lower;
+	return (0);
+}
+
+static time_t
+rtc_to_secs(struct vrtc *vrtc)
+{
+	struct clocktime ct;
+	struct timespec ts;
+	struct rtcdev *rtc;
+#ifdef __FreeBSD__
+	struct vm *vm;
+#endif
+	int century, error, hour, pm, year;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+#ifdef __FreeBSD__
+	vm = vrtc->vm;
+#endif
+	rtc = &vrtc->rtcdev;
+
+	bzero(&ct, sizeof(struct clocktime));
+
+	error = rtcget(rtc, rtc->sec, &ct.sec);
+	if (error || ct.sec < 0 || ct.sec > 59) {
+#ifdef __FreeBSD__
+		VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec);
+#endif
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->min, &ct.min);
+	if (error || ct.min < 0 || ct.min > 59) {
+#ifdef __FreeBSD__
+		VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min);
+#endif
+		goto fail;
+	}
+
+	pm = 0;
+	hour = rtc->hour;
+	if ((rtc->reg_b & RTCSB_24HR) == 0) {
+		if (hour & 0x80) {
+			hour &= ~0x80;
+			pm = 1;
+		}
+	}
+	error = rtcget(rtc, hour, &ct.hour);
+	if ((rtc->reg_b & RTCSB_24HR) == 0) {
+		if (ct.hour >= 1 && ct.hour <= 12) {
+			/*
+			 * Convert from 12-hour format to internal 24-hour
+			 * representation as follows:
+			 *
+			 *    12-hour format		ct.hour
+			 *	12	AM		0
+			 *	1 - 11	AM		1 - 11
+			 *	12	PM		12
+			 *	1 - 11	PM		13 - 23
+			 */
+			if (ct.hour == 12)
+				ct.hour = 0;
+			if (pm)
+				ct.hour += 12;
+		} else {
+#ifdef __FreeBSD__
+			VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d",
+			    rtc->hour, ct.hour);
+#endif
+			goto fail;
+		}
+	}
+
+	if (error || ct.hour < 0 || ct.hour > 23) {
+#ifdef __FreeBSD__
+		VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour);
+#endif
+		goto fail;
+	}
+
+	/*
+	 * Ignore 'rtc->dow' because some guests like Linux don't bother
+	 * setting it at all while others like OpenBSD/i386 set it incorrectly. 
+	 *
+	 * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it.
+	 */
+	ct.dow = -1;
+
+	error = rtcget(rtc, rtc->day_of_month, &ct.day);
+	if (error || ct.day < 1 || ct.day > 31) {
+#ifdef __FreeBSD__
+		VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month,
+		    ct.day);
+#endif
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->month, &ct.mon);
+	if (error || ct.mon < 1 || ct.mon > 12) {
+#ifdef __FreeBSD__
+		VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon);
+#endif
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->year, &year);
+	if (error || year < 0 || year > 99) {
+#ifdef __FreeBSD__
+		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
+#endif
+		goto fail;
+	}
+
+	error = rtcget(rtc, rtc->century, &century);
+	ct.year = century * 100 + year;
+	if (error || ct.year < POSIX_BASE_YEAR) {
+#ifdef __FreeBSD__
+		VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century,
+		    ct.year);
+#endif
+		goto fail;
+	}
+
+	error = clock_ct_to_ts(&ct, &ts);
+	if (error || ts.tv_sec < 0) {
+#ifdef __FreeBSD__
+		VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d",
+		    ct.year, ct.mon, ct.day);
+		VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d",
+		    ct.hour, ct.min, ct.sec);
+#endif
+		goto fail;
+	}
+	return (ts.tv_sec);		/* success */
+fail:
+	/*
+	 * Stop updating the RTC if the date/time fields programmed by
+	 * the guest are invalid.
+	 */
+#ifdef __FreeBSD__
+	VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected");
+#endif
+	return (VRTC_BROKEN_TIME);
+}
+
+static int
+vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase)
+{
+	struct rtcdev *rtc;
+#ifdef __FreeBSD__
+	sbintime_t oldbase;
+#endif
+	time_t oldtime;
+	uint8_t alarm_sec, alarm_min, alarm_hour;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	alarm_sec = rtc->alarm_sec;
+	alarm_min = rtc->alarm_min;
+	alarm_hour = rtc->alarm_hour;
+
+	oldtime = vrtc->base_rtctime;
+	VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx",
+	    oldtime, newtime);
+
+#ifdef __FreeBSD__
+	oldbase = vrtc->base_uptime;
+	VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx",
+	    oldbase, newbase);
+#endif
+	vrtc->base_uptime = newbase;
+
+	if (newtime == oldtime)
+		return (0);
+
+	/*
+	 * If 'newtime' indicates that RTC updates are disabled then just
+	 * record that and return. There is no need to do alarm interrupt
+	 * processing in this case.
+	 */
+	if (newtime == VRTC_BROKEN_TIME) {
+		vrtc->base_rtctime = VRTC_BROKEN_TIME;
+		return (0);
+	}
+
+	/*
+	 * Return an error if RTC updates are halted by the guest.
+	 */
+	if (rtc_halted(vrtc)) {
+		VM_CTR0(vrtc->vm, "RTC update halted by guest");
+		return (EBUSY);
+	}
+
+	do {
+		/*
+		 * If the alarm interrupt is enabled and 'oldtime' is valid
+		 * then visit all the seconds between 'oldtime' and 'newtime'
+		 * to check for the alarm condition.
+		 *
+		 * Otherwise move the RTC time forward directly to 'newtime'.
+		 */
+		if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME)
+			vrtc->base_rtctime++;
+		else
+			vrtc->base_rtctime = newtime;
+
+		if (aintr_enabled(vrtc)) {
+			/*
+			 * Update the RTC date/time fields before checking
+			 * if the alarm conditions are satisfied.
+			 */
+			secs_to_rtc(vrtc->base_rtctime, vrtc, 0);
+
+			if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) &&
+			    (alarm_min >= 0xC0 || alarm_min == rtc->min) &&
+			    (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) {
+				vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM);
+			}
+		}
+	} while (vrtc->base_rtctime != newtime);
+
+	if (uintr_enabled(vrtc))
+		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
+
+	return (0);
+}
+
+static sbintime_t
+vrtc_freq(struct vrtc *vrtc)
+{
+	int ratesel;
+
+	static sbintime_t pf[16] = {
+		0,
+		SBT_1S / 256,
+		SBT_1S / 128,
+		SBT_1S / 8192,
+		SBT_1S / 4096,
+		SBT_1S / 2048,
+		SBT_1S / 1024,
+		SBT_1S / 512,
+		SBT_1S / 256,
+		SBT_1S / 128,
+		SBT_1S / 64,
+		SBT_1S / 32,
+		SBT_1S / 16,
+		SBT_1S / 8,
+		SBT_1S / 4,
+		SBT_1S / 2,
+	};
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	/*
+	 * If both periodic and alarm interrupts are enabled then use the
+	 * periodic frequency to drive the callout. The minimum periodic
+	 * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so
+	 * piggyback the alarm on top of it. The same argument applies to
+	 * the update interrupt.
+	 */
+	if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) {
+		ratesel = vrtc->rtcdev.reg_a & 0xf;
+		return (pf[ratesel]);
+	} else if (aintr_enabled(vrtc) && update_enabled(vrtc)) {
+		return (SBT_1S);
+	} else if (uintr_enabled(vrtc) && update_enabled(vrtc)) {
+		return (SBT_1S);
+	} else {
+		return (0);
+	}
+}
+
+static void
+vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt)
+{
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	if (freqsbt == 0) {
+		if (callout_active(&vrtc->callout)) {
+			VM_CTR0(vrtc->vm, "RTC callout stopped");
+			callout_stop(&vrtc->callout);
+		}
+		return;
+	}
+	VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt);
+	callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler,
+	    vrtc, 0);
+}
+
+static void
+vrtc_callout_handler(void *arg)
+{
+	struct vrtc *vrtc = arg;
+	sbintime_t freqsbt, basetime;
+	time_t rtctime;
+	int error;
+
+	VM_CTR0(vrtc->vm, "vrtc callout fired");
+
+	VRTC_LOCK(vrtc);
+	if (callout_pending(&vrtc->callout))	/* callout was reset */
+		goto done;
+
+	if (!callout_active(&vrtc->callout))	/* callout was stopped */
+		goto done;
+
+	callout_deactivate(&vrtc->callout);
+
+	KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0,
+	    ("gratuitous vrtc callout"));
+
+	if (pintr_enabled(vrtc))
+		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
+
+	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
+		rtctime = vrtc_curtime(vrtc, &basetime);
+		error = vrtc_time_update(vrtc, rtctime, basetime);
+		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
+		    __func__, error));
+	}
+
+	freqsbt = vrtc_freq(vrtc);
+	KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__));
+	vrtc_callout_reset(vrtc, freqsbt);
+done:
+	VRTC_UNLOCK(vrtc);
+}
+
+static __inline void
+vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq)
+{
+	int active;
+
+	active = callout_active(&vrtc->callout) ? 1 : 0;
+	KASSERT((freq == 0 && !active) || (freq != 0 && active),
+	    ("vrtc callout %s with frequency %#lx",
+	    active ? "active" : "inactive", freq));
+}
+
+static void
+vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval)
+{
+	struct rtcdev *rtc;
+	int oldirqf, newirqf;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE;
+
+	oldirqf = rtc->reg_c & RTCIR_INT;
+	if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) ||
+	    (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) ||
+	    (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) {
+		newirqf = RTCIR_INT;
+	} else {
+		newirqf = 0;
+	}
+
+	oldval = rtc->reg_c;
+	rtc->reg_c = newirqf | newval;
+	changed = oldval ^ rtc->reg_c;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x",
+		    oldval, rtc->reg_c);
+	}
+
+	if (!oldirqf && newirqf) {
+		VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ);
+		vatpic_pulse_irq(vrtc->vm, RTC_IRQ);
+		vioapic_pulse_irq(vrtc->vm, RTC_IRQ);
+	} else if (oldirqf && !newirqf) {
+		VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ);
+	}
+}
+
+static int
+vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
+{
+	struct rtcdev *rtc;
+	sbintime_t oldfreq, newfreq, basetime;
+	time_t curtime, rtctime;
+	int error;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	rtc = &vrtc->rtcdev;
+	oldval = rtc->reg_b;
+	oldfreq = vrtc_freq(vrtc);
+
+	rtc->reg_b = newval;
+	changed = oldval ^ newval;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x",
+		    oldval, newval);
+	}
+
+	if (changed & RTCSB_HALT) {
+		if ((newval & RTCSB_HALT) == 0) {
+			rtctime = rtc_to_secs(vrtc);
+			basetime = sbinuptime();
+			if (rtctime == VRTC_BROKEN_TIME) {
+				if (rtc_flag_broken_time)
+					return (-1);
+			}
+		} else {
+			curtime = vrtc_curtime(vrtc, &basetime);
+			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
+			    "between vrtc basetime (%#lx) and curtime (%#lx)",
+			    __func__, vrtc->base_rtctime, curtime));
+
+			/*
+			 * Force a refresh of the RTC date/time fields so
+			 * they reflect the time right before the guest set
+			 * the HALT bit.
+			 */
+			secs_to_rtc(curtime, vrtc, 1);
+
+			/*
+			 * Updates are halted so mark 'base_rtctime' to denote
+			 * that the RTC date/time is in flux.
+			 */
+			rtctime = VRTC_BROKEN_TIME;
+			rtc->reg_b &= ~RTCSB_UINTR;
+		}
+		error = vrtc_time_update(vrtc, rtctime, basetime);
+		KASSERT(error == 0, ("vrtc_time_update error %d", error));
+	}
+
+	/*
+	 * Side effect of changes to the interrupt enable bits.
+	 */
+	if (changed & RTCSB_ALL_INTRS)
+		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c);
+
+	/*
+	 * Change the callout frequency if it has changed.
+	 */
+	newfreq = vrtc_freq(vrtc);
+	if (newfreq != oldfreq)
+		vrtc_callout_reset(vrtc, newfreq);
+	else
+		vrtc_callout_check(vrtc, newfreq);
+
+	/*
+	 * The side effect of bits that control the RTC date/time format
+	 * is handled lazily when those fields are actually read.
+	 */
+	return (0);
+}
+
+static void
+vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval)
+{
+	sbintime_t oldfreq, newfreq;
+	uint8_t oldval, changed;
+
+	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
+
+	newval &= ~RTCSA_TUP;
+	oldval = vrtc->rtcdev.reg_a;
+	oldfreq = vrtc_freq(vrtc);
+
+	if (divider_enabled(oldval) && !divider_enabled(newval)) {
+		VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx",
+		    vrtc->base_rtctime, vrtc->base_uptime);
+	} else if (!divider_enabled(oldval) && divider_enabled(newval)) {
+		/*
+		 * If the dividers are coming out of reset then update
+		 * 'base_uptime' before this happens. This is done to
+		 * maintain the illusion that the RTC date/time was frozen
+		 * while the dividers were disabled.
+		 */
+		vrtc->base_uptime = sbinuptime();
+		VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx",
+		    vrtc->base_rtctime, vrtc->base_uptime);
+	} else {
+		/* NOTHING */
+	}
+
+	vrtc->rtcdev.reg_a = newval;
+	changed = oldval ^ newval;
+	if (changed) {
+		VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x",
+		    oldval, newval);
+	}
+
+	/*
+	 * Side effect of changes to rate select and divider enable bits.
+	 */
+	newfreq = vrtc_freq(vrtc);
+	if (newfreq != oldfreq)
+		vrtc_callout_reset(vrtc, newfreq);
+	else
+		vrtc_callout_check(vrtc, newfreq);
+}
+
+int
+vrtc_set_time(struct vm *vm, time_t secs)
+{
+	struct vrtc *vrtc;
+	int error;
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+	error = vrtc_time_update(vrtc, secs, sbinuptime());
+	VRTC_UNLOCK(vrtc);
+
+	if (error) {
+		VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error,
+		    secs);
+	} else {
+		VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs);
+	}
+
+	return (error);
+}
+
+time_t
+vrtc_get_time(struct vm *vm)
+{
+	struct vrtc *vrtc;
+	sbintime_t basetime;
+	time_t t;
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+	t = vrtc_curtime(vrtc, &basetime);
+	VRTC_UNLOCK(vrtc);
+
+	return (t);
+}
+
+int
+vrtc_nvram_write(struct vm *vm, int offset, uint8_t value)
+{
+	struct vrtc *vrtc;
+	uint8_t *ptr;
+
+	vrtc = vm_rtc(vm);
+
+	/*
+	 * Don't allow writes to RTC control registers or the date/time fields.
+	 */
+	if (offset < offsetof(struct rtcdev, nvram[0]) ||
+	    offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) {
+		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
+		    offset);
+		return (EINVAL);
+	}
+
+	VRTC_LOCK(vrtc);
+	ptr = (uint8_t *)(&vrtc->rtcdev);
+	ptr[offset] = value;
+	VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset);
+	VRTC_UNLOCK(vrtc);
+
+	return (0);
+}
+
+int
+vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
+{
+	struct vrtc *vrtc;
+	sbintime_t basetime;
+	time_t curtime;
+	uint8_t *ptr;
+
+	/*
+	 * Allow all offsets in the RTC to be read.
+	 */
+	if (offset < 0 || offset >= sizeof(struct rtcdev))
+		return (EINVAL);
+
+	vrtc = vm_rtc(vm);
+	VRTC_LOCK(vrtc);
+
+	/*
+	 * Update RTC date/time fields if necessary.
+	 */
+	if (offset < 10 || offset == RTC_CENTURY) {
+		curtime = vrtc_curtime(vrtc, &basetime);
+		secs_to_rtc(curtime, vrtc, 0);
+	}
+
+	ptr = (uint8_t *)(&vrtc->rtcdev);
+	*retval = ptr[offset];
+
+	VRTC_UNLOCK(vrtc);
+	return (0);
+}
+
+int
+vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	struct vrtc *vrtc;
+
+	vrtc = vm_rtc(vm);
+
+	if (bytes != 1)
+		return (-1);
+
+	if (in) {
+		*val = 0xff;
+		return (0);
+	}
+
+	VRTC_LOCK(vrtc);
+	vrtc->addr = *val & 0x7f;
+	VRTC_UNLOCK(vrtc);
+
+	return (0);
+}
+
+int
+vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	struct vrtc *vrtc;
+	struct rtcdev *rtc;
+	sbintime_t basetime;
+	time_t curtime;
+	int error, offset;
+
+	vrtc = vm_rtc(vm);
+	rtc = &vrtc->rtcdev;
+
+	if (bytes != 1)
+		return (-1);
+
+	VRTC_LOCK(vrtc);
+	offset = vrtc->addr;
+	if (offset >= sizeof(struct rtcdev)) {
+		VRTC_UNLOCK(vrtc);
+		return (-1);
+	}
+
+	error = 0;
+	curtime = vrtc_curtime(vrtc, &basetime);
+	vrtc_time_update(vrtc, curtime, basetime);
+
+	/*
+	 * Update RTC date/time fields if necessary.
+	 *
+	 * This is not just for reads of the RTC. The side-effect of writing
+	 * the century byte requires other RTC date/time fields (e.g. sec)
+	 * to be updated here.
+	 */
+	if (offset < 10 || offset == RTC_CENTURY)
+		secs_to_rtc(curtime, vrtc, 0);
+
+	if (in) {
+		if (offset == 12) {
+			/*
+			 * XXX
+			 * reg_c interrupt flags are updated only if the
+			 * corresponding interrupt enable bit in reg_b is set.
+			 */
+			*val = vrtc->rtcdev.reg_c;
+			vrtc_set_reg_c(vrtc, 0);
+		} else {
+			*val = *((uint8_t *)rtc + offset);
+		}
+		VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x",
+		    *val, offset);
+	} else {
+		switch (offset) {
+		case 10:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val);
+			vrtc_set_reg_a(vrtc, *val);
+			break;
+		case 11:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val);
+			error = vrtc_set_reg_b(vrtc, *val);
+			break;
+		case 12:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)",
+			    *val);
+			break;
+		case 13:
+			VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)",
+			    *val);
+			break;
+		case 0:
+			/*
+			 * High order bit of 'seconds' is readonly.
+			 */
+			*val &= 0x7f;
+			/* FALLTHRU */
+		default:
+			VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x",
+			    offset, *val);
+			*((uint8_t *)rtc + offset) = *val;
+			break;
+		}
+
+		/*
+		 * XXX some guests (e.g. OpenBSD) write the century byte
+		 * outside of RTCSB_HALT so re-calculate the RTC date/time.
+		 */
+		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
+			curtime = rtc_to_secs(vrtc);
+			error = vrtc_time_update(vrtc, curtime, sbinuptime());
+			KASSERT(!error, ("vrtc_time_update error %d", error));
+			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
+				error = -1;
+		}
+	}
+	VRTC_UNLOCK(vrtc);
+	return (error);
+}
+
+void
+vrtc_reset(struct vrtc *vrtc)
+{
+	struct rtcdev *rtc;
+
+	VRTC_LOCK(vrtc);
+
+	rtc = &vrtc->rtcdev;
+	vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE));
+	vrtc_set_reg_c(vrtc, 0);
+	KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active"));
+
+	VRTC_UNLOCK(vrtc);
+}
+
+struct vrtc *
+vrtc_init(struct vm *vm)
+{
+	struct vrtc *vrtc;
+	struct rtcdev *rtc;
+	time_t curtime;
+
+	vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO);
+	vrtc->vm = vm;
+	mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF);
+	callout_init(&vrtc->callout, 1);
+
+	/* Allow dividers to keep time but disable everything else */
+	rtc = &vrtc->rtcdev;
+	rtc->reg_a = 0x20;
+	rtc->reg_b = RTCSB_24HR;
+	rtc->reg_c = 0;
+	rtc->reg_d = RTCSD_PWR;
+
+	/* Reset the index register to a safe value. */
+	vrtc->addr = RTC_STATUSD;
+
+	/*
+	 * Initialize RTC time to 00:00:00 Jan 1, 1970.
+	 */
+	curtime = 0;
+
+	VRTC_LOCK(vrtc);
+	vrtc->base_rtctime = VRTC_BROKEN_TIME;
+	vrtc_time_update(vrtc, curtime, sbinuptime());
+	secs_to_rtc(curtime, vrtc, 0);
+	VRTC_UNLOCK(vrtc);
+
+	return (vrtc);
+}
+
+void
+vrtc_cleanup(struct vrtc *vrtc)
+{
+
+	callout_drain(&vrtc->callout);
+	free(vrtc, M_VRTC);
+}
+
+#ifndef __FreeBSD__
+void
+vrtc_localize_resources(struct vrtc *vrtc)
+{
+	vmm_glue_callout_localize(&vrtc->callout);
+}
+#endif /* __FreeBSD */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.h b/usr/src/uts/i86pc/io/vmm/io/vrtc.h
new file mode 100644
index 0000000000..13abbedeb9
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.h
@@ -0,0 +1,60 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _VRTC_H_
+#define	_VRTC_H_
+
+#include <isa/isareg.h>
+
+struct vrtc;
+
+struct vrtc *vrtc_init(struct vm *vm);
+void vrtc_cleanup(struct vrtc *vrtc);
+void vrtc_reset(struct vrtc *vrtc);
+
+time_t vrtc_get_time(struct vm *vm);
+int vrtc_set_time(struct vm *vm, time_t secs);
+int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value);
+int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval);
+
+int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val);
+int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+    uint32_t *val);
+
+#ifndef __FreeBSD__
+void vrtc_localize_resources(struct vrtc *);
+#endif
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/offsets.in b/usr/src/uts/i86pc/io/vmm/offsets.in
deleted file mode 100644
index 4b1fe1d6b6..0000000000
--- a/usr/src/uts/i86pc/io/vmm/offsets.in
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2014 Pluribus Networks Inc.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/cpuvar.h>
-
-#include <machine/pmap.h>
-
-#include <machine/vmm.h>
-#include "intel/vmx_cpufunc.h"
-#include "intel/vmx.h"
-
-vmxctx
-	tmpstktop	VMXCTX_TMPSTKTOP
-	guest_rdi	VMXCTX_GUEST_RDI
-	guest_rsi	VMXCTX_GUEST_RSI
-	guest_rdx	VMXCTX_GUEST_RDX
-	guest_rcx	VMXCTX_GUEST_RCX
-	guest_r8	VMXCTX_GUEST_R8
-	guest_r9	VMXCTX_GUEST_R9
-	guest_rax	VMXCTX_GUEST_RAX
-	guest_rbx	VMXCTX_GUEST_RBX
-	guest_rbp	VMXCTX_GUEST_RBP
-	guest_r10	VMXCTX_GUEST_R10
-	guest_r11	VMXCTX_GUEST_R11
-	guest_r12	VMXCTX_GUEST_R12
-	guest_r13	VMXCTX_GUEST_R13
-	guest_r14	VMXCTX_GUEST_R14
-	guest_r15	VMXCTX_GUEST_R15
-	guest_cr2	VMXCTX_GUEST_CR2
-	host_r15	VMXCTX_HOST_R15
-	host_r14	VMXCTX_HOST_R14
-	host_r13	VMXCTX_HOST_R13
-	host_r12	VMXCTX_HOST_R12
-	host_rbp	VMXCTX_HOST_RBP
-	host_rsp	VMXCTX_HOST_RSP
-	host_rbx	VMXCTX_HOST_RBX
-	host_rip	VMXCTX_HOST_RIP
-	launch_error	VMXCTX_LAUNCH_ERROR
-
-vmx			VMX_SIZE
-
-\#define	VM_SUCCESS		0
-\#define	VM_FAIL_INVALID		1
-\#define	VM_FAIL_VALID		2
-
-\#define	VMX_RETURN_DIRECT	0
-\#define	VMX_RETURN_LONGJMP	1
-\#define	VMX_RETURN_VMRESUME	2
-\#define	VMX_RETURN_VMLAUNCH	3
-\#define	VMX_RETURN_AST		4
-
-cpu
-	cpu_thread
-
-_kthread
-	t_lwp
-	_tu._ts._t_astflag	T_ASTFLAG
diff --git a/usr/src/uts/i86pc/io/vmm/vm/pmap.h b/usr/src/uts/i86pc/io/vmm/vm/pmap.h
new file mode 100644
index 0000000000..512fc4acee
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vm/pmap.h
@@ -0,0 +1,27 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _PMAP_VM_
+#define	_PMAP_VM_
+
+#include <machine/pmap.h>
+#include "vm_glue.h"
+
+void	pmap_invalidate_cache(void);
+void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
+int	pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype);
+long	pmap_wired_count(pmap_t pmap);
+
+#endif /* _PMAP_VM_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h
new file mode 100644
index 0000000000..92a959960a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h
@@ -0,0 +1,35 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _VM_EXTERN_H_
+#define	_VM_EXTERN_H_
+
+#include <sys/types.h>
+#include <vm/vm.h>
+
+struct vmspace;
+struct pmap;
+
+typedef int (*pmap_pinit_t)(struct pmap *pmap);
+
+struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
+void vmspace_free(struct vmspace *);
+
+int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
+int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
+    vm_prot_t prot, vm_page_t *ma, int max_count);
+
+
+#endif /* _VM_EXTERN_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h
new file mode 100644
index 0000000000..600872c321
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h
@@ -0,0 +1,99 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef	_VM_GLUE_
+#define	_VM_GLUE_
+
+#include <vm/pmap.h>
+#include <vm/vm.h>
+#include <sys/cpuvar.h>
+
+struct vmspace;
+struct vm_map;
+struct pmap;
+struct vm_object;
+struct vmm_pt_ops;
+
+struct vm_map {
+	struct vmspace *vmm_space;
+};
+
+struct pmap {
+	void		*pm_pml4;
+	cpuset_t	pm_active;
+	long		pm_eptgen;
+
+	/* Implementation private */
+	enum pmap_type	pm_type;
+	struct vmm_pt_ops *pm_ops;
+	void		*pm_impl;
+};
+
+struct vmspace {
+	struct vm_map vm_map;
+
+	/* Implementation private */
+	kmutex_t	vms_lock;
+	boolean_t	vms_map_changing;
+	struct pmap	vms_pmap;
+	uintptr_t	vms_size;	/* fixed after creation */
+
+	list_t		vms_maplist;
+};
+
+typedef pfn_t (*vm_pager_fn_t)(vm_object_t, uintptr_t, pfn_t *, uint_t *);
+
+struct vm_object {
+	uint_t		vmo_refcnt;	/* manipulated with atomic ops */
+
+	/* This group of fields are fixed at creation time */
+	objtype_t	vmo_type;
+	size_t		vmo_size;
+	vm_pager_fn_t	vmo_pager;
+	void		*vmo_data;
+
+	kmutex_t	vmo_lock;	/* protects fields below */
+	vm_memattr_t	vmo_attr;
+};
+
+struct vm_page {
+	kmutex_t		vmp_lock;
+	pfn_t			vmp_pfn;
+	struct vm_object	*vmp_obj_held;
+};
+
+/* Illumos-specific functions for setup and operation */
+int vm_segmap_obj(struct vmspace *, vm_object_t, struct as *, caddr_t *,
+    uint_t, uint_t, uint_t);
+int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t,
+    uint_t, uint_t, uint_t);
+void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t);
+void vmm_arena_init(void);
+void vmm_arena_fini(void);
+
+struct vmm_pt_ops {
+	void * (*vpo_init)(uint64_t *);
+	void (*vpo_free)(void *);
+	uint64_t (*vpo_wired_cnt)(void *);
+	int (*vpo_is_wired)(void *, uint64_t, uint_t *);
+	int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, uint8_t);
+	uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t);
+};
+
+extern struct vmm_pt_ops ept_ops;
+extern struct vmm_pt_ops rvi_ops;
+
+
+#endif /* _VM_GLUE_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_map.h b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h
new file mode 100644
index 0000000000..70826ac8f1
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h
@@ -0,0 +1,63 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef	_VM_MAP_
+#define	_VM_MAP_
+
+#include "vm_glue.h"
+
+/*
+ * vm_map_wire and vm_map_unwire option flags
+ */
+#define	VM_MAP_WIRE_SYSTEM	0	/* wiring in a kernel map */
+#define	VM_MAP_WIRE_USER	1	/* wiring in a user map */
+
+#define	VM_MAP_WIRE_NOHOLES	0	/* region must not have holes */
+#define	VM_MAP_WIRE_HOLESOK	2	/* region may have holes */
+
+#define	VM_MAP_WIRE_WRITE	4	/* Validate writable. */
+
+/*
+ * The following "find_space" options are supported by vm_map_find().
+ *
+ * For VMFS_ALIGNED_SPACE, the desired alignment is specified to
+ * the macro argument as log base 2 of the desired alignment.
+ */
+#define	VMFS_NO_SPACE		0	/* don't find; use the given range */
+#define	VMFS_ANY_SPACE		1	/* find range with any alignment */
+#define	VMFS_OPTIMAL_SPACE	2	/* find range with optimal alignment */
+#define	VMFS_SUPER_SPACE	3	/* find superpage-aligned range */
+#define	VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find range with fixed alignment */
+
+/*
+ * vm_fault option flags
+ */
+#define	VM_FAULT_NORMAL	0	/* Nothing special */
+#define	VM_FAULT_WIRE	1	/* Wire the mapped page */
+#define	VM_FAULT_DIRTY	2	/* Dirty the page; use w/VM_PROT_COPY */
+
+
+
+pmap_t vmspace_pmap(struct vmspace *);
+
+int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
+    vm_offset_t, int, vm_prot_t, vm_prot_t, int);
+int vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t);
+int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags);
+
+long vmspace_resident_count(struct vmspace *vmspace);
+
+
+#endif /* _VM_MAP_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_object.h b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h
new file mode 100644
index 0000000000..1f16fa9b83
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h
@@ -0,0 +1,31 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef	_VM_OBJECT_
+#define	_VM_OBJECT_
+
+#include "vm_glue.h"
+
+vm_object_t vm_object_allocate(objtype_t, vm_pindex_t);
+void vm_object_deallocate(vm_object_t);
+void vm_object_reference(vm_object_t);
+int vm_object_set_memattr(vm_object_t, vm_memattr_t);
+void vm_object_clear(vm_object_t);
+
+
+#define	VM_OBJECT_WLOCK(vmo)	mutex_enter(&(vmo)->vmo_lock)
+#define	VM_OBJECT_WUNLOCK(vmo)	mutex_exit(&(vmo)->vmo_lock)
+
+#endif /* _VM_OBJECT_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_page.h b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h
new file mode 100644
index 0000000000..4559fe6d4c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+
+#ifndef	_VM_PAGE_
+#define	_VM_PAGE_
+
+#include "vm_glue.h"
+
+void vm_page_lock(vm_page_t);
+void vm_page_unhold(vm_page_t);
+void vm_page_unlock(vm_page_t);
+
+#define	VM_PAGE_TO_PHYS(page)	(mmu_ptob((uintptr_t)((page)->vmp_pfn)))
+
+#endif /* _VM_PAGE_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h
new file mode 100644
index 0000000000..11aa344f61
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef	_VM_PAGER_
+#define	_VM_PAGER_
+
+vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t,
+    vm_ooffset_t, void *);
+
+
+#endif /* _VM_PAGER_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index 7081368f4a..6df094b50e 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,10 +38,11 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -51,16 +54,26 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
-#include <x86/psl.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
-
-#include <machine/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+
+#ifdef __FreeBSD__
+#include <machine/cpu.h>
+#endif
 #include <machine/pcb.h>
 #include <machine/smp.h>
+#include <machine/md_var.h>
+#include <x86/psl.h>
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
@@ -77,83 +90,132 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
-#include "vmm_ipi.h"
+#include "vpmtmr.h"
+#include "vrtc.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
-#ifdef	__FreeBSD__
 #include "io/ppt.h"
 #include "io/iommu.h"
-#endif
 
-struct vhpet;
-struct vioapic;
 struct vlapic;
 
+/*
+ * Initialization:
+ * (a) allocated when vcpu is created
+ * (i) initialized when vcpu is created and when it is reinitialized
+ * (o) initialized the first time the vcpu is created
+ * (x) initialized before use
+ */
 struct vcpu {
-	int		flags;
-	enum vcpu_state	state;
-	struct mtx	mtx;
-	int		hostcpu;	/* host cpuid this vcpu last ran on */
-	struct vlapic	*vlapic;
-	int		 vcpuid;
-	struct savefpu	*guestfpu;	/* guest fpu state */
-	void		*stats;
-	struct vm_exit	exitinfo;
+	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
+	enum vcpu_state	state;		/* (o) vcpu state */
+#ifndef __FreeBSD__
+	kcondvar_t	vcpu_cv;	/* (o) cpu waiter cv */
+	kcondvar_t	state_cv;	/* (o) IDLE-transition cv */
+#endif /* __FreeBSD__ */
+	int		hostcpu;	/* (o) vcpu's current host cpu */
+#ifndef __FreeBSD__
+	int		lastloccpu;	/* (o) last host cpu localized to */
+#endif
+	u_int		runblock;	/* (i) block vcpu from run state */
+	int		reqidle;	/* (i) request vcpu to idle */
+	struct vlapic	*vlapic;	/* (i) APIC device model */
+	enum x2apic_state x2apic_state;	/* (i) APIC mode */
+	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
+	int		nmi_pending;	/* (i) NMI pending */
+	int		extint_pending;	/* (i) INTR pending */
+	int	exception_pending;	/* (i) exception pending */
+	int	exc_vector;		/* (x) exception collateral */
+	int	exc_errcode_valid;
+	uint32_t exc_errcode;
+	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
+	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
+	void		*stats;		/* (a,i) statistics */
+	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 	uint64_t	nextrip;	/* (x) next instruction to execute */
-	enum x2apic_state x2apic_state;
-	uint64_t	exitintinfo;
-	int		nmi_pending;
-	int		extint_pending;
-	struct vm_exception exception;
-	int		exception_pending;
+#ifndef __FreeBSD__
+	uint64_t	tsc_offset;	/* (x) offset from host TSC */
+#endif
 };
 
+#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
-#define	VM_MAX_MEMORY_SEGMENTS	8
-
-struct vm {
-	void		*cookie;	/* processor-specific data */
-	void		*iommu;		/* iommu-specific data */
-	struct vcpu	vcpu[VM_MAXCPU];
-	struct vhpet	*vhpet;
-	struct vioapic	*vioapic;	/* virtual ioapic */
-	struct vatpic	*vatpic;	/* virtual atpic */
-	struct vatpit	*vatpit;	/* virtual atpit */
-	int		num_mem_segs;
-	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
-	char		name[VM_MAX_NAMELEN];
+struct mem_seg {
+	size_t	len;
+	bool	sysmem;
+	struct vm_object *object;
+};
+#ifdef __FreeBSD__
+#define	VM_MAX_MEMSEGS	3
+#else
+#define	VM_MAX_MEMSEGS	4
+#endif
 
-	/*
-	 * Set of active vcpus.
-	 * An active vcpu is one that has been started implicitly (BSP) or
-	 * explicitly (AP) by sending it a startup ipi.
-	 */
-	cpuset_t	active_cpus;
+struct mem_map {
+	vm_paddr_t	gpa;
+	size_t		len;
+	vm_ooffset_t	segoff;
+	int		segid;
+	int		prot;
+	int		flags;
+};
+#define	VM_MAX_MEMMAPS	4
 
-	vm_rendezvous_func_t rendezvous_func;
+/*
+ * Initialization:
+ * (o) initialized the first time the VM is created
+ * (i) initialized when VM is created and when it is reinitialized
+ * (x) initialized before use
+ */
+struct vm {
+	void		*cookie;		/* (i) cpu-specific data */
+	void		*iommu;			/* (x) iommu-specific data */
+	struct vhpet	*vhpet;			/* (i) virtual HPET */
+	struct vioapic	*vioapic;		/* (i) virtual ioapic */
+	struct vatpic	*vatpic;		/* (i) virtual atpic */
+	struct vatpit	*vatpit;		/* (i) virtual atpit */
+	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
+	struct vrtc	*vrtc;			/* (o) virtual RTC */
+	volatile cpuset_t active_cpus;		/* (i) active vcpus */
+	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
+	int		suspend;		/* (i) stop VM execution */
+	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
+	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
+	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
+	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
+	struct vmspace	*vmspace;		/* (o) guest's address space */
+	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
+	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
+	/* The following describe the vm cpu topology */
+	uint16_t	sockets;		/* (o) num of sockets */
+	uint16_t	cores;			/* (o) num of cores/socket */
+	uint16_t	threads;		/* (o) num of threads/core */
+	uint16_t	maxcpus;		/* (o) max pluggable cpus */
+#ifndef __FreeBSD__
+	list_t		ioport_hooks;
+#endif /* __FreeBSD__ */
 };
 
 static int vmm_initialized;
 
 static struct vmm_ops *ops;
-#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
+#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
+#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
-#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
-#define	VMRUN(vmi, vcpu, rip) \
-	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
+#define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
-#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
-    	(ops != NULL ? 							\
-    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
-	ENXIO)
-#define	VMMMAP_GET(vmi, gpa) \
-	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define	VMSPACE_ALLOC(min, max) \
+	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
+#define	VMSPACE_FREE(vmspace) \
+	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
@@ -174,45 +236,134 @@ static struct vmm_ops *ops;
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
+SDT_PROVIDER_DEFINE(vmm);
+
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+/*
+ * Halt the guest if all vcpus are executing a HLT instruction with
+ * interrupts disabled.
+ */
+static int halt_detection_enabled = 1;
+SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
+    &halt_detection_enabled, 0,
+    "Halt VM if all vcpus execute HLT with interrupts disabled");
+
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
+static int trace_guest_exceptions;
+SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
+    &trace_guest_exceptions, 0,
+    "Trap into hypervisor on all guest exceptions and reflect them back");
+
+static void vm_free_memmap(struct vm *vm, int ident);
+static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
+static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
+
+#ifndef __FreeBSD__
+static void vm_clear_memseg(struct vm *, int);
+
+typedef struct vm_ioport_hook {
+	list_node_t	vmih_node;
+	uint_t		vmih_ioport;
+	void		*vmih_arg;
+	vmm_rmem_cb_t	vmih_rmem_cb;
+	vmm_wmem_cb_t	vmih_wmem_cb;
+} vm_ioport_hook_t;
+
+/* Flags for vtc_status */
+#define	VTCS_FPU_RESTORED	1 /* guest FPU restored, host FPU saved */
+#define	VTCS_FPU_CTX_CRITICAL	2 /* in ctx where FPU restore cannot be lazy */
+
+typedef struct vm_thread_ctx {
+	struct vm	*vtc_vm;
+	int		vtc_vcpuid;
+	uint_t		vtc_status;
+} vm_thread_ctx_t;
+#endif /* __FreeBSD__ */
+
+#ifdef KTR
+static const char *
+vcpu_state2str(enum vcpu_state state)
+{
+
+	switch (state) {
+	case VCPU_IDLE:
+		return ("idle");
+	case VCPU_FROZEN:
+		return ("frozen");
+	case VCPU_RUNNING:
+		return ("running");
+	case VCPU_SLEEPING:
+		return ("sleeping");
+	default:
+		return ("unknown");
+	}
+}
+#endif
+
 static void
-vcpu_cleanup(struct vm *vm, int i)
+vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
-#ifdef	__FreeBSD__
-	vmm_stat_free(vcpu->stats);	
-#endif
-	fpu_save_area_free(vcpu->guestfpu);
+	if (destroy) {
+		vmm_stat_free(vcpu->stats);
+		fpu_save_area_free(vcpu->guestfpu);
+	}
 }
 
 static void
-vcpu_init(struct vm *vm, uint32_t vcpu_id)
+vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
-	
+
+	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
+	    ("vcpu_init: invalid vcpu %d", vcpu_id));
+
 	vcpu = &vm->vcpu[vcpu_id];
 
-	vcpu_lock_init(vcpu);
-	vcpu->hostcpu = NOCPU;
-	vcpu->vcpuid = vcpu_id;
+	if (create) {
+#ifdef __FreeBSD__
+		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
+		    "initialized", vcpu_id));
+#endif
+		vcpu_lock_init(vcpu);
+		vcpu->state = VCPU_IDLE;
+		vcpu->hostcpu = NOCPU;
+#ifndef __FreeBSD__
+		vcpu->lastloccpu = NOCPU;
+#endif
+		vcpu->guestfpu = fpu_save_area_alloc();
+		vcpu->stats = vmm_stat_alloc();
+	}
+
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->runblock = 0;
+	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
-	vcpu->guestfpu = fpu_save_area_alloc();
+	vcpu->nmi_pending = 0;
+	vcpu->extint_pending = 0;
+	vcpu->exception_pending = 0;
+	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
-#ifdef	__FreeBSD__
-	vcpu->stats = vmm_stat_alloc();
-#endif
+	vmm_stat_init(vcpu->stats);
+}
+
+int
+vcpu_trace_exceptions(struct vm *vm, int vcpuid)
+{
+
+	return (trace_guest_exceptions);
 }
 
 struct vm_exit *
@@ -220,7 +371,7 @@ vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
-	if (cpuid < 0 || cpuid >= VM_MAXCPU)
+	if (cpuid < 0 || cpuid >= vm->maxcpus)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
@@ -228,24 +379,35 @@ vm_exitinfo(struct vm *vm, int cpuid)
 	return (&vcpu->exitinfo);
 }
 
+#ifdef __FreeBSD__
+static void
+vmm_resume(void)
+{
+	VMM_RESUME();
+}
+#endif
+
 static int
 vmm_init(void)
 {
 	int error;
 
-#ifndef	__FreeBSD__
-	vmm_sol_glue_init();
-#endif
-
 	vmm_host_state_init();
-#ifdef	__FreeBSD__
-	vmm_ipi_init();
+
+#ifdef __FreeBSD__
+	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+	    &IDTVEC(justreturn));
+	if (vmm_ipinum < 0)
+		vmm_ipinum = IPI_AST;
+#else
+	/* We use cpu_poke() for IPIs */
+	vmm_ipinum = 0;
 #endif
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
-	
+
 	if (vmm_is_intel())
 		ops = &vmm_ops_intel;
 	else if (vmm_is_amd())
@@ -253,10 +415,15 @@ vmm_init(void)
 	else
 		return (ENXIO);
 
-	return (VMM_INIT());
+#ifdef __FreeBSD__
+	vmm_resume_p = vmm_resume;
+#endif
+
+	return (VMM_INIT(vmm_ipinum));
 }
 
-#ifdef	__FreeBSD__
+#ifdef __FreeBSD__
+
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
@@ -265,8 +432,6 @@ vmm_handler(module_t mod, int what, void *arg)
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
-		if (ppt_num_devices() > 0)
-			iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
@@ -274,11 +439,12 @@ vmm_handler(module_t mod, int what, void *arg)
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0) {
-#ifndef	__FreeBSD__
-			vmm_sol_glue_cleanup();
-#endif
+			vmm_resume_p = NULL;
 			iommu_cleanup();
-			vmm_ipi_cleanup();
+#ifdef __FreeBSD__
+			if (vmm_ipinum != IPI_AST)
+				lapic_ipi_free(vmm_ipinum);
+#endif
 			error = VMM_CLEANUP();
 			/*
 			 * Something bad happened - prevent new
@@ -304,23 +470,19 @@ static moduledata_t vmm_kmod = {
 /*
  * vmm initialization has the following dependencies:
  *
- * - iommu initialization must happen after the pci passthru driver has had
- *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
- *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
-SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
-#else
+#else /* __FreeBSD__ */
+
 int
 vmm_mod_load()
 {
 	int	error;
 
-	vmmdev_init();
 	error = vmm_init();
 	if (error == 0)
 		vmm_initialized = 1;
@@ -333,9 +495,6 @@ vmm_mod_unload()
 {
 	int	error;
 
-	error = vmmdev_cleanup();
-	if (error)
-		return (error);
 	error = VMM_CLEANUP();
 	if (error)
 		return (error);
@@ -343,16 +502,63 @@ vmm_mod_unload()
 
 	return (0);
 }
+
+#endif /* __FreeBSD__ */
+
+static void
+vm_init(struct vm *vm, bool create)
+{
+	int i;
+#ifndef __FreeBSD__
+	uint64_t tsc_off;
 #endif
 
+	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
+	vm->iommu = NULL;
+	vm->vioapic = vioapic_init(vm);
+	vm->vhpet = vhpet_init(vm);
+	vm->vatpic = vatpic_init(vm);
+	vm->vatpit = vatpit_init(vm);
+	vm->vpmtmr = vpmtmr_init(vm);
+	if (create)
+		vm->vrtc = vrtc_init(vm);
+#ifndef __FreeBSD__
+	if (create) {
+		list_create(&vm->ioport_hooks, sizeof (vm_ioport_hook_t),
+		    offsetof (vm_ioport_hook_t, vmih_node));
+	} else {
+		VERIFY(list_is_empty(&vm->ioport_hooks));
+	}
+#endif /* __FreeBSD__ */
+
+	CPU_ZERO(&vm->active_cpus);
+	CPU_ZERO(&vm->debug_cpus);
+
+	vm->suspend = 0;
+	CPU_ZERO(&vm->suspended_cpus);
+
+	for (i = 0; i < vm->maxcpus; i++)
+		vcpu_init(vm, i, create);
+
+#ifndef __FreeBSD__
+	tsc_off = (uint64_t)(-(int64_t)rdtsc());
+	for (i = 0; i < vm->maxcpus; i++) {
+		vm->vcpu[i].tsc_offset = tsc_off;
+	}
+#endif /* __FreeBSD__ */
+}
+
+/*
+ * The default CPU topology is a single thread per package.
+ */
+u_int cores_per_package = 1;
+u_int threads_per_core = 1;
+
 int
 vm_create(const char *name, struct vm **retvm)
 {
-	int i;
 	struct vm *vm;
-	vm_paddr_t maxaddr;
-
-	const int BSP = 0;
+	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
@@ -364,269 +570,587 @@ vm_create(const char *name, struct vm **retvm)
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
+	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
+	if (vmspace == NULL)
+		return (ENOMEM);
+
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
-	vm->cookie = VMINIT(vm);
-
-	vm->vioapic = vioapic_init(vm);
-	vm->vhpet = vhpet_init(vm);
-	vm->vatpic = vatpic_init(vm);
-	vm->vatpit = vatpit_init(vm);
+	vm->vmspace = vmspace;
 
-	for (i = 0; i < VM_MAXCPU; i++) {
-		vcpu_init(vm, i);
-	}
+	vm->sockets = 1;
+	vm->cores = cores_per_package;	/* XXX backwards compatibility */
+	vm->threads = threads_per_core;	/* XXX backwards compatibility */
+	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 
-#ifdef	__FreeBSD__
-	maxaddr = vmm_mem_maxaddr();
-	vm->iommu = iommu_create_domain(maxaddr);
-#endif
+	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
-static void
-vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+void
+vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
+    uint16_t *threads, uint16_t *maxcpus)
 {
-	size_t len;
-	vm_paddr_t hpa;
-	void *host_domain;
-
-#ifdef	__FreeBSD__
-	host_domain = iommu_host_domain();
-#endif
-
-	len = 0;
-	while (len < seg->len) {
-		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
-		if (hpa == (vm_paddr_t)-1) {
-			panic("vm_free_mem_segs: cannot free hpa "
-			      "associated with gpa 0x%016lx", seg->gpa + len);
-		}
-
-#ifdef	__FreeBSD__
-		/*
-		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
-		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
-		 */
-		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
-		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
-#endif
-
-		vmm_mem_free(hpa, PAGE_SIZE);
-
-		len += PAGE_SIZE;
-	}
+	*sockets = vm->sockets;
+	*cores = vm->cores;
+	*threads = vm->threads;
+	*maxcpus = vm->maxcpus;
+}
 
-#ifdef	__FreeBSD__
-	/*
-	 * Invalidate cached translations associated with 'vm->iommu' since
-	 * we have now moved some pages from it.
-	 */
-	iommu_invalidate_tlb(vm->iommu);
-#endif
+uint16_t
+vm_get_maxcpus(struct vm *vm)
+{
+	return (vm->maxcpus);
+}
 
-	bzero(seg, sizeof(struct vm_memory_segment));
+int
+vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
+    uint16_t threads, uint16_t maxcpus)
+{
+	if (maxcpus != 0)
+		return (EINVAL);	/* XXX remove when supported */
+	if ((sockets * cores * threads) > vm->maxcpus)
+		return (EINVAL);
+	/* XXX need to check sockets * cores * threads == vCPU, how? */
+	vm->sockets = sockets;
+	vm->cores = cores;
+	vm->threads = threads;
+	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
+	return(0);
 }
 
-void
-vm_destroy(struct vm *vm)
+static void
+vm_cleanup(struct vm *vm, bool destroy)
 {
+	struct mem_map *mm;
 	int i;
 
-#ifdef	__FreeBSD__
 	ppt_unassign_all(vm);
-#endif
-
-	for (i = 0; i < vm->num_mem_segs; i++)
-		vm_free_mem_seg(vm, &vm->mem_segs[i]);
 
-	vm->num_mem_segs = 0;
-
-	for (i = 0; i < VM_MAXCPU; i++)
-		vcpu_cleanup(vm, i);
+	if (vm->iommu != NULL)
+		iommu_destroy_domain(vm->iommu);
 
+	if (destroy)
+		vrtc_cleanup(vm->vrtc);
+	else
+		vrtc_reset(vm->vrtc);
+	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
-#ifdef	__FreeBSD__
-	iommu_destroy_domain(vm->iommu);
-#endif
+	for (i = 0; i < vm->maxcpus; i++)
+		vcpu_cleanup(vm, i, destroy);
 
 	VMCLEANUP(vm->cookie);
 
+	/*
+	 * System memory is removed from the guest address space only when
+	 * the VM is destroyed. This is because the mapping remains the same
+	 * across VM reset.
+	 *
+	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
+	 * so those mappings are removed on a VM reset.
+	 */
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (destroy || !sysmem_mapping(vm, mm))
+			vm_free_memmap(vm, i);
+#ifndef __FreeBSD__
+		else {
+			/*
+			 * We need to reset the IOMMU flag so this mapping can
+			 * be reused when a VM is rebooted. Since the IOMMU
+			 * domain has already been destroyed we can just reset
+			 * the flag here.
+			 */
+			mm->flags &= ~VM_MEMMAP_F_IOMMU;
+		}
+#endif
+	}
+
+	if (destroy) {
+		for (i = 0; i < VM_MAX_MEMSEGS; i++)
+			vm_free_memseg(vm, i);
+
+		VMSPACE_FREE(vm->vmspace);
+		vm->vmspace = NULL;
+	}
+#ifndef __FreeBSD__
+	else {
+		/*
+		 * Clear the first memory segment (low mem), old memory contents
+		 * could confuse the UEFI firmware.
+		 */
+		vm_clear_memseg(vm, 0);
+	}
+#endif
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
+int
+vm_reinit(struct vm *vm)
+{
+	int error;
+
+	/*
+	 * A virtual machine can be reset only if all vcpus are suspended.
+	 */
+	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
+		vm_cleanup(vm, false);
+		vm_init(vm, false);
+		error = 0;
+	} else {
+		error = EBUSY;
+	}
+
+	return (error);
+}
+
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
-#ifdef	__FreeBSD__
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	vm_object_t obj;
 
-	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
-			   VM_PROT_RW, spok));
+	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
+		return (ENOMEM);
+	else
+		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
 
-	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
-			   VM_PROT_NONE, spok));
+	vmm_mmio_free(vm->vmspace, gpa, len);
+	return (0);
 }
-#endif
 
 /*
- * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ * Return 'true' if 'gpa' is allocated in the guest address space.
+ *
+ * This function is called in the context of a running vcpu which acts as
+ * an implicit lock on 'vm->mem_maps[]'.
  */
-static boolean_t
-vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+bool
+vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 {
+	struct mem_map *mm;
 	int i;
-	vm_paddr_t gpabase, gpalimit;
 
-	if (gpa & PAGE_MASK)
-		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+#ifdef INVARIANTS
+	int hostcpu, state;
+	state = vcpu_get_state(vm, vcpuid, &hostcpu);
+	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
+	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
+#endif
 
-	for (i = 0; i < vm->num_mem_segs; i++) {
-		gpabase = vm->mem_segs[i].gpa;
-		gpalimit = gpabase + vm->mem_segs[i].len;
-		if (gpa >= gpabase && gpa < gpalimit)
-			return (FALSE);
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
+			return (true);		/* 'gpa' is sysmem or devmem */
 	}
 
-	return (TRUE);
+	if (ppt_is_mmio(vm, gpa))
+		return (true);			/* 'gpa' is pci passthru mmio */
+
+	return (false);
 }
 
 int
-vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 {
-	int error, available, allocated;
-	struct vm_memory_segment *seg;
-	vm_paddr_t g, hpa;
-	void *host_domain;
+	struct mem_seg *seg;
+	vm_object_t obj;
 
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+#ifndef __FreeBSD__
+	extern pgcnt_t get_max_page_get(void);
+#endif
 
-	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
-	
-	available = allocated = 0;
-	g = gpa;
-	while (g < gpa + len) {
-		if (vm_gpa_available(vm, g))
-			available++;
-		else
-			allocated++;
-
-		g += PAGE_SIZE;
-	}
 
-	/*
-	 * If there are some allocated and some available pages in the address
-	 * range then it is an error.
-	 */
-	if (allocated && available)
+	if (len == 0 || (len & PAGE_MASK))
 		return (EINVAL);
 
-	/*
-	 * If the entire address range being requested has already been
-	 * allocated then there isn't anything more to do.
-	 */
-	if (allocated && available == 0)
-		return (0);
-
-	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
-		return (E2BIG);
-
-#ifdef	__FreeBSD__
-	host_domain = iommu_host_domain();
+#ifndef __FreeBSD__
+	if (len > ptob(get_max_page_get()))
+		return (EINVAL);
 #endif
 
-	seg = &vm->mem_segs[vm->num_mem_segs];
+	seg = &vm->mem_segs[ident];
+	if (seg->object != NULL) {
+		if (seg->len == len && seg->sysmem == sysmem)
+			return (EEXIST);
+		else
+			return (EINVAL);
+	}
 
-	error = 0;
-	seg->gpa = gpa;
-	seg->len = 0;
-	while (seg->len < len) {
-		hpa = vmm_mem_alloc(PAGE_SIZE);
-		if (hpa == 0) {
-			error = ENOMEM;
-			break;
-		}
+	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
+	if (obj == NULL)
+		return (ENOMEM);
 
-		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
-				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
-		if (error)
-			break;
+	seg->len = len;
+	seg->object = obj;
+	seg->sysmem = sysmem;
+	return (0);
+}
 
-#ifdef	__FreeBSD__
-		/*
-		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
-		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
-		 */
-		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
-		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
-#endif
+int
+vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
+    vm_object_t *objptr)
+{
+	struct mem_seg *seg;
 
-		seg->len += PAGE_SIZE;
-	}
+	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
+		return (EINVAL);
 
-	if (error) {
-		vm_free_mem_seg(vm, seg);
-		return (error);
-	}
+	seg = &vm->mem_segs[ident];
+	if (len)
+		*len = seg->len;
+	if (sysmem)
+		*sysmem = seg->sysmem;
+	if (objptr)
+		*objptr = seg->object;
+	return (0);
+}
 
-#ifdef	__FreeBSD__
-	/*
-	 * Invalidate cached translations associated with 'host_domain' since
-	 * we have now moved some pages from it.
-	 */
-	iommu_invalidate_tlb(host_domain);
-#endif
+#ifndef __FreeBSD__
+static void
+vm_clear_memseg(struct vm *vm, int ident)
+{
+	struct mem_seg *seg;
 
-	vm->num_mem_segs++;
+	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
+	    ("%s: invalid memseg ident %d", __func__, ident));
 
-	return (0);
+	seg = &vm->mem_segs[ident];
+
+	if (seg->object != NULL)
+		vm_object_clear(seg->object);
 }
+#endif
 
-vm_paddr_t
-vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+void
+vm_free_memseg(struct vm *vm, int ident)
 {
-	vm_paddr_t nextpage;
+	struct mem_seg *seg;
 
-	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
-	if (len > nextpage - gpa)
-		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
+	    ("%s: invalid memseg ident %d", __func__, ident));
 
-	return (VMMMAP_GET(vm->cookie, gpa));
+	seg = &vm->mem_segs[ident];
+	if (seg->object != NULL) {
+		vm_object_deallocate(seg->object);
+		bzero(seg, sizeof(struct mem_seg));
+	}
 }
 
-void *
-vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
-	    void **cookie)
+int
+vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
+    size_t len, int prot, int flags)
 {
-#ifdef	__FreeBSD__
-	int count, pageoff;
-	vm_page_t m;
+	struct mem_seg *seg;
+	struct mem_map *m, *map;
+	vm_ooffset_t last;
+	int i, error;
 
+	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
+		return (EINVAL);
+
+	if (flags & ~VM_MEMMAP_F_WIRED)
+		return (EINVAL);
+
+	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
+		return (EINVAL);
+
+	seg = &vm->mem_segs[segid];
+	if (seg->object == NULL)
+		return (EINVAL);
+
+	last = first + len;
+	if (first < 0 || first >= last || last > seg->len)
+		return (EINVAL);
+
+	if ((gpa | first | last) & PAGE_MASK)
+		return (EINVAL);
+
+	map = NULL;
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		m = &vm->mem_maps[i];
+		if (m->len == 0) {
+			map = m;
+			break;
+		}
+	}
+
+	if (map == NULL)
+		return (ENOSPC);
+
+	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
+	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
+	if (error != KERN_SUCCESS)
+		return (EFAULT);
+
+	vm_object_reference(seg->object);
+
+	if ((flags & VM_MEMMAP_F_WIRED) != 0) {
+		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
+		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+		if (error != KERN_SUCCESS) {
+			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
+			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
+			    EFAULT);
+		}
+	}
+
+	map->gpa = gpa;
+	map->len = len;
+	map->segoff = first;
+	map->segid = segid;
+	map->prot = prot;
+	map->flags = flags;
+	return (0);
+}
+
+int
+vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
+    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
+{
+	struct mem_map *mm, *mmnext;
+	int i;
+
+	mmnext = NULL;
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (mm->len == 0 || mm->gpa < *gpa)
+			continue;
+		if (mmnext == NULL || mm->gpa < mmnext->gpa)
+			mmnext = mm;
+	}
+
+	if (mmnext != NULL) {
+		*gpa = mmnext->gpa;
+		if (segid)
+			*segid = mmnext->segid;
+		if (segoff)
+			*segoff = mmnext->segoff;
+		if (len)
+			*len = mmnext->len;
+		if (prot)
+			*prot = mmnext->prot;
+		if (flags)
+			*flags = mmnext->flags;
+		return (0);
+	} else {
+		return (ENOENT);
+	}
+}
+
+static void
+vm_free_memmap(struct vm *vm, int ident)
+{
+	struct mem_map *mm;
+	int error;
+
+	mm = &vm->mem_maps[ident];
+	if (mm->len) {
+		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
+		    mm->gpa + mm->len);
+		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
+		    __func__, error));
+		bzero(mm, sizeof(struct mem_map));
+	}
+}
+
+static __inline bool
+sysmem_mapping(struct vm *vm, struct mem_map *mm)
+{
+
+	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
+		return (true);
+	else
+		return (false);
+}
+
+vm_paddr_t
+vmm_sysmem_maxaddr(struct vm *vm)
+{
+	struct mem_map *mm;
+	vm_paddr_t maxaddr;
+	int i;
+
+	maxaddr = 0;
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (sysmem_mapping(vm, mm)) {
+			if (maxaddr < mm->gpa + mm->len)
+				maxaddr = mm->gpa + mm->len;
+		}
+	}
+	return (maxaddr);
+}
+
+static void
+vm_iommu_modify(struct vm *vm, boolean_t map)
+{
+	int i, sz;
+	vm_paddr_t gpa, hpa;
+	struct mem_map *mm;
+#ifdef __FreeBSD__
+	void *vp, *cookie, *host_domain;
+#else
+	void *vp, *cookie, *host_domain __unused;
+#endif
+
+	sz = PAGE_SIZE;
+	host_domain = iommu_host_domain();
+
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (!sysmem_mapping(vm, mm))
+			continue;
+
+		if (map) {
+			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
+			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
+			    mm->gpa, mm->len, mm->flags));
+			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
+				continue;
+			mm->flags |= VM_MEMMAP_F_IOMMU;
+		} else {
+			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
+				continue;
+			mm->flags &= ~VM_MEMMAP_F_IOMMU;
+			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
+			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
+			    mm->gpa, mm->len, mm->flags));
+		}
+
+		gpa = mm->gpa;
+		while (gpa < mm->gpa + mm->len) {
+			vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
+					 &cookie);
+			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
+			    vm_name(vm), gpa));
+
+			vm_gpa_release(cookie);
+
+			hpa = DMAP_TO_PHYS((uintptr_t)vp);
+			if (map) {
+				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
+				iommu_remove_mapping(host_domain, hpa, sz);
+			} else {
+				iommu_remove_mapping(vm->iommu, gpa, sz);
+				iommu_create_mapping(host_domain, hpa, hpa, sz);
+			}
+
+			gpa += PAGE_SIZE;
+		}
+	}
+
+	/*
+	 * Invalidate the cached translations associated with the domain
+	 * from which pages were removed.
+	 */
+	if (map)
+		iommu_invalidate_tlb(host_domain);
+	else
+		iommu_invalidate_tlb(vm->iommu);
+}
+
+#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
+#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
+
+int
+vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+
+	error = ppt_unassign_device(vm, bus, slot, func);
+	if (error)
+		return (error);
+
+	if (ppt_assigned_devices(vm) == 0)
+		vm_iommu_unmap(vm);
+
+	return (0);
+}
+
+int
+vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+	vm_paddr_t maxaddr;
+
+	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
+	if (ppt_assigned_devices(vm) == 0) {
+		KASSERT(vm->iommu == NULL,
+		    ("vm_assign_pptdev: iommu must be NULL"));
+		maxaddr = vmm_sysmem_maxaddr(vm);
+		vm->iommu = iommu_create_domain(maxaddr);
+		if (vm->iommu == NULL)
+			return (ENXIO);
+		vm_iommu_map(vm);
+	}
+
+	error = ppt_assign_device(vm, bus, slot, func);
+	return (error);
+}
+
+void *
+vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
+	    void **cookie)
+{
+	int i, count, pageoff;
+	struct mem_map *mm;
+	vm_page_t m;
+#ifdef INVARIANTS
+	/*
+	 * All vcpus are frozen by ioctls that modify the memory map
+	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
+	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
+	 */
+	int state;
+	KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
+	    __func__, vcpuid));
+	for (i = 0; i < vm->maxcpus; i++) {
+		if (vcpuid != -1 && vcpuid != i)
+			continue;
+		state = vcpu_get_state(vm, i, NULL);
+		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
+		    __func__, state));
+	}
+#endif
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
-	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
-	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
+	count = 0;
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
+		    gpa < mm->gpa + mm->len) {
+			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
+			break;
+		}
+	}
 
 	if (count == 1) {
 		*cookie = m;
@@ -635,54 +1159,23 @@ vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
 		*cookie = NULL;
 		return (NULL);
 	}
-#else
-	int pageoff;
-	vm_paddr_t hpa;
-
-	pageoff = gpa & PAGE_MASK;
-	if (len > PAGE_SIZE - pageoff)
-		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
-
-	hpa = vm_gpa2hpa(vm, gpa, len);
-	if (hpa == (vm_paddr_t)-1)
-		return (NULL);
-
-	return (hat_kpm_pfn2va(btop(hpa)) + pageoff);
-#endif
 }
 
 void
 vm_gpa_release(void *cookie)
 {
-#ifdef	__FreeBSD__
 	vm_page_t m = cookie;
 
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
-#endif
-}
-
-int
-vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
-		  struct vm_memory_segment *seg)
-{
-	int i;
-
-	for (i = 0; i < vm->num_mem_segs; i++) {
-		if (gpabase == vm->mem_segs[i].gpa) {
-			*seg = vm->mem_segs[i];
-			return (0);
-		}
-	}
-	return (-1);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
@@ -697,7 +1190,7 @@ vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 	struct vcpu *vcpu;
 	int error;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
@@ -751,7 +1244,7 @@ vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
@@ -764,7 +1257,7 @@ int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
@@ -784,6 +1277,10 @@ restore_guest_fpustate(struct vcpu *vcpu)
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
+	/* restore guest XCR0 if XSAVE is enabled in the host */
+	if (rcr4() & CR4_XSAVE)
+		load_xcr(0, vcpu->guest_xcr0);
+
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
@@ -798,20 +1295,35 @@ save_guest_fpustate(struct vcpu *vcpu)
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
+	/* save guest XCR0 and restore host XCR0 */
+	if (rcr4() & CR4_XSAVE) {
+		vcpu->guest_xcr0 = rxcr(0);
+		load_xcr(0, vmm_get_host_xcr0());
+	}
+
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
+#ifdef __FreeBSD__
 	fpu_start_emulating();
+#else
+	/*
+	 * When the host state has been restored, we should not re-enable
+	 * CR0.TS on illumos for eager FPU.
+	 */
+#endif
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
-vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
+vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
+	struct vcpu *vcpu;
 	int error;
 
+	vcpu = &vm->vcpu[vcpuid];
 	vcpu_assert_locked(vcpu);
 
 	/*
@@ -820,8 +1332,17 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
-		while (vcpu->state != VCPU_IDLE)
+		while (vcpu->state != VCPU_IDLE) {
+			vcpu->reqidle = 1;
+			vcpu_notify_event_locked(vcpu, false);
+			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
+			    "idle requested", vcpu_state2str(vcpu->state));
+#ifdef __FreeBSD__
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+#else
+			cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
+#endif
+		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
@@ -855,17 +1376,36 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 		break;
 	}
 
+	if (newstate == VCPU_RUNNING) {
+		while (vcpu->runblock != 0) {
+#ifdef __FreeBSD__
+			msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
+#else
+			cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
+#endif
+		}
+	}
+
 	if (error)
 		return (EBUSY);
 
+	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
+	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
+
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
-	if (newstate == VCPU_IDLE)
+	if (newstate == VCPU_IDLE ||
+	    (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
+#ifdef __FreeBSD__
 		wakeup(&vcpu->state);
+#else
+		cv_broadcast(&vcpu->state_cv);
+#endif
+	}
 
 	return (0);
 }
@@ -880,11 +1420,11 @@ vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 }
 
 static void
-vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
-	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
+	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
@@ -894,60 +1434,139 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
-	struct vm_exit *vmexit;
 	struct vcpu *vcpu;
-	int t, timo, spindown;
+#ifdef __FreeBSD__
+	const char *wmesg;
+#else
+	const char *wmesg __unused;
+#endif
+	int t, vcpu_halted, vm_halted;
+
+	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
-	spindown = 0;
+	vcpu_halted = 0;
+	vm_halted = 0;
 
 	vcpu_lock(vcpu);
+	while (1) {
+		/*
+		 * Do a final check for pending NMI or interrupts before
+		 * really putting this thread to sleep. Also check for
+		 * software events that would cause this vcpu to wakeup.
+		 *
+		 * These interrupts/events could have happened after the
+		 * vcpu returned from VMRUN() and before it acquired the
+		 * vcpu lock above.
+		 */
+		if (vm->suspend || vcpu->reqidle)
+			break;
+		if (vm_nmi_pending(vm, vcpuid))
+			break;
+		if (!intr_disabled) {
+			if (vm_extint_pending(vm, vcpuid) ||
+			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
+				break;
+			}
+		}
 
-	/*
-	 * Do a final check for pending NMI or interrupts before
-	 * really putting this thread to sleep.
-	 *
-	 * These interrupts could have happened any time after we
-	 * returned from VMRUN() and before we grabbed the vcpu lock.
-	 */
-	if (vm->rendezvous_func == NULL &&
-	    !vm_nmi_pending(vm, vcpuid) &&
-	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
-		t = ticks;
-		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
-		if (vlapic_enabled(vcpu->vlapic)) {
-			/*
-			 * XXX msleep_spin() is not interruptible so use the
-			 * 'timo' to put an upper bound on the sleep time.
-			 */
-			timo = hz;
-			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
+		/* Don't go to sleep if the vcpu thread needs to yield */
+		if (vcpu_should_yield(vm, vcpuid))
+			break;
+
+		if (vcpu_debugged(vm, vcpuid))
+			break;
+
+		/*
+		 * Some Linux guests implement "halt" by having all vcpus
+		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
+		 * track of the vcpus that have entered this state. When all
+		 * vcpus enter the halted state the virtual machine is halted.
+		 */
+		if (intr_disabled) {
+			wmesg = "vmhalt";
+			VCPU_CTR0(vm, vcpuid, "Halted");
+			if (!vcpu_halted && halt_detection_enabled) {
+				vcpu_halted = 1;
+				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
+			}
+			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
+				vm_halted = 1;
+				break;
+			}
 		} else {
-			/*
-			 * Spindown the vcpu if the apic is disabled and it
-			 * had entered the halted state.
-			 */
-			spindown = 1;
+			wmesg = "vmidle";
 		}
-		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+
+		t = ticks;
+		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
+#ifdef __FreeBSD__
+		/*
+		 * XXX msleep_spin() cannot be interrupted by signals so
+		 * wake up periodically to check pending signals.
+		 */
+		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
+#else
+		/*
+		 * Fortunately, cv_wait_sig can be interrupted by signals, so
+		 * there is no need to periodically wake up.
+		 */
+		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
+#endif
+		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
+
+	if (vcpu_halted)
+		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
+
 	vcpu_unlock(vcpu);
 
-#ifdef	__FreeBSD__
-	/*
-	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
-	 * outside the confines of the vcpu spinlock.
-	 */
-	if (spindown) {
-		*retu = true;
-		vmexit = vm_exitinfo(vm, vcpuid);
-		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
-		vm_deactivate_cpu(vm, vcpuid);
-		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
+	if (vm_halted)
+		vm_suspend(vm, VM_SUSPEND_HALT);
+
+	return (0);
+}
+
+static int
+vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
+{
+	int rv, ftype;
+	struct vm_map *map;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
+	    __func__, vme->inst_length));
+
+	ftype = vme->u.paging.fault_type;
+	KASSERT(ftype == VM_PROT_READ ||
+	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
+	    ("vm_handle_paging: invalid fault_type %d", ftype));
+
+	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
+		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
+		    vme->u.paging.gpa, ftype);
+		if (rv == 0) {
+			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
+			    ftype == VM_PROT_READ ? "accessed" : "dirty",
+			    vme->u.paging.gpa);
+			goto done;
+		}
 	}
-#endif
 
+	map = &vm->vmspace->vm_map;
+	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
+
+	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
+	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
+
+	if (rv != KERN_SUCCESS)
+		return (EFAULT);
+done:
 	return (0);
 }
 
@@ -962,11 +1581,14 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
-	int cs_d, error, length;
+	int cs_d, error, fault;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
+	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
+	    __func__, vme->inst_length));
+
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_base = vme->u.inst_emul.cs_base;
@@ -979,37 +1601,31 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
-		/*
-		 * If the instruction length is not known then assume a
-		 * maximum size instruction.
-		 */
-		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
-		    cs_base, length, vie);
+		    cs_base, VIE_INST_SIZE, vie, &fault);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
-		error = 0;
+		error = fault = 0;
 	}
-	if (error == 1)
-		return (0);		/* Resume guest to handle page fault */
-	else if (error == -1)
-		return (EFAULT);
-	else if (error != 0)
-		panic("%s: vmm_fetch_instruction error %d", __func__, error);
+	if (error || fault)
+		return (error);
 
-	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
-		return (EFAULT);
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
+		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
+		    vme->rip + cs_base);
+		*retu = true;	    /* dump instruction bytes in userspace */
+		return (0);
+	}
 
 	/*
-	 * If the instruction length was not specified then update it now
-	 * along with 'nextrip'.
+	 * Update 'nextrip' based on the length of the emulated instruction.
 	 */
-	if (vme->inst_length == 0) {
-		vme->inst_length = vie->num_processed;
-		vcpu->nextrip += vie->num_processed;
-	}
+	vme->inst_length = vie->num_processed;
+	vcpu->nextrip += vie->num_processed;
+	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
+	    "decoding", vcpu->nextrip);
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
@@ -1032,47 +1648,394 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	return (error);
 }
 
+static int
+vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
+{
+#ifdef __FreeBSD__
+	int i, done;
+	struct vcpu *vcpu;
+
+	done = 0;
+#else
+	int i;
+	struct vcpu *vcpu;
+#endif
+	vcpu = &vm->vcpu[vcpuid];
+
+	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
+
+	/*
+	 * Wait until all 'active_cpus' have suspended themselves.
+	 */
+	vcpu_lock(vcpu);
+	while (1) {
+		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
+			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
+			break;
+		}
+
+		VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
+		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
+#ifdef __FreeBSD__
+		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
+#else
+		/*
+		 * To prevent vm_handle_suspend from becoming stuck in the
+		 * kernel if the bhyve process driving its vCPUs is killed,
+		 * offer a bail-out, even though not all the vCPUs have reached
+		 * the suspended state.
+		 */
+		if (cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m,
+		    hz, TR_CLOCK_TICK) <= 0) {
+			if ((curproc->p_flag & SEXITING) != 0) {
+				vcpu_require_state_locked(vm, vcpuid,
+				    VCPU_FROZEN);
+				break;
+			}
+		}
+#endif
+		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
+	}
+	vcpu_unlock(vcpu);
+
+	/*
+	 * Wakeup the other sleeping vcpus and return to userspace.
+	 */
+	for (i = 0; i < vm->maxcpus; i++) {
+		if (CPU_ISSET(i, &vm->suspended_cpus)) {
+			vcpu_notify_event(vm, i, false);
+		}
+	}
+
+	*retu = true;
+	return (0);
+}
+
+static int
+vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
+	vcpu->reqidle = 0;
+	vcpu_unlock(vcpu);
+	*retu = true;
+	return (0);
+}
+
+#ifndef __FreeBSD__
+static int
+vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
+{
+	struct vcpu *cpu = &vm->vcpu[vcpuid];
+	const uint32_t code = vme->u.msr.code;
+	const uint64_t val = vme->u.msr.wval;
+
+	switch (code) {
+	case MSR_TSC:
+		cpu->tsc_offset = val - rdtsc();
+		return (0);
+	}
+
+	return (-1);
+}
+#endif /* __FreeBSD__ */
+
+int
+vm_suspend(struct vm *vm, enum vm_suspend_how how)
+{
+	int i;
+
+	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
+		return (EINVAL);
+
+	if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
+		VM_CTR2(vm, "virtual machine already suspended %d/%d",
+		    vm->suspend, how);
+		return (EALREADY);
+	}
+
+	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
+
+	/*
+	 * Notify all active vcpus that they are now suspended.
+	 */
+	for (i = 0; i < vm->maxcpus; i++) {
+		if (CPU_ISSET(i, &vm->active_cpus))
+			vcpu_notify_event(vm, i, false);
+	}
+
+	return (0);
+}
+
+void
+vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	struct vm_exit *vmexit;
+
+	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
+	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
+
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->rip = rip;
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
+	vmexit->u.suspended.how = vm->suspend;
+}
+
+void
+vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	struct vm_exit *vmexit;
+
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->rip = rip;
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_DEBUG;
+}
+
+void
+vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	struct vm_exit *vmexit;
+
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->rip = rip;
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
+	vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
+}
+
+void
+vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	struct vm_exit *vmexit;
+
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->rip = rip;
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_REQIDLE;
+	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
+}
+
+void
+vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	struct vm_exit *vmexit;
+
+	vmexit = vm_exitinfo(vm, vcpuid);
+	vmexit->rip = rip;
+	vmexit->inst_length = 0;
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
+}
+
+#ifndef __FreeBSD__
+/*
+ * Some vmm resources, such as the lapic, may have CPU-specific resources
+ * allocated to them which would benefit from migration onto the host CPU which
+ * is processing the vcpu state.
+ */
+static void
+vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
+{
+	/*
+	 * Localizing cyclic resources requires acquisition of cpu_lock, and
+	 * doing so with kpreempt disabled is a recipe for deadlock disaster.
+	 */
+	VERIFY(curthread->t_preempt == 0);
+
+	/*
+	 * Do not bother with localization if this vCPU is about to return to
+	 * the host CPU it was last localized to.
+	 */
+	if (vcpu->lastloccpu == curcpu)
+		return;
+
+	/*
+	 * Localize system-wide resources to the primary boot vCPU.  While any
+	 * of the other vCPUs may access them, it keeps the potential interrupt
+	 * footprint constrained to CPUs involved with this instance.
+	 */
+	if (vcpu == &vm->vcpu[0]) {
+		vhpet_localize_resources(vm->vhpet);
+		vrtc_localize_resources(vm->vrtc);
+		vatpit_localize_resources(vm->vatpit);
+	}
+
+	vlapic_localize_resources(vcpu->vlapic);
+
+	vcpu->lastloccpu = curcpu;
+}
+
+static void
+vmm_savectx(void *arg)
+{
+	vm_thread_ctx_t *vtc = arg;
+	struct vm *vm = vtc->vtc_vm;
+	const int vcpuid = vtc->vtc_vcpuid;
+
+	if (ops->vmsavectx != NULL) {
+		ops->vmsavectx(vm->cookie, vcpuid);
+	}
+
+	/*
+	 * If the CPU holds the restored guest FPU state, save it and restore
+	 * the host FPU state before this thread goes off-cpu.
+	 */
+	if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
+		struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+		save_guest_fpustate(vcpu);
+		vtc->vtc_status &= ~VTCS_FPU_RESTORED;
+	}
+}
+
+static void
+vmm_restorectx(void *arg)
+{
+	vm_thread_ctx_t *vtc = arg;
+	struct vm *vm = vtc->vtc_vm;
+	const int vcpuid = vtc->vtc_vcpuid;
+
+	/*
+	 * When coming back on-cpu, only restore the guest FPU status if the
+	 * thread is in a context marked as requiring it.  This should be rare,
+	 * occurring only when a future logic error results in a voluntary
+	 * sleep during the VMRUN critical section.
+	 *
+	 * The common case will result in elision of the guest FPU state
+	 * restoration, deferring that action until it is clearly necessary
+	 * during vm_run.
+	 */
+	VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
+	if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
+		struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+		restore_guest_fpustate(vcpu);
+		vtc->vtc_status |= VTCS_FPU_RESTORED;
+	}
+
+	if (ops->vmrestorectx != NULL) {
+		ops->vmrestorectx(vm->cookie, vcpuid);
+	}
+
+}
+
+/*
+ * If we're in removectx(), we might still have state to tidy up.
+ */
+static void
+vmm_freectx(void *arg, int isexec)
+{
+	vmm_savectx(arg);
+}
+
+#endif /* __FreeBSD */
+
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
+	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vcpu *vcpu;
+#ifdef	__FreeBSD__
 	struct pcb *pcb;
+#endif
 	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
+	pmap_t pmap;
+#ifndef	__FreeBSD__
+	vm_thread_ctx_t vtc;
+	int affinity_type = CPU_CURRENT;
+#endif
 
 	vcpuid = vmrun->cpuid;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
+		return (EINVAL);
+
+	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
+	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
+	evinfo.rptr = &vcpu->runblock;
+	evinfo.sptr = &vm->suspend;
+	evinfo.iptr = &vcpu->reqidle;
+
+#ifndef	__FreeBSD__
+	vtc.vtc_vm = vm;
+	vtc.vtc_vcpuid = vcpuid;
+	vtc.vtc_status = 0;
+
+	installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
+	    NULL, vmm_freectx);
+#endif
+
 restart:
+#ifndef	__FreeBSD__
+	thread_affinity_set(curthread, affinity_type);
+	/*
+	 * Resource localization should happen after the CPU affinity for the
+	 * thread has been set to ensure that access from restricted contexts,
+	 * such as VMX-accelerated APIC operations, can occur without inducing
+	 * cyclic cross-calls.
+	 *
+	 * This must be done prior to disabling kpreempt via critical_enter().
+	 */
+	vm_localize_resources(vm, vcpu);
+
+	affinity_type = CPU_CURRENT;
+#endif
+
 	critical_enter();
 
+	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+	    ("vm_run: absurd pm_active"));
+
 	tscval = rdtsc();
 
 #ifdef	__FreeBSD__
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
+#else
+	/* Force a trip through update_sregs to reload %fs/%gs and friends */
+	PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
 #endif
 
-#ifndef	__FreeBSD__
-	installctx(curthread, vcpu, save_guest_fpustate,
-	    restore_guest_fpustate, NULL, NULL, NULL, NULL);
-#endif
+#ifdef	__FreeBSD__
 	restore_guest_fpustate(vcpu);
+#else
+	if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
+		restore_guest_fpustate(vcpu);
+		vtc.vtc_status |= VTCS_FPU_RESTORED;
+	}
+	vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
+#endif
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
-	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
+	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
+#ifdef	__FreeBSD__
 	save_guest_fpustate(vcpu);
+#else
+	vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
+#endif
+
 #ifndef	__FreeBSD__
-	removectx(curthread, vcpu, save_guest_fpustate,
-	    restore_guest_fpustate, NULL, NULL, NULL, NULL);
+	/*
+	 * Once clear of the delicate contexts comprising the VM_RUN handler,
+	 * thread CPU affinity can be loosened while other processing occurs.
+	 */
+	thread_affinity_clear(curthread);
 #endif
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
@@ -1083,10 +2046,25 @@ restart:
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
+		case VM_EXITCODE_REQIDLE:
+			error = vm_handle_reqidle(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_SUSPENDED:
+			error = vm_handle_suspend(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_IOAPIC_EOI:
+			vioapic_process_eoi(vm, vcpuid,
+			    vme->u.ioapic_eoi.vector);
+			break;
+		case VM_EXITCODE_RUNBLOCK:
+			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
+		case VM_EXITCODE_PAGING:
+			error = vm_handle_paging(vm, vcpuid, &retu);
+			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
@@ -1094,18 +2072,42 @@ restart:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
+		case VM_EXITCODE_MONITOR:
+		case VM_EXITCODE_MWAIT:
+		case VM_EXITCODE_VMINSN:
+			vm_inject_ud(vm, vcpuid);
+			break;
+#ifndef __FreeBSD__
+		case VM_EXITCODE_WRMSR:
+			if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) {
+				retu = true;
+			}
+			break;
+
+		case VM_EXITCODE_HT: {
+			affinity_type = CPU_BEST;
+			break;
+		}
+
+#endif
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
-	if (error == 0 && retu == false) {
+	if (error == 0 && retu == false)
 		goto restart;
-	}
+
+#ifndef	__FreeBSD__
+	removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL,
+	    NULL, vmm_freectx);
+#endif
+
+	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
 
 	/* copy the exit information */
-	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
+	bcopy(vme, &vmrun->vm_exit, sizeof (struct vm_exit));
 	return (error);
 }
 
@@ -1119,7 +2121,7 @@ vm_restart_instruction(void *arg, int vcpuid)
 	int error;
 
 	vm = arg;
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
@@ -1158,7 +2160,7 @@ vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 	struct vcpu *vcpu;
 	int type, vector;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
@@ -1262,9 +2264,7 @@ nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
-#ifdef	__FreeBSD__
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
-#endif
 		*retinfo = 0;
 		return (0);
 	}
@@ -1293,11 +2293,11 @@ vcpu_exception_intinfo(struct vcpu *vcpu)
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
-		info = vcpu->exception.vector & 0xff;
+		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
-		if (vcpu->exception.error_code_valid) {
+		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
-			info |= (uint64_t)vcpu->exception.error_code << 32;
+			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
@@ -1310,7 +2310,8 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 	uint64_t info1, info2;
 	int valid;
 
-	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+	KASSERT(vcpuid >= 0 &&
+	    vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
@@ -1322,7 +2323,7 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
-		    vcpu->exception.vector, info2);
+		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
@@ -1346,76 +2347,93 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 }
 
 int
-vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
+vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	*info1 = vcpu->exitintinfo;
+	*info2 = vcpu_exception_intinfo(vcpu);
+	return (0);
+}
+
+int
+vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
+    uint32_t errcode, int restart_instruction)
+{
+	struct vcpu *vcpu;
+	uint64_t regval;
+	int error;
+
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
-	if (exception->vector < 0 || exception->vector >= 32)
+	/*
+	 * A double fault exception should never be injected directly into
+	 * the guest. It is a derived exception that results from specific
+	 * combinations of nested faults.
+	 */
+	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
-		    "pending exception %d", exception->vector,
-		    vcpu->exception.vector);
+		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
-	vcpu->exception_pending = 1;
-	vcpu->exception = *exception;
-	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
-	return (0);
-}
+	if (errcode_valid) {
+		/*
+		 * Exceptions don't deliver an error code in real mode.
+		 */
+		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
+		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
+		if (!(regval & CR0_PE))
+			errcode_valid = 0;
+	}
 
-int
-vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
-{
-	struct vcpu *vcpu;
-	int pending;
+	/*
+	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
+	 *
+	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
+	 * one instruction or incurs an exception.
+	 */
+	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
+	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
+	    __func__, error));
 
-	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+	if (restart_instruction)
+		vm_restart_instruction(vm, vcpuid);
 
-	vcpu = &vm->vcpu[vcpuid];
-	pending = vcpu->exception_pending;
-	if (pending) {
-		vcpu->exception_pending = 0;
-		*exception = vcpu->exception;
-		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
-		    exception->vector);
-	}
-	return (pending);
+	vcpu->exception_pending = 1;
+	vcpu->exc_vector = vector;
+	vcpu->exc_errcode = errcode;
+	vcpu->exc_errcode_valid = errcode_valid;
+	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
+	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
-	struct vm_exception exception;
-	struct vm_exit *vmexit;
 	struct vm *vm;
-	int error;
+	int error, restart_instruction;
 
 	vm = vmarg;
+	restart_instruction = 1;
 
-	exception.vector = vector;
-	exception.error_code = errcode;
-	exception.error_code_valid = errcode_valid;
-	error = vm_inject_exception(vm, vcpuid, &exception);
+	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
+	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
-
-	/*
-	 * A fault-like exception allows the instruction to be restarted
-	 * after the exception handler returns.
-	 *
-	 * By setting the inst_length to 0 we ensure that the instruction
-	 * pointer remains at the faulting instruction.
-	 */
-	vmexit = vm_exitinfo(vm, vcpuid);
-	vmexit->inst_length = 0;
 }
 
 void
@@ -1441,14 +2459,13 @@ vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
-
 	return (0);
 }
 
@@ -1457,7 +2474,7 @@ vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
@@ -1470,7 +2487,7 @@ vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
@@ -1489,14 +2506,13 @@ vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
-
 	return (0);
 }
 
@@ -1505,7 +2521,7 @@ vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
@@ -1518,7 +2534,7 @@ vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
@@ -1533,7 +2549,7 @@ vm_extint_clear(struct vm *vm, int vcpuid)
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
@@ -1545,7 +2561,7 @@ vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
-	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
@@ -1554,22 +2570,24 @@ vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
 }
 
-struct vhpet *
-vm_hpet(struct vm *vm)
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
 {
-	return (vm->vhpet);
+	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
+
 	return (vm->vioapic);
 }
 
-struct vlapic *
-vm_lapic(struct vm *vm, int cpu)
+struct vhpet *
+vm_hpet(struct vm *vm)
 {
-	return (vm->vcpu[cpu].vlapic);
+
+	return (vm->vhpet);
 }
 
 #ifdef	__FreeBSD__
@@ -1594,7 +2612,7 @@ vmm_is_pptdev(int bus, int slot, int func)
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = 0;
 	for (i = 0; names[i] != NULL && !found; i++) {
-		cp = val = getenv(names[i]);
+		cp = val = kern_getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
@@ -1630,13 +2648,13 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
 	int error;
 	struct vcpu *vcpu;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
-	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
+	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
@@ -1648,7 +2666,7 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
@@ -1662,11 +2680,67 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 	return (state);
 }
 
+void
+vcpu_block_run(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	vcpu->runblock++;
+	if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
+		vcpu_notify_event_locked(vcpu, false);
+	}
+	while (vcpu->state == VCPU_RUNNING) {
+#ifdef __FreeBSD__
+		msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
+#else
+		cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
+#endif
+	}
+	vcpu_unlock(vcpu);
+}
+
+void
+vcpu_unblock_run(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
+	vcpu->runblock--;
+	if (vcpu->runblock == 0) {
+#ifdef __FreeBSD__
+		wakeup(&vcpu->state);
+#else
+		cv_broadcast(&vcpu->state_cv);
+#endif
+	}
+	vcpu_unlock(vcpu);
+}
+
+#ifndef	__FreeBSD__
+uint64_t
+vcpu_tsc_offset(struct vm *vm, int vcpuid)
+{
+	return (vm->vcpu[vcpuid].tsc_offset);
+}
+#endif /* __FreeBSD__ */
+
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
@@ -1677,6 +2751,55 @@ vm_activate_cpu(struct vm *vm, int vcpuid)
 	return (0);
 }
 
+int
+vm_suspend_cpu(struct vm *vm, int vcpuid)
+{
+	int i;
+
+	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	if (vcpuid == -1) {
+		vm->debug_cpus = vm->active_cpus;
+		for (i = 0; i < vm->maxcpus; i++) {
+			if (CPU_ISSET(i, &vm->active_cpus))
+				vcpu_notify_event(vm, i, false);
+		}
+	} else {
+		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
+			return (EINVAL);
+
+		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
+		vcpu_notify_event(vm, vcpuid, false);
+	}
+	return (0);
+}
+
+int
+vm_resume_cpu(struct vm *vm, int vcpuid)
+{
+
+	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	if (vcpuid == -1) {
+		CPU_ZERO(&vm->debug_cpus);
+	} else {
+		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
+			return (EINVAL);
+
+		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
+	}
+	return (0);
+}
+
+int
+vcpu_debugged(struct vm *vm, int vcpuid)
+{
+
+	return (CPU_ISSET(vcpuid, &vm->debug_cpus));
+}
+
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
@@ -1684,6 +2807,20 @@ vm_active_cpus(struct vm *vm)
 	return (vm->active_cpus);
 }
 
+cpuset_t
+vm_debug_cpus(struct vm *vm)
+{
+
+	return (vm->debug_cpus);
+}
+
+cpuset_t
+vm_suspended_cpus(struct vm *vm)
+{
+
+	return (vm->suspended_cpus);
+}
+
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
@@ -1694,7 +2831,7 @@ vcpu_stats(struct vm *vm, int vcpuid)
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
@@ -1705,7 +2842,7 @@ vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
@@ -1725,15 +2862,11 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
-void
-vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+static void
+vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 {
 	int hostcpu;
-	struct vcpu *vcpu;
 
-	vcpu = &vm->vcpu[vcpuid];
-
-	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
@@ -1755,12 +2888,33 @@ vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
-		if (vcpu->state == VCPU_SLEEPING)
+		if (vcpu->state == VCPU_SLEEPING) {
+#ifdef __FreeBSD__
 			wakeup_one(vcpu);
+#else
+			cv_signal(&vcpu->vcpu_cv);
+#endif
+		}
 	}
+}
+
+void
+vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	vcpu_notify_event_locked(vcpu, lapic_intr);
 	vcpu_unlock(vcpu);
 }
 
+struct vmspace *
+vm_get_vmspace(struct vm *vm)
+{
+
+	return (vm->vmspace);
+}
+
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
@@ -1782,6 +2936,20 @@ vm_atpit(struct vm *vm)
 	return (vm->vatpit);
 }
 
+struct vpmtmr *
+vm_pmtmr(struct vm *vm)
+{
+
+	return (vm->vpmtmr);
+}
+
+struct vrtc *
+vm_rtc(struct vm *vm)
+{
+
+	return (vm->vrtc);
+}
+
 enum vm_reg_name
 vm_segment_name(int seg)
 {
@@ -1805,19 +2973,17 @@ vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
 {
 	int idx;
 
-#ifdef	__FreeBSD__
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
-#endif
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
-    int num_copyinfo)
+    int num_copyinfo, int *fault)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
@@ -1830,8 +2996,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
-		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
-		if (error)
+		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
+		if (error || *fault)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
@@ -1843,8 +3009,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 	}
 
 	for (idx = 0; idx < nused; idx++) {
-		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
-		    prot, &cookie);
+		hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
+		    copyinfo[idx].len, prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
@@ -1853,8 +3019,9 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
-		return (-1);
+		return (EFAULT);
 	} else {
+		*fault = 0;
 		return (0);
 	}
 }
@@ -1892,3 +3059,125 @@ vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
 		idx++;
 	}
 }
+
+/*
+ * Return the amount of in-use and wired memory for the VM. Since
+ * these are global stats, only return the values with for vCPU 0
+ */
+VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
+VMM_STAT_DECLARE(VMM_MEM_WIRED);
+
+static void
+vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
+{
+
+	if (vcpu == 0) {
+		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
+	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
+	}	
+}
+
+static void
+vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
+{
+
+	if (vcpu == 0) {
+		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
+	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
+	}	
+}
+
+VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
+VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
+
+#ifndef __FreeBSD__
+int
+vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc,
+    vmm_wmem_cb_t wfunc, void *arg, void **cookie)
+{
+	list_t *ih = &vm->ioport_hooks;
+	vm_ioport_hook_t *hook, *node;
+
+	if (ioport == 0) {
+		return (EINVAL);
+	}
+
+	/*
+	 * Find the node position in the list which this region should be
+	 * inserted behind to maintain sorted order.
+	 */
+	for (node = list_tail(ih); node != NULL; node = list_prev(ih, node)) {
+		if (ioport == node->vmih_ioport) {
+			/* Reject duplicate port hook  */
+			return (EEXIST);
+		} else if (ioport > node->vmih_ioport) {
+			break;
+		}
+	}
+
+	hook = kmem_alloc(sizeof (*hook), KM_SLEEP);
+	hook->vmih_ioport = ioport;
+	hook->vmih_arg = arg;
+	hook->vmih_rmem_cb = rfunc;
+	hook->vmih_wmem_cb = wfunc;
+	if (node == NULL) {
+		list_insert_head(ih, hook);
+	} else {
+		list_insert_after(ih, node, hook);
+	}
+
+	*cookie = (void *)hook;
+	return (0);
+}
+
+void
+vm_ioport_unhook(struct vm *vm, void **cookie)
+{
+	vm_ioport_hook_t *hook;
+	list_t *ih = &vm->ioport_hooks;
+
+	hook = *cookie;
+	list_remove(ih, hook);
+	kmem_free(hook, sizeof (*hook));
+	*cookie = NULL;
+}
+
+int
+vm_ioport_handle_hook(struct vm *vm, int cpuid, bool in, int port, int bytes,
+    uint32_t *val)
+{
+	vm_ioport_hook_t *hook;
+	list_t *ih = &vm->ioport_hooks;
+	int err = 0;
+
+	for (hook = list_head(ih); hook != NULL; hook = list_next(ih, hook)) {
+		if (hook->vmih_ioport == port) {
+			break;
+		}
+	}
+	if (hook == NULL) {
+		return (ENOENT);
+	}
+
+	if (in) {
+		uint64_t tval;
+
+		if (hook->vmih_rmem_cb == NULL) {
+			return (ENOENT);
+		}
+		err = hook->vmih_rmem_cb(hook->vmih_arg, (uintptr_t)port,
+		    (uint_t)bytes, &tval);
+		*val = (uint32_t)tval;
+	} else {
+		if (hook->vmih_wmem_cb == NULL) {
+			return (ENOENT);
+		}
+		err = hook->vmih_wmem_cb(hook->vmih_arg, (uintptr_t)port,
+		    (uint_t)bytes, (uint64_t)*val);
+	}
+
+	return (err);
+}
+
+
+#endif /* __FreeBSD__ */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.mapfile b/usr/src/uts/i86pc/io/vmm/vmm.mapfile
new file mode 100644
index 0000000000..83c14de895
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm.mapfile
@@ -0,0 +1,62 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+    global:
+	# DDI Interfaces
+	_fini;
+	_init;
+	_info;
+
+	# bhyve driver API
+	vmm_drv_hold;
+	vmm_drv_rele;
+	vmm_drv_release_reqd;
+	vmm_drv_lease_sign;
+	vmm_drv_lease_break;
+	vmm_drv_lease_expired;
+	vmm_drv_gpa2kva;
+	vmm_drv_ioport_hook;
+	vmm_drv_ioport_unhook;
+	vmm_drv_msi;
+
+	# IOMMU API for PCI pass-thru
+	iommu_add_device;
+	iommu_host_domain;
+	iommu_remove_device;
+	lapic_intr_msi;
+	vm_iommu_domain;
+	vm_map_mmio;
+	vm_unmap_mmio;
+
+    local:
+	*;
+};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.c b/usr/src/uts/i86pc/io/vmm/vmm_host.c
index b94caf4009..9e390c93dd 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_host.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_host.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -39,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/pcpu.h>
@@ -50,11 +52,14 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z ne
 
 #include "vmm_host.h"
 
-static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4,
+	vmm_host_xcr0;
+static struct xsave_limits vmm_xsave_limits;
 
 void
 vmm_host_state_init(void)
 {
+	unsigned int regs[4];
 
 	vmm_host_efer = rdmsr(MSR_EFER);
 	vmm_host_pat = rdmsr(MSR_PAT);
@@ -68,7 +73,36 @@ vmm_host_state_init(void)
 	 */
 	vmm_host_cr0 = rcr0() | CR0_TS;
 
-	vmm_host_cr4 = rcr4();
+	/*
+	 * On non-PCID or PCID but without INVPCID support machines,
+	 * we flush kernel i.e. global TLB entries, by temporary
+	 * clearing the CR4.PGE bit, see invltlb_glob().  If
+	 * preemption occurs at the wrong time, cached vmm_host_cr4
+	 * might store the value with CR4.PGE cleared.  Since FreeBSD
+	 * requires support for PG_G on amd64, just set it
+	 * unconditionally.
+	 */
+	vmm_host_cr4 = rcr4() | CR4_PGE;
+
+	/*
+	 * Only permit a guest to use XSAVE if the host is using
+	 * XSAVE.  Only permit a guest to use XSAVE features supported
+	 * by the host.  This ensures that the FPU state used by the
+	 * guest is always a subset of the saved guest FPU state.
+	 *
+	 * In addition, only permit known XSAVE features where the
+	 * rules for which features depend on other features is known
+	 * to properly emulate xsetbv.
+	 */
+	if (vmm_host_cr4 & CR4_XSAVE) {
+		vmm_xsave_limits.xsave_enabled = 1;
+		vmm_host_xcr0 = rxcr(0);
+		vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 &
+		    (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512);
+
+		cpuid_count(0xd, 0x0, regs);
+		vmm_xsave_limits.xsave_max_size = regs[1];
+	}
 }
 
 uint64_t
@@ -99,6 +133,13 @@ vmm_get_host_cr4(void)
 	return (vmm_host_cr4);
 }
 
+uint64_t
+vmm_get_host_xcr0(void)
+{
+
+	return (vmm_host_xcr0);
+}
+
 uint64_t
 vmm_get_host_datasel(void)
 {
@@ -122,7 +163,6 @@ vmm_get_host_codesel(void)
 #endif
 }
 
-
 uint64_t
 vmm_get_host_tsssel(void)
 {
@@ -158,3 +198,10 @@ vmm_get_host_idtrbase(void)
 	return (idtr.dtr_base);
 #endif
 }
+
+const struct xsave_limits *
+vmm_get_xsave_limits(void)
+{
+
+	return (&vmm_xsave_limits);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h
index 5de015a228..f12047819d 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_host.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_host.h 242275 2012-10-29 01:51:24Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,6 +38,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef	_VMM_HOST_H_
@@ -46,20 +49,28 @@
 #endif
 
 #ifndef	_KERNEL
-#error "no user-servicable parts inside"
+#error "no user-serviceable parts inside"
 #endif
 
+struct xsave_limits {
+	int		xsave_enabled;
+	uint64_t	xcr0_allowed;
+	uint32_t	xsave_max_size;
+};
+
 void vmm_host_state_init(void);
 
 uint64_t vmm_get_host_pat(void);
 uint64_t vmm_get_host_efer(void);
 uint64_t vmm_get_host_cr0(void);
 uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_xcr0(void);
 uint64_t vmm_get_host_datasel(void);
 uint64_t vmm_get_host_codesel(void);
 uint64_t vmm_get_host_tsssel(void);
 uint64_t vmm_get_host_fsbase(void);
 uint64_t vmm_get_host_idtrbase(void);
+const struct xsave_limits *vmm_get_xsave_limits(void);
 
 /*
  * Inline access to host state that is used on every VM entry
@@ -89,8 +100,10 @@ vmm_get_host_gdtrbase(void)
 #endif
 }
 
+#ifdef	__FreeBSD__
 struct pcpu;
 extern struct pcpu __pcpu[];
+#endif
 
 static __inline uint64_t
 vmm_get_host_gsbase(void)
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
index 72c7056e26..ea96cd8db0 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 Sandvine, Inc.
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
@@ -24,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -37,15 +39,17 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
+#include <sys/proc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
@@ -84,6 +88,9 @@ enum {
 	VIE_OP_TYPE_MOVS,
 	VIE_OP_TYPE_GROUP1,
 	VIE_OP_TYPE_STOS,
+	VIE_OP_TYPE_BITTEST,
+	VIE_OP_TYPE_TWOB_GRP15,
+	VIE_OP_TYPE_ADD,
 	VIE_OP_TYPE_LAST
 };
 
@@ -94,7 +101,12 @@ enum {
 #define	VIE_OP_F_NO_MODRM	(1 << 3)
 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
 
+#ifdef _KERNEL
 static const struct vie_op two_byte_opcodes[256] = {
+	[0xAE] = {
+		  .op_byte = 0xAE,
+		  .op_type = VIE_OP_TYPE_TWOB_GRP15,
+	},
 	[0xB6] = {
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
@@ -103,6 +115,11 @@ static const struct vie_op two_byte_opcodes[256] = {
 		.op_byte = 0xB7,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xBA] = {
+		.op_byte = 0xBA,
+		.op_type = VIE_OP_TYPE_BITTEST,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -110,14 +127,26 @@ static const struct vie_op two_byte_opcodes[256] = {
 };
 
 static const struct vie_op one_byte_opcodes[256] = {
+	[0x03] = {
+		.op_byte = 0x03,
+		.op_type = VIE_OP_TYPE_ADD,
+	},
 	[0x0F] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
+	[0x0B] = {
+		.op_byte = 0x0B,
+		.op_type = VIE_OP_TYPE_OR,
+	},
 	[0x2B] = {
 		.op_byte = 0x2B,
 		.op_type = VIE_OP_TYPE_SUB,
 	},
+	[0x39] = {
+		.op_byte = 0x39,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
 	[0x3B] = {
 		.op_byte = 0x3B,
 		.op_type = VIE_OP_TYPE_CMP,
@@ -183,14 +212,20 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x23,
 		.op_type = VIE_OP_TYPE_AND,
 	},
+	[0x80] = {
+		/* Group 1 extended opcode */
+		.op_byte = 0x80,
+		.op_type = VIE_OP_TYPE_GROUP1,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0x81] = {
-		/* XXX Group 1 extended opcode */
+		/* Group 1 extended opcode */
 		.op_byte = 0x81,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x83] = {
-		/* XXX Group 1 extended opcode */
+		/* Group 1 extended opcode */
 		.op_byte = 0x83,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM8,
@@ -206,6 +241,7 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_PUSH,
 	}
 };
+#endif
 
 /* struct vie.mod */
 #define	VIE_MOD_INDIRECT		0
@@ -394,6 +430,41 @@ getcc(int opsize, uint64_t x, uint64_t y)
 		return (getcc64(x, y));
 }
 
+/*
+ * Macro creation of functions getaddflags{8,16,32,64}
+ */
+#define	GETADDFLAGS(sz)							\
+static u_long								\
+getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
+{									\
+	u_long rflags;							\
+									\
+	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
+	    "=r" (rflags), "+r" (x) : "m" (y));				\
+	return (rflags);						\
+} struct __hack
+
+GETADDFLAGS(8);
+GETADDFLAGS(16);
+GETADDFLAGS(32);
+GETADDFLAGS(64);
+
+static u_long
+getaddflags(int opsize, uint64_t x, uint64_t y)
+{
+	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
+	    ("getaddflags: invalid operand size %d", opsize));
+
+	if (opsize == 1)
+		return (getaddflags8(x, y));
+	else if (opsize == 2)
+		return (getaddflags16(x, y));
+	else if (opsize == 4)
+		return (getaddflags32(x, y));
+	else
+		return (getaddflags64(x, y));
+}
+
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -596,13 +667,11 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 /*
  * Helper function to calculate and validate a linear address.
- *
- * Returns 0 on success and 1 if an exception was injected into the guest.
  */
 static int
 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
     int opsize, int addrsize, int prot, enum vm_reg_name seg,
-    enum vm_reg_name gpr, uint64_t *gla)
+    enum vm_reg_name gpr, uint64_t *gla, int *fault)
 {
 	struct seg_desc desc;
 	uint64_t cr0, val, rflags;
@@ -628,7 +697,7 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
-		return (1);
+		goto guest_fault;
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
@@ -636,14 +705,19 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
-		return (1);
+		goto guest_fault;
 	}
 
 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
-		return (1);
+		goto guest_fault;
 	}
 
+	*fault = 0;
+	return (0);
+
+guest_fault:
+	*fault = 1;
 	return (0);
 }
 
@@ -659,7 +733,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 #endif
 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
 	uint64_t rcx, rdi, rsi, rflags;
-	int error, opsize, seg, repeat;
+	int error, fault, opsize, seg, repeat;
 
 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
 	val = 0;
@@ -682,8 +756,10 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * The count register is %rcx, %ecx or %cx depending on the
 		 * address size of the instruction.
 		 */
-		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
-			return (0);
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
+			error = 0;
+			goto done;
+		}
 	}
 
 	/*
@@ -704,13 +780,16 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
-	if (error)
+	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
+	if (error || fault)
 		goto done;
 
 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
-	    copyinfo, nitems(copyinfo));
+	    copyinfo, nitems(copyinfo), &fault);
 	if (error == 0) {
+		if (fault)
+			goto done;	/* Resume guest to handle fault */
+
 		/*
 		 * case (2): read from system memory and write to mmio.
 		 */
@@ -719,11 +798,6 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
 		if (error)
 			goto done;
-	} else if (error > 0) {
-		/*
-		 * Resume guest execution to handle fault.
-		 */
-		goto done;
 	} else {
 		/*
 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
@@ -731,13 +805,17 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 */
 
 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
-		if (error)
+		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
+		    &fault);
+		if (error || fault)
 			goto done;
 
 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
-		    PROT_WRITE, copyinfo, nitems(copyinfo));
+		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
 		if (error == 0) {
+			if (fault)
+				goto done;    /* Resume guest to handle fault */
+
 			/*
 			 * case (3): read from MMIO and write to system memory.
 			 *
@@ -753,27 +831,29 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 
 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-		} else if (error > 0) {
-			/*
-			 * Resume guest execution to handle fault.
-			 */
-			goto done;
 		} else {
 			/*
 			 * Case (4): read from and write to mmio.
+			 *
+			 * Commit to the MMIO read/write (with potential
+			 * side-effects) only after we are sure that the
+			 * instruction is not going to be restarted due
+			 * to address translation faults.
 			 */
 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
-			    PROT_READ, &srcgpa);
-			if (error)
-				goto done;
-			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
-			if (error)
+			    PROT_READ, &srcgpa, &fault);
+			if (error || fault)
 				goto done;
 
 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
-			   PROT_WRITE, &dstgpa);
+			   PROT_WRITE, &dstgpa, &fault);
+			if (error || fault)
+				goto done;
+
+			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
 			if (error)
 				goto done;
+
 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
 			if (error)
 				goto done;
@@ -818,10 +898,9 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 			vm_restart_instruction(vm, vcpuid);
 	}
 done:
-	if (error < 0)
-		return (EFAULT);
-	else
-		return (0);
+	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
+	    __func__, error));
+	return (error);
 }
 
 static int
@@ -979,12 +1058,38 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
-	uint64_t val1, result, rflags, rflags2;
+	enum vm_reg_name reg;
+	uint64_t result, rflags, rflags2, val1, val2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
+	case 0x0B:
+		/*
+		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
+		 * result in reg.
+		 *
+		 * 0b/r         or r16, r/m16
+		 * 0b/r         or r32, r/m32
+		 * REX.W + 0b/r or r64, r/m64
+		 */
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+		
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		result = val1 | val2;
+		error = vie_update_register(vm, vcpuid, reg, result, size);
+		break;
 	case 0x81:
 	case 0x83:
 		/*
@@ -1041,39 +1146,55 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
-	uint64_t op1, op2, rflags, rflags2;
+	uint64_t regop, memop, op1, op2, rflags, rflags2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	switch (vie->op.op_byte) {
+	case 0x39:
 	case 0x3B:
 		/*
+		 * 39/r		CMP r/m16, r16
+		 * 39/r		CMP r/m32, r32
+		 * REX.W 39/r	CMP r/m64, r64
+		 *
 		 * 3B/r		CMP r16, r/m16
 		 * 3B/r		CMP r32, r/m32
 		 * REX.W + 3B/r	CMP r64, r/m64
 		 *
-		 * Compare first operand (reg) with second operand (r/m) and
+		 * Compare the first operand with the second operand and
 		 * set status flags in EFLAGS register. The comparison is
 		 * performed by subtracting the second operand from the first
 		 * operand and then setting the status flags.
 		 */
 
-		/* Get the first operand */
+		/* Get the register operand */
 		reg = gpr_map[vie->reg];
-		error = vie_read_register(vm, vcpuid, reg, &op1);
+		error = vie_read_register(vm, vcpuid, reg, &regop);
 		if (error)
 			return (error);
 
-		/* Get the second operand */
-		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		/* Get the memory operand */
+		error = memread(vm, vcpuid, gpa, &memop, size, arg);
 		if (error)
 			return (error);
 
+		if (vie->op.op_byte == 0x3B) {
+			op1 = regop;
+			op2 = memop;
+		} else {
+			op1 = memop;
+			op2 = regop;
+		}
 		rflags2 = getcc(size, op1, op2);
 		break;
+	case 0x80:
 	case 0x81:
 	case 0x83:
 		/*
+		 * 80 /7		cmp r/m8, imm8
+		 * REX + 80 /7		cmp r/m8, imm8
+		 *
 		 * 81 /7		cmp r/m16, imm16
 		 * 81 /7		cmp r/m32, imm32
 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
@@ -1089,6 +1210,8 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * the status flags.
 		 *
 		 */
+		if (vie->op.op_byte == 0x80)
+			size = 1;
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
@@ -1110,6 +1233,62 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+static int
+emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t nval, rflags, rflags2, val1, val2;
+	enum vm_reg_name reg;
+
+	size = vie->opsize;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x03:
+		/*
+		 * ADD r/m to r and store the result in r
+		 *
+		 * 03/r            ADD r16, r/m16
+		 * 03/r            ADD r32, r/m32
+		 * REX.W + 03/r    ADD r64, r/m64
+		 */
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		nval = val1 + val2;
+		error = vie_update_register(vm, vcpuid, reg, nval, size);
+		break;
+	default:
+		break;
+	}
+
+	if (!error) {
+		rflags2 = getaddflags(size, val1, val2);
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
+		    &rflags);
+		if (error)
+			return (error);
+
+		rflags &= ~RFLAGS_STATUS_BITS;
+		rflags |= rflags2 & RFLAGS_STATUS_BITS;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
+		    rflags, 8);
+	}
+
+	return (error);
+}
+
 static int
 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -1178,7 +1357,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 #endif
 	struct seg_desc ss_desc;
 	uint64_t cr0, rflags, rsp, stack_gla, val;
-	int error, size, stackaddrsize, pushop;
+	int error, fault, size, stackaddrsize, pushop;
 
 	val = 0;
 	size = vie->opsize;
@@ -1201,7 +1380,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 		size = vie->opsize_override ? 2 : 8;
 	} else {
 		/*
-		 * In protected or compability mode the 'B' flag in the
+		 * In protected or compatibility mode the 'B' flag in the
 		 * stack-segment descriptor determines the size of the
 		 * stack pointer.
 		 */
@@ -1244,18 +1423,10 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 	}
 
 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
-	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
-	if (error == -1) {
-		/*
-		 * XXX cannot return a negative error value here because it
-		 * ends up being the return value of the VM_RUN() ioctl and
-		 * is interpreted as a pseudo-error (for e.g. ERESTART).
-		 */
-		return (EFAULT);
-	} else if (error == 1) {
-		/* Resume guest execution to handle page fault */
-		return (0);
-	}
+	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
+	    &fault);
+	if (error || fault)
+		return (error);
 
 	if (pushop) {
 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
@@ -1346,6 +1517,79 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+static int
+emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
+{
+	uint64_t val, rflags;
+	int error, bitmask, bitoff;
+
+	/*
+	 * 0F BA is a Group 8 extended opcode.
+	 *
+	 * Currently we only emulate the 'Bit Test' instruction which is
+	 * identified by a ModR/M:reg encoding of 100b.
+	 */
+	if ((vie->reg & 7) != 4)
+		return (EINVAL);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
+	if (error)
+		return (error);
+
+	/*
+	 * Intel SDM, Vol 2, Table 3-2:
+	 * "Range of Bit Positions Specified by Bit Offset Operands"
+	 */
+	bitmask = vie->opsize * 8 - 1;
+	bitoff = vie->immediate & bitmask;
+
+	/* Copy the bit into the Carry flag in %rflags */
+	if (val & (1UL << bitoff))
+		rflags |= PSL_C;
+	else
+		rflags &= ~PSL_C;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
+
+	return (0);
+}
+
+static int
+emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
+{
+	int error;
+	uint64_t buf;
+
+	switch (vie->reg & 7) {
+	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
+		if (vie->mod == 0x3) {
+			/*
+			 * SFENCE.  Ignore it, VM exit provides enough
+			 * barriers on its own.
+			 */
+			error = 0;
+		} else {
+			/*
+			 * CLFLUSH, CLFLUSHOPT.  Only check for access
+			 * rights.
+			 */
+			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
+		}
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
@@ -1402,6 +1646,18 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_sub(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_BITTEST:
+		error = emulate_bittest(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_TWOB_GRP15:
+		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_ADD:
+		error = emulate_add(vm, vcpuid, gpa, vie, memread,
+		    memwrite, memarg);
+		break;
 	default:
 		error = EINVAL;
 		break;
@@ -1623,27 +1879,31 @@ ptp_release(void **cookie)
 }
 
 static void *
-ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
+ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
 {
 	void *ptr;
 
 	ptp_release(cookie);
-	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
+	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
 	return (ptr);
 }
 
-int
-vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa)
+static int
+_vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
 {
-	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
+	int nlevels, pfcode, retval, usermode, writable;
+	int ptpshift = 0, ptpindex = 0;
+	uint64_t ptpphys;
+	uint64_t *ptpbase = NULL, pte = 0, pgsize = 0;
 #ifdef	__FreeBSD__
-#endif
 	u_int retries;
-	uint64_t *ptpbase, ptpphys, pte, pgsize;
+#endif
 	uint32_t *ptpbase32, pte32;
 	void *cookie;
 
+	*guest_fault = 0;
+
 	usermode = (paging->cpl == 3 ? 1 : 0);
 	writable = prot & VM_PROT_WRITE;
 	cookie = NULL;
@@ -1664,7 +1924,8 @@ restart:
 		 * XXX assuming a non-stack reference otherwise a stack fault
 		 * should be generated.
 		 */
-		vm_inject_gp(vm, vcpuid);
+		if (!check_only)
+			vm_inject_gp(vm, vcpuid);
 		goto fault;
 	}
 
@@ -1679,7 +1940,8 @@ restart:
 			/* Zero out the lower 12 bits. */
 			ptpphys &= ~0xfff;
 
-			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
+			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
+			    &cookie);
 
 			if (ptpbase32 == NULL)
 				goto error;
@@ -1693,9 +1955,11 @@ restart:
 			if ((pte32 & PG_V) == 0 ||
 			    (usermode && (pte32 & PG_U) == 0) ||
 			    (writable && (pte32 & PG_RW) == 0)) {
-				pfcode = pf_error_code(usermode, prot, 0,
-				    pte32);
-				vm_inject_pf(vm, vcpuid, pfcode, gla);
+				if (!check_only) {
+					pfcode = pf_error_code(usermode, prot, 0,
+					    pte32);
+					vm_inject_pf(vm, vcpuid, pfcode, gla);
+				}
 				goto fault;
 			}
 
@@ -1706,7 +1970,7 @@ restart:
 			 * is only set at the last level providing the guest
 			 * physical address.
 			 */
-			if ((pte32 & PG_A) == 0) {
+			if (!check_only && (pte32 & PG_A) == 0) {
 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
 				    pte32, pte32 | PG_A) == 0) {
 					goto restart;
@@ -1721,7 +1985,7 @@ restart:
 		}
 
 		/* Set the dirty bit in the page table entry if necessary */
-		if (writable && (pte32 & PG_M) == 0) {
+		if (!check_only && writable && (pte32 & PG_M) == 0) {
 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
 			    pte32, pte32 | PG_M) == 0) {
 				goto restart;
@@ -1738,7 +2002,8 @@ restart:
 		/* Zero out the lower 5 bits and the upper 32 bits */
 		ptpphys &= 0xffffffe0UL;
 
-		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
+		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
+		    &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
@@ -1747,8 +2012,10 @@ restart:
 		pte = ptpbase[ptpindex];
 
 		if ((pte & PG_V) == 0) {
-			pfcode = pf_error_code(usermode, prot, 0, pte);
-			vm_inject_pf(vm, vcpuid, pfcode, gla);
+			if (!check_only) {
+				pfcode = pf_error_code(usermode, prot, 0, pte);
+				vm_inject_pf(vm, vcpuid, pfcode, gla);
+			}
 			goto fault;
 		}
 
@@ -1761,7 +2028,7 @@ restart:
 		/* Zero out the lower 12 bits and the upper 12 bits */
 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
 
-		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
+		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
@@ -1774,13 +2041,15 @@ restart:
 		if ((pte & PG_V) == 0 ||
 		    (usermode && (pte & PG_U) == 0) ||
 		    (writable && (pte & PG_RW) == 0)) {
-			pfcode = pf_error_code(usermode, prot, 0, pte);
-			vm_inject_pf(vm, vcpuid, pfcode, gla);
+			if (!check_only) {
+				pfcode = pf_error_code(usermode, prot, 0, pte);
+				vm_inject_pf(vm, vcpuid, pfcode, gla);
+			}
 			goto fault;
 		}
 
 		/* Set the accessed bit in the page table entry */
-		if ((pte & PG_A) == 0) {
+		if (!check_only && (pte & PG_A) == 0) {
 			if (atomic_cmpset_64(&ptpbase[ptpindex],
 			    pte, pte | PG_A) == 0) {
 				goto restart;
@@ -1789,8 +2058,11 @@ restart:
 
 		if (nlevels > 0 && (pte & PG_PS) != 0) {
 			if (pgsize > 1 * GB) {
-				pfcode = pf_error_code(usermode, prot, 1, pte);
-				vm_inject_pf(vm, vcpuid, pfcode, gla);
+				if (!check_only) {
+					pfcode = pf_error_code(usermode, prot, 1,
+					    pte);
+					vm_inject_pf(vm, vcpuid, pfcode, gla);
+				}
 				goto fault;
 			}
 			break;
@@ -1800,7 +2072,7 @@ restart:
 	}
 
 	/* Set the dirty bit in the page table entry if necessary */
-	if (writable && (pte & PG_M) == 0) {
+	if (!check_only && writable && (pte & PG_M) == 0) {
 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
 			goto restart;
 	}
@@ -1810,18 +2082,38 @@ restart:
 	*gpa = pte | (gla & (pgsize - 1));
 done:
 	ptp_release(&cookie);
+	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
+	    __func__, retval));
 	return (retval);
 error:
-	retval = -1;
+	retval = EFAULT;
 	goto done;
 fault:
-	retval = 1;
+	*guest_fault = 1;
 	goto done;
 }
 
+int
+vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
+{
+
+	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
+	    false));
+}
+
+int
+vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
+{
+
+	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
+	    true));
+}
+
 int
 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t rip, int inst_length, struct vie *vie)
+    uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
 {
 	struct vm_copyinfo copyinfo[2];
 	int error, prot;
@@ -1831,13 +2123,14 @@ vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 
 	prot = PROT_READ | PROT_EXEC;
 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
-	    copyinfo, nitems(copyinfo));
-	if (error == 0) {
-		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
-		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-		vie->num_valid = inst_length;
-	}
-	return (error);
+	    copyinfo, nitems(copyinfo), faultptr);
+	if (error || *faultptr)
+		return (error);
+
+	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+	vie->num_valid = inst_length;
+	return (0);
 }
 
 static int
@@ -2261,28 +2554,18 @@ decode_moffset(struct vie *vie)
 	return (0);
 }
 
-/*
- * Verify that all the bytes in the instruction buffer were consumed.
- */
-static int
-verify_inst_length(struct vie *vie)
-{
-
-	if (vie->num_processed)
-		return (0);
-	else
-		return (-1);
-}
-
 /*
  * Verify that the 'guest linear address' provided as collateral of the nested
  * page table fault matches with our instruction decoding.
  */
 static int
-verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
+    enum vm_cpu_mode cpu_mode)
 {
 	int error;
-	uint64_t base, idx, gla2;
+	uint64_t base, segbase, idx, gla2;
+	enum vm_reg_name seg;
+	struct seg_desc desc;
 
 	/* Skip 'gla' verification */
 	if (gla == VIE_INVALID_GLA)
@@ -2302,7 +2585,7 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 		 * instruction
 		 */
 		if (vie->base_register == VM_REG_GUEST_RIP)
-			base += vie->num_valid;
+			base += vie->num_processed;
 	}
 
 	idx = 0;
@@ -2315,14 +2598,48 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 		}
 	}
 
-	/* XXX assuming that the base address of the segment is 0 */
-	gla2 = base + vie->scale * idx + vie->displacement;
+	/*
+	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
+	 *
+	 * In 64-bit mode, segmentation is generally (but not
+	 * completely) disabled.  The exceptions are the FS and GS
+	 * segments.
+	 *
+	 * In legacy IA-32 mode, when the ESP or EBP register is used
+	 * as the base, the SS segment is the default segment.  For
+	 * other data references, except when relative to stack or
+	 * string destination the DS segment is the default.  These
+	 * can be overridden to allow other segments to be accessed.
+	 */
+	if (vie->segment_override)
+		seg = vie->segment_register;
+	else if (vie->base_register == VM_REG_GUEST_RSP ||
+	    vie->base_register == VM_REG_GUEST_RBP)
+		seg = VM_REG_GUEST_SS;
+	else
+		seg = VM_REG_GUEST_DS;
+	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
+	    seg != VM_REG_GUEST_GS) {
+		segbase = 0;
+	} else {
+		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
+		if (error) {
+			printf("verify_gla: error %d getting segment"
+			       " descriptor %d", error,
+			       vie->segment_register);
+			return (-1);
+		}
+		segbase = desc.base;
+	}
+
+	gla2 = segbase + base + vie->scale * idx + vie->displacement;
 	gla2 &= size2mask[vie->addrsize];
 	if (gla != gla2) {
-		printf("verify_gla mismatch: "
+		printf("verify_gla mismatch: segbase(0x%0lx)"
 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
-		       base, vie->scale, idx, vie->displacement, gla, gla2);
+		       segbase, base, vie->scale, idx, vie->displacement,
+		       gla, gla2);
 		return (-1);
 	}
 
@@ -2355,11 +2672,8 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 	if (decode_moffset(vie))
 		return (-1);
 
-	if (verify_inst_length(vie))
-		return (-1);
-
 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
-		if (verify_gla(vm, cpuid, gla, vie))
+		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
 			return (-1);
 	}
 
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c
index bea750f162..3d08fd5e85 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -25,12 +27,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/types.h>
-#include <sys/queue.h>
-#include <sys/cpuset.h>
 #include <sys/systm.h>
 
 #include <machine/vmm.h>
@@ -38,6 +37,8 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z
 
 #include "vatpic.h"
 #include "vatpit.h"
+#include "vpmtmr.h"
+#include "vrtc.h"
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 
@@ -55,6 +56,9 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = {
 	[IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler,
 	[IO_ELCR1] = vatpic_elc_handler,
 	[IO_ELCR2] = vatpic_elc_handler,
+	[IO_PMTMR] = vpmtmr_handler,
+	[IO_RTC] = vrtc_addr_handler,
+	[IO_RTC + 1] = vrtc_data_handler,
 };
 
 #ifdef KTR
@@ -103,6 +107,7 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit,
 	uint32_t mask, val;
 	int error;
 
+#ifdef __FreeBSD__
 	/*
 	 * If there is no handler for the I/O port then punt to userspace.
 	 */
@@ -111,6 +116,28 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit,
 		*retu = true;
 		return (0);
 	}
+#else /* __FreeBSD__ */
+	handler = NULL;
+	if (vmexit->u.inout.port < MAX_IOPORTS) {
+		handler = ioport_handler[vmexit->u.inout.port];
+	}
+	/* Look for hooks, if a standard handler is not present */
+	if (handler == NULL) {
+		mask = vie_size2mask(vmexit->u.inout.bytes);
+		if (!vmexit->u.inout.in) {
+			val = vmexit->u.inout.eax & mask;
+		}
+		error = vm_ioport_handle_hook(vm, vcpuid, vmexit->u.inout.in,
+		    vmexit->u.inout.port, vmexit->u.inout.bytes, &val);
+		if (error == 0) {
+			goto finish;
+		}
+
+		*retu = true;
+		return (0);
+	}
+
+#endif /* __FreeBSD__ */
 
 	mask = vie_size2mask(vmexit->u.inout.bytes);
 
@@ -131,6 +158,9 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit,
 		return (EIO);
 	}
 
+#ifndef __FreeBSD__
+finish:
+#endif /* __FreeBSD__ */
 	if (vmexit->u.inout.in) {
 		vmexit->u.inout.eax &= ~mask;
 		vmexit->u.inout.eax |= val & mask;
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h
index 624dd8f1d8..14e315f400 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_ioport.h 273706 2014-10-26 19:03:06Z neel $
+ * $FreeBSD$
  */
 
 #ifndef	_VMM_IOPORT_H_
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h
deleted file mode 100644
index 4dff03ba1f..0000000000
--- a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/amd64/vmm/vmm_ipi.h 260466 2014-01-09 03:25:54Z neel $
- */
-
-#ifndef _VMM_IPI_H_
-#define _VMM_IPI_H_
-
-#ifdef	__FreeBSD__
-int	vmm_ipi_alloc(void);
-void	vmm_ipi_free(int num);
-#endif
-
-#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h
index 917c7f83a4..414d0341cc 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_ktr.h 258699 2013-11-27 22:18:08Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _VMM_KTR_H_
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
index 3215c74a44..43b2bebe97 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -39,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -49,7 +51,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z t
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
-#include "vmm_ipi.h"
 #include "vmm_ktr.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
@@ -67,10 +68,14 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
 {
 	struct vlapic *vlapic;
 
-	if (cpu < 0 || cpu >= VM_MAXCPU)
+	if (cpu < 0 || cpu >= vm_get_maxcpus(vm))
 		return (EINVAL);
 
-	if (vector < 32 || vector > 255)
+	/*
+	 * According to section "Maskable Hardware Interrupts" in Intel SDM
+	 * vectors 16 through 255 can be delivered through the local APIC.
+	 */
+	if (vector < 16 || vector > 255)
 		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
@@ -86,7 +91,7 @@ lapic_set_local_intr(struct vm *vm, int cpu, int vector)
 	cpuset_t dmask;
 	int error;
 
-	if (cpu < -1 || cpu >= VM_MAXCPU)
+	if (cpu < -1 || cpu >= vm_get_maxcpus(vm))
 		return (EINVAL);
 
 	if (cpu == -1)
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
index ee47ee7783..da3b0ff660 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.h 259863 2013-12-25 06:46:31Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_mem.c
new file mode 100644
index 0000000000..a736d94bba
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.c
@@ -0,0 +1,124 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_mem.h"
+
+int
+vmm_mem_init(void)
+{
+
+	return (0);
+}
+
+vm_object_t
+vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
+	       vm_paddr_t hpa)
+{
+	int error;
+	vm_object_t obj;
+	struct sglist *sg;
+
+	sg = sglist_alloc(1, M_WAITOK);
+	error = sglist_append_phys(sg, hpa, len);
+	KASSERT(error == 0, ("error %d appending physaddr to sglist", error));
+
+	obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
+	if (obj != NULL) {
+		/*
+		 * VT-x ignores the MTRR settings when figuring out the
+		 * memory type for translations obtained through EPT.
+		 *
+		 * Therefore we explicitly force the pages provided by
+		 * this object to be mapped as uncacheable.
+		 */
+		VM_OBJECT_WLOCK(obj);
+		error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
+		VM_OBJECT_WUNLOCK(obj);
+		if (error != KERN_SUCCESS) {
+			panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
+				error);
+		}
+		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
+				    VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
+		if (error != KERN_SUCCESS) {
+			vm_object_deallocate(obj);
+			obj = NULL;
+		}
+	}
+
+	/*
+	 * Drop the reference on the sglist.
+	 *
+	 * If the scatter/gather object was successfully allocated then it
+	 * has incremented the reference count on the sglist. Dropping the
+	 * initial reference count ensures that the sglist will be freed
+	 * when the object is deallocated.
+	 * 
+	 * If the object could not be allocated then we end up freeing the
+	 * sglist.
+	 */
+	sglist_free(sg);
+
+	return (obj);
+}
+
+void
+vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
+{
+
+	vm_map_remove(&vmspace->vm_map, gpa, gpa + len);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+	return (ptoa(Maxmem));
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h
index 05dc37fb9a..e6f88fb222 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_mem.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_mem.h 245678 2013-01-20 03:42:49Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -41,9 +43,13 @@
 #ifndef	_VMM_MEM_H_
 #define	_VMM_MEM_H_
 
+struct vmspace;
+struct vm_object;
+
 int		vmm_mem_init(void);
-vm_paddr_t	vmm_mem_alloc(size_t size);
-void		vmm_mem_free(vm_paddr_t start, size_t size);
+struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len,
+				 vm_paddr_t hpa);
+void		vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size);
 vm_paddr_t	vmm_mem_maxaddr(void);
 
 #endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index 79e1cb1a44..2b612b20e9 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -20,246 +21,401 @@
 #include <sys/stat.h>
 #include <sys/vmsystm.h>
 #include <sys/ddi.h>
-/*
- * struct modctl in <sys/modctl.h> contains "void *__unused".  
- * Do this ugly workaround to avoid it.
- */
-#undef	__unused
+#include <sys/mkdev.h>
 #include <sys/sunddi.h>
 #include <sys/fs/dv_node.h>
+#include <sys/cpuset.h>
+#include <sys/id_space.h>
+#include <sys/fs/sdev_plugin.h>
+#include <sys/smt.h>
+
+#include <sys/kernel.h>
+#include <sys/hma.h>
+#include <sys/x86_archext.h>
 
 #include <sys/vmm.h>
 #include <sys/vmm_instruction_emul.h>
 #include <sys/vmm_dev.h>
 #include <sys/vmm_impl.h>
+#include <sys/vmm_drv.h>
 
 #include <vm/vm.h>
 #include <vm/seg_dev.h>
 
 #include "io/vatpic.h"
 #include "io/vioapic.h"
+#include "io/vrtc.h"
+#include "io/vhpet.h"
 #include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_util.h"
+#include "vm/vm_glue.h"
 
-static dev_info_t *vmm_dip;
-static void *vmm_statep;
+/*
+ * Locking details:
+ *
+ * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
+ * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
+ * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
+ * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
+ * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
+ */
 
-static SLIST_HEAD(, vmm_softc) head;
+static kmutex_t		vmmdev_mtx;
+static dev_info_t	*vmmdev_dip;
+static hma_reg_t	*vmmdev_hma_reg;
+static sdev_plugin_hdl_t vmmdev_sdev_hdl;
 
-static kmutex_t vmmdev_mtx;
+static kmutex_t		vmm_mtx;
+static list_t		vmm_list;
+static list_t		vmm_destroy_list;
+static id_space_t	*vmm_minors;
+static void		*vmm_statep;
 
-/*
- * vmm trace ring
- */
-int	vmm_dmsg_ring_size = VMM_DMSG_RING_SIZE;
-static	vmm_trace_rbuf_t *vmm_debug_rbuf;
-static	vmm_trace_dmsg_t *vmm_trace_dmsg_alloc(void);
-static	void vmm_trace_dmsg_free(void);
-static	void vmm_trace_rbuf_alloc(void);
-static	void vmm_trace_rbuf_free(void);
+static const char *vmmdev_hvm_name = "bhyve";
 
-/*
- * This routine is used to manage debug messages
- * on ring buffer.
- */
-static vmm_trace_dmsg_t *
-vmm_trace_dmsg_alloc(void)
+/* For sdev plugin (/dev) */
+#define	VMM_SDEV_ROOT "/dev/vmm"
+
+/* From uts/i86pc/io/vmm/intel/vmx.c */
+extern int vmx_x86_supported(const char **);
+
+/* Holds and hooks from drivers external to vmm */
+struct vmm_hold {
+	list_node_t	vmh_node;
+	vmm_softc_t	*vmh_sc;
+	boolean_t	vmh_release_req;
+	uint_t		vmh_ioport_hook_cnt;
+};
+
+struct vmm_lease {
+	list_node_t		vml_node;
+	struct vm		*vml_vm;
+	boolean_t		vml_expired;
+	boolean_t		(*vml_expire_func)(void *);
+	void			*vml_expire_arg;
+	list_node_t		vml_expire_node;
+	struct vmm_hold		*vml_hold;
+};
+
+static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
+static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
+
+static int
+vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 {
-	vmm_trace_dmsg_t *dmsg_alloc, *dmsg = vmm_debug_rbuf->dmsgp;
+	int error;
+	bool sysmem;
 
-	if (vmm_debug_rbuf->looped == TRUE) {
-		vmm_debug_rbuf->dmsgp = dmsg->next;
-		return (vmm_debug_rbuf->dmsgp);
-	}
+	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
+	    NULL);
+	if (error || mseg->len == 0)
+		return (error);
 
-	/*
-	 * If we're looping for the first time,
-	 * connect the ring.
-	 */
-	if (((vmm_debug_rbuf->size + (sizeof (vmm_trace_dmsg_t))) >
-	    vmm_debug_rbuf->maxsize) && (vmm_debug_rbuf->dmsgh != NULL)) {
-		dmsg->next = vmm_debug_rbuf->dmsgh;
-		vmm_debug_rbuf->dmsgp = vmm_debug_rbuf->dmsgh;
-		vmm_debug_rbuf->looped = TRUE;
-		return (vmm_debug_rbuf->dmsgp);
-	}
-
-	/* If we've gotten this far then memory allocation is needed */
-	dmsg_alloc = kmem_zalloc(sizeof (vmm_trace_dmsg_t), KM_NOSLEEP);
-	if (dmsg_alloc == NULL) {
-		vmm_debug_rbuf->allocfailed++;
-		return (dmsg_alloc);
-	} else {
-		vmm_debug_rbuf->size += sizeof (vmm_trace_dmsg_t);
-	}
+	if (!sysmem) {
+		vmm_devmem_entry_t *de;
+		list_t *dl = &sc->vmm_devmem_list;
 
-	if (vmm_debug_rbuf->dmsgp != NULL) {
-		dmsg->next = dmsg_alloc;
-		vmm_debug_rbuf->dmsgp = dmsg->next;
-		return (vmm_debug_rbuf->dmsgp);
-	} else {
-		/*
-		 * We should only be here if we're initializing
-		 * the ring buffer.
-		 */
-		if (vmm_debug_rbuf->dmsgh == NULL) {
-			vmm_debug_rbuf->dmsgh = dmsg_alloc;
-		} else {
-			/* Something is wrong */
-			kmem_free(dmsg_alloc, sizeof (vmm_trace_dmsg_t));
-			return (NULL);
+		for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
+			if (de->vde_segid == mseg->segid) {
+				break;
+			}
 		}
-
-		vmm_debug_rbuf->dmsgp = dmsg_alloc;
-		return (vmm_debug_rbuf->dmsgp);
+		if (de != NULL) {
+			(void) strlcpy(mseg->name, de->vde_name,
+			    sizeof (mseg->name));
+		}
+	} else {
+		bzero(mseg->name, sizeof (mseg->name));
 	}
+
+	return (error);
 }
 
 /*
- * Free all messages on debug ring buffer.
+ * The 'devmem' hack:
+ *
+ * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
+ * in the vm which appear with their own name related to the vm under /dev.
+ * Since this would be a hassle from an sdev perspective and would require a
+ * new cdev interface (or complicate the existing one), we choose to implement
+ * this in a different manner.  When 'devmem' mappings are created, an
+ * identifying off_t is communicated back out to userspace.  That off_t,
+ * residing above the normal guest memory space, can be used to mmap the
+ * 'devmem' mapping from the already-open vm device.
  */
-static void
-vmm_trace_dmsg_free(void)
-{
-	vmm_trace_dmsg_t *dmsg_next, *dmsg = vmm_debug_rbuf->dmsgh;
 
-	while (dmsg != NULL) {
-		dmsg_next = dmsg->next;
-		kmem_free(dmsg, sizeof (vmm_trace_dmsg_t));
+static int
+vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
+{
+	off_t map_offset;
+	vmm_devmem_entry_t *entry;
 
+	if (list_is_empty(&sc->vmm_devmem_list)) {
+		map_offset = VM_DEVMEM_START;
+	} else {
+		entry = list_tail(&sc->vmm_devmem_list);
+		map_offset = entry->vde_off + entry->vde_len;
+		if (map_offset < entry->vde_off) {
+			/* Do not tolerate overflow */
+			return (ERANGE);
+		}
 		/*
-		 * If we've looped around the ring than we're done.
+		 * XXXJOY: We could choose to search the list for duplicate
+		 * names and toss an error.  Since we're using the offset
+		 * method for now, it does not make much of a difference.
 		 */
-		if (dmsg_next == vmm_debug_rbuf->dmsgh) {
-			break;
-		} else {
-			dmsg = dmsg_next;
-		}
 	}
+
+	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
+	entry->vde_segid = mseg->segid;
+	entry->vde_len = mseg->len;
+	entry->vde_off = map_offset;
+	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
+	list_insert_tail(&sc->vmm_devmem_list, entry);
+
+	return (0);
 }
 
-static void
-vmm_trace_rbuf_alloc(void)
+static boolean_t
+vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp)
 {
-	vmm_debug_rbuf = kmem_zalloc(sizeof (vmm_trace_rbuf_t), KM_SLEEP);
+	list_t *dl = &sc->vmm_devmem_list;
+	vmm_devmem_entry_t *de = NULL;
 
-	mutex_init(&vmm_debug_rbuf->lock, NULL, MUTEX_DRIVER, NULL);
+	VERIFY(off >= VM_DEVMEM_START);
 
-	if (vmm_dmsg_ring_size > 0) {
-		vmm_debug_rbuf->maxsize = vmm_dmsg_ring_size;
+	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
+		/* XXX: Only hit on direct offset/length matches for now */
+		if (de->vde_off == off && de->vde_len == len) {
+			break;
+		}
+	}
+	if (de == NULL) {
+		return (B_FALSE);
 	}
-}
 
+	*segidp = de->vde_segid;
+	return (B_TRUE);
+}
 
 static void
-vmm_trace_rbuf_free(void)
+vmmdev_devmem_purge(vmm_softc_t *sc)
 {
-	vmm_trace_dmsg_free();
-	mutex_destroy(&vmm_debug_rbuf->lock);
-	kmem_free(vmm_debug_rbuf, sizeof (vmm_trace_rbuf_t));
+	vmm_devmem_entry_t *entry;
+
+	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
+		kmem_free(entry, sizeof (*entry));
+	}
 }
 
-static void
-vmm_vtrace_log(const char *fmt, va_list ap)
+static int
+vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
 {
-	vmm_trace_dmsg_t *dmsg;
+	int error;
+	bool sysmem = true;
 
-	if (vmm_debug_rbuf == NULL) {
-		return;
+	if (VM_MEMSEG_NAME(mseg)) {
+		sysmem = false;
 	}
+	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
 
-	/*
-	 * If max size of ring buffer is smaller than size
-	 * required for one debug message then just return
-	 * since we have no room for the debug message.
-	 */
-	if (vmm_debug_rbuf->maxsize < (sizeof (vmm_trace_dmsg_t))) {
-		return;
+	if (error == 0 && VM_MEMSEG_NAME(mseg)) {
+		/*
+		 * Rather than create a whole fresh device from which userspace
+		 * can mmap this segment, instead make it available at an
+		 * offset above where the main guest memory resides.
+		 */
+		error = vmmdev_devmem_create(sc, mseg, mseg->name);
+		if (error != 0) {
+			vm_free_memseg(sc->vmm_vm, mseg->segid);
+		}
 	}
+	return (error);
+}
 
-	mutex_enter(&vmm_debug_rbuf->lock);
-
-	/* alloc or reuse on ring buffer */
-	dmsg = vmm_trace_dmsg_alloc();
+/*
+ * Resource Locking and Exclusion
+ *
+ * Much of bhyve depends on key portions of VM state, such as the guest memory
+ * map, to remain unchanged while the guest is running.  As ported from
+ * FreeBSD, the initial strategy for this resource exclusion hinged on gating
+ * access to the instance vCPUs.  Threads acting on a single vCPU, like those
+ * performing the work of actually running the guest in VMX/SVM, would lock
+ * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
+ * state, all of the vCPUs would be first locked, ensuring that the
+ * operation(s) could complete without any other threads stumbling into
+ * intermediate states.
+ *
+ * This approach is largely effective for bhyve.  Common operations, such as
+ * running the vCPUs, steer clear of lock contention.  The model begins to
+ * break down for operations which do not occur in the context of a specific
+ * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
+ * thread in the bhyve process.  In order to properly protect those vCPU-less
+ * operations from encountering invalid states, additional locking is required.
+ * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
+ * It does mean that class of operations will be serialized on locking the
+ * specific vCPU and that instances sized at VM_MAXCPU will potentially see
+ * undue contention on the VM_MAXCPU-1 vCPU.
+ *
+ * In order to address the shortcomings of this model, the concept of a
+ * read/write lock has been added to bhyve.  Operations which change
+ * fundamental aspects of a VM (such as the memory map) must acquire the write
+ * lock, which also implies locking all of the vCPUs and waiting for all read
+ * lock holders to release.  While it increases the cost and waiting time for
+ * those few operations, it allows most hot-path operations on the VM (which
+ * depend on its configuration remaining stable) to occur with minimal locking.
+ *
+ * Consumers of the Driver API (see below) are a special case when it comes to
+ * this locking, since they may hold a read lock via the drv_lease mechanism
+ * for an extended period of time.  Rather than forcing those consumers to
+ * continuously poll for a write lock attempt, the lease system forces them to
+ * provide a release callback to trigger their clean-up (and potential later
+ * reacquisition) of the read lock.
+ */
 
-	if (dmsg == NULL) {
-		/* resource allocation failed */
-		mutex_exit(&vmm_debug_rbuf->lock);
-		return;
-	}
+static void
+vcpu_lock_one(vmm_softc_t *sc, int vcpu)
+{
+	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 
-	gethrestime(&dmsg->timestamp);
+	/*
+	 * Since this state transition is utilizing from_idle=true, it should
+	 * not fail, but rather block until it can be successful.
+	 */
+	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
+}
 
-	(void) vsnprintf(dmsg->buf, sizeof (dmsg->buf), fmt, ap);
+static void
+vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
+{
+	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
 
-	mutex_exit(&vmm_debug_rbuf->lock);
+	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
+	vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
 }
 
-void
-vmm_trace_log(const char *fmt, ...)
+static void
+vmm_read_lock(vmm_softc_t *sc)
 {
-	va_list ap;
-
-	va_start(ap, fmt);
-	vmm_vtrace_log(fmt, ap);
-	va_end(ap);
+	rw_enter(&sc->vmm_rwlock, RW_READER);
 }
 
-void
-vmmdev_init(void)
+static void
+vmm_read_unlock(vmm_softc_t *sc)
 {
-	vmm_trace_rbuf_alloc();
+	rw_exit(&sc->vmm_rwlock);
 }
 
-int
-vmmdev_cleanup(void)
+static void
+vmm_write_lock(vmm_softc_t *sc)
 {
-	int	error;
+	int maxcpus;
 
-	if (SLIST_EMPTY(&head))
-		error = 0;
-	else
-		error = EBUSY;
+	/* First lock all the vCPUs */
+	maxcpus = vm_get_maxcpus(sc->vmm_vm);
+	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
+		vcpu_lock_one(sc, vcpu);
+	}
 
-	if (error == 0)
-		vmm_trace_dmsg_free();
+	mutex_enter(&sc->vmm_lease_lock);
+	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
+	sc->vmm_lease_blocker++;
+	if (sc->vmm_lease_blocker == 1) {
+		list_t *list = &sc->vmm_lease_list;
+		vmm_lease_t *lease = list_head(list);
 
-	return (error);
+		while (lease != NULL) {
+			boolean_t sync_break = B_FALSE;
+
+			if (!lease->vml_expired) {
+				void *arg = lease->vml_expire_arg;
+				lease->vml_expired = B_TRUE;
+				sync_break = lease->vml_expire_func(arg);
+			}
+
+			if (sync_break) {
+				vmm_lease_t *next;
+
+				/*
+				 * These leases which are synchronously broken
+				 * result in vmm_read_unlock() calls from a
+				 * different thread than the corresponding
+				 * vmm_read_lock().  This is acceptable, given
+				 * that the rwlock underpinning the whole
+				 * mechanism tolerates the behavior.  This
+				 * flexibility is _only_ afforded to VM read
+				 * lock (RW_READER) holders.
+				 */
+				next = list_next(list, lease);
+				vmm_lease_break_locked(sc, lease);
+				lease = next;
+			} else {
+				lease = list_next(list, lease);
+			}
+		}
+	}
+	mutex_exit(&sc->vmm_lease_lock);
+
+	rw_enter(&sc->vmm_rwlock, RW_WRITER);
+	/*
+	 * For now, the 'maxcpus' value for an instance is fixed at the
+	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
+	 * the future, allowing for dynamic vCPU resource sizing, acquisition
+	 * of the write lock will need to be wary of such changes.
+	 */
+	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
 }
 
-int
-vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode,
-    cred_t *credp, int *rvalp)
+static void
+vmm_write_unlock(vmm_softc_t *sc)
 {
-	int error, vcpu, state_changed;
-	struct vm_memory_segment seg;
-	struct vm_register vmreg;
-	struct vm_seg_desc vmsegdesc;
-	struct vm_run vmrun;
-	struct vm_exception vmexc;
-	struct vm_lapic_irq vmirq;
-	struct vm_lapic_msi vmmsi;
-	struct vm_ioapic_irq ioapic_irq;
-	struct vm_isa_irq isa_irq;
-	struct vm_capability vmcap;
-	struct vm_nmi vmnmi;
-	struct vm_x2apic x2apic;
-	struct vm_gla2gpa gg;
-	struct vm_activate_cpu vac;
-	int pincount;
-	int i;
-
-	vcpu = -1;
-	state_changed = 0;
+	int maxcpus;
+
+	mutex_enter(&sc->vmm_lease_lock);
+	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
+	sc->vmm_lease_blocker--;
+	if (sc->vmm_lease_blocker == 0) {
+		cv_broadcast(&sc->vmm_lease_cv);
+	}
+	mutex_exit(&sc->vmm_lease_lock);
 
 	/*
-	 * Some VMM ioctls can operate only on vcpus that are not running.
+	 * The VM write lock _must_ be released from the same thread it was
+	 * acquired in, unlike the read lock.
 	 */
+	VERIFY(rw_write_held(&sc->vmm_rwlock));
+	rw_exit(&sc->vmm_rwlock);
+
+	/* Unlock all the vCPUs */
+	maxcpus = vm_get_maxcpus(sc->vmm_vm);
+	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
+		vcpu_unlock_one(sc, vcpu);
+	}
+}
+
+static int
+vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
+    cred_t *credp, int *rvalp)
+{
+	int error = 0, vcpu = -1;
+	void *datap = (void *)arg;
+	enum vm_lock_type {
+		LOCK_NONE = 0,
+		LOCK_VCPU,
+		LOCK_READ_HOLD,
+		LOCK_WRITE_HOLD
+	} lock_type = LOCK_NONE;
+
+	/* Acquire any exclusion resources needed for the operation. */
 	switch (cmd) {
 	case VM_RUN:
 	case VM_GET_REGISTER:
 	case VM_SET_REGISTER:
 	case VM_GET_SEGMENT_DESCRIPTOR:
 	case VM_SET_SEGMENT_DESCRIPTOR:
+	case VM_GET_REGISTER_SET:
+	case VM_SET_REGISTER_SET:
 	case VM_INJECT_EXCEPTION:
 	case VM_GET_CAPABILITY:
 	case VM_SET_CAPABILITY:
@@ -267,494 +423,1320 @@ vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode,
 	case VM_PPTDEV_MSIX:
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
+	case VM_GLA2GPA_NOFAULT:
 	case VM_ACTIVATE_CPU:
+	case VM_SET_INTINFO:
+	case VM_GET_INTINFO:
 	case VM_RESTART_INSTRUCTION:
 		/*
-		 * XXX fragile, handle with care
-		 * Assumes that the first field of the ioctl data is the vcpu.
+		 * Copy in the ID of the vCPU chosen for this operation.
+		 * Since a nefarious caller could update their struct between
+		 * this locking and when the rest of the ioctl data is copied
+		 * in, it is _critical_ that this local 'vcpu' variable be used
+		 * rather than the in-struct one when performing the ioctl.
 		 */
-		if (ddi_copyin((void *)arg, &vcpu, sizeof (vcpu), mode)) {
+		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
 			return (EFAULT);
 		}
-		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
-			error = EINVAL;
-			goto done;
+		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
+			return (EINVAL);
 		}
-
-		error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
-		if (error)
-			goto done;
-
-		state_changed = 1;
+		vcpu_lock_one(sc, vcpu);
+		lock_type = LOCK_VCPU;
 		break;
-	case VM_MAP_MEMORY:
-		/*
-		 * ioctls that operate on the entire virtual machine must
-		 * prevent all vcpus from running.
-		 */
-		error = 0;
-		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
-			error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
-			if (error)
-				break;
-		}
 
-		if (error) {
-			while (--vcpu >= 0)
-				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
-			goto done;
-		}
+	case VM_REINIT:
+	case VM_BIND_PPTDEV:
+	case VM_UNBIND_PPTDEV:
+	case VM_MAP_PPTDEV_MMIO:
+	case VM_ALLOC_MEMSEG:
+	case VM_MMAP_MEMSEG:
+	case VM_WRLOCK_CYCLE:
+		vmm_write_lock(sc);
+		lock_type = LOCK_WRITE_HOLD;
+		break;
 
-		state_changed = 2;
+	case VM_GET_GPA_PMAP:
+	case VM_GET_MEMSEG:
+	case VM_MMAP_GETNEXT:
+	case VM_LAPIC_IRQ:
+	case VM_INJECT_NMI:
+	case VM_IOAPIC_ASSERT_IRQ:
+	case VM_IOAPIC_DEASSERT_IRQ:
+	case VM_IOAPIC_PULSE_IRQ:
+	case VM_LAPIC_MSI:
+	case VM_LAPIC_LOCAL_IRQ:
+	case VM_GET_X2APIC_STATE:
+	case VM_RTC_READ:
+	case VM_RTC_WRITE:
+	case VM_RTC_SETTIME:
+	case VM_RTC_GETTIME:
+#ifndef __FreeBSD__
+	case VM_DEVMEM_GETOFFSET:
+#endif
+		vmm_read_lock(sc);
+		lock_type = LOCK_READ_HOLD;
 		break;
 
+	case VM_IOAPIC_PINCOUNT:
 	default:
 		break;
 	}
 
-	switch(cmd) {
-	case VM_RUN:
-		if (ddi_copyin((void *)arg, &vmrun,
-		    sizeof (struct vm_run), mode)) {
-			return (EFAULT);
+	/* Execute the primary logic for the ioctl. */
+	switch (cmd) {
+	case VM_RUN: {
+		struct vm_run vmrun;
+
+		if (ddi_copyin(datap, &vmrun, sizeof (vmrun), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_run(sc->vm, &vmrun);
-		if (ddi_copyout(&vmrun, (void *)arg,
-		    sizeof (struct vm_run), mode)) {
-			return (EFAULT);
+		vmrun.cpuid = vcpu;
+
+		if (!(curthread->t_schedflag & TS_VCPU))
+			smt_mark_as_vcpu();
+
+		error = vm_run(sc->vmm_vm, &vmrun);
+		/*
+		 * XXXJOY: I think it's necessary to do copyout, even in the
+		 * face of errors, since the exit state is communicated out.
+		 */
+		if (ddi_copyout(&vmrun, datap, sizeof (vmrun), md)) {
+			error = EFAULT;
+			break;
 		}
 		break;
-	case VM_LAPIC_IRQ:
-		if (ddi_copyin((void *)arg, &vmirq,
-		    sizeof (struct vm_lapic_irq), mode)) {
-			return (EFAULT);
-		}
-		error = lapic_intr_edge(sc->vm, vmirq.cpuid, vmirq.vector);
-		if (ddi_copyout(&vmirq, (void *)arg,
-		    sizeof (struct vm_lapic_irq), mode)) {
-			return (EFAULT);
+	}
+	case VM_SUSPEND: {
+		struct vm_suspend vmsuspend;
+
+		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
+			error = EFAULT;
+			break;
 		}
+		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
 		break;
-	case VM_LAPIC_LOCAL_IRQ:
-		if (ddi_copyin((void *)arg, &vmirq,
-		    sizeof (struct vm_lapic_irq), mode)) {
-			return (EFAULT);
-		}
-		error = lapic_set_local_intr(sc->vm, vmirq.cpuid,
-		    vmirq.vector);
-		if (ddi_copyout(&vmirq, (void *)arg,
-		    sizeof (struct vm_lapic_irq), mode)) {
-			return (EFAULT);
+	}
+	case VM_REINIT:
+		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
+			/*
+			 * The VM instance should be free of driver-attached
+			 * hooks during the reinitialization process.
+			 */
+			break;
 		}
+		error = vm_reinit(sc->vmm_vm);
+		(void) vmm_drv_block_hook(sc, B_FALSE);
 		break;
-	case VM_LAPIC_MSI:
-		if (ddi_copyin((void *)arg, &vmmsi,
-		    sizeof (struct vm_lapic_msi), mode)) {
-			return (EFAULT);
-		}
-		error = lapic_intr_msi(sc->vm, vmmsi.addr, vmmsi.msg);
-		if (ddi_copyout(&vmmsi, (void *)arg,
-		    sizeof (struct vm_lapic_msi), mode)) {
-			return (EFAULT);
-		}
-	case VM_IOAPIC_ASSERT_IRQ:
-		if (ddi_copyin((void *)arg, &ioapic_irq,
-		    sizeof (struct vm_ioapic_irq), mode)) {
-			return (EFAULT);
+	case VM_STAT_DESC: {
+		struct vm_stat_desc statdesc;
+
+		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vioapic_assert_irq(sc->vm, ioapic_irq.irq);;
-		if (ddi_copyout(&ioapic_irq, (void *)arg,
-		    sizeof (struct vm_ioapic_irq), mode)) {
-			return (EFAULT);
+		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
+		    sizeof (statdesc.desc));
+		if (error == 0 &&
+		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
+			error = EFAULT;
+			break;
 		}
 		break;
-	case VM_IOAPIC_DEASSERT_IRQ:
-		if (ddi_copyin((void *)arg, &ioapic_irq,
-		    sizeof (struct vm_ioapic_irq), mode)) {
-			return (EFAULT);
+	}
+	case VM_STATS_IOC: {
+		struct vm_stats vmstats;
+
+		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
+		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vioapic_deassert_irq(sc->vm, ioapic_irq.irq);
-		if (ddi_copyout(&ioapic_irq, (void *)arg,
-		    sizeof (struct vm_ioapic_irq), mode)) {
-			return (EFAULT);
+		hrt2tv(gethrtime(), &vmstats.tv);
+		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
+		    &vmstats.num_entries, vmstats.statbuf);
+		if (error == 0 &&
+		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
+			error = EFAULT;
+			break;
 		}
 		break;
-	case VM_IOAPIC_PULSE_IRQ:
-		if (ddi_copyin((void *)arg, &ioapic_irq,
-		    sizeof (struct vm_ioapic_irq), mode)) {
-			return (EFAULT);
+	}
+
+	/* XXXJOY: punt on these for now */
+	case VM_PPTDEV_MSI: {
+		struct vm_pptdev_msi pptmsi;
+
+		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vioapic_pulse_irq(sc->vm, ioapic_irq.irq);
-		if (ddi_copyout(&ioapic_irq, (void *)arg,
-		    sizeof (struct vm_ioapic_irq), mode)) {
-			return (EFAULT);
+		return (ENOTTY);
+	}
+	case VM_PPTDEV_MSIX: {
+		struct vm_pptdev_msix pptmsix;
+
+		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
+			error = EFAULT;
+			break;
 		}
-		break;
-	case VM_IOAPIC_PINCOUNT:
-		error = 0;
-		pincount = vioapic_pincount(sc->vm);
-		if (ddi_copyout(&pincount, (void *)arg, sizeof (int), mode)) {
-			return (EFAULT);
+		return (ENOTTY);
+	}
+	case VM_MAP_PPTDEV_MMIO: {
+		struct vm_pptdev_mmio pptmmio;
+
+		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
+			error = EFAULT;
+			break;
 		}
-		break;
-	case VM_ISA_ASSERT_IRQ:
-		if (ddi_copyin((void *)arg, &isa_irq,
-		    sizeof (struct vm_isa_irq), mode)) {
-			return (EFAULT);
+		return (ENOTTY);
+	}
+	case VM_BIND_PPTDEV:
+	case VM_UNBIND_PPTDEV: {
+		struct vm_pptdev pptdev;
+
+		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vatpic_assert_irq(sc->vm, isa_irq.atpic_irq);
-		if (error == 0 && isa_irq.ioapic_irq != -1)
-			error = vioapic_assert_irq(sc->vm,
-			    isa_irq.ioapic_irq);
-		if (ddi_copyout(&isa_irq, (void *)arg,
-		    sizeof (struct vm_isa_irq), mode)) {
-			return (EFAULT);
-		
+		return (ENOTTY);
+	}
+
+	case VM_INJECT_EXCEPTION: {
+		struct vm_exception vmexc;
+
+		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
+			error = EFAULT;
+			break;
 		}
+		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
+		    vmexc.error_code_valid, vmexc.error_code,
+		    vmexc.restart_instruction);
 		break;
-	case VM_ISA_DEASSERT_IRQ:
-		if (ddi_copyin((void *)arg, &isa_irq,
-		    sizeof (struct vm_isa_irq), mode)) {
-			return (EFAULT);
-		}
-		error = vatpic_deassert_irq(sc->vm, isa_irq.atpic_irq);
-		if (error == 0 && isa_irq.ioapic_irq != -1)
-			error = vioapic_deassert_irq(sc->vm,
-			    isa_irq.ioapic_irq);
-		if (ddi_copyout(&isa_irq, (void *)arg,
-		    sizeof (struct vm_isa_irq), mode)) {
-			return (EFAULT);
-		
+	}
+	case VM_INJECT_NMI: {
+		struct vm_nmi vmnmi;
+
+		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
+			error = EFAULT;
+			break;
 		}
+		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
 		break;
-	case VM_ISA_PULSE_IRQ:
-		if (ddi_copyin((void *)arg, &isa_irq,
-		    sizeof (struct vm_isa_irq), mode)) {
-			return (EFAULT);
-		}
-		error = vatpic_pulse_irq(sc->vm, isa_irq.atpic_irq);
-		if (error == 0 && isa_irq.ioapic_irq != -1)
-			error = vioapic_pulse_irq(sc->vm, isa_irq.ioapic_irq);
-		if (ddi_copyout(&isa_irq, (void *)arg,
-		    sizeof (struct vm_isa_irq), mode)) {
-			return (EFAULT);
-		
+	}
+	case VM_LAPIC_IRQ: {
+		struct vm_lapic_irq vmirq;
+
+		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
+			error = EFAULT;
+			break;
 		}
+		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
 		break;
-	case VM_MAP_MEMORY:
-		if (ddi_copyin((void *)arg, &seg,
-		    sizeof (struct vm_memory_segment), mode)) {
-			return (EFAULT);
+	}
+	case VM_LAPIC_LOCAL_IRQ: {
+		struct vm_lapic_irq vmirq;
+
+		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_malloc(sc->vm, seg.gpa, seg.len);
+		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
+		    vmirq.vector);
 		break;
-	case VM_GET_MEMORY_SEG:
-		if (ddi_copyin((void *)arg, &seg,
-		    sizeof (struct vm_memory_segment), mode)) {
-			return (EFAULT);
-		}
-		seg.len = 0;
-		(void)vm_gpabase2memseg(sc->vm, seg.gpa, &seg);
-		if (ddi_copyout(&seg, (void *)arg,
-		    sizeof (struct vm_memory_segment), mode)) {
-			return (EFAULT);
+	}
+	case VM_LAPIC_MSI: {
+		struct vm_lapic_msi vmmsi;
+
+		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = 0;
+		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
 		break;
-	case VM_GET_REGISTER:
-		if (ddi_copyin((void *)arg, &vmreg,
-		    sizeof (struct vm_register), mode)) {
-			return (EFAULT);
-		}
-		error = vm_get_register(sc->vm, vmreg.cpuid, vmreg.regnum,
-					&vmreg.regval);
-		if (!error) {
-			if (ddi_copyout(&vmreg, (void *)arg,
-				 sizeof (struct vm_register), mode)) {
-				return (EFAULT);
-			}
+	}
+
+	case VM_IOAPIC_ASSERT_IRQ: {
+		struct vm_ioapic_irq ioapic_irq;
+
+		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
+			error = EFAULT;
+			break;
 		}
+		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
 		break;
-	case VM_SET_REGISTER:
-		if (ddi_copyin((void *)arg, &vmreg,
-		    sizeof (struct vm_register), mode)) {
-			return (EFAULT);
+	}
+	case VM_IOAPIC_DEASSERT_IRQ: {
+		struct vm_ioapic_irq ioapic_irq;
+
+		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_set_register(sc->vm, vmreg.cpuid, vmreg.regnum,
-					vmreg.regval);
+		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
 		break;
-	case VM_SET_SEGMENT_DESCRIPTOR:
-		if (ddi_copyin((void *)arg, &vmsegdesc,
-		    sizeof (struct vm_seg_desc), mode)) {
-			return (EFAULT);
+	}
+	case VM_IOAPIC_PULSE_IRQ: {
+		struct vm_ioapic_irq ioapic_irq;
+
+		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_set_seg_desc(sc->vm, vmsegdesc.cpuid,
-					vmsegdesc.regnum,
-					&vmsegdesc.desc);
+		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
 		break;
-	case VM_GET_SEGMENT_DESCRIPTOR:
-		if (ddi_copyin((void *)arg, &vmsegdesc,
-		    sizeof (struct vm_seg_desc), mode)) {
-			return (EFAULT);
+	}
+	case VM_IOAPIC_PINCOUNT: {
+		int pincount;
+
+		pincount = vioapic_pincount(sc->vmm_vm);
+		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_get_seg_desc(sc->vm, vmsegdesc.cpuid,
-					vmsegdesc.regnum,
-					&vmsegdesc.desc);
-		if (!error) {
-			if (ddi_copyout(&vmsegdesc, (void *)arg,
-			    sizeof (struct vm_seg_desc), mode)) {
-				return (EFAULT);
+		break;
+	}
+
+	case VM_ISA_ASSERT_IRQ: {
+		struct vm_isa_irq isa_irq;
+
+		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
+		if (error == 0 && isa_irq.ioapic_irq != -1) {
+			error = vioapic_assert_irq(sc->vmm_vm,
+			    isa_irq.ioapic_irq);
+		}
+		break;
+	}
+	case VM_ISA_DEASSERT_IRQ: {
+		struct vm_isa_irq isa_irq;
+
+		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
+		if (error == 0 && isa_irq.ioapic_irq != -1) {
+			error = vioapic_deassert_irq(sc->vmm_vm,
+			    isa_irq.ioapic_irq);
+		}
+		break;
+	}
+	case VM_ISA_PULSE_IRQ: {
+		struct vm_isa_irq isa_irq;
+
+		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
+		if (error == 0 && isa_irq.ioapic_irq != -1) {
+			error = vioapic_pulse_irq(sc->vmm_vm,
+			    isa_irq.ioapic_irq);
+		}
+		break;
+	}
+	case VM_ISA_SET_IRQ_TRIGGER: {
+		struct vm_isa_irq_trigger isa_irq_trigger;
+
+		if (ddi_copyin(datap, &isa_irq_trigger,
+		    sizeof (isa_irq_trigger), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vatpic_set_irq_trigger(sc->vmm_vm,
+		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
+		break;
+	}
+
+	case VM_MMAP_GETNEXT: {
+		struct vm_memmap mm;
+
+		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
+		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
+		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_MMAP_MEMSEG: {
+		struct vm_memmap mm;
+
+		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
+		    mm.len, mm.prot, mm.flags);
+		break;
+	}
+	case VM_ALLOC_MEMSEG: {
+		struct vm_memseg vmseg;
+
+		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vmmdev_alloc_memseg(sc, &vmseg);
+		break;
+	}
+	case VM_GET_MEMSEG: {
+		struct vm_memseg vmseg;
+
+		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vmmdev_get_memseg(sc, &vmseg);
+		if (error == 0 &&
+		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_GET_REGISTER: {
+		struct vm_register vmreg;
+
+		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
+		    &vmreg.regval);
+		if (error == 0 &&
+		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_SET_REGISTER: {
+		struct vm_register vmreg;
+
+		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
+		    vmreg.regval);
+		break;
+	}
+	case VM_SET_SEGMENT_DESCRIPTOR: {
+		struct vm_seg_desc vmsegd;
+
+		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
+		    &vmsegd.desc);
+		break;
+	}
+	case VM_GET_SEGMENT_DESCRIPTOR: {
+		struct vm_seg_desc vmsegd;
+
+		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
+		    &vmsegd.desc);
+		if (error == 0 &&
+		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_GET_REGISTER_SET: {
+		struct vm_register_set vrs;
+		int regnums[VM_REG_LAST];
+		uint64_t regvals[VM_REG_LAST];
+
+		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
+			error = EFAULT;
+			break;
+		}
+		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
+			error = EINVAL;
+			break;
+		}
+		if (ddi_copyin(vrs.regnums, regnums,
+		    sizeof (int) * vrs.count, md)) {
+			error = EFAULT;
+			break;
+		}
+
+		error = 0;
+		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
+			if (regnums[i] < 0) {
+				error = EINVAL;
+				break;
 			}
+			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
+			    &regvals[i]);
+		}
+		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
+		    sizeof (uint64_t) * vrs.count, md)) {
+			error = EFAULT;
 		}
 		break;
-	case VM_GET_CAPABILITY:
-		if (ddi_copyin((void *)arg, &vmcap,
-		    sizeof (struct vm_capability), mode)) {
-			return (EFAULT);
+	}
+	case VM_SET_REGISTER_SET: {
+		struct vm_register_set vrs;
+		int regnums[VM_REG_LAST];
+		uint64_t regvals[VM_REG_LAST];
+
+		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_get_capability(sc->vm, vmcap.cpuid,
-					  vmcap.captype,
-					  &vmcap.capval);
-		if (!error) {
-			if (ddi_copyout(&vmcap, (void *)arg,
-			    sizeof (struct vm_capability), mode)) {
-				return (EFAULT);
+		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
+			error = EINVAL;
+			break;
+		}
+		if (ddi_copyin(vrs.regnums, regnums,
+		    sizeof (int) * vrs.count, md)) {
+			error = EFAULT;
+			break;
+		}
+		if (ddi_copyin(vrs.regvals, regvals,
+		    sizeof (uint64_t) * vrs.count, md)) {
+			error = EFAULT;
+			break;
+		}
+
+		error = 0;
+		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
+			/*
+			 * Setting registers in a set is not atomic, since a
+			 * failure in the middle of the set will cause a
+			 * bail-out and inconsistent register state.  Callers
+			 * should be wary of this.
+			 */
+			if (regnums[i] < 0) {
+				error = EINVAL;
+				break;
 			}
+			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
+			    regvals[i]);
 		}
 		break;
-	case VM_SET_CAPABILITY:
-		if (ddi_copyin((void *)arg, &vmcap,
-		    sizeof (struct vm_capability), mode)) {
-			return (EFAULT);
+	}
+
+	case VM_GET_CAPABILITY: {
+		struct vm_capability vmcap;
+
+		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
+		    &vmcap.capval);
+		if (error == 0 &&
+		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_set_capability(sc->vm, vmcap.cpuid,
-					  vmcap.captype,
-					  vmcap.capval);
 		break;
-	case VM_SET_X2APIC_STATE:
-		if (ddi_copyin((void *)arg, &x2apic,
-		    sizeof (struct vm_x2apic), mode)) {
-			return (EFAULT);
+	}
+	case VM_SET_CAPABILITY: {
+		struct vm_capability vmcap;
+
+		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_set_x2apic_state(sc->vm,
-					    x2apic.cpuid, x2apic.state);
+		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
+		    vmcap.capval);
 		break;
-	case VM_GET_X2APIC_STATE:
-		if (ddi_copyin((void *)arg, &x2apic,
-		    sizeof (struct vm_x2apic), mode)) {
-			return (EFAULT);
+	}
+	case VM_SET_X2APIC_STATE: {
+		struct vm_x2apic x2apic;
+
+		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_get_x2apic_state(sc->vm,
-					    x2apic.cpuid, &x2apic.state);
-		if (!error) {
-			if (ddi_copyout(&x2apic, (void *)arg,
-			    sizeof (struct vm_x2apic), mode)) {
-				return (EFAULT);
-			}
+		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
+		break;
+	}
+	case VM_GET_X2APIC_STATE: {
+		struct vm_x2apic x2apic;
+
+		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
+		    &x2apic.state);
+		if (error == 0 &&
+		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_GET_GPA_PMAP: {
+		struct vm_gpa_pte gpapte;
+
+		if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
+			error = EFAULT;
+			break;
+		}
+#ifdef __FreeBSD__
+		/* XXXJOY: add function? */
+		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
+		    gpapte.gpa, gpapte.pte, &gpapte.ptenum);
+#endif
+		error = 0;
+		break;
+	}
+	case VM_GET_HPET_CAPABILITIES: {
+		struct vm_hpet_cap hpetcap;
+
+		error = vhpet_getcap(&hpetcap);
+		if (error == 0 &&
+		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
+			error = EFAULT;
+			break;
 		}
 		break;
+	}
 	case VM_GLA2GPA: {
+		struct vm_gla2gpa gg;
+
 		CTASSERT(PROT_READ == VM_PROT_READ);
 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
-		if (ddi_copyin((void *)arg, &gg,
-		    sizeof (struct vm_gla2gpa), mode)) {
-			return (EFAULT);
+
+		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
+			error = EFAULT;
+			break;
 		}
-		error = vm_gla2gpa(sc->vm, gg.vcpuid, &gg.paging, gg.gla,
-		    gg.prot, &gg.gpa);
-		KASSERT(error == 0 || error == 1 || error == -1,
-		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
-		if (error >= 0) {
-			/*
-			 * error = 0: the translation was successful
-			 * error = 1: a fault was injected into the guest
-			 */
-			gg.fault = error;
-			error = 0;
-			if (ddi_copyout(&gg, (void *)arg,
-			    sizeof (struct vm_gla2gpa), mode)) {
-				return (EFAULT);
-			}
+		gg.vcpuid = vcpu;
+		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
+		    gg.prot, &gg.gpa, &gg.fault);
+		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_GLA2GPA_NOFAULT: {
+		struct vm_gla2gpa gg;
+
+		CTASSERT(PROT_READ == VM_PROT_READ);
+		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
+		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
+
+		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
+			error = EFAULT;
+			break;
+		}
+		gg.vcpuid = vcpu;
+		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
+		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
+		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+
+	case VM_ACTIVATE_CPU:
+		error = vm_activate_cpu(sc->vmm_vm, vcpu);
+		break;
+
+	case VM_SUSPEND_CPU:
+		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
+			error = EFAULT;
+		} else {
+			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
+		}
+		break;
+
+	case VM_RESUME_CPU:
+		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
+			error = EFAULT;
+		} else {
+			error = vm_resume_cpu(sc->vmm_vm, vcpu);
+		}
+		break;
+
+	case VM_GET_CPUS: {
+		struct vm_cpuset vm_cpuset;
+		cpuset_t tempset;
+		void *srcp = &tempset;
+		int size;
+
+		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
+			error = EFAULT;
+			break;
+		}
+
+		/* Be more generous about sizing since our cpuset_t is large. */
+		size = vm_cpuset.cpusetsize;
+		if (size <= 0 || size > sizeof (cpuset_t)) {
+			error = ERANGE;
+		}
+		/*
+		 * If they want a ulong_t or less, make sure they receive the
+		 * low bits with all the useful information.
+		 */
+		if (size <= sizeof (tempset.cpub[0])) {
+			srcp = &tempset.cpub[0];
+		}
+
+		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
+			tempset = vm_active_cpus(sc->vmm_vm);
+		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
+			tempset = vm_suspended_cpus(sc->vmm_vm);
+		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
+			tempset = vm_debug_cpus(sc->vmm_vm);
 		} else {
+			error = EINVAL;
+		}
+
+		ASSERT(size > 0 && size <= sizeof (tempset));
+		if (error == 0 &&
+		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_SET_INTINFO: {
+		struct vm_intinfo vmii;
+
+		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
+		break;
+	}
+	case VM_GET_INTINFO: {
+		struct vm_intinfo vmii;
+
+		vmii.vcpuid = vcpu;
+		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
+		    &vmii.info2);
+		if (error == 0 &&
+		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_RTC_WRITE: {
+		struct vm_rtc_data rtcdata;
+
+		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
+		    rtcdata.value);
+		break;
+	}
+	case VM_RTC_READ: {
+		struct vm_rtc_data rtcdata;
+
+		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
+		    &rtcdata.value);
+		if (error == 0 &&
+		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+	case VM_RTC_SETTIME: {
+		struct vm_rtc_time rtctime;
+
+		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
+		break;
+	}
+	case VM_RTC_GETTIME: {
+		struct vm_rtc_time rtctime;
+
+		rtctime.secs = vrtc_get_time(sc->vmm_vm);
+		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+
+	case VM_RESTART_INSTRUCTION:
+		error = vm_restart_instruction(sc->vmm_vm, vcpu);
+		break;
+
+	case VM_SET_TOPOLOGY: {
+		struct vm_cpu_topology topo;
+
+		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
+		    topo.threads, topo.maxcpus);
+		break;
+	}
+	case VM_GET_TOPOLOGY: {
+		struct vm_cpu_topology topo;
+
+		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
+		    &topo.threads, &topo.maxcpus);
+		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
+
+#ifndef __FreeBSD__
+	case VM_DEVMEM_GETOFFSET: {
+		struct vm_devmem_offset vdo;
+		list_t *dl = &sc->vmm_devmem_list;
+		vmm_devmem_entry_t *de = NULL;
+
+		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
 			error = EFAULT;
+			break;
+		}
+
+		for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
+			if (de->vde_segid == vdo.segid) {
+				break;
+			}
+		}
+		if (de != NULL) {
+			vdo.offset = de->vde_off;
+			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
+				error = EFAULT;
+			}
+		} else {
+			error = ENOENT;
 		}
 		break;
 	}
-	case VM_ACTIVATE_CPU:
-		if (ddi_copyin((void *)arg, &vac,
-		    sizeof (struct vm_activate_cpu), mode)) {
-			return (EFAULT);
-		}
-		error = vm_activate_cpu(sc->vm, vac.vcpuid);
-		break;
-	case VM_RESTART_INSTRUCTION:
-		error = vm_restart_instruction(sc->vm, vcpu);
-		break;
-	default:
-		error = ENOTTY;
-		break;
+	case VM_WRLOCK_CYCLE: {
+		/*
+		 * Present a test mechanism to acquire/release the write lock
+		 * on the VM without any other effects.
+		 */
+		break;
+	}
+#endif
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	/* Release exclusion resources */
+	switch (lock_type) {
+	case LOCK_NONE:
+		break;
+	case LOCK_VCPU:
+		vcpu_unlock_one(sc, vcpu);
+		break;
+	case LOCK_READ_HOLD:
+		vmm_read_unlock(sc);
+		break;
+	case LOCK_WRITE_HOLD:
+		vmm_write_unlock(sc);
+		break;
+	default:
+		panic("unexpected lock type");
+		break;
+	}
+
+	return (error);
+}
+
+static vmm_softc_t *
+vmm_lookup(const char *name)
+{
+	list_t *vml = &vmm_list;
+	vmm_softc_t *sc;
+
+	ASSERT(MUTEX_HELD(&vmm_mtx));
+
+	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
+		if (strcmp(sc->vmm_name, name) == 0) {
+			break;
+		}
+	}
+
+	return (sc);
+}
+
+static int
+vmmdev_do_vm_create(char *name, cred_t *cr)
+{
+	vmm_softc_t	*sc = NULL;
+	minor_t		minor;
+	int		error = ENOMEM;
+
+	if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
+		return (EINVAL);
+	}
+
+	mutex_enter(&vmm_mtx);
+
+	/* Look for duplicates names */
+	if (vmm_lookup(name) != NULL) {
+		mutex_exit(&vmm_mtx);
+		return (EEXIST);
+	}
+
+	/* Allow only one instance per non-global zone. */
+	if (!INGLOBALZONE(curproc)) {
+		for (sc = list_head(&vmm_list); sc != NULL;
+		    sc = list_next(&vmm_list, sc)) {
+			if (sc->vmm_zone == curzone) {
+				mutex_exit(&vmm_mtx);
+				return (EINVAL);
+			}
+		}
+	}
+
+	minor = id_alloc(vmm_minors);
+	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
+		goto fail;
+	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
+		ddi_soft_state_free(vmm_statep, minor);
+		goto fail;
+	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		goto fail;
+	}
+
+	error = vm_create(name, &sc->vmm_vm);
+	if (error == 0) {
+		/* Complete VM intialization and report success. */
+		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
+		sc->vmm_minor = minor;
+		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
+		    offsetof(vmm_devmem_entry_t, vde_node));
+
+		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
+		    offsetof(vmm_hold_t, vmh_node));
+		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
+
+		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
+		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
+		    offsetof(vmm_lease_t, vml_node));
+		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
+		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
+
+		sc->vmm_zone = crgetzone(cr);
+		zone_hold(sc->vmm_zone);
+		vmm_zsd_add_vm(sc);
+
+		list_insert_tail(&vmm_list, sc);
+		mutex_exit(&vmm_mtx);
+		return (0);
+	}
+
+	ddi_remove_minor_node(vmmdev_dip, name);
+fail:
+	id_free(vmm_minors, minor);
+	if (sc != NULL) {
+		ddi_soft_state_free(vmm_statep, minor);
+	}
+	mutex_exit(&vmm_mtx);
+
+	return (error);
+}
+
+/*
+ * Bhyve 'Driver' Interface
+ *
+ * While many devices are emulated in the bhyve userspace process, there are
+ * others with performance constraints which require that they run mostly or
+ * entirely in-kernel.  For those not integrated directly into bhyve, an API is
+ * needed so they can query/manipulate the portions of VM state needed to
+ * fulfill their purpose.
+ *
+ * This includes:
+ * - Translating guest-physical addresses to host-virtual pointers
+ * - Injecting MSIs
+ * - Hooking IO port addresses
+ *
+ * The vmm_drv interface exists to provide that functionality to its consumers.
+ * (At this time, 'viona' is the only user)
+ */
+int
+vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
+{
+	vnode_t *vp = fp->f_vnode;
+	const dev_t dev = vp->v_rdev;
+	vmm_softc_t *sc;
+	vmm_hold_t *hold;
+	int err = 0;
+
+	if (vp->v_type != VCHR) {
+		return (ENXIO);
+	}
+	const major_t major = getmajor(dev);
+	const minor_t minor = getminor(dev);
+
+	mutex_enter(&vmmdev_mtx);
+	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
+		mutex_exit(&vmmdev_mtx);
+		return (ENOENT);
 	}
+	mutex_enter(&vmm_mtx);
+	mutex_exit(&vmmdev_mtx);
 
-	if (state_changed == 1) {
-		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
-	} else if (state_changed == 2) {
-		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
-			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
+	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
+		err = ENOENT;
+		goto out;
 	}
+	/* XXXJOY: check cred permissions against instance */
 
-done:
-	/* Make sure that no handler returns a bogus value like ERESTART */
-	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
-	return (error);
+	if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
+		err = EBUSY;
+		goto out;
+	}
+
+	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
+	hold->vmh_sc = sc;
+	hold->vmh_release_req = B_FALSE;
+
+	list_insert_tail(&sc->vmm_holds, hold);
+	sc->vmm_flags |= VMM_HELD;
+	*holdp = hold;
+
+out:
+	mutex_exit(&vmm_mtx);
+	return (err);
 }
 
-static
-minor_t vmm_find_free_minor(void)
+void
+vmm_drv_rele(vmm_hold_t *hold)
 {
-	minor_t		minor;
-
-	for (minor = 1; ; minor++) {
-		if (ddi_get_soft_state(vmm_statep, minor) == NULL)
-			break;
+	vmm_softc_t *sc;
+
+	ASSERT(hold != NULL);
+	ASSERT(hold->vmh_sc != NULL);
+	VERIFY(hold->vmh_ioport_hook_cnt == 0);
+
+	mutex_enter(&vmm_mtx);
+	sc = hold->vmh_sc;
+	list_remove(&sc->vmm_holds, hold);
+	if (list_is_empty(&sc->vmm_holds)) {
+		sc->vmm_flags &= ~VMM_HELD;
+		cv_broadcast(&sc->vmm_cv);
 	}
+	mutex_exit(&vmm_mtx);
+	kmem_free(hold, sizeof (*hold));
+}
+
+boolean_t
+vmm_drv_release_reqd(vmm_hold_t *hold)
+{
+	ASSERT(hold != NULL);
 
-	return (minor);
+	return (hold->vmh_release_req);
 }
 
-int
-vmmdev_do_vm_create(dev_info_t *dip, char *name)
+vmm_lease_t *
+vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
 {
-	struct vmm_softc	*sc;
-	minor_t			minor;
-	int			error;
+	vmm_softc_t *sc = hold->vmh_sc;
+	vmm_lease_t *lease;
 
-	mutex_enter(&vmmdev_mtx);
+	ASSERT3P(expiref, !=, NULL);
 
-	if (strlen(name) >= VM_MAX_NAMELEN) {
-		mutex_exit(&vmmdev_mtx);
-		return (EINVAL);
+	if (hold->vmh_release_req) {
+		return (NULL);
 	}
 
-	minor = vmm_find_free_minor();
-	if (ddi_soft_state_zalloc(vmm_statep, minor) == DDI_FAILURE) {
-		mutex_exit(&vmmdev_mtx);
-		return (DDI_FAILURE);
+	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
+	list_link_init(&lease->vml_node);
+	lease->vml_expire_func = expiref;
+	lease->vml_expire_arg = arg;
+	lease->vml_expired = B_FALSE;
+	lease->vml_hold = hold;
+	/* cache the VM pointer for one less pointer chase */
+	lease->vml_vm = sc->vmm_vm;
+
+	mutex_enter(&sc->vmm_lease_lock);
+	while (sc->vmm_lease_blocker != 0) {
+		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
 	}
+	list_insert_tail(&sc->vmm_lease_list, lease);
+	vmm_read_lock(sc);
+	mutex_exit(&sc->vmm_lease_lock);
 
-	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
-		ddi_soft_state_free(vmm_statep, minor);
-		mutex_exit(&vmmdev_mtx);
-		return (DDI_FAILURE);
-	}
-	strcpy(sc->name, name);
-	sc->minor = minor;
+	return (lease);
+}
 
-	if (ddi_create_minor_node(dip, name, S_IFCHR, minor,
-	    DDI_PSEUDO, 0) == DDI_FAILURE) {
-		ddi_soft_state_free(vmm_statep, minor);
-		mutex_exit(&vmmdev_mtx);
-		return (DDI_FAILURE);
-	}
+static void
+vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
+{
+	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
 
-	error = vm_create(name, &sc->vm);
-	if (error != 0) {
-		ddi_soft_state_free(vmm_statep, minor);
-		ddi_remove_minor_node(dip, name);
-		mutex_exit(&vmmdev_mtx);
-		return (error);
+	list_remove(&sc->vmm_lease_list, lease);
+	vmm_read_unlock(sc);
+	kmem_free(lease, sizeof (*lease));
+}
+
+void
+vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
+{
+	vmm_softc_t *sc = hold->vmh_sc;
+
+	VERIFY3P(hold, ==, lease->vml_hold);
+
+	mutex_enter(&sc->vmm_lease_lock);
+	vmm_lease_break_locked(sc, lease);
+	mutex_exit(&sc->vmm_lease_lock);
+}
+
+boolean_t
+vmm_drv_lease_expired(vmm_lease_t *lease)
+{
+	return (lease->vml_expired);
+}
+
+void *
+vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
+{
+	ASSERT(lease != NULL);
+
+	return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
+}
+
+int
+vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
+{
+	ASSERT(lease != NULL);
+
+	return (lapic_intr_msi(lease->vml_vm, addr, msg));
+}
+
+int
+vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc,
+    vmm_drv_wmem_cb_t wfunc, void *arg, void **cookie)
+{
+	vmm_softc_t *sc;
+	int err;
+
+	ASSERT(hold != NULL);
+	ASSERT(cookie != NULL);
+
+	sc = hold->vmh_sc;
+	mutex_enter(&vmm_mtx);
+	/* Confirm that hook installation is not blocked */
+	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
+		mutex_exit(&vmm_mtx);
+		return (EBUSY);
+	}
+	/*
+	 * Optimistically record an installed hook which will prevent a block
+	 * from being asserted while the mutex is dropped.
+	 */
+	hold->vmh_ioport_hook_cnt++;
+	mutex_exit(&vmm_mtx);
+
+	vmm_write_lock(sc);
+	err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc,
+	    (vmm_wmem_cb_t)wfunc, arg, cookie);
+	vmm_write_unlock(sc);
+
+	if (err != 0) {
+		mutex_enter(&vmm_mtx);
+		/* Walk back optimism about the hook installation */
+		hold->vmh_ioport_hook_cnt--;
+		mutex_exit(&vmm_mtx);
 	}
-	SLIST_INSERT_HEAD(&head, sc, link);
+	return (err);
+}
 
-	mutex_exit(&vmmdev_mtx);
+void
+vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
+{
+	vmm_softc_t *sc;
 
-	return (0);
+	ASSERT(hold != NULL);
+	ASSERT(cookie != NULL);
+	ASSERT(hold->vmh_ioport_hook_cnt != 0);
+
+	sc = hold->vmh_sc;
+	vmm_write_lock(sc);
+	vm_ioport_unhook(sc->vmm_vm, cookie);
+	vmm_write_unlock(sc);
+
+	mutex_enter(&vmm_mtx);
+	hold->vmh_ioport_hook_cnt--;
+	mutex_exit(&vmm_mtx);
 }
 
-static struct vmm_softc *
-vmm_lookup(char *name)
+static int
+vmm_drv_purge(vmm_softc_t *sc)
 {
-	struct vmm_softc	*sc;
+	ASSERT(MUTEX_HELD(&vmm_mtx));
 
-	SLIST_FOREACH(sc, &head, link) {
-		if (strcmp(sc->name, name) == 0) {
-			break;
+	if ((sc->vmm_flags & VMM_HELD) != 0) {
+		vmm_hold_t *hold;
+
+		sc->vmm_flags |= VMM_CLEANUP;
+		for (hold = list_head(&sc->vmm_holds); hold != NULL;
+		    hold = list_next(&sc->vmm_holds, hold)) {
+			hold->vmh_release_req = B_TRUE;
 		}
+		while ((sc->vmm_flags & VMM_HELD) != 0) {
+			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
+				return (EINTR);
+			}
+		}
+		sc->vmm_flags &= ~VMM_CLEANUP;
 	}
 
-	return (sc);
-
+	VERIFY(list_is_empty(&sc->vmm_holds));
+	sc->vmm_flags |= VMM_PURGED;
+	return (0);
 }
 
-struct vm *
-vm_lookup_by_name(char *name)
+static int
+vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
 {
-	struct vmm_softc	*sc;
+	int err = 0;
 
-	mutex_enter(&vmmdev_mtx);
+	mutex_enter(&vmm_mtx);
+	if (!enable_block) {
+		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
 
-	if ((sc = vmm_lookup(name)) == NULL) {
-		mutex_exit(&vmmdev_mtx);
-		return (NULL);
+		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
+		goto done;
 	}
 
-	mutex_exit(&vmmdev_mtx);
+	/* If any holds have hooks installed, the block is a failure */
+	if (!list_is_empty(&sc->vmm_holds)) {
+		vmm_hold_t *hold;
+
+		for (hold = list_head(&sc->vmm_holds); hold != NULL;
+		    hold = list_next(&sc->vmm_holds, hold)) {
+			if (hold->vmh_ioport_hook_cnt != 0) {
+				err = EBUSY;
+				goto done;
+			}
+		}
+	}
+	sc->vmm_flags |= VMM_BLOCK_HOOK;
 
-	return (sc->vm);
+done:
+	mutex_exit(&vmm_mtx);
+	return (err);
 }
 
-int
-vmmdev_do_vm_destroy(dev_info_t *dip, char *name)
+static int
+vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd)
 {
-	struct vmm_softc	*sc;
-	dev_info_t      *pdip = ddi_get_parent(dip);
+	dev_info_t	*pdip = ddi_get_parent(vmmdev_dip);
+	minor_t		minor;
 
-	mutex_enter(&vmmdev_mtx);
+	ASSERT(MUTEX_HELD(&vmm_mtx));
 
-	if ((sc = vmm_lookup(name)) == NULL) {
-		mutex_exit(&vmmdev_mtx);
-		return (ENOENT);
+	if (clean_zsd) {
+		vmm_zsd_rem_vm(sc);
 	}
 
-	if (sc->open) {
-		mutex_exit(&vmmdev_mtx);
-		return (EBUSY);
+	if (vmm_drv_purge(sc) != 0) {
+		return (EINTR);
 	}
 
-	vm_destroy(sc->vm);
-	SLIST_REMOVE(&head, sc, vmm_softc, link);
-	ddi_remove_minor_node(dip, name);
-	ddi_soft_state_free(vmm_statep, sc->minor);
-	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
+	/* Clean up devmem entries */
+	vmmdev_devmem_purge(sc);
 
-	mutex_exit(&vmmdev_mtx);
+	list_remove(&vmm_list, sc);
+	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
+	minor = sc->vmm_minor;
+	zone_rele(sc->vmm_zone);
+	if (sc->vmm_is_open) {
+		list_insert_tail(&vmm_destroy_list, sc);
+		sc->vmm_flags |= VMM_DESTROY;
+	} else {
+		vm_destroy(sc->vmm_vm);
+		ddi_soft_state_free(vmm_statep, minor);
+		id_free(vmm_minors, minor);
+	}
+	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
 
 	return (0);
 }
 
 int
-vmmdev_do_vm_mmap(struct vmm_softc *vmm_sc, off_t off, int nprot)
+vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
 {
-	vm_paddr_t	paddr;
+	int		err;
 
-	mutex_enter(&vmmdev_mtx);
+	mutex_enter(&vmm_mtx);
+	err = vmm_do_vm_destroy_locked(sc, clean_zsd);
+	mutex_exit(&vmm_mtx);
 
-	paddr = vm_gpa2hpa(vmm_sc->vm, (vm_paddr_t)off, PAGE_SIZE);
-	if (paddr == -1) {
-		return (-1);
-	}
+	return (err);
+}
 
-	mutex_exit(&vmmdev_mtx);
+/* ARGSUSED */
+static int
+vmmdev_do_vm_destroy(const char *name, cred_t *cr)
+{
+	vmm_softc_t	*sc;
+	int		err;
+
+	if (crgetuid(cr) != 0)
+		return (EPERM);
+
+	mutex_enter(&vmm_mtx);
+
+	if ((sc = vmm_lookup(name)) == NULL) {
+		mutex_exit(&vmm_mtx);
+		return (ENOENT);
+	}
+	/*
+	 * We don't check this in vmm_lookup() since that function is also used
+	 * for validation during create and currently vmm names must be unique.
+	 */
+	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
+		mutex_exit(&vmm_mtx);
+		return (EPERM);
+	}
+	err = vmm_do_vm_destroy_locked(sc, B_TRUE);
+	mutex_exit(&vmm_mtx);
 
-	return (btop(paddr));
+	return (err);
 }
 
 
 static int
 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 {
-	minor_t			minor;
-	struct vmm_softc	*sc;
+	minor_t		minor;
+	vmm_softc_t	*sc;
 
 	minor = getminor(*devp);
 	if (minor == VMM_CTL_MINOR) {
@@ -768,19 +1750,15 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 		return (0);
 	}
 
-	mutex_enter(&vmmdev_mtx);
+	mutex_enter(&vmm_mtx);
 	sc = ddi_get_soft_state(vmm_statep, minor);
 	if (sc == NULL) {
-		mutex_exit(&vmmdev_mtx);
+		mutex_exit(&vmm_mtx);
 		return (ENXIO);
 	}
 
-	if (sc->open) {
-		mutex_exit(&vmmdev_mtx);
-		return (EBUSY);
-	}
-	sc->open = B_TRUE;
-	mutex_exit(&vmmdev_mtx);
+	sc->vmm_is_open = B_TRUE;
+	mutex_exit(&vmm_mtx);
 
 	return (0);
 }
@@ -788,170 +1766,360 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 static int
 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
 {
-	minor_t			minor;
-	struct vmm_softc	*sc;
+	minor_t		minor;
+	vmm_softc_t	*sc;
 
 	minor = getminor(dev);
 	if (minor == VMM_CTL_MINOR)
 		return (0);
 
-	mutex_enter(&vmmdev_mtx);
+	mutex_enter(&vmm_mtx);
 	sc = ddi_get_soft_state(vmm_statep, minor);
 	if (sc == NULL) {
-		mutex_exit(&vmmdev_mtx);
+		mutex_exit(&vmm_mtx);
 		return (ENXIO);
 	}
 
-	sc->open = B_FALSE;
-	mutex_exit(&vmmdev_mtx);
+	VERIFY(sc->vmm_is_open);
+	sc->vmm_is_open = B_FALSE;
+
+	if (sc->vmm_flags & VMM_DESTROY) {
+		list_remove(&vmm_destroy_list, sc);
+		vm_destroy(sc->vmm_vm);
+		ddi_soft_state_free(vmm_statep, minor);
+		id_free(vmm_minors, minor);
+	}
+	mutex_exit(&vmm_mtx);
 
 	return (0);
 }
 
+static int
+vmm_is_supported(intptr_t arg)
+{
+	int r;
+	const char *msg;
+
+	if (vmm_is_intel()) {
+		r = vmx_x86_supported(&msg);
+	} else if (vmm_is_amd()) {
+		/*
+		 * HMA already ensured that the features necessary for SVM
+		 * operation were present and online during vmm_attach().
+		 */
+		r = 0;
+	} else {
+		r = ENXIO;
+		msg = "Unsupported CPU vendor";
+	}
+
+	if (r != 0 && arg != (intptr_t)NULL) {
+		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
+			return (EFAULT);
+	}
+	return (r);
+}
+
 static int
 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
     int *rvalp)
 {
-	struct vmm_softc	*sc;
-	struct vmm_ioctl	kvi;
-	minor_t			minor;
+	vmm_softc_t	*sc;
+	minor_t		minor;
 
 	minor = getminor(dev);
 
 	if (minor == VMM_CTL_MINOR) {
-		if (ddi_copyin((void *)arg, &kvi, sizeof (struct vmm_ioctl),
-		    mode)) {
-			return (EFAULT);
+		void *argp = (void *)arg;
+		char name[VM_MAX_NAMELEN] = { 0 };
+		size_t len = 0;
+
+		if ((mode & FKIOCTL) != 0) {
+			len = strlcpy(name, argp, sizeof (name));
+		} else {
+			if (copyinstr(argp, name, sizeof (name), &len) != 0) {
+				return (EFAULT);
+			}
 		}
+		if (len >= VM_MAX_NAMELEN) {
+			return (ENAMETOOLONG);
+		}
+
 		switch (cmd) {
 		case VMM_CREATE_VM:
 			if ((mode & FWRITE) == 0)
 				return (EPERM);
-			return (vmmdev_do_vm_create(vmm_dip, kvi.vmm_name));
+			return (vmmdev_do_vm_create(name, credp));
 		case VMM_DESTROY_VM:
 			if ((mode & FWRITE) == 0)
 				return (EPERM);
-			return (vmmdev_do_vm_destroy(vmm_dip, kvi.vmm_name));
+			return (vmmdev_do_vm_destroy(name, credp));
+		case VMM_VM_SUPPORTED:
+			return (vmm_is_supported(arg));
 		default:
-			break;
+			/* No other actions are legal on ctl device */
+			return (ENOTTY);
 		}
 	}
 
 	sc = ddi_get_soft_state(vmm_statep, minor);
 	ASSERT(sc);
 
+	if (sc->vmm_flags & VMM_DESTROY)
+		return (ENXIO);
+
 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
 }
 
 static int
-vmm_mmap(dev_t dev, off_t off, int prot)
+vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
+    unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
 {
-	struct vmm_softc	*sc;
+	vmm_softc_t *sc;
+	const minor_t minor = getminor(dev);
+	struct vm *vm;
+	int err;
+	vm_object_t vmo = NULL;
+	struct vmspace *vms;
+
+	if (minor == VMM_CTL_MINOR) {
+		return (ENODEV);
+	}
+	if (off < 0 || (off + len) <= 0) {
+		return (EINVAL);
+	}
+	if ((prot & PROT_USER) == 0) {
+		return (EACCES);
+	}
 
-	sc = ddi_get_soft_state(vmm_statep, getminor(dev));
+	sc = ddi_get_soft_state(vmm_statep, minor);
 	ASSERT(sc);
 
-	return (vmmdev_do_vm_mmap(sc, off, prot));
-}
+	if (sc->vmm_flags & VMM_DESTROY)
+		return (ENXIO);
 
-static int
-vmm_segmap(dev_t dev, off_t off, struct as *as,
-		  caddr_t *addrp, off_t len, unsigned int prot,
-		  unsigned int maxprot, unsigned int flags, cred_t *credp)
-{
-	struct segdev_crargs	dev_a;
-	int			error;
+	/* Grab read lock on the VM to prevent any changes to the memory map */
+	vmm_read_lock(sc);
 
-	as_rangelock(as);
+	vm = sc->vmm_vm;
+	vms = vm_get_vmspace(vm);
+	if (off >= VM_DEVMEM_START) {
+		int segid;
 
-	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
-	if (error != 0) {
-		as_rangeunlock(as);
-		return (error);
+		/* Mapping a devmem "device" */
+		if (!vmmdev_devmem_segid(sc, off, len, &segid)) {
+			err = ENODEV;
+			goto out;
+		}
+		err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
+		if (err != 0) {
+			goto out;
+		}
+		err = vm_segmap_obj(vms, vmo, as, addrp, prot, maxprot, flags);
+	} else {
+		/* Mapping a part of the guest physical space */
+		err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
+		    flags);
 	}
 
-	dev_a.mapfunc = vmm_mmap;
-	dev_a.dev = dev;
-	dev_a.offset = off;
-	dev_a.type = (flags & MAP_TYPE);
-	dev_a.prot = (uchar_t)prot;
-	dev_a.maxprot = (uchar_t)maxprot;
-	dev_a.hat_attr = 0;
-	dev_a.hat_flags = HAT_LOAD_NOCONSIST;
-	dev_a.devmap_data = NULL;
-
-	error = as_map(as, *addrp, len, segdev_create, &dev_a);
-
-	as_rangeunlock(as);
 
-	return (error);
+out:
+	vmm_read_unlock(sc);
+	return (err);
 }
 
-static int
-vmm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+static sdev_plugin_validate_t
+vmm_sdev_validate(sdev_ctx_t ctx)
 {
-	return (0);
+	const char *name = sdev_ctx_name(ctx);
+	vmm_softc_t *sc;
+	sdev_plugin_validate_t ret;
+	minor_t minor;
+
+	if (sdev_ctx_vtype(ctx) != VCHR)
+		return (SDEV_VTOR_INVALID);
+
+	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
+
+	mutex_enter(&vmm_mtx);
+	if ((sc = vmm_lookup(name)) == NULL)
+		ret = SDEV_VTOR_INVALID;
+	else if (sc->vmm_minor != minor)
+		ret = SDEV_VTOR_STALE;
+	else
+		ret = SDEV_VTOR_VALID;
+	mutex_exit(&vmm_mtx);
+
+	return (ret);
 }
 
 static int
-vmm_probe(dev_info_t *dip)
+vmm_sdev_filldir(sdev_ctx_t ctx)
 {
-	if (driver_installed(ddi_name_to_major("kvm"))) {
-		cmn_err(CE_WARN, "kvm is installed\n");
-		return (DDI_PROBE_FAILURE);
+	vmm_softc_t *sc;
+	int ret;
+
+	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
+		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
+		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
+		return (EINVAL);
+	}
+
+	mutex_enter(&vmm_mtx);
+	ASSERT(vmmdev_dip != NULL);
+	for (sc = list_head(&vmm_list); sc != NULL;
+	    sc = list_next(&vmm_list, sc)) {
+		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
+			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
+			    S_IFCHR | 0600,
+			    makedevice(ddi_driver_major(vmmdev_dip),
+			    sc->vmm_minor));
+		} else {
+			continue;
+		}
+		if (ret != 0 && ret != EEXIST)
+			goto out;
 	}
 
-	return (DDI_PROBE_SUCCESS);
+	ret = 0;
+
+out:
+	mutex_exit(&vmm_mtx);
+	return (ret);
+}
+
+/* ARGSUSED */
+static void
+vmm_sdev_inactive(sdev_ctx_t ctx)
+{
 }
 
+static sdev_plugin_ops_t vmm_sdev_ops = {
+	.spo_version = SDEV_PLUGIN_VERSION,
+	.spo_flags = SDEV_PLUGIN_SUBDIR,
+	.spo_validate = vmm_sdev_validate,
+	.spo_filldir = vmm_sdev_filldir,
+	.spo_inactive = vmm_sdev_inactive
+};
+
+/* ARGSUSED */
 static int
-vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
 {
+	int error;
+
 	switch (cmd) {
-	case DDI_ATTACH:
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)vmmdev_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
 		break;
 	default:
+		error = DDI_FAILURE;
+		break;
+	}
+	return (error);
+}
+
+static int
+vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	sdev_plugin_hdl_t sph;
+	hma_reg_t *reg = NULL;
+	boolean_t vmm_loaded = B_FALSE;
+
+	if (cmd != DDI_ATTACH) {
 		return (DDI_FAILURE);
 	}
 
-	if (vmm_mod_load()) {
+	mutex_enter(&vmmdev_mtx);
+	/* Ensure we are not already attached. */
+	if (vmmdev_dip != NULL) {
+		mutex_exit(&vmmdev_mtx);
 		return (DDI_FAILURE);
 	}
 
-	vmm_dip = dip;
+	vmm_sol_glue_init();
+	vmm_arena_init();
 
-	/*
-	 * Create control node.  Other nodes will be created on demand.
-	 */
-	if (ddi_create_minor_node(dip, VMM_CTL_MINOR_NODE, S_IFCHR,
+	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
+		goto fail;
+	} else if (vmm_mod_load() != 0) {
+		goto fail;
+	}
+	vmm_loaded = B_TRUE;
+
+	/* Create control node.  Other nodes will be created on demand. */
+	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
-		return (DDI_FAILURE);
+		goto fail;
 	}
 
-	ddi_report_dev(dip);
+	if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) ==
+	    (sdev_plugin_hdl_t)NULL) {
+		ddi_remove_minor_node(dip, NULL);
+		goto fail;
+	}
 
+	ddi_report_dev(dip);
+	vmmdev_hma_reg = reg;
+	vmmdev_sdev_hdl = sph;
+	vmmdev_dip = dip;
+	mutex_exit(&vmmdev_mtx);
 	return (DDI_SUCCESS);
+
+fail:
+	if (vmm_loaded) {
+		VERIFY0(vmm_mod_unload());
+	}
+	if (reg != NULL) {
+		hma_unregister(reg);
+	}
+	vmm_arena_fini();
+	vmm_sol_glue_cleanup();
+	mutex_exit(&vmmdev_mtx);
+	return (DDI_FAILURE);
 }
 
 static int
 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	switch (cmd) {
-	case DDI_DETACH:
-		break;
-	default:
+	if (cmd != DDI_DETACH) {
+		return (DDI_FAILURE);
+	}
+
+	/* Ensure that all resources have been cleaned up */
+	mutex_enter(&vmmdev_mtx);
+
+	mutex_enter(&vmm_mtx);
+	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
+		mutex_exit(&vmm_mtx);
+		mutex_exit(&vmmdev_mtx);
 		return (DDI_FAILURE);
 	}
+	mutex_exit(&vmm_mtx);
 
-	if (vmm_mod_unload()) {;
+	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
+	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
+		mutex_exit(&vmmdev_mtx);
 		return (DDI_FAILURE);
 	}
+	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
 
-	/*
-	 * Remove the control node.
-	 */
-	ddi_remove_minor_node(dip, VMM_CTL_MINOR_NODE);
-	vmm_dip = NULL;
+	/* Remove the control node. */
+	ddi_remove_minor_node(dip, "ctl");
+	vmmdev_dip = NULL;
+
+	VERIFY0(vmm_mod_unload());
+	hma_unregister(vmmdev_hma_reg);
+	vmmdev_hma_reg = NULL;
+	vmm_arena_fini();
+	vmm_sol_glue_cleanup();
+
+	mutex_exit(&vmmdev_mtx);
 
 	return (DDI_SUCCESS);
 }
@@ -966,7 +2134,7 @@ static struct cb_ops vmm_cb_ops = {
 	nodev,		/* write */
 	vmm_ioctl,
 	nodev,		/* devmap */
-	vmm_mmap,
+	nodev,		/* mmap */
 	vmm_segmap,
 	nochpoll,	/* poll */
 	ddi_prop_op,
@@ -977,9 +2145,9 @@ static struct cb_ops vmm_cb_ops = {
 static struct dev_ops vmm_ops = {
 	DEVO_REV,
 	0,
-	ddi_no_info,
+	vmm_info,
 	nulldev,	/* identify */
-	vmm_probe,
+	nulldev,	/* probe */
 	vmm_attach,
 	vmm_detach,
 	nodev,		/* reset */
@@ -989,7 +2157,7 @@ static struct dev_ops vmm_ops = {
 
 static struct modldrv modldrv = {
 	&mod_driverops,
-	"vmm",
+	"bhyve vmm",
 	&vmm_ops
 };
 
@@ -1004,16 +2172,27 @@ _init(void)
 {
 	int	error;
 
-	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
+	sysinit();
 
-	error = ddi_soft_state_init(&vmm_statep, sizeof (struct vmm_softc), 0);
+	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
+	list_create(&vmm_list, sizeof (vmm_softc_t),
+	    offsetof(vmm_softc_t, vmm_node));
+	list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
+	    offsetof(vmm_softc_t, vmm_node));
+	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
+
+	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
 	if (error) {
 		return (error);
 	}
 
+	vmm_zsd_init();
+
 	error = mod_install(&modlinkage);
 	if (error) {
 		ddi_soft_state_fini(&vmm_statep);
+		vmm_zsd_fini();
 	}
 
 	return (error);
@@ -1028,6 +2207,9 @@ _fini(void)
 	if (error) {
 		return (error);
 	}
+
+	vmm_zsd_fini();
+
 	ddi_soft_state_fini(&vmm_statep);
 
 	return (0);
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
new file mode 100644
index 0000000000..c26e763805
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c
@@ -0,0 +1,268 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/machsystm.h>
+
+#include <sys/gipt.h>
+#include <vm/vm_glue.h>
+
+
+struct ept_map {
+	gipt_map_t	em_gipt;
+	uint64_t	em_wired_page_count;
+};
+typedef struct ept_map ept_map_t;
+
+#define	EPT_LOCK(m)	(&(m)->em_gipt.giptm_lock)
+
+#define	EPT_MAX_LEVELS	4
+
+CTASSERT(EPT_MAX_LEVELS <= GIPT_MAX_LEVELS);
+
+#define	EPT_R		(0x1 << 0)
+#define	EPT_W		(0x1 << 1)
+#define	EPT_X		(0x1 << 2)
+#define	EPT_RWX		(EPT_R | EPT_W | EPT_X)
+#define	EPT_LGPG	(0x1 << 7)
+
+#define	EPT_PA_MASK	(0x000ffffffffff000ull)
+
+CTASSERT(EPT_R == PROT_READ);
+CTASSERT(EPT_W == PROT_WRITE);
+CTASSERT(EPT_X == PROT_EXEC);
+
+
+#define	EPT_PAT(attr)	(((attr) & 0x7) << 3)
+#define	EPT_PADDR(addr)	((addr) & EPT_PA_MASK)
+
+#define	EPT_IS_ABSENT(pte)	(((pte) & EPT_RWX) == 0)
+#define	EPT_PTE_PFN(pte)	mmu_btop(EPT_PADDR(pte))
+#define	EPT_PTE_PROT(pte)	((pte) & EPT_RWX)
+#define	EPT_MAPS_PAGE(pte, lvl)	\
+	(EPT_PTE_PROT(pte) != 0 && (((pte) & EPT_LGPG) != 0 || (lvl) == 0))
+
+/*
+ * Only assign EPT_LGPG for levels higher than 0.  Although this bit is defined
+ * as being ignored at level 0, some versions of VMWare fail to honor this and
+ * report such a PTE as an EPT mis-configuration.
+ */
+#define	EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr)	\
+	(EPT_PADDR(pfn_to_pa(pfn)) |			\
+	(((lvl) != 0) ? EPT_LGPG : 0) |			\
+	EPT_PAT(attr) | ((prot) & EPT_RWX))
+#define	EPT_PTE_ASSIGN_TABLE(pfn)	(EPT_PADDR(pfn_to_pa(pfn)) | EPT_RWX)
+
+
+static gipt_pte_type_t
+ept_pte_type(uint64_t pte, uint_t level)
+{
+	if (EPT_IS_ABSENT(pte)) {
+		return (PTET_EMPTY);
+	} else if (EPT_MAPS_PAGE(pte, level)) {
+		return (PTET_PAGE);
+	} else {
+		return (PTET_LINK);
+	}
+}
+
+static uint64_t
+ept_pte_map(uint64_t pfn)
+{
+	return (EPT_PTE_ASSIGN_TABLE(pfn));
+}
+
+static void *
+ept_create(uintptr_t *pml4_kaddr)
+{
+	ept_map_t *emap;
+	gipt_map_t *map;
+	gipt_t *root;
+	struct gipt_cbs cbs = {
+		.giptc_pte_type = ept_pte_type,
+		.giptc_pte_map = ept_pte_map,
+	};
+
+	emap = kmem_zalloc(sizeof (*emap), KM_SLEEP);
+	map = &emap->em_gipt;
+	root = gipt_alloc();
+	root->gipt_level = EPT_MAX_LEVELS - 1;
+	gipt_map_init(map, EPT_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root);
+
+	*pml4_kaddr = (uintptr_t)root->gipt_kva;
+	return (emap);
+}
+
+static void
+ept_destroy(void *arg)
+{
+	ept_map_t *emap = arg;
+
+	if (emap != NULL) {
+		gipt_map_t *map = &emap->em_gipt;
+
+		gipt_map_fini(map);
+		kmem_free(emap, sizeof (*emap));
+	}
+}
+
+static uint64_t
+ept_wired_count(void *arg)
+{
+	ept_map_t *emap = arg;
+	uint64_t res;
+
+	mutex_enter(EPT_LOCK(emap));
+	res = emap->em_wired_page_count;
+	mutex_exit(EPT_LOCK(emap));
+
+	return (res);
+}
+
+static int
+ept_is_wired(void *arg, uint64_t va, uint_t *protp)
+{
+	ept_map_t *emap = arg;
+	gipt_t *pt;
+	int rv = -1;
+
+	mutex_enter(EPT_LOCK(emap));
+	pt = gipt_map_lookup_deepest(&emap->em_gipt, va);
+	if (pt != NULL) {
+		const uint64_t pte = GIPT_VA2PTE(pt, va);
+
+		if (EPT_MAPS_PAGE(pte, pt->gipt_level)) {
+			*protp = EPT_PTE_PROT(pte);
+			rv = 0;
+		}
+	}
+	mutex_exit(EPT_LOCK(emap));
+
+	return (rv);
+}
+
+static int
+ept_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot,
+    uint8_t attr)
+{
+	ept_map_t *emap = arg;
+	gipt_map_t *map = &emap->em_gipt;
+	gipt_t *pt;
+	uint64_t *ptep, pte;
+
+	ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0);
+	ASSERT3U(lvl, <, EPT_MAX_LEVELS);
+
+	mutex_enter(EPT_LOCK(emap));
+	pt = gipt_map_lookup(map, va, lvl);
+	if (pt == NULL) {
+		/*
+		 * A table at the appropriate VA/level that would house this
+		 * mapping does not currently exist.  Try to walk down to that
+		 * point, creating any necessary parent(s).
+		 */
+		pt = gipt_map_create_parents(map, va, lvl);
+
+		/*
+		 * There was a large page mapping in the way of creating the
+		 * necessary parent table(s).
+		 */
+		if (pt == NULL) {
+			panic("unexpected large page @ %08lx", va);
+		}
+	}
+	ptep = GIPT_VA2PTEP(pt, va);
+
+	pte = *ptep;
+	if (!EPT_IS_ABSENT(pte)) {
+		if (!EPT_MAPS_PAGE(pte, lvl)) {
+			panic("unexpected PT link @ %08lx in %p", va, pt);
+		} else {
+			panic("unexpected page mapped @ %08lx in %p", va, pt);
+		}
+	}
+
+	pte = EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr);
+	*ptep = pte;
+	pt->gipt_valid_cnt++;
+	emap->em_wired_page_count += gipt_level_count[lvl];
+
+	mutex_exit(EPT_LOCK(emap));
+	return (0);
+}
+
+static uint64_t
+ept_unmap(void *arg, uint64_t va, uint64_t end_va)
+{
+	ept_map_t *emap = arg;
+	gipt_map_t *map = &emap->em_gipt;
+	gipt_t *pt;
+	uint64_t cur_va = va;
+	uint64_t unmapped = 0;
+
+	mutex_enter(EPT_LOCK(emap));
+
+	pt = gipt_map_lookup_deepest(map, cur_va);
+	if (pt == NULL) {
+		mutex_exit(EPT_LOCK(emap));
+		return (0);
+	}
+	if (!EPT_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) {
+		cur_va = gipt_map_next_page(map, cur_va, end_va, &pt);
+		if (cur_va == 0) {
+			mutex_exit(EPT_LOCK(emap));
+			return (0);
+		}
+	}
+
+	while (cur_va < end_va) {
+		uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va);
+		const uint_t lvl = pt->gipt_level;
+
+		ASSERT(EPT_MAPS_PAGE(*ptep, lvl));
+		*ptep = 0;
+		pt->gipt_valid_cnt--;
+		unmapped += gipt_level_count[pt->gipt_level];
+
+		gipt_t *next_pt = pt;
+		uint64_t next_va;
+		next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt);
+
+		if (pt->gipt_valid_cnt == 0) {
+			gipt_map_clean_parents(map, pt);
+		}
+		if (next_va == 0) {
+			break;
+		}
+		pt = next_pt;
+		cur_va = next_va;
+	}
+	emap->em_wired_page_count -= unmapped;
+
+	mutex_exit(EPT_LOCK(emap));
+
+	return (unmapped);
+}
+
+struct vmm_pt_ops ept_ops = {
+	.vpo_init	= ept_create,
+	.vpo_free	= ept_destroy,
+	.vpo_wired_cnt	= ept_wired_count,
+	.vpo_is_wired	= ept_is_wired,
+	.vpo_map	= ept_map,
+	.vpo_unmap	= ept_unmap,
+};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
index 6588f5a46d..a8d94ea024 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -1,31 +1,4 @@
 /*
- * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/kern/subr_sleepqueue.c 261520 2014-02-05 18:13:27Z jhb $
- */
-/*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * All rights reserved.
  *
@@ -63,6 +36,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -73,22 +47,62 @@
 #include <sys/queue.h>
 #include <sys/spl.h>
 #include <sys/systm.h>
+#include <sys/ddidmareq.h>
+#include <sys/id_space.h>
+#include <sys/psm_defs.h>
+#include <sys/smp_impldefs.h>
+#include <sys/modhash.h>
+#include <sys/hma.h>
 
 #include <machine/cpufunc.h>
 #include <machine/fpu.h>
 #include <machine/md_var.h>
+#include <machine/pmap.h>
 #include <machine/specialreg.h>
 #include <machine/vmm.h>
 #include <sys/vmm_impl.h>
+#include <sys/kernel.h>
 
 #include <vm/as.h>
 #include <vm/seg_kmem.h>
 
+SET_DECLARE(sysinit_set, struct sysinit);
+
+void
+sysinit(void)
+{
+	struct sysinit **si;
+
+	SET_FOREACH(si, sysinit_set)
+		(*si)->func((*si)->data);
+}
+
+u_char const bin2bcd_data[] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pfn_t	pfn;
 
+	/*
+	 * Since hat_getpfnum() may block on an htable mutex, this is not at
+	 * all safe to run from a critical_enter/kpreempt_disable context.
+	 * The FreeBSD analog does not have the same locking constraints, so
+	 * close attention must be paid wherever this is called.
+	 */
+	ASSERT(curthread->t_preempt == 0);
+
 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)va);
 	ASSERT(pfn != PFN_INVALID);
 	return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK);
@@ -97,45 +111,72 @@ pmap_kextract(vm_offset_t va)
 int
 cpusetobj_ffs(const cpuset_t *set)
 {
-#if	CPUSET_WORDS > 1
-	int	i, cbit;
+	uint_t large, small;
 
-	cbit = 0;
-	for (i = 0; i < CPUSET_WORDS; i++) {
-		if (set->cpub[i] != 0) {
-			cbit = ffsl(set->cpub[i]);
-			cbit += i * sizeof (set->cpub[0]);
-			break;
-		}
-	}
-	return (cbit);
-#else
-	return(ffsl(*set));
-#endif
-}
+	/*
+	 * Rather than reaching into the cpuset_t ourselves, leave that task to
+	 * cpuset_bounds().  The simplicity is worth the extra wasted work to
+	 * find the upper bound.
+	 */
+	cpuset_bounds(set, &small, &large);
 
-void
-smp_rendezvous(void (* setup_func)(void *),
-	       void (* action_func)(void *),
-	       void (* teardown_func)(void *),
-	       void *arg)
-{
-	cpuset_t cpuset;
+	if (small == CPUSET_NOTINSET) {
+		/* The FreeBSD version returns 0 if it find nothing */
+		return (0);
+	}
 
-	ASSERT(setup_func == NULL);
-	ASSERT(teardown_func == NULL);
+	ASSERT3U(small, <=, INT_MAX);
 
-	CPUSET_ALL(cpuset);
-	xc_sync((xc_arg_t)arg, 0, 0, CPUSET2BV(cpuset), (xc_func_t)action_func);
+	/* Least significant bit index starts at 1 for valid results */
+	return (small + 1);
 }
 
 struct kmem_item {
 	void			*addr;
 	size_t			size;
-	LIST_ENTRY(kmem_item)	next;
 };
 static kmutex_t kmem_items_lock;
-static LIST_HEAD(, kmem_item) kmem_items;
+
+static mod_hash_t *vmm_alloc_hash;
+uint_t vmm_alloc_hash_nchains = 16381;
+uint_t vmm_alloc_hash_size = PAGESIZE;
+
+static void
+vmm_alloc_hash_valdtor(mod_hash_val_t val)
+{
+	struct kmem_item *i = (struct kmem_item *)val;
+
+	kmem_free(i->addr, i->size);
+	kmem_free(i, sizeof (struct kmem_item));
+}
+
+static void
+vmm_alloc_init(void)
+{
+	vmm_alloc_hash = mod_hash_create_ptrhash("vmm_alloc_hash",
+	    vmm_alloc_hash_nchains, vmm_alloc_hash_valdtor,
+	    vmm_alloc_hash_size);
+
+	VERIFY(vmm_alloc_hash != NULL);
+}
+
+static uint_t
+vmm_alloc_check(mod_hash_key_t key, mod_hash_val_t *val, void *unused)
+{
+	struct kmem_item *i = (struct kmem_item *)val;
+
+	cmn_err(CE_PANIC, "!vmm_alloc_check: hash not empty: %p, %d", i->addr,
+	    i->size);
+
+	return (MH_WALK_TERMINATE);
+}
+
+static void
+vmm_alloc_cleanup(void)
+{
+	mod_hash_walk(vmm_alloc_hash, vmm_alloc_check, NULL);
+	mod_hash_destroy_ptrhash(vmm_alloc_hash);
+}
 
 void *
 malloc(unsigned long size, struct malloc_type *mtp, int flags)
@@ -148,17 +189,28 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags)
 		kmem_flag = KM_NOSLEEP;
 
 	if (flags & M_ZERO) {
-		p = kmem_zalloc(size + sizeof(struct kmem_item), kmem_flag);
+		p = kmem_zalloc(size, kmem_flag);
 	} else {
-		p = kmem_alloc(size + sizeof(struct kmem_item), kmem_flag);
+		p = kmem_alloc(size, kmem_flag);
+	}
+
+	if (p == NULL)
+		return (NULL);
+
+	i = kmem_zalloc(sizeof (struct kmem_item), kmem_flag);
+
+	if (i == NULL) {
+		kmem_free(p, size);
+		return (NULL);
 	}
 
 	mutex_enter(&kmem_items_lock);
-	i = p + size;
 	i->addr = p;
 	i->size = size;
 
-	LIST_INSERT_HEAD(&kmem_items, i, next);
+	VERIFY(mod_hash_insert(vmm_alloc_hash,
+	    (mod_hash_key_t)PHYS_TO_DMAP(vtophys(p)), (mod_hash_val_t)i) == 0);
+
 	mutex_exit(&kmem_items_lock);
 
 	return (p);
@@ -167,29 +219,66 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags)
 void
 free(void *addr, struct malloc_type *mtp)
 {
-	struct kmem_item	*i;
-
 	mutex_enter(&kmem_items_lock);
-	LIST_FOREACH(i, &kmem_items, next) {
-		if (i->addr == addr)
-			break;
-	}
-	ASSERT(i != NULL);
-	LIST_REMOVE(i, next);
+	VERIFY(mod_hash_destroy(vmm_alloc_hash,
+	    (mod_hash_key_t)PHYS_TO_DMAP(vtophys(addr))) == 0);
 	mutex_exit(&kmem_items_lock);
+}
+
+extern void *contig_alloc(size_t, ddi_dma_attr_t *, uintptr_t, int);
+extern void contig_free(void *, size_t);
 
-	kmem_free(addr, i->size + sizeof(struct kmem_item));
+void *
+contigmalloc(unsigned long size, struct malloc_type *type, int flags,
+    vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
+    vm_paddr_t boundary)
+{
+	ddi_dma_attr_t attr = {
+		/* Using fastboot_dma_attr as a guide... */
+		DMA_ATTR_V0,
+		low,			/* dma_attr_addr_lo */
+		high,			/* dma_attr_addr_hi */
+		0x00000000FFFFFFFFULL,	/* dma_attr_count_max */
+		alignment,		/* dma_attr_align */
+		1,			/* dma_attr_burstsize */
+		1,			/* dma_attr_minxfer */
+		0x00000000FFFFFFFFULL,	/* dma_attr_maxxfer */
+		0x00000000FFFFFFFFULL,	/* dma_attr_seg: any */
+		1,			/* dma_attr_sgllen */
+		alignment,		/* dma_attr_granular */
+		0,			/* dma_attr_flags */
+	};
+	int cansleep = (flags & M_WAITOK);
+	void *result;
+
+	ASSERT(alignment == PAGESIZE);
+
+	result = contig_alloc((size_t)size, &attr, alignment, cansleep);
+
+	if (result != NULL && (flags & M_ZERO) != 0) {
+		bzero(result, size);
+	}
+	return (result);
+}
+
+void
+contigfree(void *addr, unsigned long size, struct malloc_type *type)
+{
+	contig_free(addr, size);
 }
 
 void
 mtx_init(struct mtx *mtx, char *name, const char *type_name, int opts)
 {
-	if (opts & MTX_SPIN) {
-		mutex_init(&mtx->m, name, MUTEX_SPIN,
-		    (ddi_iblock_cookie_t)ipltospl(DISP_LEVEL));
-	} else {
-		mutex_init(&mtx->m, name, MUTEX_DRIVER, NULL);
-	}
+	/*
+	 * Requests that a mutex be initialized to the MTX_SPIN type are
+	 * ignored.  The limitations which may have required spinlocks on
+	 * FreeBSD do not apply to how bhyve has been structured here.
+	 *
+	 * Adaptive mutexes are required to avoid deadlocks when certain
+	 * cyclics behavior interacts with interrupts and contended locks.
+	 */
+	mutex_init(&mtx->m, name, MUTEX_ADAPTIVE, NULL);
 }
 
 void
@@ -202,130 +291,14 @@ void
 critical_enter(void)
 {
 	kpreempt_disable();
-	thread_affinity_set(curthread, CPU_CURRENT);
 }
 
 void
 critical_exit(void)
 {
-	thread_affinity_clear(curthread);
 	kpreempt_enable();
 }
 
-struct unr {
-	u_int		item;
-	struct unr	*link;
-};
-
-#define	UNR_HASHSIZE	8
-
-struct unrhdr {
-	struct mtx	*mtx;
-	struct unr	*hash[UNR_HASHSIZE];
-	u_int		min;
-	u_int		max;
-	u_int		next;
-};
-
-#define	HASH_UNR(uh, i)	((uh)->hash[(i) & ((UNR_HASHSIZE) - 1)])
-
-static struct mtx unr_mtx;
-
-/*
- * Allocate a new unrheader set.
- *
- * Highest and lowest valid values given as parameters.
- */
-struct unrhdr *
-new_unrhdr(int low, int high, struct mtx *mtx)
-{
-	struct unrhdr	*uh;
-
-	uh = kmem_zalloc(sizeof (struct unrhdr), KM_SLEEP);
-	if (mtx) {
-		uh->mtx = mtx;
-	} else {
-		uh->mtx = &unr_mtx;
-	}
-	uh->min = low;
-	uh->max = high;
-	uh->next = uh->min;
-
-	return (uh);
-}
-
-void
-delete_unrhdr(struct unrhdr *uh)
-{
-	kmem_free(uh, sizeof (struct unrhdr));
-}
-
-static struct unr *
-unr_lookup(struct unrhdr *uh, int item)
-{
-	struct unr	*unr;
-
-	ASSERT(MUTEX_HELD(&uh->mtx->m));
-
-	for (unr = HASH_UNR(uh, item); unr != NULL; unr = unr->link) {
-		if (unr->item == item)
-			break;
-	}
-
-	return (unr);
-}
-
-int
-alloc_unr(struct unrhdr *uh)
-{
-	struct unr	*unr;
-	int		item, start;
-
-	mutex_enter(&uh->mtx->m);
-	start = uh->next;
-	for (;;) {
-		item = uh->next;
-		if (++uh->next == uh->max) {
-			uh->next = uh->min;
-		}
-
-		if (unr_lookup(uh, item) == NULL) {
-			unr = kmem_zalloc(sizeof (struct unr), KM_SLEEP);
-			unr->item = item;
-			unr->link = HASH_UNR(uh, item);
-			HASH_UNR(uh, item) = unr;
-			break;
-		}
-
-		if (item == start) {
-			item = -1;
-			break;
-		}
-	}
-	mutex_exit(&uh->mtx->m);
-
-	return (item);
-}
-
-void
-free_unr(struct unrhdr *uh, u_int item)
-{
-	struct unr	*unr, **unrp;
-
-	mutex_enter(&uh->mtx->m);
-	unrp = &HASH_UNR(uh, item);
-	for (;;) {
-		ASSERT(*unrp != NULL);
-		if ((*unrp)->item == item)
-			break;
-		unrp = &(*unrp)->link;
-	}
-	unr = *unrp;
-	*unrp = unr->link;
-	mutex_exit(&uh->mtx->m);
-	kmem_free(unr, sizeof(struct unr));
-}
-
 
 static void
 vmm_glue_callout_handler(void *arg)
@@ -351,25 +324,43 @@ vmm_glue_callout_init(struct callout *c, int mpsafe)
 	when.cyt_interval = CY_INFINITY;
 
 	mutex_enter(&cpu_lock);
-	c->c_cyc_id = cyclic_add(&hdlr, &when);
+#if 0
+	/*
+	 * XXXJOY: according to the freebsd sources, callouts do not begin
+	 * their life in the ACTIVE state.
+	 */
 	c->c_flags |= CALLOUT_ACTIVE;
+#else
+	bzero(c, sizeof (*c));
+#endif
+	c->c_cyc_id = cyclic_add(&hdlr, &when);
 	mutex_exit(&cpu_lock);
 }
 
+static __inline hrtime_t
+sbttohrtime(sbintime_t sbt)
+{
+	return (((sbt >> 32) * NANOSEC) +
+	    (((uint64_t)NANOSEC * (uint32_t)sbt) >> 32));
+}
+
 int
 vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
     void (*func)(void *), void *arg, int flags)
 {
+	hrtime_t target = sbttohrtime(sbt);
+
 	ASSERT(c->c_cyc_id != CYCLIC_NONE);
 
 	c->c_func = func;
 	c->c_arg = arg;
 	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
 
-	if (flags & C_ABSOLUTE)
-		cyclic_reprogram(c->c_cyc_id, sbt);
-	else
-		cyclic_reprogram(c->c_cyc_id, sbt + gethrtime());
+	if (flags & C_ABSOLUTE) {
+		cyclic_reprogram(c->c_cyc_id, target);
+	} else {
+		cyclic_reprogram(c->c_cyc_id, target + gethrtime());
+	}
 
 	return (0);
 }
@@ -397,201 +388,24 @@ vmm_glue_callout_drain(struct callout *c)
 	return (0);
 }
 
-static int
-ipi_cpu_justreturn(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
-{
-	return (0);
-}
-
-void
-ipi_cpu(int cpu, u_int ipi)
-{
-	cpuset_t	set;
-
-	CPUSET_ONLY(set, cpu);
-	xc_call_nowait(NULL, NULL, NULL, CPUSET2BV(set),
-		       ipi_cpu_justreturn);
-}
-
-#define	SC_TABLESIZE	256			/* Must be power of 2. */
-#define	SC_MASK		(SC_TABLESIZE - 1)
-#define	SC_SHIFT	8
-#define	SC_HASH(wc)	((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
-			    SC_MASK)
-#define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
-
-struct sleepqueue {
-	u_int sq_blockedcnt;			/* Num. of blocked threads. */
-	LIST_ENTRY(sleepqueue) sq_hash;		/* Chain. */
-	void		*sq_wchan;		/* Wait channel. */
-	kcondvar_t	sq_cv;
-};
-
-struct sleepqueue_chain {
-	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
-	struct mtx	sc_lock;		/* Spin lock for this chain. */
-};
-
-static struct sleepqueue_chain	sleepq_chains[SC_TABLESIZE];
-
-#define	SLEEPQ_CACHE_SZ		(64)
-static kmem_cache_t		*vmm_sleepq_cache;
-
-static int
-vmm_sleepq_cache_init(void *buf, void *user_arg, int kmflags)
-{
-	struct sleepqueue *sq = (struct sleepqueue *)buf;
-
-	bzero(sq, sizeof (struct sleepqueue));
-	cv_init(&sq->sq_cv, NULL, CV_DRIVER, NULL);
-
-	return (0);
-}
-
-static void
-vmm_sleepq_cache_fini(void *buf, void *user_arg)
-{
-	struct sleepqueue *sq = (struct sleepqueue *)buf;
-	cv_destroy(&sq->sq_cv);
-}
-
-static void
-init_sleepqueues(void)
-{
-	int	i;
-
-        for (i = 0; i < SC_TABLESIZE; i++) {
-		LIST_INIT(&sleepq_chains[i].sc_queues);
-		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
-			 MTX_SPIN);
-	}
-
-	vmm_sleepq_cache = kmem_cache_create("vmm_sleepq_cache",
-	    sizeof (struct sleepqueue), SLEEPQ_CACHE_SZ, vmm_sleepq_cache_init,
-	    vmm_sleepq_cache_fini, NULL, NULL, NULL, 0);
-
-}
-
-/*
- * Lock the sleep queue chain associated with the specified wait channel.
- */
-static void
-sleepq_lock(void *wchan)
-{
-	struct sleepqueue_chain *sc;
-
-	sc = SC_LOOKUP(wchan);
-	mtx_lock_spin(&sc->sc_lock);
-}
-
-/*
- * Look up the sleep queue associated with a given wait channel in the hash
- * table locking the associated sleep queue chain.  If no queue is found in
- * the table, NULL is returned.
- */
-static struct sleepqueue *
-sleepq_lookup(void *wchan)
-{
-	struct sleepqueue_chain	*sc;
-	struct sleepqueue	*sq;
-
-	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
-	sc = SC_LOOKUP(wchan);
-	mtx_assert(&sc->sc_lock, MA_OWNED);
-	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
-		if (sq->sq_wchan == wchan)
-			return (sq);
-	return (NULL);
-}
-
-/*
- * Unlock the sleep queue chain associated with a given wait channel.
- */
-static void
-sleepq_release(void *wchan)
-{
-	struct sleepqueue_chain *sc;
-
-	sc = SC_LOOKUP(wchan);
-	mtx_unlock_spin(&sc->sc_lock);
-}
-
-struct sleepqueue *
-sleepq_add(void *wchan)
-{
-	struct sleepqueue_chain	*sc;
-	struct sleepqueue	*sq;
-
-	sc = SC_LOOKUP(wchan);
-
-	/* Look up the sleep queue associated with the wait channel 'wchan'. */
-	sq = sleepq_lookup(wchan);
-
-	if (sq == NULL) {
-		sq = kmem_cache_alloc(vmm_sleepq_cache, KM_SLEEP);
-		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
-		sq->sq_wchan = wchan;
-	}
-
-        sq->sq_blockedcnt++;
-
-	return (sq);
-}
-
-void
-sleepq_remove(struct sleepqueue *sq)
-{
-	sq->sq_blockedcnt--;
-
-	if (sq->sq_blockedcnt == 0) {
-		LIST_REMOVE(sq, sq_hash);
-		kmem_cache_free(vmm_sleepq_cache, sq);
-	}
-}
-
-int
-msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int ticks)
-{
-	struct sleepqueue	*sq;
-	int			error;
-
-	sleepq_lock(chan);
-	sq = sleepq_add(chan);
-	sleepq_release(chan);
-
-	cv_reltimedwait(&sq->sq_cv, &mtx->m, ticks, TR_CLOCK_TICK);
-
-	sleepq_lock(chan);
-	sleepq_remove(sq);
-	sleepq_release(chan);
-
-	return (error);
-}
-
 void
-wakeup(void *chan)
+vmm_glue_callout_localize(struct callout *c)
 {
-	struct sleepqueue	*sq;
-
-	sleepq_lock(chan);
-        sq = sleepq_lookup(chan);
-	if (sq != NULL) {
-		cv_broadcast(&sq->sq_cv);
-	}
-	sleepq_release(chan);
+	mutex_enter(&cpu_lock);
+	cyclic_move_here(c->c_cyc_id);
+	mutex_exit(&cpu_lock);
 }
 
 void
-wakeup_one(void *chan)
+ipi_cpu(int cpu, u_int ipi)
 {
-	struct sleepqueue	*sq;
-
-	sleepq_lock(chan);
-        sq = sleepq_lookup(chan);
-	if (sq != NULL) {
-		cv_signal(&sq->sq_cv);
-	}
-	sleepq_release(chan);
+	/*
+	 * This was previously implemented as an invocation of asynchronous
+	 * no-op crosscalls to interrupt the target CPU.  Since even nowait
+	 * crosscalls can block in certain circumstances, a direct poke_cpu()
+	 * is safer when called from delicate contexts.
+	 */
+	poke_cpu(cpu);
 }
 
 u_int	cpu_high;		/* Highest arg to CPUID */
@@ -618,162 +432,257 @@ vmm_cpuid_init(void)
 	cpu_exthigh = regs[0];
 }
 
-struct savefpu {
-	fpu_ctx_t	fsa_fp_ctx;
-};
-
-static vmem_t *fpu_save_area_arena;
-
-static void
-fpu_save_area_init(void)
-{
-	fpu_save_area_arena = vmem_create("fpu_save_area",
-	    NULL, 0, XSAVE_AREA_ALIGN,
-	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_BESTFIT | VM_SLEEP);
-}
-
-static void
-fpu_save_area_cleanup(void)
-{
-	vmem_destroy(fpu_save_area_arena);
-}
-
+/*
+ * FreeBSD uses the struct savefpu for managing the FPU state. That is mimicked
+ * by our hypervisor multiplexor framework structure.
+ */
 struct savefpu *
 fpu_save_area_alloc(void)
 {
-	return (vmem_alloc(fpu_save_area_arena, sizeof (struct savefpu),
-			   VM_SLEEP));
+	return ((struct savefpu *)hma_fpu_alloc(KM_SLEEP));
 }
 
 void
 fpu_save_area_free(struct savefpu *fsa)
 {
-	vmem_free(fpu_save_area_arena, fsa, sizeof (struct savefpu));
+	hma_fpu_t *fpu = (hma_fpu_t *)fsa;
+	hma_fpu_free(fpu);
 }
 
 void
 fpu_save_area_reset(struct savefpu *fsa)
 {
-	extern const struct fxsave_state sse_initial;
-	extern const struct xsave_state avx_initial;
-	struct fpu_ctx *fp;
-	struct fxsave_state *fx;
-	struct xsave_state *xs;
-
-	fp = &fsa->fsa_fp_ctx;
-
-	fp->fpu_regs.kfpu_status = 0;
-	fp->fpu_regs.kfpu_xstatus = 0;
-
-	switch (fp_save_mech) {
-	case FP_FXSAVE:
-		fx = &fp->fpu_regs.kfpu_u.kfpu_fx;
-		bcopy(&sse_initial, fx, sizeof (*fx));
-		break;
-	case FP_XSAVE:
-		fp->fpu_xsave_mask = (XFEATURE_ENABLED_X87 |
-		    XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX);
-		xs = &fp->fpu_regs.kfpu_u.kfpu_xs;
-		bcopy(&avx_initial, xs, sizeof (*xs));
-		break;
-	default:
-		panic("Invalid fp_save_mech");
-		/*NOTREACHED*/
-	}
+	hma_fpu_t *fpu = (hma_fpu_t *)fsa;
+	hma_fpu_init(fpu);
 }
 
+/*
+ * This glue function is supposed to save the host's FPU state. This is always
+ * paired in the general bhyve code with a call to fpusave. Therefore, we treat
+ * this as a nop and do all the work in fpusave(), which will have the context
+ * argument that we want anyways.
+ */
 void
 fpuexit(kthread_t *td)
 {
-	fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu);
 }
 
-static __inline void
-vmm_fxrstor(struct fxsave_state *addr)
+/*
+ * This glue function is supposed to restore the guest's FPU state from the save
+ * area back to the host. In FreeBSD, it is assumed that the host state has
+ * already been saved by a call to fpuexit(); however, we do both here.
+ */
+void
+fpurestore(void *arg)
 {
-	__asm __volatile("fxrstor %0" : : "m" (*(addr)));
-}
+	hma_fpu_t *fpu = arg;
 
-static __inline void
-vmm_fxsave(struct fxsave_state *addr)
-{
-	__asm __volatile("fxsave %0" : "=m" (*(addr)));
+	hma_fpu_start_guest(fpu);
 }
 
-static __inline void
-vmm_xrstor(struct xsave_state *addr, uint64_t mask)
+/*
+ * This glue function is supposed to save the guest's FPU state. The host's FPU
+ * state is not expected to be restored necessarily due to the use of FPU
+ * emulation through CR0.TS. However, we can and do restore it here.
+ */
+void
+fpusave(void *arg)
 {
-	uint32_t low, hi;
+	hma_fpu_t *fpu = arg;
 
-	low = mask;
-	hi = mask >> 32;
-	__asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi));
+	hma_fpu_stop_guest(fpu);
 }
 
-static __inline void
-vmm_xsave(struct xsave_state *addr, uint64_t mask)
+void
+vmm_sol_glue_init(void)
 {
-	uint32_t low, hi;
-
-	low = mask;
-	hi = mask >> 32;
-	__asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) :
-	    "memory");
+	vmm_alloc_init();
+	vmm_cpuid_init();
 }
 
 void
-fpurestore(void *arg)
+vmm_sol_glue_cleanup(void)
 {
-	struct savefpu *fsa = (struct savefpu *)arg;
-	struct fpu_ctx *fp;
-
-	fp = &fsa->fsa_fp_ctx;
-
-	switch (fp_save_mech) {
-	case FP_FXSAVE:
-		vmm_fxrstor(&fp->fpu_regs.kfpu_u.kfpu_fx);
-		break;
-	case FP_XSAVE:
-		vmm_xrstor(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
-		break;
-	default:
-		panic("Invalid fp_save_mech");
-		/*NOTREACHED*/
-	}
+	vmm_alloc_cleanup();
 }
 
-void
-fpusave(void *arg)
+
+/* From FreeBSD's sys/kern/subr_clock.c */
+
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: Utah $Hdr: clock.c 1.18 91/01/21$
+ *	from: @(#)clock.c	8.2 (Berkeley) 1/12/94
+ *	from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ *	and
+ *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ */
+
+#include <sys/clock.h>
+
+/*--------------------------------------------------------------------*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+
+#define	FEBRUARY	2
+#define	days_in_year(y) 	(leapyear(y) ? 366 : 365)
+#define	days_in_month(y, m) \
+	(month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define	day_of_week(days)	(((days) + 4) % 7)
+
+static const int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+
+/*
+ * This inline avoids some unnecessary modulo operations
+ * as compared with the usual macro:
+ *   ( ((year % 4) == 0 &&
+ *      (year % 100) != 0) ||
+ *     ((year % 400) == 0) )
+ * It is otherwise equivalent.
+ */
+static int
+leapyear(int year)
 {
-	struct savefpu *fsa = (struct savefpu *)arg;
-	struct fpu_ctx *fp;
-
-	fp = &fsa->fsa_fp_ctx;
-
-	switch (fp_save_mech) {
-	case FP_FXSAVE:
-		vmm_fxsave(&fp->fpu_regs.kfpu_u.kfpu_fx);
-		break;
-	case FP_XSAVE:
-		vmm_xsave(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
-		break;
-	default:
-		panic("Invalid fp_save_mech");
-		/*NOTREACHED*/
+	int rv = 0;
+
+	if ((year & 3) == 0) {
+		rv = 1;
+		if ((year % 100) == 0) {
+			rv = 0;
+			if ((year % 400) == 0)
+				rv = 1;
+		}
 	}
+	return (rv);
 }
 
-void
-vmm_sol_glue_init(void)
+int
+clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
 {
-	vmm_cpuid_init();
-	fpu_save_area_init();
-	init_sleepqueues();
+	int i, year, days;
+
+	year = ct->year;
+
+#ifdef __FreeBSD__
+	if (ct_debug) {
+		printf("ct_to_ts(");
+		print_ct(ct);
+		printf(")");
+	}
+#endif
+
+	/* Sanity checks. */
+	if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
+	    ct->day > days_in_month(year, ct->mon) ||
+	    ct->hour > 23 ||  ct->min > 59 || ct->sec > 59 ||
+	    (sizeof(time_t) == 4 && year > 2037)) {	/* time_t overflow */
+#ifdef __FreeBSD__
+		if (ct_debug)
+			printf(" = EINVAL\n");
+#endif
+		return (EINVAL);
+	}
+
+	/*
+	 * Compute days since start of time
+	 * First from years, then from months.
+	 */
+	days = 0;
+	for (i = POSIX_BASE_YEAR; i < year; i++)
+		days += days_in_year(i);
+
+	/* Months */
+	for (i = 1; i < ct->mon; i++)
+	  	days += days_in_month(year, i);
+	days += (ct->day - 1);
+
+	ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 +
+	    ct->sec;
+	ts->tv_nsec = ct->nsec;
+
+#ifdef __FreeBSD__
+	if (ct_debug)
+		printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec);
+#endif
+	return (0);
 }
 
 void
-vmm_sol_glue_cleanup(void)
-{
-	fpu_save_area_cleanup();
-	kmem_cache_destroy(vmm_sleepq_cache);
+clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
+{
+	int i, year, days;
+	time_t rsec;	/* remainder seconds */
+	time_t secs;
+
+	secs = ts->tv_sec;
+	days = secs / SECDAY;
+	rsec = secs % SECDAY;
+
+	ct->dow = day_of_week(days);
+
+	/* Subtract out whole years, counting them in i. */
+	for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
+		days -= days_in_year(year);
+	ct->year = year;
+
+	/* Subtract out whole months, counting them in i. */
+	for (i = 1; days >= days_in_month(year, i); i++)
+		days -= days_in_month(year, i);
+	ct->mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	ct->day = days + 1;
+
+	/* Hours, minutes, seconds are easy */
+	ct->hour = rsec / 3600;
+	rsec = rsec % 3600;
+	ct->min  = rsec / 60;
+	rsec = rsec % 60;
+	ct->sec  = rsec;
+	ct->nsec = ts->tv_nsec;
+#ifdef __FreeBSD__
+	if (ct_debug) {
+		printf("ts_to_ct(%ld.%09ld) = ",
+		    (long)ts->tv_sec, (long)ts->tv_nsec);
+		print_ct(ct);
+		printf("\n");
+	}
+#endif
 }
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
deleted file mode 100644
index 3bb5412d16..0000000000
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2013 Pluribus Networks Inc.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $");
-
-#include <sys/param.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-
-#include <vm/vm.h>
-#include <machine/pmap.h>
-
-#include <sys/ddi.h>
-
-#include "vmm_util.h"
-#include "vmm_mem.h"
-
-int
-vmm_mem_init(void)
-{
-	return (0);
-}
-
-vm_paddr_t
-vmm_mem_alloc(size_t size)
-{
-	clock_t usec = 2 * 1000000;
-	vm_paddr_t pa;
-	caddr_t addr;
-
-	if (size != PAGE_SIZE)
-		panic("vmm_mem_alloc: invalid allocation size %lu", size);
-
-	while (usec > 0) {
-		if ((addr = kmem_zalloc(PAGE_SIZE, KM_NOSLEEP)) != NULL) {
-			ASSERT(((uintptr_t)addr & PAGE_MASK) == 0);
-			pa = vtophys((vm_offset_t)addr);
-			return (pa);
-		}
-		delay(drv_usectohz((clock_t)500000));
-		usec -= 500000;
-	}
-
-	return (NULL);
-}
-
-void
-vmm_mem_free(vm_paddr_t base, size_t length)
-{
-	page_t	*pp;
-
-	if (base & PAGE_MASK) {
-		panic("vmm_mem_free: base 0x%0lx must be aligned on a "
-		      "0x%0x boundary\n", base, PAGE_SIZE);
-	}
-
-	if (length != PAGE_SIZE) {
-		panic("vmm_mem_free: invalid length %lu", length);
-	}
-
-	pp = page_numtopp_nolock(btop(base));
-	kmem_free((void *)pp->p_offset, PAGE_SIZE);
-}
-
-vm_paddr_t
-vmm_mem_maxaddr(void)
-{
-
-	return (ptob(physmax + 1));
-}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
new file mode 100644
index 0000000000..d630d32630
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c
@@ -0,0 +1,297 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/machsystm.h>
+#include <sys/x86_archext.h>
+
+#include <sys/gipt.h>
+#include <vm/vm_glue.h>
+
+
+struct rvi_map {
+	gipt_map_t	rm_gipt;
+	uint64_t	rm_wired_page_count;
+};
+typedef struct rvi_map rvi_map_t;
+
+#define	RVI_LOCK(m)	(&(m)->rm_gipt.giptm_lock)
+
+#define	RVI_MAX_LEVELS	4
+
+CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS);
+
+#define	RVI_PRESENT	PT_VALID
+#define	RVI_WRITABLE	PT_WRITABLE
+#define	RVI_ACCESSED	PT_REF
+#define	RVI_DIRTY	PT_MOD
+#define	RVI_LGPG	PT_PAGESIZE
+#define	RVI_NX		PT_NX
+#define	RVI_USER	PT_USER
+#define	RVI_PWT		PT_WRITETHRU
+#define	RVI_PCD		PT_NOCACHE
+
+#define	RVI_PA_MASK	PT_PADDR
+
+#define	RVI_PAT(attr)	rvi_attr_to_pat(attr)
+#define	RVI_PADDR(addr)	((addr) & RVI_PA_MASK)
+#define	RVI_PROT(prot)	\
+	((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \
+	(((prot) & PROT_EXEC) == 0 ? RVI_NX : 0))
+
+#define	RVI_IS_ABSENT(pte)	(((pte) & RVI_PRESENT) == 0)
+#define	RVI_PTE_PFN(pte)	mmu_btop(RVI_PADDR(pte))
+#define	RVI_MAPS_PAGE(pte, lvl)	\
+	(!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0))
+#define	RVI_PTE_PROT(pte)	\
+	(RVI_IS_ABSENT(pte) ? 0 : (			\
+	PROT_READ |					\
+	(((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) |	\
+	(((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0)))
+
+#define	RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr)	\
+	(RVI_PADDR(pfn_to_pa(pfn)) |			\
+	(((lvl) != 0) ? RVI_LGPG : 0) |			\
+	RVI_USER | RVI_ACCESSED | RVI_PRESENT |		\
+	RVI_PAT(attr) |					\
+	RVI_PROT(prot))
+
+#define	RVI_PTE_ASSIGN_TABLE(pfn)	\
+	(RVI_PADDR(pfn_to_pa(pfn)) |			\
+	RVI_USER | RVI_ACCESSED | RVI_PRESENT |		\
+	RVI_PAT(MTRR_TYPE_WB) |				\
+	RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC))
+
+
+/* Make sure that PAT indexes line up as expected */
+CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB);
+CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC);
+
+static inline uint64_t
+rvi_attr_to_pat(const uint8_t attr)
+{
+	if (attr == MTRR_TYPE_UC) {
+		/* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */
+		return (RVI_PCD|RVI_PWT);
+	} else if (attr == MTRR_TYPE_WB) {
+		/* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */
+		return (0);
+	}
+
+	panic("unexpected memattr %x", attr);
+	return (0);
+}
+
+static gipt_pte_type_t
+rvi_pte_type(uint64_t pte, uint_t level)
+{
+	if (RVI_IS_ABSENT(pte)) {
+		return (PTET_EMPTY);
+	} else if (RVI_MAPS_PAGE(pte, level)) {
+		return (PTET_PAGE);
+	} else {
+		return (PTET_LINK);
+	}
+}
+
+static uint64_t
+rvi_pte_map(uint64_t pfn)
+{
+	return (RVI_PTE_ASSIGN_TABLE(pfn));
+}
+
+static void *
+rvi_create(uintptr_t *pml4_kaddr)
+{
+	rvi_map_t *rmap;
+	gipt_map_t *map;
+	gipt_t *root;
+	struct gipt_cbs cbs = {
+		.giptc_pte_type = rvi_pte_type,
+		.giptc_pte_map = rvi_pte_map,
+	};
+
+	rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP);
+	map = &rmap->rm_gipt;
+	root = gipt_alloc();
+	root->gipt_level = RVI_MAX_LEVELS - 1;
+	gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root);
+
+	*pml4_kaddr = (uintptr_t)root->gipt_kva;
+	return (rmap);
+}
+
+static void
+rvi_destroy(void *arg)
+{
+	rvi_map_t *rmap = arg;
+
+	if (rmap != NULL) {
+		gipt_map_t *map = &rmap->rm_gipt;
+
+		gipt_map_fini(map);
+		kmem_free(rmap, sizeof (*rmap));
+	}
+}
+
+static uint64_t
+rvi_wired_count(void *arg)
+{
+	rvi_map_t *rmap = arg;
+	uint64_t res;
+
+	mutex_enter(RVI_LOCK(rmap));
+	res = rmap->rm_wired_page_count;
+	mutex_exit(RVI_LOCK(rmap));
+
+	return (res);
+}
+
+static int
+rvi_is_wired(void *arg, uint64_t va, uint_t *protp)
+{
+	rvi_map_t *rmap = arg;
+	gipt_t *pt;
+	int rv = -1;
+
+	mutex_enter(RVI_LOCK(rmap));
+	pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va);
+	if (pt != NULL) {
+		const uint64_t pte = GIPT_VA2PTE(pt, va);
+
+		if (RVI_MAPS_PAGE(pte, pt->gipt_level)) {
+			*protp = RVI_PTE_PROT(pte);
+			rv = 0;
+		}
+	}
+	mutex_exit(RVI_LOCK(rmap));
+
+	return (rv);
+}
+
+static int
+rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot,
+    uint8_t attr)
+{
+	rvi_map_t *rmap = arg;
+	gipt_map_t *map = &rmap->rm_gipt;
+	gipt_t *pt;
+	uint64_t *ptep, pte;
+
+	ASSERT((prot & PROT_READ) != 0);
+	ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0);
+	ASSERT3U(lvl, <, RVI_MAX_LEVELS);
+
+	mutex_enter(RVI_LOCK(rmap));
+	pt = gipt_map_lookup(map, va, lvl);
+	if (pt == NULL) {
+		/*
+		 * A table at the appropriate VA/level that would house this
+		 * mapping does not currently exist.  Try to walk down to that
+		 * point, creating any necessary parent(s).
+		 */
+		pt = gipt_map_create_parents(map, va, lvl);
+
+		/*
+		 * There was a large page mapping in the way of creating the
+		 * necessary parent table(s).
+		 */
+		if (pt == NULL) {
+			panic("unexpected large page @ %08lx", va);
+		}
+	}
+	ptep = GIPT_VA2PTEP(pt, va);
+
+	pte = *ptep;
+	if (!RVI_IS_ABSENT(pte)) {
+		if (!RVI_MAPS_PAGE(pte, lvl)) {
+			panic("unexpected PT link @ %08lx in %p", va, pt);
+		} else {
+			panic("unexpected page mapped @ %08lx in %p", va, pt);
+		}
+	}
+
+	pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr);
+	*ptep = pte;
+	pt->gipt_valid_cnt++;
+	rmap->rm_wired_page_count += gipt_level_count[lvl];
+
+	mutex_exit(RVI_LOCK(rmap));
+	return (0);
+}
+
+static uint64_t
+rvi_unmap(void *arg, uint64_t va, uint64_t end_va)
+{
+	rvi_map_t *rmap = arg;
+	gipt_map_t *map = &rmap->rm_gipt;
+	gipt_t *pt;
+	uint64_t cur_va = va;
+	uint64_t unmapped = 0;
+
+	mutex_enter(RVI_LOCK(rmap));
+
+	pt = gipt_map_lookup_deepest(map, cur_va);
+	if (pt == NULL) {
+		mutex_exit(RVI_LOCK(rmap));
+		return (0);
+	}
+	if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) {
+		cur_va = gipt_map_next_page(map, cur_va, end_va, &pt);
+		if (cur_va == 0) {
+			mutex_exit(RVI_LOCK(rmap));
+			return (0);
+		}
+	}
+
+	while (cur_va < end_va) {
+		uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va);
+		const uint_t lvl = pt->gipt_level;
+
+		ASSERT(RVI_MAPS_PAGE(*ptep, lvl));
+		*ptep = 0;
+		pt->gipt_valid_cnt--;
+		unmapped += gipt_level_count[pt->gipt_level];
+
+		gipt_t *next_pt = pt;
+		uint64_t next_va;
+		next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt);
+
+		if (pt->gipt_valid_cnt == 0) {
+			gipt_map_clean_parents(map, pt);
+		}
+		if (next_va == 0) {
+			break;
+		}
+		pt = next_pt;
+		cur_va = next_va;
+	}
+	rmap->rm_wired_page_count -= unmapped;
+
+	mutex_exit(RVI_LOCK(rmap));
+
+	return (unmapped);
+}
+
+struct vmm_pt_ops rvi_ops = {
+	.vpo_init	= rvi_create,
+	.vpo_free	= rvi_destroy,
+	.vpo_wired_cnt	= rvi_wired_count,
+	.vpo_is_wired	= rvi_is_wired,
+	.vpo_map	= rvi_map,
+	.vpo_unmap	= rvi_unmap,
+};
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
new file mode 100644
index 0000000000..66a67d9529
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
@@ -0,0 +1,1016 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/list.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sysmacros.h>
+#include <sys/machsystm.h>
+#include <sys/vmsystm.h>
+#include <sys/malloc.h>
+#include <sys/x86_archext.h>
+#include <vm/as.h>
+#include <vm/seg_vn.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_vmm.h>
+
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include "vm/vm_glue.h"
+
+#define	PMAP_TO_VMMAP(pm)	((vm_map_t)		\
+	((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap)))
+#define	VMMAP_TO_VMSPACE(vmmap)	((struct vmspace *)		\
+	((caddr_t)(vmmap) - offsetof(struct vmspace, vm_map)))
+
+
+struct vmspace_mapping {
+	list_node_t	vmsm_node;
+	vm_object_t	vmsm_object;
+	uintptr_t	vmsm_addr;
+	size_t		vmsm_len;
+	off_t		vmsm_offset;
+	uint_t		vmsm_prot;
+};
+typedef struct vmspace_mapping vmspace_mapping_t;
+
+#define	VMSM_OFFSET(vmsm, addr)	(			\
+	    (vmsm)->vmsm_offset +			\
+	    ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
+
+
+/* Private glue interfaces */
+static void pmap_free(pmap_t);
+static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t,
+    boolean_t);
+static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *);
+
+static vmem_t *vmm_alloc_arena = NULL;
+
+static void *
+vmm_arena_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+	return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
+	    segkmem_page_create, &kvps[KV_VVP]));
+}
+
+static void
+vmm_arena_free(vmem_t *vmp, void *inaddr, size_t size)
+{
+	segkmem_xfree(vmp, inaddr, size, &kvps[KV_VVP], NULL);
+}
+
+void
+vmm_arena_init(void)
+{
+	vmm_alloc_arena = vmem_create("vmm_alloc_arena", NULL, 0, 1024 * 1024,
+	    vmm_arena_alloc, vmm_arena_free, kvmm_arena, 0, VM_SLEEP);
+
+	ASSERT(vmm_alloc_arena != NULL);
+}
+
+void
+vmm_arena_fini(void)
+{
+	VERIFY(vmem_size(vmm_alloc_arena, VMEM_ALLOC) == 0);
+	vmem_destroy(vmm_alloc_arena);
+	vmm_alloc_arena = NULL;
+}
+
+struct vmspace *
+vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit)
+{
+	struct vmspace *vms;
+	const uintptr_t size = end + 1;
+
+	/*
+	 * This whole mess is built on the assumption that a 64-bit address
+	 * space is available to work with for the various pagetable tricks.
+	 */
+	VERIFY(ttoproc(curthread)->p_model == DATAMODEL_LP64);
+	VERIFY(start == 0 && size > 0 && (size & PAGEOFFSET) == 0 &&
+	    size <= (uintptr_t)USERLIMIT);
+
+	vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
+	vms->vms_size = size;
+	list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
+	    offsetof(vmspace_mapping_t, vmsm_node));
+
+	if (pinit(&vms->vms_pmap) == 0) {
+		kmem_free(vms, sizeof (*vms));
+		return (NULL);
+	}
+
+	return (vms);
+}
+
+void
+vmspace_free(struct vmspace *vms)
+{
+	VERIFY(list_is_empty(&vms->vms_maplist));
+
+	pmap_free(&vms->vms_pmap);
+	kmem_free(vms, sizeof (*vms));
+}
+
+pmap_t
+vmspace_pmap(struct vmspace *vms)
+{
+	return (&vms->vms_pmap);
+}
+
+long
+vmspace_resident_count(struct vmspace *vms)
+{
+	/* XXXJOY: finish */
+	return (0);
+}
+
+void *
+vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size)
+{
+	vmspace_mapping_t *vmsm;
+	void *result = NULL;
+
+	/*
+	 * Since vmspace_find_kva is provided so that vmm_drv consumers can do
+	 * GPA2KVA translations, it is expected to be called when there is a
+	 * read lock preventing vmspace alterations.  As such, it can do the
+	 * lockless vm_mapping_find() lookup.
+	 */
+	vmsm = vm_mapping_find(vms, addr, size, B_TRUE);
+	if (vmsm != NULL) {
+		struct vm_object *vmo = vmsm->vmsm_object;
+
+		switch (vmo->vmo_type) {
+		case OBJT_DEFAULT:
+			result = (void *)((uintptr_t)vmo->vmo_data +
+			    VMSM_OFFSET(vmsm, addr));
+			break;
+		default:
+			break;
+		}
+	}
+
+	return (result);
+}
+
+static int
+vmspace_pmap_iswired(struct vmspace *vms, uintptr_t addr, uint_t *prot)
+{
+	pmap_t pmap = &vms->vms_pmap;
+	int rv;
+
+	ASSERT(MUTEX_HELD(&vms->vms_lock));
+
+	rv = pmap->pm_ops->vpo_is_wired(pmap->pm_impl, addr, prot);
+	return (rv);
+}
+
+static void
+pmap_free(pmap_t pmap)
+{
+	void *pmi = pmap->pm_impl;
+	struct vmm_pt_ops *ops = pmap->pm_ops;
+
+	pmap->pm_pml4 = NULL;
+	pmap->pm_impl = NULL;
+	pmap->pm_ops = NULL;
+
+	ops->vpo_free(pmi);
+}
+
+int
+pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags)
+{
+	/* For use in vmm only */
+	pmap->pm_type = type;
+	switch (type) {
+	case PT_EPT: {
+		struct vmm_pt_ops *ops = &ept_ops;
+		void *pml4, *pmi;
+
+		pmi = ops->vpo_init((uintptr_t *)&pml4);
+
+		pmap->pm_ops = ops;
+		pmap->pm_impl = pmi;
+		pmap->pm_pml4 = pml4;
+		return (1);
+	}
+	case PT_RVI: {
+		struct vmm_pt_ops *ops = &rvi_ops;
+		void *pml4, *pmi;
+
+		pmi = ops->vpo_init((uintptr_t *)&pml4);
+
+		pmap->pm_ops = ops;
+		pmap->pm_impl = pmi;
+		pmap->pm_pml4 = pml4;
+		return (1);
+	}
+	default:
+		panic("unsupported pmap type: %x", type);
+		break;
+	}
+
+	return (1);
+}
+
+long
+pmap_wired_count(pmap_t pmap)
+{
+	long val;
+
+	val = pmap->pm_ops->vpo_wired_cnt(pmap->pm_impl);
+	VERIFY3S(val, >=, 0);
+
+	return (val);
+}
+
+int
+pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
+{
+	/* Allow the fallback to vm_fault to handle this */
+	return (-1);
+}
+
+
+
+struct sglist_ent {
+	vm_paddr_t	sge_pa;
+	size_t		sge_len;
+};
+struct sglist {
+	kmutex_t		sg_lock;
+	uint_t			sg_refcnt;
+	uint_t			sg_len;
+	uint_t			sg_next;
+	struct sglist_ent	sg_entries[];
+};
+
+#define	SG_SIZE(cnt)	(sizeof (struct sglist) + \
+	(sizeof (struct sglist_ent) * (cnt)))
+
+struct sglist *
+sglist_alloc(int nseg, int flags)
+{
+	const size_t sz = SG_SIZE(nseg);
+	const int flag = (flags & M_WAITOK) ? KM_SLEEP : KM_NOSLEEP;
+	struct sglist *sg;
+
+	ASSERT(nseg > 0);
+
+	sg = kmem_zalloc(sz, flag);
+	if (sg != NULL) {
+		sg->sg_len = nseg;
+		sg->sg_refcnt = 1;
+	}
+	return (sg);
+}
+
+void
+sglist_free(struct sglist *sg)
+{
+	size_t sz;
+
+	mutex_enter(&sg->sg_lock);
+	if (sg->sg_refcnt > 1) {
+		sg->sg_refcnt--;
+		mutex_exit(&sg->sg_lock);
+		return;
+	}
+
+	VERIFY(sg->sg_refcnt == 1);
+	sg->sg_refcnt = 0;
+	sz = SG_SIZE(sg->sg_len);
+	mutex_exit(&sg->sg_lock);
+	kmem_free(sg, sz);
+}
+
+int
+sglist_append_phys(struct sglist *sg, vm_paddr_t pa, size_t len)
+{
+	uint_t idx;
+	struct sglist_ent *ent;
+
+	/* Restrict to page-aligned entries */
+	if ((pa & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0 || len == 0) {
+		return (EINVAL);
+	}
+
+	mutex_enter(&sg->sg_lock);
+	idx = sg->sg_next;
+	if (idx >= sg->sg_len) {
+		mutex_exit(&sg->sg_lock);
+		return (ENOSPC);
+	}
+
+	ent = &sg->sg_entries[idx];
+	ASSERT(ent->sge_pa == 0 && ent->sge_len == 0);
+	ent->sge_pa = pa;
+	ent->sge_len = len;
+	sg->sg_next++;
+
+	mutex_exit(&sg->sg_lock);
+	return (0);
+}
+
+
+static pfn_t
+vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
+{
+	panic("bad vm_object pager");
+	return (PFN_INVALID);
+}
+
+static pfn_t
+vm_object_pager_heap(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
+{
+	const uintptr_t kaddr = ALIGN2PAGE((uintptr_t)vmo->vmo_data + off);
+	uint_t idx, level;
+	htable_t *ht;
+	x86pte_t pte;
+	pfn_t top_pfn, pfn;
+
+	ASSERT(vmo->vmo_type == OBJT_DEFAULT);
+	ASSERT(off < vmo->vmo_size);
+
+	ht = htable_getpage(kas.a_hat, kaddr, &idx);
+	if (ht == NULL) {
+		return (PFN_INVALID);
+	}
+	pte = x86pte_get(ht, idx);
+	if (!PTE_ISPAGE(pte, ht->ht_level)) {
+		htable_release(ht);
+		return (PFN_INVALID);
+	}
+
+	pfn = top_pfn = PTE2PFN(pte, ht->ht_level);
+	level = ht->ht_level;
+	if (ht->ht_level > 0) {
+		pfn += mmu_btop(kaddr & LEVEL_OFFSET((uint_t)ht->ht_level));
+	}
+	htable_release(ht);
+
+	if (lpfn != NULL) {
+		*lpfn = top_pfn;
+	}
+	if (lvl != NULL) {
+		*lvl = level;
+	}
+	return (pfn);
+}
+
+static pfn_t
+vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl)
+{
+	const uintptr_t aoff = ALIGN2PAGE(off);
+	uint_t level = 0;
+	uintptr_t pos = 0;
+	struct sglist *sg;
+	struct sglist_ent *ent;
+	pfn_t pfn = PFN_INVALID;
+
+	ASSERT(vmo->vmo_type == OBJT_SG);
+	ASSERT(off < vmo->vmo_size);
+
+	sg = vmo->vmo_data;
+	if (sg == NULL) {
+		return (PFN_INVALID);
+	}
+
+	ent = &sg->sg_entries[0];
+	for (uint_t i = 0; i < sg->sg_next; i++, ent++) {
+		if (aoff >= pos && aoff < (pos + ent->sge_len)) {
+			/* XXXJOY: Punt on large pages for now */
+			level = 0;
+			pfn = mmu_btop(ent->sge_pa + (aoff - pos));
+			break;
+		}
+		pos += ent->sge_len;
+	}
+
+	if (lpfn != 0) {
+		*lpfn = pfn;
+	}
+	if (lvl != 0) {
+		*lvl = level;
+	}
+	return (pfn);
+}
+
+static void
+vm_reserve_pages(size_t npages)
+{
+	uint_t retries = 60;
+	int rc;
+
+	mutex_enter(&freemem_lock);
+	if (availrmem < npages) {
+		mutex_exit(&freemem_lock);
+
+		/*
+		 * Set needfree and wait for the ZFS ARC reap thread to free up
+		 * some memory.
+		 */
+		page_needfree(npages);
+
+		mutex_enter(&freemem_lock);
+		while ((availrmem < npages) && retries-- > 0) {
+			mutex_exit(&freemem_lock);
+			rc = delay_sig(drv_usectohz(1 * MICROSEC));
+			mutex_enter(&freemem_lock);
+
+			if (rc == EINTR)
+				break;
+		}
+		mutex_exit(&freemem_lock);
+
+		page_needfree(-npages);
+	} else {
+		mutex_exit(&freemem_lock);
+	}
+}
+
+void
+vm_object_clear(vm_object_t vmo)
+{
+	ASSERT(vmo->vmo_type == OBJT_DEFAULT);
+
+	/* XXXJOY: Better zeroing approach? */
+	bzero(vmo->vmo_data, vmo->vmo_size);
+}
+
+vm_object_t
+vm_object_allocate(objtype_t type, vm_pindex_t psize)
+{
+	vm_object_t vmo;
+	const size_t size = ptob((size_t)psize);
+
+	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
+	mutex_init(&vmo->vmo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/* For now, these are to stay fixed after allocation */
+	vmo->vmo_type = type;
+	vmo->vmo_size = size;
+	vmo->vmo_attr = VM_MEMATTR_DEFAULT;
+
+	switch (type) {
+	case OBJT_DEFAULT: {
+		vm_reserve_pages(psize);
+
+		/* XXXJOY: opt-in to larger pages? */
+		vmo->vmo_data = vmem_alloc(vmm_alloc_arena, size, KM_NOSLEEP);
+		if (vmo->vmo_data == NULL) {
+			mutex_destroy(&vmo->vmo_lock);
+			kmem_free(vmo, sizeof (*vmo));
+			return (NULL);
+		}
+		vm_object_clear(vmo);
+		vmo->vmo_pager = vm_object_pager_heap;
+	}
+		break;
+	case OBJT_SG:
+		vmo->vmo_data = NULL;
+		vmo->vmo_pager = vm_object_pager_sg;
+		break;
+	default:
+		panic("Unsupported vm_object type");
+		break;
+	}
+
+	vmo->vmo_refcnt = 1;
+	return (vmo);
+}
+
+vm_object_t
+vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size,
+    vm_prot_t prot, vm_ooffset_t off, void *cred)
+{
+	struct vm_object *vmo;
+	struct sglist *sg = (struct sglist *)handle;
+
+	/* XXXJOY: be very restrictive for now */
+	VERIFY(type == OBJT_SG);
+	VERIFY(off == 0);
+
+	vmo = vm_object_allocate(type, size);
+	vmo->vmo_data = sg;
+
+	mutex_enter(&sg->sg_lock);
+	VERIFY(sg->sg_refcnt++ >= 1);
+	mutex_exit(&sg->sg_lock);
+
+	return (vmo);
+}
+
+void
+vm_object_deallocate(vm_object_t vmo)
+{
+	ASSERT(vmo != NULL);
+
+	uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
+	/* underflow would be a deadly serious mistake */
+	VERIFY3U(ref, !=, UINT_MAX);
+	if (ref != 0) {
+		return;
+	}
+
+	switch (vmo->vmo_type) {
+	case OBJT_DEFAULT:
+		vmem_free(vmm_alloc_arena, vmo->vmo_data, vmo->vmo_size);
+		break;
+	case OBJT_SG:
+		sglist_free((struct sglist *)vmo->vmo_data);
+		break;
+	default:
+		panic("Unsupported vm_object type");
+		break;
+	}
+
+	vmo->vmo_pager = vm_object_pager_none;
+	vmo->vmo_data = NULL;
+	vmo->vmo_size = 0;
+	mutex_destroy(&vmo->vmo_lock);
+	kmem_free(vmo, sizeof (*vmo));
+}
+
+CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC);
+CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB);
+int
+vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr)
+{
+	ASSERT(MUTEX_HELD(&vmo->vmo_lock));
+
+	switch (attr) {
+	case VM_MEMATTR_UNCACHEABLE:
+	case VM_MEMATTR_WRITE_BACK:
+		vmo->vmo_attr = attr;
+		return (0);
+	default:
+		break;
+	}
+	return (EINVAL);
+}
+
+void
+vm_object_reference(vm_object_t vmo)
+{
+	ASSERT(vmo != NULL);
+
+	uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
+	/* overflow would be a deadly serious mistake */
+	VERIFY3U(ref, !=, 0);
+}
+
+static vmspace_mapping_t *
+vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size,
+    boolean_t no_lock)
+{
+	vmspace_mapping_t *vmsm;
+	list_t *ml = &vms->vms_maplist;
+	const uintptr_t range_end = addr + size;
+
+	ASSERT(addr <= range_end);
+
+	if (no_lock) {
+		/*
+		 * This check should be superflous with the protections
+		 * promised by the bhyve logic which calls into the VM shim.
+		 * All the same, it is cheap to be paranoid.
+		 */
+		VERIFY(!vms->vms_map_changing);
+	} else {
+		VERIFY(MUTEX_HELD(&vms->vms_lock));
+	}
+
+	if (addr >= vms->vms_size) {
+		return (NULL);
+	}
+	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
+		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
+
+		if (addr >= vmsm->vmsm_addr && addr < seg_end) {
+			if (range_end <= seg_end) {
+				return (vmsm);
+			} else {
+				return (NULL);
+			}
+		}
+	}
+	return (NULL);
+}
+
+static boolean_t
+vm_mapping_gap(struct vmspace *vms, uintptr_t addr, size_t size)
+{
+	vmspace_mapping_t *vmsm;
+	list_t *ml = &vms->vms_maplist;
+	const uintptr_t range_end = addr + size;
+
+	ASSERT(MUTEX_HELD(&vms->vms_lock));
+
+	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
+		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
+
+		if ((vmsm->vmsm_addr >= addr && vmsm->vmsm_addr < range_end) ||
+		    (seg_end > addr && seg_end < range_end)) {
+			return (B_FALSE);
+		}
+	}
+	return (B_TRUE);
+}
+
+static void
+vm_mapping_remove(struct vmspace *vms, vmspace_mapping_t *vmsm)
+{
+	list_t *ml = &vms->vms_maplist;
+
+	ASSERT(MUTEX_HELD(&vms->vms_lock));
+	ASSERT(vms->vms_map_changing);
+
+	list_remove(ml, vmsm);
+	vm_object_deallocate(vmsm->vmsm_object);
+	kmem_free(vmsm, sizeof (*vmsm));
+}
+
+int
+vm_fault(vm_map_t map, vm_offset_t off, vm_prot_t type, int flag)
+{
+	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
+	pmap_t pmap = &vms->vms_pmap;
+	void *pmi = pmap->pm_impl;
+	const uintptr_t addr = off;
+	vmspace_mapping_t *vmsm;
+	struct vm_object *vmo;
+	uint_t prot, map_lvl;
+	pfn_t pfn;
+	uintptr_t map_addr;
+
+	mutex_enter(&vms->vms_lock);
+	if (vmspace_pmap_iswired(vms, addr, &prot) == 0) {
+		int err = 0;
+
+		/*
+		 * It is possible that multiple vCPUs will race to fault-in a
+		 * given address.  In such cases, the race loser(s) will
+		 * encounter the already-mapped page, needing to do nothing
+		 * more than consider it a success.
+		 *
+		 * If the fault exceeds protection, it is an obvious error.
+		 */
+		if ((prot & type) != type) {
+			err = FC_PROT;
+		}
+
+		mutex_exit(&vms->vms_lock);
+		return (err);
+	}
+
+	/* Try to wire up the address */
+	if ((vmsm = vm_mapping_find(vms, addr, 0, B_FALSE)) == NULL) {
+		mutex_exit(&vms->vms_lock);
+		return (FC_NOMAP);
+	}
+	vmo = vmsm->vmsm_object;
+	prot = vmsm->vmsm_prot;
+
+	/* XXXJOY: punt on large pages for now */
+	pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, addr), NULL, NULL);
+	map_lvl = 0;
+	map_addr = P2ALIGN((uintptr_t)addr, LEVEL_SIZE(map_lvl));
+	VERIFY(pfn != PFN_INVALID);
+
+	/*
+	 * If pmap failure is to be handled, the previously acquired page locks
+	 * would need to be released.
+	 */
+	VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, prot,
+	    vmo->vmo_attr));
+	pmap->pm_eptgen++;
+
+	mutex_exit(&vms->vms_lock);
+	return (0);
+}
+
+int
+vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
+    vm_prot_t prot, vm_page_t *ma, int max_count)
+{
+	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
+	const uintptr_t vaddr = addr;
+	vmspace_mapping_t *vmsm;
+	struct vm_object *vmo;
+	vm_page_t vmp;
+
+	ASSERT0(addr & PAGEOFFSET);
+	ASSERT(len == PAGESIZE);
+	ASSERT(max_count == 1);
+
+	/*
+	 * Unlike practically all of the other logic that queries or
+	 * manipulates vmspace objects, vm_fault_quick_hold_pages() does so
+	 * without holding vms_lock.  This is safe because bhyve ensures that
+	 * changes to the vmspace map occur only when all other threads have
+	 * been excluded from running.
+	 *
+	 * Since this task can count on vms_maplist remaining static and does
+	 * not need to modify the pmap (like vm_fault might), it can proceed
+	 * without the lock.  The vm_object has independent refcount and lock
+	 * protection, while the vmo_pager methods do not rely on vms_lock for
+	 * safety.
+	 *
+	 * Performing this work without locks is critical in cases where
+	 * multiple vCPUs require simultaneous instruction emulation, such as
+	 * for frequent guest APIC accesses on a host that lacks hardware
+	 * acceleration for that behavior.
+	 */
+	if ((vmsm = vm_mapping_find(vms, vaddr, PAGESIZE, B_TRUE)) == NULL ||
+	    (prot & ~vmsm->vmsm_prot) != 0) {
+		return (-1);
+	}
+
+	vmp = kmem_zalloc(sizeof (struct vm_page), KM_SLEEP);
+
+	vmo = vmsm->vmsm_object;
+	vm_object_reference(vmo);
+	vmp->vmp_obj_held = vmo;
+	vmp->vmp_pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, vaddr), NULL,
+	    NULL);
+
+	*ma = vmp;
+	return (1);
+}
+
+/*
+ * Find a suitable location for a mapping (and install it).
+ */
+int
+vm_map_find(vm_map_t map, vm_object_t vmo, vm_ooffset_t off, vm_offset_t *addr,
+    vm_size_t len, vm_offset_t max_addr, int find_flags, vm_prot_t prot,
+    vm_prot_t prot_max, int cow)
+{
+	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
+	const size_t size = (size_t)len;
+	const uintptr_t uoff = (uintptr_t)off;
+	uintptr_t base = *addr;
+	vmspace_mapping_t *vmsm;
+	int res = 0;
+
+	/* For use in vmm only */
+	VERIFY(find_flags == VMFS_NO_SPACE); /* essentially MAP_FIXED */
+	VERIFY(max_addr == 0);
+
+	if (size == 0 || off < 0 ||
+	    uoff >= (uoff + size) || vmo->vmo_size < (uoff + size)) {
+		return (EINVAL);
+	}
+
+	if (*addr >= vms->vms_size) {
+		return (ENOMEM);
+	}
+
+	vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
+
+	mutex_enter(&vms->vms_lock);
+	vms->vms_map_changing = B_TRUE;
+	if (!vm_mapping_gap(vms, base, size)) {
+		res = ENOMEM;
+		goto out;
+	}
+
+	if (res == 0) {
+		vmsm->vmsm_object = vmo;
+		vmsm->vmsm_addr = base;
+		vmsm->vmsm_len = len;
+		vmsm->vmsm_offset = (off_t)uoff;
+		vmsm->vmsm_prot = prot;
+		list_insert_tail(&vms->vms_maplist, vmsm);
+
+		/* Communicate out the chosen address. */
+		*addr = (vm_offset_t)base;
+	}
+out:
+	vms->vms_map_changing = B_FALSE;
+	mutex_exit(&vms->vms_lock);
+	if (res != 0) {
+		kmem_free(vmsm, sizeof (*vmsm));
+	}
+	return (res);
+}
+
+int
+vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
+{
+	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
+	pmap_t pmap = &vms->vms_pmap;
+	void *pmi = pmap->pm_impl;
+	const uintptr_t addr = start;
+	const size_t size = (size_t)(end - start);
+	vmspace_mapping_t *vmsm;
+
+	ASSERT(start < end);
+
+	mutex_enter(&vms->vms_lock);
+	vms->vms_map_changing = B_TRUE;
+	/* expect to match existing mapping exactly */
+	if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL ||
+	    vmsm->vmsm_addr != addr || vmsm->vmsm_len != size) {
+		vms->vms_map_changing = B_FALSE;
+		mutex_exit(&vms->vms_lock);
+		return (ENOENT);
+	}
+
+	(void) pmap->pm_ops->vpo_unmap(pmi, addr, end);
+	pmap->pm_eptgen++;
+
+	vm_mapping_remove(vms, vmsm);
+	vms->vms_map_changing = B_FALSE;
+	mutex_exit(&vms->vms_lock);
+	return (0);
+}
+
+int
+vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
+{
+	struct vmspace *vms = VMMAP_TO_VMSPACE(map);
+	pmap_t pmap = &vms->vms_pmap;
+	void *pmi = pmap->pm_impl;
+	const uintptr_t addr = start;
+	const size_t size = end - start;
+	vmspace_mapping_t *vmsm;
+	struct vm_object *vmo;
+	uint_t prot;
+
+	mutex_enter(&vms->vms_lock);
+
+	/* For the time being, only exact-match mappings are expected */
+	if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) {
+		mutex_exit(&vms->vms_lock);
+		return (FC_NOMAP);
+	}
+	vmo = vmsm->vmsm_object;
+	prot = vmsm->vmsm_prot;
+
+	for (uintptr_t pos = addr; pos < end; ) {
+		pfn_t pfn;
+		uintptr_t pg_size, map_addr;
+		uint_t map_lvl = 0;
+
+		/* XXXJOY: punt on large pages for now */
+		pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, pos), NULL, NULL);
+		pg_size = LEVEL_SIZE(map_lvl);
+		map_addr = P2ALIGN(pos, pg_size);
+		VERIFY(pfn != PFN_INVALID);
+
+		VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl,
+		    prot, vmo->vmo_attr));
+		vms->vms_pmap.pm_eptgen++;
+
+		pos += pg_size;
+	}
+
+	mutex_exit(&vms->vms_lock);
+
+	return (0);
+}
+
+/* Provided custom for bhyve 'devmem' segment mapping */
+int
+vm_segmap_obj(struct vmspace *vms, vm_object_t vmo, struct as *as,
+    caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
+{
+	const size_t size = vmo->vmo_size;
+	int err;
+
+	if (vmo->vmo_type != OBJT_DEFAULT) {
+		/* Only support default objects for now */
+		return (ENOTSUP);
+	}
+
+	as_rangelock(as);
+
+	err = choose_addr(as, addrp, size, 0, ADDR_VACALIGN, flags);
+	if (err == 0) {
+		segvmm_crargs_t svma;
+
+		svma.kaddr = vmo->vmo_data;
+		svma.prot = prot;
+		svma.cookie = vmo;
+		svma.hold = (segvmm_holdfn_t)vm_object_reference;
+		svma.rele = (segvmm_relefn_t)vm_object_deallocate;
+
+		err = as_map(as, *addrp, size, segvmm_create, &svma);
+	}
+
+	as_rangeunlock(as);
+	return (err);
+}
+
+int
+vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp,
+    off_t len, uint_t prot, uint_t maxprot, uint_t flags)
+{
+	const uintptr_t addr = (uintptr_t)off;
+	const size_t size = (uintptr_t)len;
+	vmspace_mapping_t *vmsm;
+	vm_object_t vmo;
+	int err;
+
+	if (off < 0 || len <= 0 ||
+	    (addr & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
+		return (EINVAL);
+	}
+
+	mutex_enter(&vms->vms_lock);
+	if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) {
+		mutex_exit(&vms->vms_lock);
+		return (ENXIO);
+	}
+	if ((prot & ~(vmsm->vmsm_prot | PROT_USER)) != 0) {
+		mutex_exit(&vms->vms_lock);
+		return (EACCES);
+	}
+	vmo = vmsm->vmsm_object;
+	if (vmo->vmo_type != OBJT_DEFAULT) {
+		/* Only support default objects for now */
+		mutex_exit(&vms->vms_lock);
+		return (ENOTSUP);
+	}
+
+	as_rangelock(as);
+
+	err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
+	if (err == 0) {
+		segvmm_crargs_t svma;
+		const uintptr_t addroff = addr - vmsm->vmsm_addr;
+		const uintptr_t mapoff = addroff + vmsm->vmsm_offset;
+
+		VERIFY(addroff < vmsm->vmsm_len);
+		VERIFY((vmsm->vmsm_len - addroff) >= size);
+		VERIFY(mapoff < vmo->vmo_size);
+		VERIFY((mapoff + size) <= vmo->vmo_size);
+
+		svma.kaddr = (void *)((uintptr_t)vmo->vmo_data + mapoff);
+		svma.prot = prot;
+		svma.cookie = vmo;
+		svma.hold = (segvmm_holdfn_t)vm_object_reference;
+		svma.rele = (segvmm_relefn_t)vm_object_deallocate;
+
+		err = as_map(as, *addrp, len, segvmm_create, &svma);
+	}
+
+	as_rangeunlock(as);
+	mutex_exit(&vms->vms_lock);
+	return (err);
+}
+
+void
+vm_page_lock(vm_page_t vmp)
+{
+	ASSERT(!MUTEX_HELD(&vmp->vmp_lock));
+
+	mutex_enter(&vmp->vmp_lock);
+}
+
+void
+vm_page_unlock(vm_page_t vmp)
+{
+	boolean_t purge = (vmp->vmp_pfn == PFN_INVALID);
+
+	ASSERT(MUTEX_HELD(&vmp->vmp_lock));
+
+	mutex_exit(&vmp->vmp_lock);
+
+	if (purge) {
+		mutex_destroy(&vmp->vmp_lock);
+		kmem_free(vmp, sizeof (*vmp));
+	}
+}
+
+void
+vm_page_unhold(vm_page_t vmp)
+{
+	ASSERT(MUTEX_HELD(&vmp->vmp_lock));
+	VERIFY(vmp->vmp_pfn != PFN_INVALID);
+
+	vm_object_deallocate(vmp->vmp_obj_held);
+	vmp->vmp_obj_held = NULL;
+	vmp->vmp_pfn = PFN_INVALID;
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.c b/usr/src/uts/i86pc/io/vmm/vmm_stat.c
new file mode 100644
index 0000000000..2cbcce9590
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c
@@ -0,0 +1,172 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <machine/vmm.h>
+#include "vmm_util.h"
+#include "vmm_stat.h"
+
+/*
+ * 'vst_num_elems' is the total number of addressable statistic elements
+ * 'vst_num_types' is the number of unique statistic types
+ *
+ * It is always true that 'vst_num_elems' is greater than or equal to
+ * 'vst_num_types'. This is because a stat type may represent more than
+ * one element (for e.g. VMM_STAT_ARRAY).
+ */
+static int vst_num_elems, vst_num_types;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+#define	vst_size	((size_t)vst_num_elems * sizeof(uint64_t))
+
+void
+vmm_stat_register(void *arg)
+{
+	struct vmm_stat_type *vst = arg;
+
+	/* We require all stats to identify themselves with a description */
+	if (vst->desc == NULL)
+		return;
+
+	if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel())
+		return;
+
+	if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_amd())
+		return;
+
+	if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) {
+		printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc);
+		return;
+	}
+
+	vst->index = vst_num_elems;
+	vst_num_elems += vst->nelems;
+
+	vsttab[vst_num_types++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+	struct vmm_stat_type *vst;
+	uint64_t *stats;
+	int i;
+
+	if (vcpu < 0 || vcpu >= vm_get_maxcpus(vm))
+		return (EINVAL);
+
+	/* Let stats functions update their counters */
+	for (i = 0; i < vst_num_types; i++) {
+		vst = vsttab[i];
+		if (vst->func != NULL)
+			(*vst->func)(vm, vcpu, vst);
+	}
+
+	/* Copy over the stats */
+	stats = vcpu_stats(vm, vcpu);
+	for (i = 0; i < vst_num_elems; i++)
+		buf[i] = stats[i];
+	*num_stats = vst_num_elems;
+	return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+
+	return (malloc(vst_size, M_VMM_STAT, M_WAITOK));
+}
+
+void
+vmm_stat_init(void *vp)
+{
+
+	bzero(vp, vst_size);
+}
+
+void
+vmm_stat_free(void *vp)
+{
+	free(vp, M_VMM_STAT);
+}
+
+int
+vmm_stat_desc_copy(int index, char *buf, int bufsize)
+{
+	int i;
+	struct vmm_stat_type *vst;
+
+	for (i = 0; i < vst_num_types; i++) {
+		vst = vsttab[i];
+		if (index >= vst->index && index < vst->index + vst->nelems) {
+			if (vst->nelems > 1) {
+				snprintf(buf, bufsize, "%s[%d]",
+					 vst->desc, index - vst->index);
+			} else {
+				strlcpy(buf, vst->desc, bufsize);
+			}
+			return (0);	/* found it */
+		}
+	}
+
+	return (EINVAL);
+}
+
+/* global statistics */
+VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
+VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt");
+VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted");
+VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted");
+VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted");
+VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted");
+VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits");
+VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted");
+VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening");
+VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening");
+VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted");
+VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted");
+VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault");
+VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation");
+VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
+VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
+VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit");
+VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
+VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit");
+VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions");
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
index 9bf7a60e0b..3232e23888 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -10,7 +12,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -26,15 +28,24 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_stat.h 250427 2013-05-10 02:59:49Z neel $
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef _VMM_STAT_H_
 #define	_VMM_STAT_H_
 
+#include <machine/vmm.h>
+
 struct vm;
 
-#define	MAX_VMM_STAT_ELEMS	64		/* arbitrary */
+#ifdef __FreeBSD__
+#define	MAX_VMM_STAT_ELEMS	64			/* arbitrary */
+#else
+#define	MAX_VMM_STAT_ELEMS	(64 + VM_MAXCPU)	/* arbitrary */
+#endif
 
 enum vmm_stat_scope {
 	VMM_STAT_SCOPE_ANY,
@@ -42,20 +53,28 @@ enum vmm_stat_scope {
 	VMM_STAT_SCOPE_AMD,		/* AMD SVM specific statistic */
 };
 
+struct vmm_stat_type;
+typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu,
+    struct vmm_stat_type *stat);
+
 struct vmm_stat_type {
 	int	index;			/* position in the stats buffer */
 	int	nelems;			/* standalone or array */
 	const char *desc;		/* description of statistic */
+	vmm_stat_func_t func;
 	enum vmm_stat_scope scope;
 };
 
-void	vmm_stat_init(void *arg);
+void	vmm_stat_register(void *arg);
 
-#define	VMM_STAT_DEFINE(type, nelems, desc, scope)			\
+#define	VMM_STAT_FDEFINE(type, nelems, desc, func, scope)		\
 	struct vmm_stat_type type[1] = {				\
-		{ -1, nelems, desc, scope }				\
+		{ -1, nelems, desc, func, scope }			\
 	};								\
-	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type)
+
+#define VMM_STAT_DEFINE(type, nelems, desc, scope) 			\
+	VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope)
 
 #define	VMM_STAT_DECLARE(type)						\
 	extern struct vmm_stat_type type[1]
@@ -67,10 +86,14 @@ void	vmm_stat_init(void *arg);
 #define	VMM_STAT_AMD(type, desc)	\
 	VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD)
 
+#define	VMM_STAT_FUNC(type, desc, func)	\
+	VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY)
+
 #define	VMM_STAT_ARRAY(type, nelems, desc)	\
 	VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY)
 
 void	*vmm_stat_alloc(void);
+void	vmm_stat_init(void *vp);
 void 	vmm_stat_free(void *vp);
 
 /*
@@ -79,7 +102,7 @@ void 	vmm_stat_free(void *vp);
 int	vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
 int	vmm_stat_desc_copy(int index, char *buf, int buflen);
 
-static void __inline
+static __inline void
 vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst,
 		    int statidx, uint64_t x)
 {
@@ -92,9 +115,22 @@ vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst,
 		stats[vst->index + statidx] += x;
 #endif
 }
-		   
 
-static void __inline
+static __inline void
+vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst,
+		   int statidx, uint64_t val)
+{
+#ifdef VMM_KEEP_STATS
+	uint64_t *stats;
+	
+	stats = vcpu_stats(vm, vcpu);
+
+	if (vst->index >= 0 && statidx < vst->nelems)
+		stats[vst->index + statidx] = val;
+#endif
+}
+
+static __inline void
 vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
 {
 
@@ -103,6 +139,15 @@ vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
 #endif
 }
 
+static __inline void
+vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val)
+{
+
+#ifdef VMM_KEEP_STATS
+	vmm_stat_array_set(vm, vcpu, vst, 0, val);
+#endif
+}
+
 VMM_STAT_DECLARE(VCPU_MIGRATIONS);
 VMM_STAT_DECLARE(VMEXIT_COUNT);
 VMM_STAT_DECLARE(VMEXIT_EXTINT);
@@ -121,7 +166,7 @@ VMM_STAT_DECLARE(VMEXIT_INST_EMUL);
 VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
 VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
 VMM_STAT_DECLARE(VMEXIT_USERSPACE);
-VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
-VMM_STAT_DECLARE(VMEXIT_USERSPACE);
+VMM_STAT_DECLARE(VMEXIT_RUNBLOCK);
 VMM_STAT_DECLARE(VMEXIT_EXCEPTION);
+VMM_STAT_DECLARE(VMEXIT_REQIDLE);
 #endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_support.s b/usr/src/uts/i86pc/io/vmm/vmm_support.s
new file mode 100644
index 0000000000..5777d46959
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_support.s
@@ -0,0 +1,54 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/segments.h>
+
+/*
+ * %rdi = trapno
+ *
+ * This variant is for any explicit exception injection that we need: in this
+ * case, we can't just, for example, do a direct "int $2", as that will then
+ * trash our %cr3 via tr_nmiint due to KPTI, so we have to fake a trap frame.
+ * Both NMIs and MCEs don't push an 'err' into the frame.
+ */
+ENTRY_NP(vmm_call_trap)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rsp, %r11
+	andq	$~0xf, %rsp	/* align stack */
+	pushq	$KDS_SEL	/* %ss */
+	pushq	%r11		/* %rsp */
+	pushfq			/* %rflags */
+	pushq	$KCS_SEL	/* %cs */
+	leaq	.trap_iret_dest(%rip), %rcx
+	pushq	%rcx		/* %rip */
+	cli
+	cmpq	$T_NMIFLT, %rdi
+	je	nmiint
+	cmpq	$T_MCE, %rdi
+	je	mcetrap
+
+	pushq	%rdi		/* save our bad trapno... */
+	leaq	__vmm_call_bad_trap(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+	/*NOTREACHED*/
+
+.trap_iret_dest:
+	popq	%rbp
+	ret
+SET_SIZE(vmm_call_trap)
+
+__vmm_call_bad_trap:
+	.string	"bad trapno for vmm_call_trap()"
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c
index fabd42e13c..3eadfe57e5 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_util.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -39,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/libkern.h>
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h
index fe1c1c9449..fc7e7364c7 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_util.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/vmm_util.h 245678 2013-01-20 03:42:49Z neel $
+ * $FreeBSD$
  */
 
 #ifndef _VMM_UTIL_H_
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_zsd.c b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c
new file mode 100644
index 0000000000..0271cc339e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c
@@ -0,0 +1,218 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+#include <sys/cpuvar.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/list.h>
+#include <sys/types.h>
+#include <sys/vmm.h>
+#include <sys/vmm_impl.h>
+#include <sys/zone.h>
+
+/*
+ * zone specific data
+ *
+ * Zone specific data is used to keep an association between zones and the vmm
+ * instances that may be running in them.  This is used to ensure that vmm
+ * instances do not outlive their parent zone.
+ *
+ * Locking strategy
+ *
+ * The global vmm_zsd_lock is held while modifying vmm_zsd_list.
+ *
+ * The per zone vz_lock in vmm_zsd_t is held while reading or writing anything
+ * within in vmm_zsd_t instance.  This is important to ensure that there's not
+ * an accidental VM creating as a zone is going down.
+ */
+
+/*
+ * One of these per zone.
+ */
+struct vmm_zsd {
+	list_t		vz_vmms;	/* vmm instances in the zone */
+	list_node_t	vz_linkage;	/* link to other zones */
+	boolean_t	vz_active;	/* B_FALSE early in shutdown callback */
+	zoneid_t	vz_zoneid;
+	kmutex_t	vz_lock;
+};
+
+static kmutex_t vmm_zsd_lock;		/* Protects vmm_zsd_list */
+static list_t vmm_zsd_list;		/* Linkage between all zsd instances */
+
+static zone_key_t vmm_zsd_key;
+
+int
+vmm_zsd_add_vm(vmm_softc_t *sc)
+{
+	vmm_zsd_t *zsd;
+
+	ASSERT(sc->vmm_zone != NULL);
+
+	mutex_enter(&vmm_zsd_lock);
+
+	for (zsd = list_head(&vmm_zsd_list); zsd != NULL;
+	    zsd = list_next(&vmm_zsd_list, zsd)) {
+		if (zsd->vz_zoneid == sc->vmm_zone->zone_id) {
+			break;
+		}
+	}
+
+	VERIFY(zsd != NULL);
+	mutex_exit(&vmm_zsd_lock);
+
+	mutex_enter(&zsd->vz_lock);
+	if (!zsd->vz_active) {
+		mutex_exit(&zsd->vz_lock);
+		return (ENOSYS);
+	}
+
+	sc->vmm_zsd = zsd;
+	list_insert_tail(&zsd->vz_vmms, sc);
+
+	mutex_exit(&zsd->vz_lock);
+
+	return (0);
+}
+
+void
+vmm_zsd_rem_vm(vmm_softc_t *sc)
+{
+	vmm_zsd_t *zsd = sc->vmm_zsd;
+
+	mutex_enter(&zsd->vz_lock);
+
+	list_remove(&zsd->vz_vmms, sc);
+	sc->vmm_zsd = NULL;
+
+	mutex_exit(&zsd->vz_lock);
+}
+
+static void *
+vmm_zsd_create(zoneid_t zid)
+{
+	vmm_zsd_t *zsd;
+	zone_t *zone;
+
+	zsd = kmem_zalloc(sizeof (*zsd), KM_SLEEP);
+
+	list_create(&zsd->vz_vmms, sizeof (vmm_softc_t),
+	    offsetof(vmm_softc_t, vmm_zsd_linkage));
+
+	zsd->vz_zoneid = zid;
+
+	mutex_init(&zsd->vz_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * If the vmm module is loaded while this zone is in the midst of
+	 * shutting down, vmm_zsd_destroy() may be called without
+	 * vmm_zsd_shutdown() ever being called. If it is shutting down, there
+	 * is no sense in letting any in-flight VM creation succeed so set
+	 * vz_active accordingly.
+	 *
+	 * zone_find_by_id_nolock() is used rather than zone_find_by_id()
+	 * so that the zone is returned regardless of state.
+	 */
+	zone = zone_find_by_id_nolock(zid);
+	VERIFY(zone != NULL);
+	zsd->vz_active = zone_status_get(zone) < ZONE_IS_SHUTTING_DOWN;
+
+	mutex_enter(&vmm_zsd_lock);
+	list_insert_tail(&vmm_zsd_list, zsd);
+	mutex_exit(&vmm_zsd_lock);
+
+	return (zsd);
+}
+
+/*
+ * Tells all runing VMs in the zone to poweroff.  This does not reclaim guest
+ * resources (memory, etc.).
+ */
+static void
+vmm_zsd_shutdown(zoneid_t zid, void *data)
+{
+	vmm_zsd_t *zsd = data;
+	vmm_softc_t *sc;
+
+	mutex_enter(&zsd->vz_lock);
+
+	/*
+	 * This may already be B_FALSE. See comment in vmm_zsd_create(). If it
+	 * is already B_FALSE we will take a quick trip through the empty list.
+	 */
+	zsd->vz_active = B_FALSE;
+
+	for (sc = list_head(&zsd->vz_vmms); sc != NULL;
+	    sc = list_next(&zsd->vz_vmms, sc)) {
+		/* Send a poweroff to the VM, whether running or not. */
+		(void) vm_suspend(sc->vmm_vm, VM_SUSPEND_POWEROFF);
+	}
+	mutex_exit(&zsd->vz_lock);
+}
+
+/*
+ * Reap all VMs that remain and free up guest resources.
+ */
+static void
+vmm_zsd_destroy(zoneid_t zid, void *data)
+{
+	vmm_zsd_t *zsd = data;
+	vmm_softc_t *sc;
+
+	mutex_enter(&vmm_zsd_lock);
+	list_remove(&vmm_zsd_list, zsd);
+	mutex_exit(&vmm_zsd_lock);
+
+	mutex_enter(&zsd->vz_lock);
+	ASSERT(!zsd->vz_active);
+
+	while ((sc = list_remove_head(&zsd->vz_vmms)) != NULL) {
+		int err;
+
+		/*
+		 * This frees all resources associated with the vm, including
+		 * sc.
+		 */
+		err = vmm_do_vm_destroy(sc, B_FALSE);
+		ASSERT3S(err, ==, 0);
+	}
+
+	mutex_exit(&zsd->vz_lock);
+	mutex_destroy(&zsd->vz_lock);
+
+	kmem_free(zsd, sizeof (*zsd));
+}
+
+void
+vmm_zsd_init(void)
+{
+	mutex_init(&vmm_zsd_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&vmm_zsd_list, sizeof (vmm_zsd_t),
+	    offsetof(vmm_zsd_t, vz_linkage));
+	zone_key_create(&vmm_zsd_key, vmm_zsd_create, vmm_zsd_shutdown,
+	    vmm_zsd_destroy);
+}
+
+void
+vmm_zsd_fini(void)
+{
+	/* Calls vmm_zsd_destroy() on all zones. */
+	zone_key_delete(vmm_zsd_key);
+	ASSERT(list_is_empty(&vmm_zsd_list));
+
+	list_destroy(&vmm_zsd_list);
+	mutex_destroy(&vmm_zsd_lock);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmx_assym.s b/usr/src/uts/i86pc/io/vmm/vmx_assym.s
deleted file mode 100644
index d84ca30275..0000000000
--- a/usr/src/uts/i86pc/io/vmm/vmx_assym.s
+++ /dev/null
@@ -1 +0,0 @@
-#include "vmx_assym.h"
diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c
index 02222ef5e7..d74f866013 100644
--- a/usr/src/uts/i86pc/io/vmm/x86.c
+++ b/usr/src/uts/i86pc/io/vmm/x86.c
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,38 +38,69 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $");
+__FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/types.h>
+#include <sys/pcpu.h>
 #include <sys/systm.h>
-#include <sys/cpuset.h>
+#include <sys/sysctl.h>
+#include <sys/x86_archext.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
+#include <machine/segments.h>
 #include <machine/specialreg.h>
 
 #include <machine/vmm.h>
 
+#include "vmm_host.h"
+#include "vmm_ktr.h"
+#include "vmm_util.h"
 #include "x86.h"
 
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
+
 #define	CPUID_VM_HIGH		0x40000000
 
 static const char bhyve_id[12] = "bhyve bhyve ";
 
 static uint64_t bhyve_xcpuids;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
+    "Number of times an unknown cpuid leaf was accessed");
+
+static int cpuid_leaf_b = 1;
+SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
+    &cpuid_leaf_b, 0, NULL);
+
+/*
+ * Round up to the next power of two, if necessary, and then take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+log2(u_int x)
+{
+
+	return (fls(x << (1 - powerof2(x))) - 1);
+}
 
 int
 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
-	int error;
-	unsigned int 	func, regs[4];
+	const struct xsave_limits *limits;
+	uint64_t cr4;
+	int error, enable_invpcid, level, width = 0, x2apic_id = 0;
+	unsigned int func, regs[4], logical_cpus = 0;
 	enum x2apic_state x2apic_state;
+	uint16_t cores, maxcpus, sockets, threads;
+
+	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
 
 	/*
 	 * Requests for invalid CPUID levels should map to the highest
@@ -102,26 +135,108 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		case CPUID_8000_0003:
 		case CPUID_8000_0004:
 		case CPUID_8000_0006:
+			cpuid_count(*eax, *ecx, regs);
+			break;
 		case CPUID_8000_0008:
 			cpuid_count(*eax, *ecx, regs);
+			if (vmm_is_amd()) {
+				/*
+				 * As on Intel (0000_0007:0, EDX), mask out
+				 * unsupported or unsafe AMD extended features
+				 * (8000_0008 EBX).
+				 */
+				regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
+				    AMDFEID_XSAVEERPTR);
+
+				vm_get_topology(vm, &sockets, &cores, &threads,
+				    &maxcpus);
+				/*
+				 * Here, width is ApicIdCoreIdSize, present on
+				 * at least Family 15h and newer.  It
+				 * represents the "number of bits in the
+				 * initial apicid that indicate thread id
+				 * within a package."
+				 *
+				 * Our topo_probe_amd() uses it for
+				 * pkg_id_shift and other OSes may rely on it.
+				 */
+				width = MIN(0xF, log2(threads * cores));
+				if (width < 0x4)
+					width = 0;
+				logical_cpus = MIN(0xFF, threads * cores - 1);
+				regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus;
+			}
 			break;
 
 		case CPUID_8000_0001:
+			cpuid_count(*eax, *ecx, regs);
+
+			/*
+			 * Hide SVM from guest.
+			 */
+			regs[2] &= ~AMDID2_SVM;
+
+			/*
+			 * Don't advertise extended performance counter MSRs
+			 * to the guest.
+			 */
+			regs[2] &= ~AMDID2_PCXC;
+			regs[2] &= ~AMDID2_PNXC;
+			regs[2] &= ~AMDID2_PTSCEL2I;
+
+			/*
+			 * Don't advertise Instruction Based Sampling feature.
+			 */
+			regs[2] &= ~AMDID2_IBS;
+
+			/* NodeID MSR not available */
+			regs[2] &= ~AMDID2_NODE_ID;
+
+			/* Don't advertise the OS visible workaround feature */
+			regs[2] &= ~AMDID2_OSVW;
+
+			/* Hide mwaitx/monitorx capability from the guest */
+			regs[2] &= ~AMDID2_MWAITX;
+
+#ifndef __FreeBSD__
+			/*
+			 * Detection routines for TCE and FFXSR are missing
+			 * from our vm_cpuid_capability() detection logic
+			 * today.  Mask them out until that is remedied.
+			 * They do not appear to be in common usage, so their
+			 * absence should not cause undue trouble.
+			 */
+			regs[2] &= ~AMDID2_TCE;
+			regs[3] &= ~AMDID_FFXSR;
+#endif
+
 			/*
 			 * Hide rdtscp/ia32_tsc_aux until we know how
 			 * to deal with them.
 			 */
-			cpuid_count(*eax, *ecx, regs);
 			regs[3] &= ~AMDID_RDTSCP;
 			break;
 
 		case CPUID_8000_0007:
-			cpuid_count(*eax, *ecx, regs);
-#ifdef	__FreeBSD__
 			/*
-			 * If the host TSCs are not synchronized across
-			 * physical cpus then we cannot advertise an
-			 * invariant tsc to a vcpu.
+			 * AMD uses this leaf to advertise the processor's
+			 * power monitoring and RAS capabilities. These
+			 * features are hardware-specific and exposing
+			 * them to a guest doesn't make a lot of sense.
+			 *
+			 * Intel uses this leaf only to advertise the
+			 * "Invariant TSC" feature with all other bits
+			 * being reserved (set to zero).
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+
+			/*
+			 * "Invariant TSC" can be advertised to the guest if:
+			 * - host TSC frequency is invariant
+			 * - host TSCs are synchronized across physical cpus
 			 *
 			 * XXX This still falls short because the vcpu
 			 * can observe the TSC moving backwards as it
@@ -129,9 +244,73 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			 * it should discourage the guest from using the
 			 * TSC to keep track of time.
 			 */
-			if (!smp_tsc)
-				regs[3] &= ~AMDPM_TSC_INVARIANT;
-#endif
+#ifdef __FreeBSD__
+			/* XXXJOY: Wire up with our own TSC logic */
+			if (tsc_is_invariant && smp_tsc)
+				regs[3] |= AMDPM_TSC_INVARIANT;
+#endif /* __FreeBSD__ */
+			break;
+
+		case CPUID_8000_001D:
+			/* AMD Cache topology, like 0000_0004 for Intel. */
+			if (!vmm_is_amd())
+				goto default_leaf;
+
+			/*
+			 * Similar to Intel, generate a ficticious cache
+			 * topology for the guest with L3 shared by the
+			 * package, and L1 and L2 local to a core.
+			 */
+			vm_get_topology(vm, &sockets, &cores, &threads,
+			    &maxcpus);
+			switch (*ecx) {
+			case 0:
+				logical_cpus = threads;
+				level = 1;
+				func = 1;	/* data cache */
+				break;
+			case 1:
+				logical_cpus = threads;
+				level = 2;
+				func = 3;	/* unified cache */
+				break;
+			case 2:
+				logical_cpus = threads * cores;
+				level = 3;
+				func = 3;	/* unified cache */
+				break;
+			default:
+				logical_cpus = 0;
+				level = 0;
+				func = 0;
+				break;
+			}
+
+			logical_cpus = MIN(0xfff, logical_cpus - 1);
+			regs[0] = (logical_cpus << 14) | (1 << 8) |
+			    (level << 5) | func;
+			regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0;
+			regs[2] = 0;
+			regs[3] = 0;
+			break;
+
+		case CPUID_8000_001E:
+			/* AMD Family 16h+ additional identifiers */
+			if (!vmm_is_amd() || CPUID_TO_FAMILY(cpu_id) < 0x16)
+				goto default_leaf;
+
+			vm_get_topology(vm, &sockets, &cores, &threads,
+			    &maxcpus);
+			regs[0] = vcpu_id;
+			threads = MIN(0xFF, threads - 1);
+			regs[1] = (threads << 8) |
+			    (vcpu_id >> log2(threads + 1));
+			/*
+			 * XXX Bhyve topology cannot yet represent >1 node per
+			 * processor.
+			 */
+			regs[2] = 0;
+			regs[3] = 0;
 			break;
 
 		case CPUID_0000_0001:
@@ -150,22 +329,41 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
-			 * Don't expose VMX, SpeedStep or TME capability.
+			 * Don't expose VMX, SpeedStep, TME or SMX capability.
 			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+			regs[2] &= ~(CPUID2_SMX);
 
 			regs[2] |= CPUID2_HV;
 
 			if (x2apic_state != X2APIC_DISABLED)
 				regs[2] |= CPUID2_X2APIC;
+			else
+				regs[2] &= ~CPUID2_X2APIC;
 
 			/*
-			 * Hide xsave/osxsave/avx until the FPU save/restore
-			 * issues are resolved
+			 * Only advertise CPUID2_XSAVE in the guest if
+			 * the host is using XSAVE.
 			 */
-			regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
-				     CPUID2_AVX);
+			if (!(regs[2] & CPUID2_OSXSAVE))
+				regs[2] &= ~CPUID2_XSAVE;
+
+			/*
+			 * If CPUID2_XSAVE is being advertised and the
+			 * guest has set CR4_XSAVE, set
+			 * CPUID2_OSXSAVE.
+			 */
+			regs[2] &= ~CPUID2_OSXSAVE;
+			if (regs[2] & CPUID2_XSAVE) {
+				error = vm_get_register(vm, vcpu_id,
+				    VM_REG_GUEST_CR4, &cr4);
+				if (error)
+					panic("x86_emulate_cpuid: error %d "
+					      "fetching %%cr4", error);
+				if (cr4 & CR4_XSAVE)
+					regs[2] |= CPUID2_OSXSAVE;
+			}
 
 			/*
 			 * Hide monitor/mwait until we know how to deal with
@@ -177,7 +375,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			 * Hide the performance and debug features.
 			 */
 			regs[2] &= ~CPUID2_PDCM;
-			
+
 			/*
 			 * No TSC deadline support in the APIC yet
 			 */
@@ -187,48 +385,95 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
-			
+
 			/*
-			 * Machine check handling is done in the host.
+			 * Hide the debug store capability.
 			 */
-			regs[3] &= ~(CPUID_MCA | CPUID_MCE);
-
-                        /*
-                        * Hide the debug store capability.
-                        */
 			regs[3] &= ~CPUID_DS;
 
 			/*
-			 * Disable multi-core.
+			 * Advertise the Machine Check and MTRR capability.
+			 *
+			 * Some guest OSes (e.g. Windows) will not boot if
+			 * these features are absent.
 			 */
+			regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+			vm_get_topology(vm, &sockets, &cores, &threads,
+			    &maxcpus);
+			logical_cpus = threads * cores;
 			regs[1] &= ~CPUID_HTT_CORES;
-			regs[3] &= ~CPUID_HTT;
+			regs[1] |= (logical_cpus & 0xff) << 16;
+			regs[3] |= CPUID_HTT;
 			break;
 
 		case CPUID_0000_0004:
-			do_cpuid(4, regs);
+			cpuid_count(*eax, *ecx, regs);
 
-			/*
-			 * Do not expose topology.
-			 */
-			regs[0] &= 0xffff8000;
-			/*
-			 * The maximum number of processor cores in
-			 * this physical processor package and the
-			 * maximum number of threads sharing this
-			 * cache are encoded with "plus 1" encoding.
-			 * Adding one to the value in this register
-			 * field to obtains the actual value.
-			 *
-			 * Therefore 0 for both indicates 1 core
-			 * per package and no cache sharing.
-			 */
+			if (regs[0] || regs[1] || regs[2] || regs[3]) {
+				vm_get_topology(vm, &sockets, &cores, &threads,
+				    &maxcpus);
+				regs[0] &= 0x3ff;
+				regs[0] |= (cores - 1) << 26;
+				/*
+				 * Cache topology:
+				 * - L1 and L2 are shared only by the logical
+				 *   processors in a single core.
+				 * - L3 and above are shared by all logical
+				 *   processors in the package.
+				 */
+				logical_cpus = threads;
+				level = (regs[0] >> 5) & 0x7;
+				if (level >= 3)
+					logical_cpus *= cores;
+				regs[0] |= (logical_cpus - 1) << 14;
+			}
 			break;
 
-		case CPUID_0000_0006:
 		case CPUID_0000_0007:
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+
+			/* leaf 0 */
+			if (*ecx == 0) {
+				cpuid_count(*eax, *ecx, regs);
+
+				/* Only leaf 0 is supported */
+				regs[0] = 0;
+
+				/*
+				 * Expose known-safe features.
+				 */
+				regs[1] &= (CPUID_STDEXT_FSGSBASE |
+				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
+				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
+				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
+				    CPUID_STDEXT_AVX512F |
+				    CPUID_STDEXT_RDSEED |
+				    CPUID_STDEXT_AVX512PF |
+				    CPUID_STDEXT_AVX512ER |
+				    CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA);
+				regs[2] = 0;
+				regs[3] &= CPUID_STDEXT3_MD_CLEAR;
+
+				/* Advertise INVPCID if it is enabled. */
+				error = vm_get_capability(vm, vcpu_id,
+				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
+				if (error == 0 && enable_invpcid)
+					regs[1] |= CPUID_STDEXT_INVPCID;
+			}
+			break;
+
+		case CPUID_0000_0006:
+			regs[0] = CPUTPM1_ARAT;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+			break;
+
 		case CPUID_0000_000A:
-		case CPUID_0000_000D:
 			/*
 			 * Handle the access, but report 0 for
 			 * all options
@@ -241,12 +486,93 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 
 		case CPUID_0000_000B:
 			/*
-			 * Processor topology enumeration
+			 * Intel processor topology enumeration
 			 */
-			regs[0] = 0;
-			regs[1] = 0;
-			regs[2] = *ecx & 0xff;
-			regs[3] = vcpu_id;
+			if (vmm_is_intel()) {
+				vm_get_topology(vm, &sockets, &cores, &threads,
+				    &maxcpus);
+				if (*ecx == 0) {
+					logical_cpus = threads;
+					width = log2(logical_cpus);
+					level = CPUID_TYPE_SMT;
+					x2apic_id = vcpu_id;
+				}
+
+				if (*ecx == 1) {
+					logical_cpus = threads * cores;
+					width = log2(logical_cpus);
+					level = CPUID_TYPE_CORE;
+					x2apic_id = vcpu_id;
+				}
+
+				if (!cpuid_leaf_b || *ecx >= 2) {
+					width = 0;
+					logical_cpus = 0;
+					level = 0;
+					x2apic_id = 0;
+				}
+
+				regs[0] = width & 0x1f;
+				regs[1] = logical_cpus & 0xffff;
+				regs[2] = (level << 8) | (*ecx & 0xff);
+				regs[3] = x2apic_id;
+			} else {
+				regs[0] = 0;
+				regs[1] = 0;
+				regs[2] = 0;
+				regs[3] = 0;
+			}
+			break;
+
+		case CPUID_0000_000D:
+			limits = vmm_get_xsave_limits();
+			if (!limits->xsave_enabled) {
+				regs[0] = 0;
+				regs[1] = 0;
+				regs[2] = 0;
+				regs[3] = 0;
+				break;
+			}
+
+			cpuid_count(*eax, *ecx, regs);
+			switch (*ecx) {
+			case 0:
+				/*
+				 * Only permit the guest to use bits
+				 * that are active in the host in
+				 * %xcr0.  Also, claim that the
+				 * maximum save area size is
+				 * equivalent to the host's current
+				 * save area size.  Since this runs
+				 * "inside" of vmrun(), it runs with
+				 * the guest's xcr0, so the current
+				 * save area size is correct as-is.
+				 */
+				regs[0] &= limits->xcr0_allowed;
+				regs[2] = limits->xsave_max_size;
+				regs[3] &= (limits->xcr0_allowed >> 32);
+				break;
+			case 1:
+				/* Only permit XSAVEOPT. */
+				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
+				regs[1] = 0;
+				regs[2] = 0;
+				regs[3] = 0;
+				break;
+			default:
+				/*
+				 * If the leaf is for a permitted feature,
+				 * pass through as-is, otherwise return
+				 * all zeroes.
+				 */
+				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
+					regs[0] = 0;
+					regs[1] = 0;
+					regs[2] = 0;
+					regs[3] = 0;
+				}
+				break;
+			}
 			break;
 
 		case 0x40000000:
@@ -257,6 +583,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			break;
 
 		default:
+default_leaf:
 			/*
 			 * The leaf value has already been clamped so
 			 * simply pass this through, keeping count of
@@ -274,3 +601,45 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 
 	return (1);
 }
+
+bool
+vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
+{
+	bool rv;
+
+	KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
+	    __func__, cap));
+
+	/*
+	 * Simply passthrough the capabilities of the host cpu for now.
+	 */
+	rv = false;
+	switch (cap) {
+#ifdef __FreeBSD__
+	case VCC_NO_EXECUTE:
+		if (amd_feature & AMDID_NX)
+			rv = true;
+		break;
+	case VCC_FFXSR:
+		if (amd_feature & AMDID_FFXSR)
+			rv = true;
+		break;
+	case VCC_TCE:
+		if (amd_feature2 & AMDID2_TCE)
+			rv = true;
+		break;
+#else
+	case VCC_NO_EXECUTE:
+		if (is_x86_feature(x86_featureset, X86FSET_NX))
+			rv = true;
+		break;
+	/* XXXJOY: No kernel detection for FFXR or TCE at present, so ignore */
+	case VCC_FFXSR:
+	case VCC_TCE:
+		break;
+#endif
+	default:
+		panic("%s: unknown vm_cpu_capability %d", __func__, cap);
+	}
+	return (rv);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h
index db2340b37b..0d70c04fd8 100644
--- a/usr/src/uts/i86pc/io/vmm/x86.h
+++ b/usr/src/uts/i86pc/io/vmm/x86.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/vmm/x86.h 255287 2013-09-06 05:16:10Z grehan $
+ * $FreeBSD$
  */
 
 #ifndef _X86_H_
@@ -47,6 +49,8 @@
 #define CPUID_8000_0006	(0x80000006)
 #define CPUID_8000_0007	(0x80000007)
 #define CPUID_8000_0008	(0x80000008)
+#define CPUID_8000_001D	(0x8000001D)
+#define CPUID_8000_001E	(0x8000001E)
 
 /*
  * CPUID instruction Fn0000_0001:
@@ -62,4 +66,17 @@
 int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
 		      uint32_t *ecx, uint32_t *edx);
 
+enum vm_cpuid_capability {
+	VCC_NONE,
+	VCC_NO_EXECUTE,
+	VCC_FFXSR,
+	VCC_TCE,
+	VCC_LAST
+};
+
+/*
+ * Return 'true' if the capability 'cap' is enabled in this virtual cpu
+ * and 'false' otherwise.
+ */
+bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability);
 #endif
diff --git a/usr/src/uts/i86pc/os/gipt.c b/usr/src/uts/i86pc/os/gipt.c
new file mode 100644
index 0000000000..ace7e03438
--- /dev/null
+++ b/usr/src/uts/i86pc/os/gipt.c
@@ -0,0 +1,566 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/gipt.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/sunddi.h>
+#include <sys/panic.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+
+/*
+ * Generic Indexed Page Table
+ *
+ * There are several applications, such as hardware virtualization or IOMMU
+ * control, which require construction of a page table tree to represent a
+ * virtual address space.  Many features of the existing htable system would be
+ * convenient for this, but its tight coupling to the VM system make it
+ * undesirable for independent consumers.  The GIPT interface exists to provide
+ * page table allocation and indexing on top of which a table hierarchy
+ * (EPT, VT-d, etc) can be built by upstack logic.
+ *
+ * Types:
+ *
+ * gipt_t - Represents a single page table with a physical backing page and
+ *     associated metadata.
+ * gipt_map_t - The workhorse of this facility, it contains an hash table to
+ *     index all of the gipt_t entries which make up the page table tree.
+ * struct gipt_cbs - Callbacks used by the gipt_map_t:
+ *     gipt_pte_type_cb_t - Given a PTE, emit the type (empty/page/table)
+ *     gipt_pte_map_cb_t - Given a PFN, emit a (child) table mapping
+ */
+
+/*
+ * For now, the level shifts are hard-coded to match with standard 4-level
+ * 64-bit paging structures.
+ */
+
+#define	GIPT_HASH(map, va, lvl)			\
+	((((va) >> 12) + ((va) >> 28) + (lvl)) & ((map)->giptm_table_cnt - 1))
+
+const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1] = {
+	12,	/* 4K */
+	21,	/* 2M */
+	30,	/* 1G */
+	39,	/* 512G */
+	48	/* MAX */
+};
+const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1] = {
+	0xfffffffffffff000ull,	/* 4K */
+	0xffffffffffe00000ull,	/* 2M */
+	0xffffffffc0000000ull,	/* 1G */
+	0xffffff8000000000ull,	/* 512G */
+	0xffff000000000000ull	/* MAX */
+};
+const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1] = {
+	0x0000000000001000ull,	/* 4K */
+	0x0000000000200000ull,	/* 2M */
+	0x0000000040000000ull,	/* 1G */
+	0x0000008000000000ull,	/* 512G */
+	0x0001000000000000ull	/* MAX */
+};
+const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1] = {
+	0x0000000000000001ull,	/* 4K */
+	0x0000000000000200ull,	/* 2M */
+	0x0000000000040000ull,	/* 1G */
+	0x0000000008000000ull,	/* 512G */
+	0x0000001000000000ull	/* MAX */
+};
+
+/*
+ * Allocate a gipt_t structure with corresponding page of memory to hold the
+ * PTEs which it contains.
+ */
+gipt_t *
+gipt_alloc(void)
+{
+	gipt_t *pt;
+	void *page;
+
+	pt = kmem_zalloc(sizeof (*pt), KM_SLEEP);
+	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
+	pt->gipt_kva = page;
+	pt->gipt_pfn = hat_getpfnum(kas.a_hat, page);
+
+	return (pt);
+}
+
+/*
+ * Free a gipt_t structure along with its page of PTE storage.
+ */
+void
+gipt_free(gipt_t *pt)
+{
+	void *page = pt->gipt_kva;
+
+	ASSERT(pt->gipt_pfn != PFN_INVALID);
+	ASSERT(pt->gipt_kva != NULL);
+
+	pt->gipt_pfn = PFN_INVALID;
+	pt->gipt_kva = NULL;
+
+	kmem_free(page, PAGESIZE);
+	kmem_free(pt, sizeof (*pt));
+}
+
+/*
+ * Initialize a gipt_map_t with a max level (must be >= 1) and allocating its
+ * hash table based on a provided size (must be a power of 2).
+ */
+void
+gipt_map_init(gipt_map_t *map, uint_t levels, uint_t hash_table_size,
+    const struct gipt_cbs *cbs, gipt_t *root)
+{
+	VERIFY(map->giptm_root == NULL);
+	VERIFY(map->giptm_hash == NULL);
+	VERIFY3U(levels, >, 0);
+	VERIFY3U(levels, <=, GIPT_MAX_LEVELS);
+	VERIFY(ISP2(hash_table_size));
+	VERIFY(root != NULL);
+
+	mutex_init(&map->giptm_lock, NULL, MUTEX_DEFAULT, NULL);
+	map->giptm_table_cnt = hash_table_size;
+	bcopy(cbs, &map->giptm_cbs, sizeof (*cbs));
+	map->giptm_hash = kmem_alloc(sizeof (list_t) * map->giptm_table_cnt,
+	    KM_SLEEP);
+	for (uint_t i = 0; i < hash_table_size; i++) {
+		list_create(&map->giptm_hash[i], sizeof (gipt_t),
+		    offsetof(gipt_t, gipt_node));
+	}
+	map->giptm_levels = levels;
+
+	/*
+	 * Insert the table root into the hash.  It will be held in existence
+	 * with an extra "valid" reference.  This will prevent its clean-up
+	 * during gipt_map_clean_parents() calls, even if it has no children.
+	 */
+	mutex_enter(&map->giptm_lock);
+	gipt_map_insert(map, root);
+	map->giptm_root = root;
+	root->gipt_valid_cnt++;
+	mutex_exit(&map->giptm_lock);
+}
+
+/*
+ * Clean up a gipt_map_t by removing any lingering gipt_t entries referenced by
+ * it, and freeing its hash table.
+ */
+void
+gipt_map_fini(gipt_map_t *map)
+{
+	const uint_t cnt = map->giptm_table_cnt;
+	const size_t sz = sizeof (list_t) * cnt;
+
+	mutex_enter(&map->giptm_lock);
+	/* Clean up any lingering tables */
+	for (uint_t i = 0; i < cnt; i++) {
+		list_t *list = &map->giptm_hash[i];
+		gipt_t *pt;
+
+		while ((pt = list_remove_head(list)) != NULL) {
+			gipt_free(pt);
+		}
+		ASSERT(list_is_empty(list));
+	}
+
+	kmem_free(map->giptm_hash, sz);
+	map->giptm_hash = NULL;
+	map->giptm_root = NULL;
+	map->giptm_levels = 0;
+	mutex_exit(&map->giptm_lock);
+	mutex_destroy(&map->giptm_lock);
+}
+
+/*
+ * Look in the map for a gipt_t containing a given VA which is located at a
+ * specified level.
+ */
+gipt_t *
+gipt_map_lookup(gipt_map_t *map, uint64_t va, uint_t lvl)
+{
+	gipt_t *pt;
+
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+	ASSERT3U(lvl, <=, GIPT_MAX_LEVELS);
+
+	/*
+	 * Lookup gipt_t at the VA aligned to the next level up.  For example,
+	 * level 0 corresponds to a page table containing 512 PTEs which cover
+	 * 4k each, spanning a total 2MB. As such, the base VA of that table
+	 * must be aligned to the same 2MB.
+	 */
+	const uint64_t masked_va = va & gipt_level_mask[lvl + 1];
+	const uint_t hash = GIPT_HASH(map, masked_va, lvl);
+
+	/* Only the root is expected to be at the top level. */
+	if (lvl == (map->giptm_levels - 1) && map->giptm_root != NULL) {
+		pt = map->giptm_root;
+
+		ASSERT3U(pt->gipt_level, ==, lvl);
+
+		/*
+		 * It may be so that the VA in question is not covered by the
+		 * range of the table root.
+		 */
+		if (pt->gipt_vaddr != masked_va) {
+			return (NULL);
+		}
+
+		return (pt);
+	}
+
+	list_t *list = &map->giptm_hash[hash];
+	for (pt = list_head(list); pt != NULL; pt = list_next(list, pt)) {
+		if (pt->gipt_vaddr == masked_va && pt->gipt_level == lvl)
+			break;
+	}
+	return (pt);
+}
+
+/*
+ * Look in the map for the deepest (lowest level) gipt_t which contains a given
+ * VA.  This could still fail if the VA is outside the range of the table root.
+ */
+gipt_t *
+gipt_map_lookup_deepest(gipt_map_t *map, uint64_t va)
+{
+	gipt_t *pt = NULL;
+	uint_t lvl;
+
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+
+	for (lvl = 0; lvl < map->giptm_levels; lvl++) {
+		pt = gipt_map_lookup(map, va, lvl);
+		if (pt != NULL) {
+			break;
+		}
+	}
+	return (pt);
+}
+
+/*
+ * Given a VA inside a gipt_t, calculate (based on the level of that PT) the VA
+ * corresponding to the next entry in the table.  It returns 0 if that VA would
+ * fall beyond the bounds of the table.
+ */
+static __inline__ uint64_t
+gipt_next_va(gipt_t *pt, uint64_t va)
+{
+	const uint_t lvl = pt->gipt_level;
+	const uint64_t masked = va & gipt_level_mask[lvl];
+	const uint64_t max = pt->gipt_vaddr + gipt_level_size[lvl+1];
+	const uint64_t next = masked + gipt_level_size[lvl];
+
+	ASSERT3U(masked, >=, pt->gipt_vaddr);
+	ASSERT3U(masked, <, max);
+
+	/*
+	 * If the "next" VA would be outside this table, including cases where
+	 * it overflowed, indicate an error result.
+	 */
+	if (next >= max || next <= masked) {
+		return (0);
+	}
+	return (next);
+}
+
+/*
+ * For a given VA, find the next VA which corresponds to a valid page mapping.
+ * The gipt_t containing that VA will be indicated via 'ptp'.  (The gipt_t of
+ * the starting VA can be passed in via 'ptp' for a minor optimization).  If
+ * there is no valid mapping higher than 'va' but contained within 'max_va',
+ * then this will indicate failure with 0 returned.
+ */
+uint64_t
+gipt_map_next_page(gipt_map_t *map, uint64_t va, uint64_t max_va, gipt_t **ptp)
+{
+	gipt_t *pt = *ptp;
+	uint64_t cur_va = va;
+	gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type;
+
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+	ASSERT3U(max_va, !=, 0);
+	ASSERT3U(ptp, !=, NULL);
+
+	/*
+	 * If a starting table is not provided, search the map for the deepest
+	 * table which contains the VA.  If for some reason that VA is beyond
+	 * coverage of the map root, indicate failure.
+	 */
+	if (pt == NULL) {
+		pt = gipt_map_lookup_deepest(map, cur_va);
+		if (pt == NULL) {
+			goto fail;
+		}
+	}
+
+	/*
+	 * From the starting table (at whatever level that may reside), walk
+	 * forward through the PTEs looking for a valid page mapping.
+	 */
+	while (cur_va < max_va) {
+		const uint64_t next_va = gipt_next_va(pt, cur_va);
+		if (next_va == 0) {
+			/*
+			 * The end of this table has been reached.  Ascend one
+			 * level to continue the walk if possible.  If already
+			 * at the root, the end of the table means failure.
+			 */
+			if (pt->gipt_level >= map->giptm_levels) {
+				goto fail;
+			}
+			pt = gipt_map_lookup(map, cur_va, pt->gipt_level + 1);
+			if (pt == NULL) {
+				goto fail;
+			}
+			continue;
+		} else if (next_va >= max_va) {
+			/*
+			 * Terminate the walk with a failure if the VA
+			 * corresponding to the next PTE is beyond the max.
+			 */
+			goto fail;
+		}
+		cur_va = next_va;
+
+		const uint64_t pte = GIPT_VA2PTE(pt, cur_va);
+		const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level);
+		if (ptet == PTET_EMPTY) {
+			continue;
+		} else if (ptet == PTET_PAGE) {
+			/* A valid page mapping: success. */
+			*ptp = pt;
+			return (cur_va);
+		} else if (ptet == PTET_LINK) {
+			/*
+			 * A child page table is present at this PTE.  Look it
+			 * up from the map.
+			 */
+			ASSERT3U(pt->gipt_level, >, 0);
+			pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1);
+			ASSERT3P(pt, !=, NULL);
+			break;
+		} else {
+			panic("unexpected PTE type %x @ va %p", ptet, cur_va);
+		}
+	}
+
+	/*
+	 * By this point, the above loop has located a table structure to
+	 * descend into in order to find the next page.
+	 */
+	while (cur_va < max_va) {
+		const uint64_t pte = GIPT_VA2PTE(pt, cur_va);
+		const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level);
+
+		if (ptet == PTET_EMPTY) {
+			const uint64_t next_va = gipt_next_va(pt, cur_va);
+			if (next_va == 0 || next_va >= max_va) {
+				goto fail;
+			}
+			cur_va = next_va;
+			continue;
+		} else if (ptet == PTET_PAGE) {
+			/* A valid page mapping: success. */
+			*ptp = pt;
+			return (cur_va);
+		} else if (ptet == PTET_LINK) {
+			/*
+			 * A child page table is present at this PTE.  Look it
+			 * up from the map.
+			 */
+			ASSERT3U(pt->gipt_level, >, 0);
+			pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1);
+			ASSERT3P(pt, !=, NULL);
+		} else {
+			panic("unexpected PTE type %x @ va %p", ptet, cur_va);
+		}
+	}
+
+fail:
+	*ptp = NULL;
+	return (0);
+}
+
+/*
+ * Insert a gipt_t into the map based on its VA and level.  It is up to the
+ * caller to ensure that a duplicate entry does not already exist in the map.
+ */
+void
+gipt_map_insert(gipt_map_t *map, gipt_t *pt)
+{
+	const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level);
+
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+	ASSERT(gipt_map_lookup(map, pt->gipt_vaddr, pt->gipt_level) == NULL);
+	VERIFY3U(pt->gipt_level, <, map->giptm_levels);
+
+	list_insert_head(&map->giptm_hash[hash], pt);
+}
+
+/*
+ * Remove a gipt_t from the map.
+ */
+void
+gipt_map_remove(gipt_map_t *map, gipt_t *pt)
+{
+	const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level);
+
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+
+	list_remove(&map->giptm_hash[hash], pt);
+}
+
+/*
+ * Given a VA, create any missing gipt_t entries from the specified level all
+ * the way up to (but not including) the root.  This is done from lowest level
+ * to highest, and stops when an existing table covering that VA is found.
+ * References to any created gipt_t tables, plus the final "found" gipt_t are
+ * stored in 'pts'.  The number of gipt_t pointers stored to 'pts' serves as
+ * the return value (1 <= val <= root level).  It is up to the caller to
+ * populate linking PTEs to the newly created empty tables.
+ */
+static uint_t
+gipt_map_ensure_chain(gipt_map_t *map, uint64_t va, uint_t lvl, gipt_t **pts)
+{
+	const uint_t root_lvl = map->giptm_root->gipt_level;
+	uint_t clvl = lvl, count = 0;
+	gipt_t *child_pt = NULL;
+
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+	ASSERT3U(lvl, <, root_lvl);
+	ASSERT3P(map->giptm_root, !=, NULL);
+
+	do {
+		const uint64_t pva = (va & gipt_level_mask[clvl + 1]);
+		gipt_t *pt;
+
+		pt = gipt_map_lookup(map, pva, clvl);
+		if (pt != NULL) {
+			ASSERT3U(pva, ==, pt->gipt_vaddr);
+
+			if (child_pt != NULL) {
+				child_pt->gipt_parent = pt;
+			}
+			pts[count++] = pt;
+			return (count);
+		}
+
+		pt = gipt_alloc();
+		pt->gipt_vaddr = pva;
+		pt->gipt_level = clvl;
+		if (child_pt != NULL) {
+			child_pt->gipt_parent = pt;
+		}
+
+		gipt_map_insert(map, pt);
+		child_pt = pt;
+		pts[count++] = pt;
+		clvl++;
+	} while (clvl <= root_lvl);
+
+	return (count);
+}
+
+/*
+ * Ensure that a page table covering a VA at a specified level exists.  This
+ * will create any necessary tables chaining up to the root as well.
+ */
+gipt_t *
+gipt_map_create_parents(gipt_map_t *map, uint64_t va, uint_t lvl)
+{
+	gipt_t *pt, *pts[GIPT_MAX_LEVELS] = { 0 };
+	gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type;
+	gipt_pte_map_cb_t pte_map = map->giptm_cbs.giptc_pte_map;
+	uint64_t *ptep;
+	uint_t i, count;
+
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+
+	count = gipt_map_ensure_chain(map, va, lvl, pts);
+	if (count == 1) {
+		/* Table already exists in the hierarchy */
+		return (pts[0]);
+	}
+	ASSERT3U(count, >, 1);
+
+	/* Make sure there is not already a large page mapping at the top */
+	pt = pts[count - 1];
+	if (pte_type(GIPT_VA2PTE(pt, va), pt->gipt_level) == PTET_PAGE) {
+		const uint_t end = count - 1;
+
+		/*
+		 * Nuke those gipt_t entries which were optimistically created
+		 * for what was found to be a conflicted mapping.
+		 */
+		for (i = 0; i < end; i++) {
+			gipt_map_remove(map, pts[i]);
+			gipt_free(pts[i]);
+		}
+		return (NULL);
+	}
+
+	/* Initialize the appropriate tables from bottom to top */
+	for (i = 1; i < count; i++) {
+		pt = pts[i];
+		ptep = GIPT_VA2PTEP(pt, va);
+
+		/*
+		 * Since gipt_map_ensure_chain() creates missing tables until
+		 * it find a valid one, and that existing table has been
+		 * checked for the existence of a large page, nothing should
+		 * occupy this PTE.
+		 */
+		ASSERT3U(pte_type(*ptep, pt->gipt_level), ==, PTET_EMPTY);
+
+		*ptep = pte_map(pts[i - 1]->gipt_pfn);
+		pt->gipt_valid_cnt++;
+	}
+
+	return (pts[0]);
+}
+
+/*
+ * If a page table is empty, free it from the map, as well as any parent tables
+ * that would subsequently become empty as part of the clean-up.  As noted in
+ * gipt_map_init(), the table root is a special case and will remain in the
+ * map, even when empty.
+ */
+void
+gipt_map_clean_parents(gipt_map_t *map, gipt_t *pt)
+{
+	ASSERT(MUTEX_HELD(&map->giptm_lock));
+
+	while (pt->gipt_valid_cnt == 0) {
+		gipt_t *parent = pt->gipt_parent;
+		uint64_t *ptep = GIPT_VA2PTEP(parent, pt->gipt_vaddr);
+
+		ASSERT3S(map->giptm_cbs.giptc_pte_type(*ptep,
+		    parent->gipt_level), ==, PTET_LINK);
+
+		/*
+		 * For now, it is assumed that all gipt consumers consider PTE
+		 * zeroing as an adequate action for table unmap.
+		 */
+		*ptep = 0;
+
+		parent->gipt_valid_cnt--;
+		gipt_map_remove(map, pt);
+		gipt_free(pt);
+		pt = parent;
+	}
+}
diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h
new file mode 100644
index 0000000000..4d7d523726
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/gipt.h
@@ -0,0 +1,92 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _GIPT_H_
+#define	_GIPT_H_
+
+#include <sys/types.h>
+#include <sys/mutex.h>
+#include <sys/param.h>
+#include <sys/list.h>
+
+struct gipt {
+	list_node_t	gipt_node;
+	uint64_t	gipt_vaddr;
+	uint64_t	gipt_pfn;
+	uint16_t	gipt_level;
+	uint16_t	gipt_valid_cnt;
+	uint32_t	_gipt_pad;
+	struct gipt	*gipt_parent;
+	uint64_t	*gipt_kva;
+	uint64_t	_gipt_pad2;
+};
+typedef struct gipt gipt_t;
+
+typedef enum {
+	PTET_EMPTY	= 0,
+	PTET_PAGE	= 1,
+	PTET_LINK	= 2,
+} gipt_pte_type_t;
+
+/* Given a PTE and its level, determine the type of that PTE */
+typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t);
+/* Given the PFN of a child table, emit a PTE that references it */
+typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t);
+
+struct gipt_cbs {
+	gipt_pte_type_cb_t	giptc_pte_type;
+	gipt_pte_map_cb_t	giptc_pte_map;
+};
+
+struct gipt_map {
+	kmutex_t	giptm_lock;
+	gipt_t		*giptm_root;
+	list_t		*giptm_hash;
+	struct gipt_cbs	giptm_cbs;
+	size_t		giptm_table_cnt;
+	uint_t		giptm_levels;
+};
+typedef struct gipt_map gipt_map_t;
+
+#define	GIPT_HASH_SIZE_DEFAULT	0x2000
+#define	GIPT_MAX_LEVELS	4
+
+#define	GIPT_VA2IDX(pt, va)			\
+	(((va) - (pt)->gipt_vaddr) >>		\
+	gipt_level_shift[(pt)->gipt_level])
+
+#define	GIPT_VA2PTE(pt, va)	((pt)->gipt_kva[GIPT_VA2IDX(pt, va)])
+#define	GIPT_VA2PTEP(pt, va)	(&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)])
+
+extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1];
+extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1];
+extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1];
+extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1];
+
+extern gipt_t *gipt_alloc(void);
+extern void gipt_free(gipt_t *);
+extern void gipt_map_init(gipt_map_t *, uint_t, uint_t,
+    const struct gipt_cbs *, gipt_t *);
+extern void gipt_map_fini(gipt_map_t *);
+extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t);
+extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t);
+extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t,
+    gipt_t **);
+extern void gipt_map_insert(gipt_map_t *, gipt_t *);
+extern void gipt_map_remove(gipt_map_t *, gipt_t *);
+extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t);
+extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *);
+
+#endif /* _GIPT_H_ */
diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h
index a4fb0f2527..a26cc00a55 100644
--- a/usr/src/uts/i86pc/sys/viona_io.h
+++ b/usr/src/uts/i86pc/sys/viona_io.h
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #ifndef	_VIONA_IO_H_
@@ -27,8 +28,8 @@
 #define	VNA_IOC_TX_RING_KICK	(VNA_IOC | 8)
 #define	VNA_IOC_RX_INTR_CLR	(VNA_IOC | 9)
 #define	VNA_IOC_TX_INTR_CLR	(VNA_IOC | 10)
-#define VNA_IOC_SET_FEATURES	(VNA_IOC | 11)
-#define VNA_IOC_GET_FEATURES	(VNA_IOC | 12)
+#define	VNA_IOC_SET_FEATURES	(VNA_IOC | 11)
+#define	VNA_IOC_GET_FEATURES	(VNA_IOC | 12)
 
 typedef struct vioc_create {
 	datalink_id_t	c_linkid;
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index e876ce748f..8a35d123c7 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/vmm.h 273375 2014-10-21 07:10:43Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,18 +38,25 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
+#include <sys/sdt.h>
 #include <x86/segments.h>
 
+#ifdef _KERNEL
+SDT_PROVIDER_DECLARE(vmm);
+#endif
+
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
+	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
@@ -89,6 +98,16 @@ enum vm_reg_name {
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
+	VM_REG_GUEST_PDPTE0,
+	VM_REG_GUEST_PDPTE1,
+	VM_REG_GUEST_PDPTE2,
+	VM_REG_GUEST_PDPTE3,
+	VM_REG_GUEST_INTR_SHADOW,
+	VM_REG_GUEST_DR0,
+	VM_REG_GUEST_DR1,
+	VM_REG_GUEST_DR2,
+	VM_REG_GUEST_DR3,
+	VM_REG_GUEST_DR6,
 	VM_REG_LAST
 };
 
@@ -108,31 +127,37 @@ enum x2apic_state {
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
+
 #define	VM_MAX_NAMELEN	32
 
 #ifdef _KERNEL
 
 struct vm;
 struct vm_exception;
-struct vm_memory_segment;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vhpet;
 struct vioapic;
 struct vlapic;
+struct vmspace;
+struct vm_object;
 struct vm_guest_paging;
+struct pmap;
 
-typedef int	(*vmm_init_func_t)(void);
+struct vm_eventinfo {
+	u_int	*rptr;		/* runblock cookie */
+	int	*sptr;		/* suspend cookie */
+	int	*iptr;		/* reqidle cookie */
+};
+
+typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
-typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
-typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void	(*vmm_resume_func_t)(void);
+typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+		    struct pmap *pmap, struct vm_eventinfo *info);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
-typedef int	(*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
-				       vm_paddr_t hpa, size_t length,
-				       vm_memattr_t attr, int prot,
-				       boolean_t superpages_ok);
-typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
@@ -143,26 +168,38 @@ typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
+typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
 typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
 typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
+#ifndef __FreeBSD__
+typedef void	(*vmi_savectx)(void *vmi, int vcpu);
+typedef void	(*vmi_restorectx)(void *vmi, int vcpu);
+#endif
 
 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
 	vmm_cleanup_func_t	cleanup;
+	vmm_resume_func_t	resume;
 
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
-	vmi_mmap_set_func_t	vmmmap_set;
-	vmi_mmap_get_func_t	vmmmap_get;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
+	vmi_vmspace_alloc	vmspace_alloc;
+	vmi_vmspace_free	vmspace_free;
 	vmi_vlapic_init		vlapic_init;
 	vmi_vlapic_cleanup	vlapic_cleanup;
+
+#ifndef __FreeBSD__
+	vmi_savectx		vmsavectx;
+	vmi_restorectx		vmrestorectx;
+#endif
 };
 
 extern struct vmm_ops vmm_ops_intel;
@@ -170,20 +207,41 @@ extern struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
 void vm_destroy(struct vm *vm);
+int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
-int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
-#ifdef	__FreeBSD__
+uint16_t vm_get_maxcpus(struct vm *vm);
+void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
+    uint16_t *threads, uint16_t *maxcpus);
+int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
+    uint16_t threads, uint16_t maxcpus);
+
+/*
+ * APIs that modify the guest memory map require all vcpus to be frozen.
+ */
+int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+    size_t len, int prot, int flags);
+int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
+void vm_free_memseg(struct vm *vm, int ident);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
-#endif
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
-#ifndef	__FreeBSD__
-vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
-#endif
-void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
-		  void **cookie);
+int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
+
+/*
+ * APIs that inspect the guest memory map require only a *single* vcpu to
+ * be frozen. This acts like a read lock on the guest memory map since any
+ * modification requires *all* vcpus to be frozen.
+ */
+int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
+    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
+    struct vm_object **objptr);
+vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm);
+void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
+    int prot, void **cookie);
 void vm_gpa_release(void *cookie);
-int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
-	      struct vm_memory_segment *seg);
+bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
+
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@@ -191,6 +249,7 @@ int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vm *vm, int vcpu);
 int vm_nmi_pending(struct vm *vm, int vcpuid);
 void vm_nmi_clear(struct vm *vm, int vcpuid);
@@ -206,10 +265,43 @@ int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
-cpuset_t vm_active_cpus(struct vm *vm);
+int vm_suspend_cpu(struct vm *vm, int vcpu);
+int vm_resume_cpu(struct vm *vm, int vcpu);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
+
+#ifdef _SYS__CPUSET_H_
+cpuset_t vm_active_cpus(struct vm *vm);
+cpuset_t vm_debug_cpus(struct vm *vm);
+cpuset_t vm_suspended_cpus(struct vm *vm);
+#endif	/* _SYS__CPUSET_H_ */
+
+static __inline int
+vcpu_runblocked(struct vm_eventinfo *info)
+{
 
-typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
+	return (*info->rptr != 0);
+}
+
+static __inline int
+vcpu_suspended(struct vm_eventinfo *info)
+{
+
+	return (*info->sptr);
+}
+
+static __inline int
+vcpu_reqidle(struct vm_eventinfo *info)
+{
+
+	return (*info->iptr);
+}
+
+int vcpu_debugged(struct vm *vm, int vcpuid);
 
 /*
  * Return 1 if device indicated by bus/slot/func is supposed to be a
@@ -231,21 +323,43 @@ enum vcpu_state {
 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
     bool from_idle);
 enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
+void vcpu_block_run(struct vm *, int);
+void vcpu_unblock_run(struct vm *, int);
+
+#ifndef __FreeBSD__
+uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid);
+#endif
 
-static int __inline
+static __inline int
 vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
+#ifdef _SYS_THREAD_H
+static __inline int
+vcpu_should_yield(struct vm *vm, int vcpu)
+{
+
+	if (curthread->t_astflag)
+		return (1);
+	else if (CPU->cpu_runrun)
+		return (1);
+	else
+		return (0);
+}
+#endif /* _SYS_THREAD_H */
+
 void *vcpu_stats(struct vm *vm, int vcpu);
-void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
+struct vmspace *vm_get_vmspace(struct vm *vm);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
+struct vpmtmr *vm_pmtmr(struct vm *vm);
+struct vrtc *vm_rtc(struct vm *vm);
 
 /*
- * Inject exception 'vme' into the guest vcpu. This function returns 0 on
+ * Inject exception 'vector' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
@@ -255,7 +369,8 @@ struct vatpit *vm_atpit(struct vm *vm);
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
-int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
+int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
+    uint32_t errcode, int restart_instruction);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
@@ -298,9 +413,10 @@ struct vm_copyinfo {
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
- * Returns 0 on success.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
+ * retval	is_fault	Interpretation
+ *   0		   0		Success
+ *   0		   1		An exception was injected into the guest
+ * EFAULT	  N/A		Unrecoverable error
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
@@ -308,16 +424,18 @@ struct vm_copyinfo {
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
-    int num_copyinfo);
+    int num_copyinfo, int *is_fault);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     void *kaddr, size_t len);
 void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len);
+
+int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
 #endif	/* KERNEL */
 
-#define	VM_MAXCPU	16			/* maximum virtual cpus */
+#define	VM_MAXCPU	32			/* maximum virtual cpus */
 
 /*
  * Identifiers for optional vmm capabilities
@@ -348,7 +466,6 @@ struct seg_desc {
 	uint32_t	limit;
 	uint32_t	access;
 };
-
 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
 #define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
@@ -443,7 +560,20 @@ enum vm_exitcode {
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
+	VM_EXITCODE_RUNBLOCK,
+	VM_EXITCODE_IOAPIC_EOI,
+	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
+	VM_EXITCODE_TASK_SWITCH,
+	VM_EXITCODE_MONITOR,
+	VM_EXITCODE_MWAIT,
+	VM_EXITCODE_SVM,
+	VM_EXITCODE_REQIDLE,
+	VM_EXITCODE_DEBUG,
+	VM_EXITCODE_VMINSN,
+#ifndef	__FreeBSD__
+	VM_EXITCODE_HT,
+#endif
 	VM_EXITCODE_MAX
 };
 
@@ -468,6 +598,22 @@ struct vm_inout_str {
 	struct seg_desc seg_desc;
 };
 
+enum task_switch_reason {
+	TSR_CALL,
+	TSR_IRET,
+	TSR_JMP,
+	TSR_IDT_GATE,	/* task gate in IDT */
+};
+
+struct vm_task_switch {
+	uint16_t	tsssel;		/* new TSS selector */
+	int		ext;		/* task switch due to external event */
+	uint32_t	errcode;
+	int		errcode_valid;	/* push 'errcode' on the new stack */
+	enum task_switch_reason reason;
+	struct vm_guest_paging paging;
+};
+
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
@@ -506,6 +652,14 @@ struct vm_exit {
 			int		inst_type;
 			int		inst_error;
 		} vmx;
+		/*
+		 * SVM specific payload.
+		 */
+		struct {
+			uint64_t	exitcode;
+			uint64_t	exitinfo1;
+			uint64_t	exitinfo2;
+		} svm;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
@@ -516,7 +670,15 @@ struct vm_exit {
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
+			uint64_t	intr_status;
 		} hlt;
+		struct {
+			int		vector;
+		} ioapic_eoi;
+		struct {
+			enum vm_suspend_how how;
+		} suspended;
+		struct vm_task_switch task_switch;
 	} u;
 };
 
@@ -554,12 +716,28 @@ int vm_restart_instruction(void *vm, int vcpuid);
 
 #ifndef	__FreeBSD__
 #ifdef	_KERNEL
-extern void vmm_sol_glue_init(void);
-extern void vmm_sol_glue_cleanup(void);
 
-extern int vmm_mod_load(void);
-extern int vmm_mod_unload(void);
-#endif
-#endif
+void vmm_sol_glue_init(void);
+void vmm_sol_glue_cleanup(void);
+
+int vmm_mod_load(void);
+int vmm_mod_unload(void);
+
+void vmm_call_trap(uint64_t);
+
+/*
+ * Because of tangled headers, these are mirrored by vmm_drv.h to present the
+ * interface to driver consumers.
+ */
+typedef int (*vmm_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *);
+typedef int (*vmm_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t);
+
+int vm_ioport_hook(struct vm *, uint_t, vmm_rmem_cb_t, vmm_wmem_cb_t, void *,
+    void **);
+void vm_ioport_unhook(struct vm *, void **);
+int vm_ioport_handle_hook(struct vm *, int, bool, int, int, uint32_t *);
+
+#endif /* _KERNEL */
+#endif /* __FreeBSD */
 
 #endif	/* _VMM_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
index 3e74eb8786..58e581a60d 100644
--- a/usr/src/uts/i86pc/sys/vmm_dev.h
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/vmm_dev.h 268889 2014-07-19 20:59:08Z neel $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -36,20 +38,30 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef	_VMM_DEV_H_
 #define	_VMM_DEV_H_
 
-#ifdef _KERNEL
-void	vmmdev_init(void);
-int	vmmdev_cleanup(void);
-#endif
+#include <machine/vmm.h>
+
+struct vm_memmap {
+	vm_paddr_t	gpa;
+	int		segid;		/* memory segment */
+	vm_ooffset_t	segoff;		/* offset into memory segment */
+	size_t		len;		/* mmap length */
+	int		prot;		/* RWX */
+	int		flags;
+};
+#define	VM_MEMMAP_F_WIRED	0x01
+#define	VM_MEMMAP_F_IOMMU	0x02
 
-struct vm_memory_segment {
-	vm_paddr_t	gpa;	/* in */
+#define	VM_MEMSEG_NAME(m)	((m)->name[0] != '\0' ? (m)->name : NULL)
+struct vm_memseg {
+	int		segid;
 	size_t		len;
-	int		wired;
+	char		name[SPECNAMELEN + 1];
 };
 
 struct vm_register {
@@ -64,6 +76,13 @@ struct vm_seg_desc {			/* data or code segment */
 	struct seg_desc desc;
 };
 
+struct vm_register_set {
+	int		cpuid;
+	unsigned int	count;
+	const int	*regnums;	/* enum vm_reg_name */
+	uint64_t	*regvals;
+};
+
 struct vm_run {
 	int		cpuid;
 	struct vm_exit	vm_exit;
@@ -129,7 +148,7 @@ struct vm_pptdev_msi {
 	int		slot;
 	int		func;
 	int		numvec;		/* 0 means disabled */
-	uint32_t	msg;
+	uint64_t	msg;
 	uint64_t	addr;
 };
 
@@ -139,7 +158,7 @@ struct vm_pptdev_msix {
 	int		slot;
 	int		func;
 	int		idx;
-	uint32_t	msg;
+	uint64_t	msg;
 	uint32_t	vector_control;
 	uint64_t	addr;
 };
@@ -148,7 +167,12 @@ struct vm_nmi {
 	int		cpuid;
 };
 
+#ifdef __FreeBSD__
 #define	MAX_VM_STATS	64
+#else
+#define	MAX_VM_STATS	(64 + VM_MAXCPU)
+#endif
+
 struct vm_stats {
 	int		cpuid;				/* in */
 	int		num_entries;			/* out */
@@ -176,8 +200,8 @@ struct vm_hpet_cap {
 	uint32_t	capabilities;	/* lower 32 bits of HPET capabilities */
 };
 
-struct vm_activate_cpu {
-	int		vcpuid;
+struct vm_suspend {
+	enum vm_suspend_how how;
 };
 
 struct vm_gla2gpa {
@@ -189,13 +213,51 @@ struct vm_gla2gpa {
 	uint64_t	gpa;
 };
 
+struct vm_activate_cpu {
+	int		vcpuid;
+};
+
 struct vm_cpuset {
 	int		which;
 	int		cpusetsize;
+#ifndef _KERNEL
 	cpuset_t	*cpus;
+#else
+	void		*cpus;
+#endif
 };
 #define	VM_ACTIVE_CPUS		0
 #define	VM_SUSPENDED_CPUS	1
+#define	VM_DEBUG_CPUS		2
+
+struct vm_intinfo {
+	int		vcpuid;
+	uint64_t	info1;
+	uint64_t	info2;
+};
+
+struct vm_rtc_time {
+	time_t		secs;
+};
+
+struct vm_rtc_data {
+	int		offset;
+	uint8_t		value;
+};
+
+#ifndef __FreeBSD__
+struct vm_devmem_offset {
+	int		segid;
+	off_t		offset;
+};
+#endif
+
+struct vm_cpu_topology {
+	uint16_t	sockets;
+	uint16_t	cores;
+	uint16_t	threads;
+	uint16_t	maxcpus;
+};
 
 enum {
 	/* general routines */
@@ -203,20 +265,31 @@ enum {
 	IOCNUM_RUN = 1,
 	IOCNUM_SET_CAPABILITY = 2,
 	IOCNUM_GET_CAPABILITY = 3,
+	IOCNUM_SUSPEND = 4,
+	IOCNUM_REINIT = 5,
 
 	/* memory apis */
-	IOCNUM_MAP_MEMORY = 10,
-	IOCNUM_GET_MEMORY_SEG = 11,
+	IOCNUM_MAP_MEMORY = 10,			/* deprecated */
+	IOCNUM_GET_MEMORY_SEG = 11,		/* deprecated */
 	IOCNUM_GET_GPA_PMAP = 12,
 	IOCNUM_GLA2GPA = 13,
+	IOCNUM_ALLOC_MEMSEG = 14,
+	IOCNUM_GET_MEMSEG = 15,
+	IOCNUM_MMAP_MEMSEG = 16,
+	IOCNUM_MMAP_GETNEXT = 17,
+	IOCNUM_GLA2GPA_NOFAULT = 18,
 
 	/* register/state accessors */
 	IOCNUM_SET_REGISTER = 20,
 	IOCNUM_GET_REGISTER = 21,
 	IOCNUM_SET_SEGMENT_DESCRIPTOR = 22,
 	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
+	IOCNUM_SET_REGISTER_SET = 24,
+	IOCNUM_GET_REGISTER_SET = 25,
 
 	/* interrupt injection */
+	IOCNUM_GET_INTINFO = 28,
+	IOCNUM_SET_INTINFO = 29,
 	IOCNUM_INJECT_EXCEPTION = 30,
 	IOCNUM_LAPIC_IRQ = 31,
 	IOCNUM_INJECT_NMI = 32,
@@ -244,6 +317,10 @@ enum {
 	IOCNUM_GET_X2APIC_STATE = 61,
 	IOCNUM_GET_HPET_CAPABILITIES = 62,
 
+	/* CPU Topology */
+	IOCNUM_SET_TOPOLOGY = 63,
+	IOCNUM_GET_TOPOLOGY = 64,
+
 	/* legacy interrupt injection */
 	IOCNUM_ISA_ASSERT_IRQ = 80,
 	IOCNUM_ISA_DEASSERT_IRQ = 81,
@@ -253,14 +330,36 @@ enum {
 	/* vm_cpuset */
 	IOCNUM_ACTIVATE_CPU = 90,
 	IOCNUM_GET_CPUSET = 91,
+	IOCNUM_SUSPEND_CPU = 92,
+	IOCNUM_RESUME_CPU = 93,
+
+	/* RTC */
+	IOCNUM_RTC_READ = 100,
+	IOCNUM_RTC_WRITE = 101,
+	IOCNUM_RTC_SETTIME = 102,
+	IOCNUM_RTC_GETTIME = 103,
+
+#ifndef __FreeBSD__
+	/* illumos-custom ioctls */
+	IOCNUM_DEVMEM_GETOFFSET = 256,
+	IOCNUM_WRLOCK_CYCLE = 257,
+#endif
 };
 
 #define	VM_RUN		\
 	_IOWR('v', IOCNUM_RUN, struct vm_run)
-#define	VM_MAP_MEMORY	\
-	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
-#define	VM_GET_MEMORY_SEG \
-	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_SUSPEND	\
+	_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
+#define	VM_REINIT	\
+	_IO('v', IOCNUM_REINIT)
+#define	VM_ALLOC_MEMSEG	\
+	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg)
+#define	VM_GET_MEMSEG	\
+	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg)
+#define	VM_MMAP_MEMSEG	\
+	_IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap)
+#define	VM_MMAP_GETNEXT	\
+	_IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap)
 #define	VM_SET_REGISTER \
 	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
 #define	VM_GET_REGISTER \
@@ -269,6 +368,10 @@ enum {
 	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
 #define	VM_GET_SEGMENT_DESCRIPTOR \
 	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_SET_REGISTER_SET \
+	_IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set)
+#define	VM_GET_REGISTER_SET \
+	_IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set)
 #define	VM_INJECT_EXCEPTION	\
 	_IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
 #define	VM_LAPIC_IRQ 		\
@@ -309,10 +412,8 @@ enum {
 	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
 #define VM_INJECT_NMI \
 	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
-#ifdef	__FreeBSD__
-#define	VM_STATS \
+#define	VM_STATS_IOC \
 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
-#endif
 #define	VM_STAT_DESC \
 	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
 #define	VM_SET_X2APIC_STATE \
@@ -321,14 +422,52 @@ enum {
 	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
 #define	VM_GET_HPET_CAPABILITIES \
 	_IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
+#define VM_SET_TOPOLOGY \
+	_IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology)
+#define VM_GET_TOPOLOGY \
+	_IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology)
 #define	VM_GET_GPA_PMAP \
 	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
 #define	VM_GLA2GPA	\
 	_IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
+#define	VM_GLA2GPA_NOFAULT \
+	_IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa)
 #define	VM_ACTIVATE_CPU	\
 	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
 #define	VM_GET_CPUS	\
 	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define	VM_SUSPEND_CPU \
+	_IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu)
+#define	VM_RESUME_CPU \
+	_IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu)
+#define	VM_SET_INTINFO	\
+	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
+#define	VM_GET_INTINFO	\
+	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
+#define VM_RTC_WRITE \
+	_IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data)
+#define VM_RTC_READ \
+	_IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data)
+#define VM_RTC_SETTIME	\
+	_IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time)
+#define VM_RTC_GETTIME	\
+	_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
 #define	VM_RESTART_INSTRUCTION \
 	_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
+
+#ifndef __FreeBSD__
+#define	VM_DEVMEM_GETOFFSET \
+	_IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset)
+#define	VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE)
+
+/* ioctls used against ctl device for vm create/destroy */
+#define	VMM_IOC_BASE		(('V' << 16) | ('M' << 8))
+#define	VMM_CREATE_VM		(VMM_IOC_BASE | 0x01)
+#define	VMM_DESTROY_VM		(VMM_IOC_BASE | 0x02)
+#define	VMM_VM_SUPPORTED	(VMM_IOC_BASE | 0x03)
+
+#define	VMM_CTL_DEV		"/dev/vmmctl"
+
+#endif
+
 #endif
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
new file mode 100644
index 0000000000..33fefc10ea
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -0,0 +1,50 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VMM_DRV_H_
+#define	_VMM_DRV_H_
+
+#ifdef	_KERNEL
+struct vmm_hold;
+typedef struct vmm_hold vmm_hold_t;
+
+struct vmm_lease;
+typedef struct vmm_lease vmm_lease_t;
+
+/*
+ * Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t
+ * counterparts in vmm.h.
+ */
+typedef int (*vmm_drv_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *);
+typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t);
+
+extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **);
+extern void vmm_drv_rele(vmm_hold_t *);
+extern boolean_t vmm_drv_release_reqd(vmm_hold_t *);
+
+extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *),
+    void *);
+extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *);
+extern boolean_t vmm_drv_lease_expired(vmm_lease_t *);
+
+extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t);
+extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t);
+
+extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t,
+    vmm_drv_wmem_cb_t, void *, void **);
+extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **);
+#endif /* _KERNEL */
+
+#endif /* _VMM_DRV_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h
index 1602fa286d..cdc56cc464 100644
--- a/usr/src/uts/i86pc/sys/vmm_impl.h
+++ b/usr/src/uts/i86pc/sys/vmm_impl.h
@@ -11,76 +11,79 @@
 
 /*
  * Copyright 2014 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _VMM_IMPL_H_
-#define _VMM_IMPL_H_
+#define	_VMM_IMPL_H_
 
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/varargs.h>
+#include <sys/zone.h>
+
+#ifdef	_KERNEL
+
+#define	VMM_CTL_MINOR	0
 
 /*
- * /dev names:
- *      /dev/vmmctl         - control device
- *      /dev/vmm/<name>     - vm devices
+ * Rather than creating whole character devices for devmem mappings, they are
+ * available by mmap(2)ing the vmm handle at a specific offset.  These offsets
+ * begin just above the maximum allow guest physical address.
  */
-#define	VMM_DRIVER_NAME		"vmm"
+#include <vm/vm_param.h>
+#define	VM_DEVMEM_START	(VM_MAXUSER_ADDRESS + 1)
 
-#define	VMM_CTL_MINOR_NODE	"ctl"
-#define	VMM_CTL_MINOR_NAME	VMM_DRIVER_NAME VMM_CTL_NODE
-#define	VMM_CTL_MINOR		0
-
-#define	VMM_IOC_BASE		(('V' << 16) | ('M' << 8))
+struct vmm_devmem_entry {
+	list_node_t	vde_node;
+	int		vde_segid;
+	char		vde_name[SPECNAMELEN + 1];
+	size_t		vde_len;
+	off_t		vde_off;
+};
+typedef struct vmm_devmem_entry vmm_devmem_entry_t;
 
-#define	VMM_CREATE_VM		(VMM_IOC_BASE | 0x01)
-#define	VMM_DESTROY_VM		(VMM_IOC_BASE | 0x02)
+typedef struct vmm_zsd vmm_zsd_t;
 
-struct vmm_ioctl {
-	char vmm_name[VM_MAX_NAMELEN];
+enum vmm_softc_state {
+	VMM_HELD	= 1,	/* external driver(s) possess hold on the VM */
+	VMM_CLEANUP	= 2,	/* request that holds are released */
+	VMM_PURGED	= 4,	/* all hold have been released */
+	VMM_BLOCK_HOOK	= 8,	/* mem hook install temporarily blocked */
+	VMM_DESTROY	= 16	/* VM is destroyed, softc still around */
 };
 
-#ifdef	_KERNEL
 struct vmm_softc {
-	boolean_t			open;
-	minor_t				minor;
-	struct vm			*vm;
-	char				name[VM_MAX_NAMELEN];
-	SLIST_ENTRY(vmm_softc)		link;
-};
-#endif
+	list_node_t	vmm_node;
+	struct vm	*vmm_vm;
+	minor_t		vmm_minor;
+	char		vmm_name[VM_MAX_NAMELEN];
+	list_t		vmm_devmem_list;
 
-/*
- * VMM trace ring buffer constants
- */
-#define	VMM_DMSG_RING_SIZE		0x100000	/* 1MB */
-#define	VMM_DMSG_BUF_SIZE		256
+	kcondvar_t	vmm_cv;
+	list_t		vmm_holds;
+	uint_t		vmm_flags;
+	boolean_t	vmm_is_open;
 
-/*
- * VMM trace ring buffer content
- */
-typedef struct vmm_trace_dmsg {
-	timespec_t		timestamp;
-	char			buf[VMM_DMSG_BUF_SIZE];
-	struct vmm_trace_dmsg	*next;
-} vmm_trace_dmsg_t;
+	kmutex_t	vmm_lease_lock;
+	list_t		vmm_lease_list;
+	uint_t		vmm_lease_blocker;
+	kcondvar_t	vmm_lease_cv;
+	krwlock_t	vmm_rwlock;
 
-/*
- * VMM trace ring buffer header
- */
-typedef struct vmm_trace_rbuf {
-	kmutex_t		lock;		/* lock to avoid clutter */
-	int			looped;		/* completed ring */
-	int			allocfailed;	/* dmsg mem alloc failed */
-	size_t			size;		/* current size */
-	size_t			maxsize;	/* max size */
-	vmm_trace_dmsg_t	*dmsgh;		/* messages head */
-	vmm_trace_dmsg_t	*dmsgp;		/* ptr to last message */
-} vmm_trace_rbuf_t;
+	/* For zone specific data */
+	list_node_t	vmm_zsd_linkage;
+	zone_t		*vmm_zone;
+	vmm_zsd_t	*vmm_zsd;
+};
+typedef struct vmm_softc vmm_softc_t;
 
-/*
- * VMM trace ring buffer interfaces
- */
-void vmm_trace_log(const char *fmt, ...);
+void vmm_zsd_init(void);
+void vmm_zsd_fini(void);
+int vmm_zsd_add_vm(vmm_softc_t *sc);
+void vmm_zsd_rem_vm(vmm_softc_t *sc);
+int vmm_do_vm_destroy(vmm_softc_t *, boolean_t);
+
+#endif /* _KERNEL */
 
 #endif	/* _VMM_IMPL_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
index 8138890a2c..f10f407164 100644
--- a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
+++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
@@ -1,4 +1,6 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
@@ -23,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/vmm_instruction_emul.h 276479 2014-12-31 20:31:32Z dim $
+ * $FreeBSD$
  */
 /*
  * This file and its contents are supplied under the terms of the
@@ -93,17 +95,26 @@ int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
  */
 int vmm_fetch_instruction(struct vm *vm, int cpuid,
 			  struct vm_guest_paging *guest_paging,
-			  uint64_t rip, int inst_length, struct vie *vie);
+			  uint64_t rip, int inst_length, struct vie *vie,
+			  int *is_fault);
 
 /*
  * Translate the guest linear address 'gla' to a guest physical address.
  *
- * Returns 0 on success and '*gpa' contains the result of the translation.
- * Returns 1 if an exception was injected into the guest.
- * Returns -1 otherwise.
+ * retval	is_fault	Interpretation
+ *   0		   0		'gpa' contains result of the translation
+ *   0		   1		An exception was injected into the guest
+ * EFAULT	  N/A		An unrecoverable hypervisor error occurred
  */
 int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
-    uint64_t gla, int prot, uint64_t *gpa);
+    uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
+
+/*
+ * Like vm_gla2gpa, but no exceptions are injected into the guest and
+ * PTEs are not changed.
+ */
+int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
 
 void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
 
diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile
index c2b8bd8dcf..4ede5bbd84 100644
--- a/usr/src/uts/i86pc/viona/Makefile
+++ b/usr/src/uts/i86pc/viona/Makefile
@@ -11,6 +11,7 @@
 
 #
 # Copyright 2013 Pluribus Networks Inc.
+# Copyright 2017 Joyent, Inc.
 #
 
 #
@@ -22,7 +23,7 @@ UTSBASE	= ../..
 # Define the module and object file sets.
 #
 MODULE		= viona
-OBJECTS		= $(VIONA_OBJS:%=$(OBJS_DIR)/%)
+OBJECTS	= $(VIONA_OBJS:%=$(OBJS_DIR)/%)
 LINTS		= $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
 CONF_SRCDIR	= $(UTSBASE)/i86pc/io/viona
@@ -42,6 +43,12 @@ INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 #
 # Overrides
 #
+LINTTAGS	+= -erroff=E_ASSIGN_NARROW_CONV
+LINTTAGS	+= -erroff=E_FUNC_ARG_UNUSED
+LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
+LINTTAGS	+= -erroff=E_FUNC_RET_MAYBE_IGNORED2
+LINTTAGS	+= -erroff=E_FUNC_RET_ALWAYS_IGNOR2
+
 CFLAGS		+= $(CCVERBOSE)
 LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm
 
diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile
index b3ab735781..5b93db987b 100644
--- a/usr/src/uts/i86pc/vmm/Makefile
+++ b/usr/src/uts/i86pc/vmm/Makefile
@@ -11,6 +11,7 @@
 
 #
 # Copyright 2013 Pluribus Networks Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 #
@@ -26,6 +27,7 @@ OBJECTS		= $(VMM_OBJS:%=$(OBJS_DIR)/%)
 LINTS		= $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
 CONF_SRCDIR	= $(UTSBASE)/i86pc/io/vmm
+MAPFILE		= $(UTSBASE)/i86pc/io/vmm/vmm.mapfile
 
 #
 #	Include common rules.
@@ -42,11 +44,52 @@ INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 #
 #	Overrides and additions
 #
+LINTTAGS	+= -erroff=E_EMPTY_DECLARATION
+LINTTAGS	+= -erroff=E_OPERANDS_INCOMPATIBLE_TYPES
+LINTTAGS	+= -erroff=E_VOID_CANT_RETURN_VALUE
+LINTTAGS	+= -erroff=E_YACC_ERROR
+LINTTAGS	+= -erroff=E_STATIC_UNUSED
+LINTTAGS	+= -erroff=E_FUNC_RET_MAYBE_IGNORED2
+LINTTAGS	+= -erroff=E_FUNC_RET_ALWAYS_IGNOR2
+LINTTAGS	+= -erroff=E_BAD_FORMAT_ARG_TYPE2
+LINTTAGS	+= -erroff=E_FUNC_ARG_UNUSED
+LINTTAGS	+= -erroff=E_FUNC_SET_NOT_USED
+LINTTAGS	+= -erroff=E_ASSIGN_NARROW_CONV
+LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
+LINTTAGS	+= -erroff=E_CONSTANT_CONDITION
+LINTTAGS	+= -erroff=E_PTR_TO_VOID_IN_ARITHMETIC
+LINTTAGS	+= -erroff=E_CONST_TRUNCATED_BY_ASSIGN
+LINTTAGS	+= -erroff=E_NOP_ELSE_STMT
+LINTTAGS	+= -erroff=E_FUNC_NO_RET_VAL
+LINTTAGS	+= -erroff=E_OLD_STYLE_DECL_OR_BAD_TYPE
+LINTTAGS	+= -erroff=E_VAR_USED_BEFORE_SET
+LINTTAGS	+= -erroff=E_INTEGER_OVERFLOW_DETECTED
+LINTTAGS	+= -erroff=E_STMT_NOT_REACHED
+LINTTAGS	+= -erroff=E_FUNC_NO_RET_VAL
+LINTTAGS	+= -erroff=E_USELESS_DECLARATION
+LINTTAGS	+= -erroff=E_EXPR_NULL_EFFECT
+LINTTAGS	+= -erroff=E_CASE_FALLTHRU
+LINTTAGS	+= -erroff=E_FUNC_DECL_VAR_ARG2
+LINTTAGS	+= -erroff=E_ASM_IMPOSSIBLE_CONSTRAINT
+LINTTAGS	+= -erroff=E_ASM_UNUSED_PARAM
+LINTTAGS	+= -erroff=E_NOP_IF_STMT
+LINTTAGS	+= -erroff=E_ZERO_OR_NEGATIVE_SUBSCRIPT
+
+CERRWARN	+= -_gcc=-Wno-empty-body
+
+# 3rd party code
+SMOFF += all_func_returns
+
+# needs work
+$(OBJS_DIR)/vmm_sol_dev.o := SMOFF += signed_integer_overflow_check
+
+# a can't happen: vmx_setcap() warn: variable dereferenced before check 'pptr'
+$(OBJS_DIR)/vmx.o := SMOFF += deref_check
 
 # These sources only compile with gcc.  Workaround a confluence of cruft
 # regarding dmake and shadow compilation by neutering the sun compiler.
-amd64_CC	= $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc
-CFLAGS 		+= -_cc=-xdryrun
+#amd64_CC	= $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc
+#CFLAGS		+= -_cc=-xdryrun
 
 ALL_BUILDS	= $(ALL_BUILDSONLY64)
 DEF_BUILDS	= $(DEF_BUILDSONLY64)
@@ -56,9 +99,23 @@ INC_PATH	+= -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io
 AS_INC_PATH	+= -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR)
 
 CFLAGS		+= -_gcc=-Wimplicit-function-declaration
+# The FreeBSD %# notation makes gcc gripe
+CFLAGS		+= -_gcc=-Wno-format
+# enable collection of VMM statistics
+CFLAGS		+= -DVMM_KEEP_STATS
 
-OFFSETS_SRC	= $(CONF_SRCDIR)/offsets.in
-ASSYM_H		= $(OBJS_DIR)/vmx_assym.h
+LDFLAGS         += -Nfs/dev
+
+$(OBJS_DIR)/vmm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits
+$(OBJS_DIR)/svm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits
+
+LDFLAGS         += -z type=kmod -M $(MAPFILE)
+
+OFFSETS_VMX	= $(CONF_SRCDIR)/intel/offsets.in
+OFFSETS_SVM	= $(CONF_SRCDIR)/amd/offsets.in
+ASSYM_VMX	= $(OBJS_DIR)/vmx_assym.h
+ASSYM_SVM	= $(OBJS_DIR)/svm_assym.h
+ASSYM_H		= $(ASSYM_VMX) $(ASSYM_SVM)
 
 CLEANFILES	+= $(ASSYM_H)
 
@@ -88,7 +145,10 @@ install:	$(INSTALL_DEPS)
 #
 include $(UTSBASE)/i86pc/Makefile.targ
 
-$(OBJECTS): $(ASSYM_H)
+$(ASSYM_VMX): $(OFFSETS_VMX) $(GENASSYM)
+	$(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_VMX) >$@
+$(ASSYM_SVM): $(OFFSETS_SVM) $(GENASSYM)
+	$(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SVM) >$@
 
-$(ASSYM_H): $(OFFSETS_SRC) $(GENASSYM)
-	$(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SRC) >$@
+$(OBJS_DIR)/vmx_support.o:  $(ASSYM_VMX)
+$(OBJS_DIR)/svm_support.o:  $(ASSYM_SVM)
diff --git a/usr/src/uts/req.flg b/usr/src/uts/req.flg
index ffbaa3f643..15085a486d 100644
--- a/usr/src/uts/req.flg
+++ b/usr/src/uts/req.flg
@@ -22,6 +22,7 @@
 #
 # Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
+# Copyright 2018 Joyent, Inc.
 #
 
 #
@@ -37,6 +38,7 @@ echo_file usr/src/Makefile
 
 # For full builds (open and closed), we want both etc/certs and
 # etc/keys.  For an open source build, there's no etc/keys directory.
+find_files "s.*" usr/contrib/freebsd
 find_files "s.*" usr/src/cmd/cmd-crypto/etc
 find_files "s.*" usr/src/common/acl
 find_files "s.*" usr/src/common/atomic
@@ -56,4 +58,5 @@ find_files "s.*" usr/src/common/smbios
 find_files "s.*" usr/src/common/tsol
 find_files "s.*" usr/src/common/util
 find_files "s.*" usr/src/common/zfs
+find_files "s.*" usr/src/compat/freebsd
 find_files "s.*" usr/src/psm/promif
-- 
cgit v1.2.3


From b0c683a03f7c07cd7ab962acc1dcfef6ba4f78a3 Mon Sep 17 00:00:00 2001
From: Jerry Jelinek <jerry.jelinek@joyent.com>
Date: Mon, 18 May 2020 17:30:05 +0000
Subject: 12665 want modern bhyve [fix broken build]

---
 exception_lists/copyright        | 1 -
 exception_lists/cstyle           | 1 -
 exception_lists/hdrchk           | 1 -
 exception_lists/wscheck          | 1 -
 usr/src/head/Makefile            | 3 +--
 usr/src/uts/i86pc/Makefile.files | 2 --
 usr/src/uts/i86pc/Makefile.i86pc | 1 -
 7 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/exception_lists/copyright b/exception_lists/copyright
index 87c8de6ba4..354f7fb236 100644
--- a/exception_lists/copyright
+++ b/exception_lists/copyright
@@ -552,7 +552,6 @@ usr/src/compat/freebsd/amd64/machine/*.h
 usr/contrib/freebsd/*/*.h
 usr/contrib/freebsd/*/*/*.h
 usr/contrib/freebsd/lib/libutil/*.c
-usr/src/head/bhyve.h
 usr/src/lib/libvmmapi/common/vmmapi.[ch]
 usr/src/tools/scripts/gensetdefs.pl
 usr/src/uts/i86pc/io/vmm/amd/*.[chs]
diff --git a/exception_lists/cstyle b/exception_lists/cstyle
index 4da7cad3df..9a76360653 100644
--- a/exception_lists/cstyle
+++ b/exception_lists/cstyle
@@ -1421,7 +1421,6 @@ usr/src/compat/freebsd/amd64/machine/*.h
 usr/contrib/freebsd/*/*.h
 usr/contrib/freebsd/*/*/*.h
 usr/contrib/freebsd/lib/libutil/*.c
-usr/src/head/bhyve.h
 usr/src/lib/libvmmapi/common/vmmapi.[ch]
 usr/src/uts/i86pc/io/vmm/amd/*.[ch]
 usr/src/uts/i86pc/io/vmm/intel/*.[chs]
diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk
index f4e1cf14c3..08a3179039 100644
--- a/exception_lists/hdrchk
+++ b/exception_lists/hdrchk
@@ -421,7 +421,6 @@ usr/src/compat/freebsd/*/*.h
 usr/src/compat/freebsd/amd64/machine/*.h
 usr/contrib/freebsd/*/*.h
 usr/contrib/freebsd/*/*/*.h
-usr/src/head/bhyve.h
 usr/src/lib/libvmmapi/common/vmmapi.h
 usr/src/uts/i86pc/io/vmm/intel/*.h
 usr/src/uts/i86pc/io/vmm/io/*.h
diff --git a/exception_lists/wscheck b/exception_lists/wscheck
index 0a66384f9c..0e2f1164c5 100644
--- a/exception_lists/wscheck
+++ b/exception_lists/wscheck
@@ -83,7 +83,6 @@ usr/src/data/perfmon/readme.txt
 usr/contrib/freebsd/*/*.h
 usr/contrib/freebsd/*/*/*.h
 usr/contrib/freebsd/lib/libutil/*.c
-usr/src/head/bhyve.h
 usr/src/lib/libvmmapi/common/vmmapi.[ch]
 usr/src/uts/i86pc/io/vmm/amd/*.[ch]
 usr/src/uts/i86pc/io/vmm/intel/*.[chs]
diff --git a/usr/src/head/Makefile b/usr/src/head/Makefile
index 562664bde5..75e2788897 100644
--- a/usr/src/head/Makefile
+++ b/usr/src/head/Makefile
@@ -33,8 +33,7 @@ include ../Makefile.master
 
 sparc_HDRS=
 i386_HDRS=		\
-	stack_unwind.h	\
-	bhyve.h
+	stack_unwind.h
 
 KRB5HDRS=		\
 	mit_copyright.h	\
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index fe848a2f26..50b429dfe6 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -270,8 +270,6 @@ VMM_OBJS += vmm.o \
 	vmcb.o \
 	svm_support.o \
 	amdv.o \
-	sol_iommu.o \
-	sol_ppt.o \
 	gipt.o \
 	vmm_sol_vm.o \
 	vmm_sol_glue.o \
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc
index fa72c89dbe..cec9d91ac8 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc
+++ b/usr/src/uts/i86pc/Makefile.i86pc
@@ -250,7 +250,6 @@ DRV_KMODS	+= vmm
 DRV_KMODS	+= viona
 DRV_KMODS	+= ppt
 DRV_KMODS	+= imc imcstub
-DRV_KMODS	+= vmm
 
 DRV_KMODS	+= cpudrv
 
-- 
cgit v1.2.3


From 109b65249647da8f2f4306cd9b3d2800b05fd59b Mon Sep 17 00:00:00 2001
From: Toomas Soome <tsoome@me.com>
Date: Sat, 16 May 2020 17:31:10 +0300
Subject: 12744 gfx_private: bitmap_cons_clear 8-bit mode is using wrong color
 Reviewed by: Yuri Pankov <ypankov@fastmail.com> Reviewed by: Andy Fiddaman
 <andy@omniosce.org> Approved by: Robert Mustacchi <rm@fingolfin.org>

---
 usr/src/uts/i86pc/io/gfx_private/gfxp_bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/uts/i86pc/io/gfx_private/gfxp_bitmap.c b/usr/src/uts/i86pc/io/gfx_private/gfxp_bitmap.c
index 2fed9f162c..1a11d7ff0f 100644
--- a/usr/src/uts/i86pc/io/gfx_private/gfxp_bitmap.c
+++ b/usr/src/uts/i86pc/io/gfx_private/gfxp_bitmap.c
@@ -450,10 +450,10 @@ bitmap_cons_clear(struct gfxp_fb_softc *softc, struct vis_consclear *ca)
 		for (i = 0; i < console->fb.screen.y; i++) {
 			if (softc->mode == KD_TEXT) {
 				fb = console->fb.fb + i * pitch;
-				(void) memset(fb, ca->bg_color, pitch);
+				(void) memset(fb, data, pitch);
 			}
 			fb = console->fb.shadow_fb + i * pitch;
-			(void) memset(fb, ca->bg_color, pitch);
+			(void) memset(fb, data, pitch);
 		}
 		break;
 	case 15:
-- 
cgit v1.2.3


From b22a70abf81f995ecc990b8444e63308bc389d5c Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Wed, 3 Jan 2018 21:11:35 +0000
Subject: 12679 want viona driver for bhyve Portions contributed by: Ryan
 Zezeski <rpz@joyent.com> Portions contributed by: John Levon
 <john.levon@joyent.com> Portions contributed by: Jason King
 <jason.king@joyent.com> Portions contributed by: Robert Mustacchi
 <rm@joyent.com> Portions contributed by: Bryan Cantrill <bryan@joyent.com>
 Reviewed by: Ryan Zezeski <ryan@zinascii.com> Approved by: Dan McDonald
 <danmcd@joyent.com>

---
 usr/src/cmd/bhyve/Makefile                      |    3 +-
 usr/src/cmd/bhyve/pci_emul.c                    |    5 +
 usr/src/cmd/bhyve/pci_emul.h                    |    7 +
 usr/src/cmd/bhyve/pci_virtio_viona.c            |  494 +++++---
 usr/src/cmd/devfsadm/i386/misc_link_i386.c      |    6 +
 usr/src/man/man9e/mac.9e                        |   22 +-
 usr/src/pkg/manifests/system-bhyve.mf           |    3 +
 usr/src/uts/common/inet/ip/ip6_output.c         |   13 +-
 usr/src/uts/common/inet/ip/ip_output.c          |    8 +
 usr/src/uts/common/inet/ipf/ip_fil_solaris.c    |  335 +++++-
 usr/src/uts/common/inet/ipf/netinet/ipf_stack.h |   16 +-
 usr/src/uts/common/io/hook.c                    |    2 +-
 usr/src/uts/common/sys/dlpi.h                   |    7 +-
 usr/src/uts/common/sys/hook_impl.h              |    4 +-
 usr/src/uts/common/sys/neti.h                   |    5 +-
 usr/src/uts/i86pc/Makefile.files                |    6 +-
 usr/src/uts/i86pc/Makefile.i86pc                |    1 +
 usr/src/uts/i86pc/io/viona/viona.c              | 1409 -----------------------
 usr/src/uts/i86pc/io/viona/viona.mapfile        |   41 +
 usr/src/uts/i86pc/io/viona/viona_hook.c         |  438 +++++++
 usr/src/uts/i86pc/io/viona/viona_impl.h         |  326 ++++++
 usr/src/uts/i86pc/io/viona/viona_main.c         |  991 ++++++++++++++++
 usr/src/uts/i86pc/io/viona/viona_ring.c         |  638 ++++++++++
 usr/src/uts/i86pc/io/viona/viona_rx.c           |  718 ++++++++++++
 usr/src/uts/i86pc/io/viona/viona_tx.c           |  756 ++++++++++++
 usr/src/uts/i86pc/sys/viona_io.h                |   49 +-
 usr/src/uts/i86pc/sys/vmm_drv.h                 |    3 +
 usr/src/uts/i86pc/viona/Makefile                |   13 +-
 usr/src/uts/intel/ipf/ipf.global-objs.debug64   |    9 +-
 29 files changed, 4689 insertions(+), 1639 deletions(-)
 delete mode 100644 usr/src/uts/i86pc/io/viona/viona.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona.mapfile
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_hook.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_impl.h
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_main.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_ring.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_rx.c
 create mode 100644 usr/src/uts/i86pc/io/viona/viona_tx.c

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index e96868e006..2301e6c8a6 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -58,6 +58,7 @@ SRCS =	acpi.c			\
 	pci_virtio_console.c	\
 	pci_virtio_net.c	\
 	pci_virtio_rnd.c	\
+	pci_virtio_viona.c	\
 	pci_xhci.c		\
 	pm.c			\
 	post.c			\
@@ -120,7 +121,7 @@ CSTD=		$(CSTD_GNU99)
 C99MODE=	-xc99=%all
 C99LMODE=	-Xc99=%all
 
-$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -lmd -luuid -lvmmapi -lz
+$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz
 $(MEVENT_TEST_PROG) := LDLIBS += -lsocket
 
 .KEEP_STATE:
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
index 5118b31534..a71cc528aa 100644
--- a/usr/src/cmd/bhyve/pci_emul.c
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -1597,6 +1597,11 @@ pci_lintr_update(struct pci_devinst *pi)
 		pci_irq_assert(pi);
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
+#ifndef __FreeBSD__
+	if (pi->pi_d->pe_lintrupdate != NULL) {
+		pi->pi_d->pe_lintrupdate(pi);
+	}
+#endif /* __FreeBSD__ */
 }
 
 int
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
index 853badaadb..0053caed99 100644
--- a/usr/src/cmd/bhyve/pci_emul.h
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -27,6 +27,9 @@
  *
  * $FreeBSD$
  */
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
 
 #ifndef _PCI_EMUL_H_
 #define _PCI_EMUL_H_
@@ -71,6 +74,10 @@ struct pci_devemu {
 	uint64_t  (*pe_barread)(struct vmctx *ctx, int vcpu,
 				struct pci_devinst *pi, int baridx,
 				uint64_t offset, int size);
+
+#ifndef __FreeBSD__
+	void	(*pe_lintrupdate)(struct pci_devinst *pi);
+#endif /* __FreeBSD__ */
 };
 #define PCI_EMUL_SET(x)   DATA_SET(pci_devemu_set, x);
 
diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c
index e5a5cb584f..9cafa7b111 100644
--- a/usr/src/cmd/bhyve/pci_virtio_viona.c
+++ b/usr/src/cmd/bhyve/pci_virtio_viona.c
@@ -34,7 +34,7 @@
  * http://www.illumos.org/license/CDDL.
  *
  * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/cdefs.h>
@@ -84,18 +84,6 @@
 
 #define	VIONA_REGSZ	VIONA_R_MAX+1
 
-/*
- * Host capabilities
- */
-#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
-#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
-#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
-
-#define	VIONA_S_HOSTCAPS		\
-	(VIRTIO_NET_F_MAC |		\
-	VIRTIO_NET_F_MRG_RXBUF |	\
-	VIRTIO_NET_F_STATUS)
-
 /*
  * Queue definitions.
  */
@@ -108,7 +96,7 @@
 /*
  * Debug printf
  */
-static int pci_viona_debug;
+static volatile int pci_viona_debug;
 #define	DPRINTF(params) if (pci_viona_debug) printf params
 #define	WPRINTF(params) printf params
 
@@ -124,26 +112,20 @@ struct pci_viona_softc {
 	int		vsc_isr;
 
 	datalink_id_t	vsc_linkid;
-	char		vsc_linkname[MAXLINKNAMELEN];
 	int		vsc_vnafd;
 
+	/* Configurable parameters */
+	char		vsc_linkname[MAXLINKNAMELEN];
+	uint32_t	vsc_feature_mask;
+	uint16_t	vsc_vq_size;
+
 	uint32_t	vsc_features;
 	uint8_t		vsc_macaddr[6];
 
 	uint64_t	vsc_pfn[VIONA_MAXQ];
 	uint16_t	vsc_msix_table_idx[VIONA_MAXQ];
-	/*
-	 * Flag to see if host is already sending data out.
-	 * If it is, no need to wait for lock and send interrupt to host
-	 * for new data.
-	 */
-	boolean_t	vsc_tx_kick_lock_held;
-
-	pthread_t	tx_tid;
-	pthread_mutex_t	tx_mtx;
-	pthread_cond_t	tx_cond;
+	boolean_t	vsc_msix_active;
 };
-#define	viona_ctx(sc)	((sc)->vsc_pi->pi_vmctx)
 
 /*
  * Return the size of IO BAR that maps virtio header and device specific
@@ -160,47 +142,44 @@ pci_viona_iosize(struct pci_devinst *pi)
 }
 
 static uint16_t
-pci_viona_qsize(int qnum)
+pci_viona_qsize(struct pci_viona_softc *sc, int qnum)
 {
 	/* XXX no ctl queue currently */
 	if (qnum == VIONA_CTLQ) {
 		return (0);
 	}
 
-	/* XXX fixed currently. Maybe different for tx/rx/ctl */
-	return (VIONA_RINGSZ);
+	return (sc->vsc_vq_size);
 }
 
 static void
 pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
 {
-	int	error;
-
 	assert(ring < VIONA_MAXQ);
 
 	switch (ring) {
 	case VIONA_RXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_RESET);
-		if (error != 0) {
-			WPRINTF(("ioctl viona rx ring reset failed %d\n",
-			    error));
-		} else {
-			sc->vsc_pfn[VIONA_RXQ] = 0;
-		}
-		break;
 	case VIONA_TXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_RESET);
-		if (error != 0) {
-			WPRINTF(("ioctl viona tx ring reset failed %d\n",
-			    error));
-		} else {
-			sc->vsc_pfn[VIONA_TXQ] = 0;
-		}
 		break;
 	case VIONA_CTLQ:
 	default:
-		break;
+		return;
+	}
+
+	for (;;) {
+		int res;
+
+		res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring);
+		if (res == 0) {
+			break;
+		} else if (errno != EINTR) {
+			WPRINTF(("ioctl viona ring %d reset failed %d\n",
+			    ring, errno));
+			return;
+		}
 	}
+
+	sc->vsc_pfn[ring] = 0;
 }
 
 static void
@@ -220,11 +199,11 @@ static void *
 pci_viona_poll_thread(void *param)
 {
 	struct pci_viona_softc *sc = param;
-	pollfd_t	pollset;
-	int			error;
+	pollfd_t pollset;
+	const int fd = sc->vsc_vnafd;
 
-	pollset.fd = sc->vsc_vnafd;
-	pollset.events = POLLIN | POLLOUT;
+	pollset.fd = fd;
+	pollset.events = POLLRDBAND;
 
 	for (;;) {
 		if (poll(&pollset, 1, -1) < 0) {
@@ -236,23 +215,35 @@ pci_viona_poll_thread(void *param)
 				break;
 			}
 		}
-		if (pollset.revents & POLLIN) {
-			pci_generate_msix(sc->vsc_pi,
-			    sc->vsc_msix_table_idx[VIONA_RXQ]);
-			error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_INTR_CLR);
-			if (error != 0) {
-				WPRINTF(("ioctl viona rx intr clear failed"
-				    " %d\n", error));
+		if (pollset.revents & POLLRDBAND) {
+			vioc_intr_poll_t vip;
+			uint_t i;
+			int res;
+			boolean_t assert_lintr = B_FALSE;
+			const boolean_t do_msix = pci_msix_enabled(sc->vsc_pi);
+
+			res = ioctl(fd, VNA_IOC_INTR_POLL, &vip);
+			for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) {
+				if (vip.vip_status[i] == 0) {
+					continue;
+				}
+				if (do_msix) {
+					pci_generate_msix(sc->vsc_pi,
+					    sc->vsc_msix_table_idx[i]);
+				} else {
+					assert_lintr = B_TRUE;
+				}
+				res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i);
+				if (res != 0) {
+					WPRINTF(("ioctl viona vq %d intr "
+					    "clear failed %d\n", i, errno));
+				}
 			}
-		}
-
-		if (pollset.revents & POLLOUT) {
-			pci_generate_msix(sc->vsc_pi,
-			    sc->vsc_msix_table_idx[VIONA_TXQ]);
-			error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_INTR_CLR);
-			if (error != 0) {
-				WPRINTF(("ioctl viona tx intr clear failed"
-				    " %d\n", error));
+			if (assert_lintr) {
+				pthread_mutex_lock(&sc->vsc_mtx);
+				sc->vsc_isr |= VTCFG_ISR_QUEUES;
+				pci_lintr_assert(sc->vsc_pi);
+				pthread_mutex_unlock(&sc->vsc_mtx);
 			}
 		}
 	}
@@ -260,57 +251,6 @@ pci_viona_poll_thread(void *param)
 	pthread_exit(NULL);
 }
 
-static void
-pci_viona_ping_rxq(struct pci_viona_softc *sc)
-{
-	int error;
-
-	error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_KICK);
-	if (error != 0) {
-		WPRINTF(("ioctl viona rx ring kick failed %d\n", error));
-	}
-}
-
-static void *
-pci_viona_tx_thread(void *param)
-{
-	struct pci_viona_softc *sc = (struct pci_viona_softc *)param;
-	int error;
-
-	pthread_mutex_lock(&sc->tx_mtx);
-	for (;;) {
-		error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
-		assert(error == 0);
-		sc->vsc_tx_kick_lock_held = B_TRUE;
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_KICK);
-		if (error != 0) {
-			WPRINTF(("ioctl viona tx ring kick failed %d\n",
-			    error));
-		}
-		sc->vsc_tx_kick_lock_held = B_FALSE;
-	}
-	pthread_mutex_unlock(&sc->tx_mtx);
-
-	return (NULL);
-}
-
-static void
-pci_viona_ping_txq(struct pci_viona_softc *sc)
-{
-	/* Signal the tx thread for processing */
-	if (sc->vsc_tx_kick_lock_held)
-		return;
-	pthread_mutex_lock(&sc->tx_mtx);
-	pthread_cond_signal(&sc->tx_cond);
-	pthread_mutex_unlock(&sc->tx_mtx);
-}
-
-static void
-pci_viona_ping_ctlq(struct pci_viona_softc *sc)
-{
-	DPRINTF(("viona: control qnotify!\n\r"));
-}
-
 static void
 pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
 {
@@ -320,29 +260,19 @@ pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
 
 	assert(qnum < VIONA_MAXQ);
 
+	if (qnum == VIONA_CTLQ) {
+		return;
+	}
+
 	sc->vsc_pfn[qnum] = (pfn << VRING_PFN);
 
-	vna_ri.ri_qsize = pci_viona_qsize(qnum);
+	vna_ri.ri_index = qnum;
+	vna_ri.ri_qsize = pci_viona_qsize(sc, qnum);
 	vna_ri.ri_qaddr = (pfn << VRING_PFN);
+	error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri);
 
-	switch (qnum) {
-	case VIONA_RXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_INIT, &vna_ri);
-		if (error != 0) {
-			WPRINTF(("ioctl viona rx ring init failed %d\n",
-			    error));
-		}
-		break;
-	case VIONA_TXQ:
-		error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_INIT, &vna_ri);
-		if (error != 0) {
-			WPRINTF(("ioctl viona tx ring init failed %d\n",
-			    error));
-		}
-		break;
-	case VIONA_CTLQ:
-	default:
-		break;
+	if (error != 0) {
+		WPRINTF(("ioctl viona ring %u init failed %d\n", qnum, errno));
 	}
 }
 
@@ -350,36 +280,110 @@ static int
 pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
 {
 	vioc_create_t		vna_create;
-#if notyet
-	char			devname[MAXNAMELEN];
-	int			ctlfd;
-#endif
 	int			error;
 
-	sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL);
+	sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL);
 	if (sc->vsc_vnafd == -1) {
-		WPRINTF(("open viona ctl failed\n"));
+		WPRINTF(("open viona ctl failed: %d\n", errno));
 		return (-1);
 	}
 
 	vna_create.c_linkid = sc->vsc_linkid;
-	strlcpy(vna_create.c_vmname, vmname,
-	    sizeof (vna_create.c_vmname));
-#if notyet
-	vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size,
-	    NULL);
-	vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL),
-	    &vna_create.c_himem_size, NULL);
-#endif
+	vna_create.c_vmfd = vm_get_device_fd(ctx);
 	error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
 	if (error != 0) {
-		WPRINTF(("ioctl viona create failed %d\n", error));
+		(void) close(sc->vsc_vnafd);
+		WPRINTF(("ioctl viona create failed %d\n", errno));
 		return (-1);
 	}
 
 	return (0);
 }
 
+static int
+pci_viona_parse_opts(struct pci_viona_softc *sc, char *opts)
+{
+	char *next, *cp, *vnic = NULL;
+	int err = 0;
+
+	sc->vsc_vq_size = VIONA_RINGSZ;
+	sc->vsc_feature_mask = 0;
+
+	for (; opts != NULL && *opts != '\0'; opts = next) {
+		char *val;
+
+		if ((cp = strchr(opts, ',')) != NULL) {
+			*cp = '\0';
+			next = cp + 1;
+		} else {
+			next = NULL;
+		}
+
+		if ((cp = strchr(opts, '=')) == NULL) {
+			/* vnic chosen with bare name */
+			if (vnic != NULL) {
+				fprintf(stderr,
+				    "viona: unexpected vnic name '%s'", opts);
+				err = -1;
+			} else {
+				vnic = opts;
+			}
+			continue;
+		}
+
+		/* <param>=<value> handling */
+		val = cp + 1;
+		*cp = '\0';
+		if (strcmp(opts, "feature_mask") == 0) {
+			long num;
+
+			errno = 0;
+			num = strtol(val, NULL, 0);
+			if (errno != 0 || num < 0) {
+				fprintf(stderr,
+				    "viona: invalid mask '%s'", val);
+			} else {
+				sc->vsc_feature_mask = num;
+			}
+		} else if (strcmp(opts, "vqsize") == 0) {
+			long num;
+
+			errno = 0;
+			num = strtol(val, NULL, 0);
+			if (errno != 0) {
+				fprintf(stderr,
+				    "viona: invalid vsqize '%s'", val);
+				err = -1;
+			} else if (num <= 2 || num > 32768) {
+				fprintf(stderr,
+				    "viona: vqsize out of range", num);
+				err = -1;
+			} else if ((1 << (ffs(num) - 1)) != num) {
+				fprintf(stderr,
+				    "viona: vqsize must be power of 2", num);
+				err = -1;
+			} else {
+				sc->vsc_vq_size = num;
+			}
+		} else {
+			fprintf(stderr,
+			    "viona: unrecognized option '%s'", opts);
+			err = -1;
+		}
+	}
+	if (vnic == NULL) {
+		fprintf(stderr, "viona: vnic name required");
+		sc->vsc_linkname[0] = '\0';
+		err = -1;
+	} else {
+		(void) strlcpy(sc->vsc_linkname, vnic, MAXLINKNAMELEN);
+	}
+
+	DPRINTF(("viona=%p dev=%s vqsize=%x feature_mask=%x\n", sc,
+	    sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask));
+	return (err);
+}
+
 static int
 pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
@@ -387,9 +391,9 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	dladm_status_t		status;
 	dladm_vnic_attr_t	attr;
 	char			errmsg[DLADM_STRSIZE];
-	int error;
+	int error, i;
 	struct pci_viona_softc *sc;
-	int i;
+	uint64_t ioport;
 
 	if (opts == NULL) {
 		printf("virtio-viona: vnic required\n");
@@ -404,7 +408,10 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
-	strlcpy(sc->vsc_linkname, opts, MAXLINKNAMELEN);
+	if (pci_viona_parse_opts(sc, opts) != 0) {
+		free(sc);
+		return (1);
+	}
 
 	if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
 		WPRINTF(("could not open /dev/dld"));
@@ -430,7 +437,6 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		return (1);
 	}
 
-	sc->vsc_tx_kick_lock_held = B_FALSE;
 	memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
 
 	dladm_close(handle);
@@ -449,42 +455,44 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	/* MSI-X support */
 	for (i = 0; i < VIONA_MAXQ; i++)
 		sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
 
-	/*
-	 * BAR 1 used to map MSI-X table and PBA
-	 */
+	/* BAR 1 used to map MSI-X table and PBA */
 	if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
 		free(sc);
 		return (1);
 	}
 
-	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+	/* BAR 0 for legacy-style virtio register access. */
+	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+	if (error != 0) {
+		WPRINTF(("could not allocate virtio BAR\n"));
+		free(sc);
+		return (1);
+	}
+
+	/* Install ioport hook for virtqueue notification */
+	ioport = pi->pi_bar[0].addr + VTCFG_R_QNOTIFY;
+	error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport);
+	if (error != 0) {
+		WPRINTF(("could not install ioport hook at %x\n", ioport));
+		free(sc);
+		return (1);
+	}
 
 	/*
-	 * Initialize tx semaphore & spawn TX processing thread
-	 * As of now, only one thread for TX desc processing is
-	 * spawned.
+	 * Need a legacy interrupt for virtio compliance, even though MSI-X
+	 * operation is _strongly_ suggested for adequate performance.
 	 */
-	pthread_mutex_init(&sc->tx_mtx, NULL);
-	pthread_cond_init(&sc->tx_cond, NULL);
-	pthread_create(&sc->tx_tid, NULL, pci_viona_tx_thread, (void *)sc);
+	pci_lintr_request(pi);
 
 	return (0);
 }
 
-/*
- * Function pointer array to handle queue notifications
- */
-static void (*pci_viona_qnotify[VIONA_MAXQ])(struct pci_viona_softc *) = {
-	pci_viona_ping_rxq,
-	pci_viona_ping_txq,
-	pci_viona_ping_ctlq
-};
-
 static uint64_t
 viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
 {
@@ -500,6 +508,109 @@ viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
 	return (offset);
 }
 
+static void
+pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	struct msix_table_entry mte;
+	uint16_t tab_index;
+	vioc_ring_msi_t vrm;
+	int res;
+
+	assert(ring <= VIONA_VQ_TX);
+
+	vrm.rm_index = ring;
+	vrm.rm_addr = 0;
+	vrm.rm_msg = 0;
+	tab_index = sc->vsc_msix_table_idx[ring];
+
+	if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) {
+		mte = pi->pi_msix.table[tab_index];
+		if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+			vrm.rm_addr = mte.addr;
+			vrm.rm_msg = mte.msg_data;
+		}
+	}
+
+	res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm);
+	if (res != 0) {
+		WPRINTF(("ioctl viona set_msi %d failed %d\n", ring, errno));
+	}
+}
+
+static void
+pci_viona_lintrupdate(struct pci_devinst *pi)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	boolean_t msix_on = B_FALSE;
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+	msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0);
+	if ((sc->vsc_msix_active && !msix_on) ||
+	    (msix_on && !sc->vsc_msix_active)) {
+		uint_t i;
+
+		sc->vsc_msix_active = msix_on;
+		/* Update in-kernel ring configs */
+		for (i = 0; i <= VIONA_VQ_TX; i++) {
+			pci_viona_ring_set_msix(pi, i);
+		}
+	}
+	pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset)
+{
+	struct pci_viona_softc *sc = pi->pi_arg;
+	uint_t tab_index, i;
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+	if (!sc->vsc_msix_active) {
+		pthread_mutex_unlock(&sc->vsc_mtx);
+		return;
+	}
+
+	/*
+	 * Rather than update every possible MSI-X vector, cheat and use the
+	 * offset to calculate the entry within the table.  Since this should
+	 * only be called when a write to the table succeeds, the index should
+	 * be valid.
+	 */
+	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+	for (i = 0; i <= VIONA_VQ_TX; i++) {
+		if (sc->vsc_msix_table_idx[i] != tab_index) {
+			continue;
+		}
+		pci_viona_ring_set_msix(pi, i);
+	}
+
+	pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_qnotify(struct pci_viona_softc *sc, int ring)
+{
+	int error;
+
+	switch (ring) {
+	case VIONA_TXQ:
+	case VIONA_RXQ:
+		error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring);
+		if (error != 0) {
+			WPRINTF(("ioctl viona ring %d kick failed %d\n",
+			    ring, errno));
+		}
+		break;
+	case VIONA_CTLQ:
+		DPRINTF(("viona: control qnotify!\n"));
+		break;
+	default:
+		break;
+	}
+}
+
 static void
 pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
     int baridx, uint64_t offset, int size, uint64_t value)
@@ -510,7 +621,9 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 
 	if (baridx == pci_msix_table_bar(pi) ||
 	    baridx == pci_msix_pba_bar(pi)) {
-		pci_emul_msix_twrite(pi, offset, size, value);
+		if (pci_emul_msix_twrite(pi, offset, size, value) == 0) {
+			pci_viona_msix_update(pi, offset);
+		}
 		return;
 	}
 
@@ -529,10 +642,14 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		assert(size == 4);
+		value &= ~(sc->vsc_feature_mask);
 		err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
-		if (err != 0)
+		if (err != 0) {
 			WPRINTF(("ioctl feature negotiation returned"
-			    " err = %d\n", err));
+			    " err = %d\n", errno));
+		} else {
+			sc->vsc_features = value;
+		}
 		break;
 	case VTCFG_R_PFN:
 		assert(size == 4);
@@ -546,7 +663,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	case VTCFG_R_QNOTIFY:
 		assert(size == 2);
 		assert(value < VIONA_MAXQ);
-		(*pci_viona_qnotify[value])(sc);
+		pci_viona_qnotify(sc, value);
 		break;
 	case VTCFG_R_STATUS:
 		assert(size == 1);
@@ -560,6 +677,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		assert(size == 2);
 		assert(sc->vsc_curq != VIONA_CTLQ);
 		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+		pci_viona_ring_set_msix(pi, sc->vsc_curq);
 		break;
 	case VIONA_R_CFG0:
 	case VIONA_R_CFG1:
@@ -597,7 +715,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	pthread_mutex_unlock(&sc->vsc_mtx);
 }
 
-uint64_t
+static uint64_t
 pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
     int baridx, uint64_t offset, int size)
 {
@@ -627,9 +745,11 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	case VTCFG_R_HOSTCAP:
 		assert(size == 4);
 		err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
-		if (err != 0)
+		if (err != 0) {
 			WPRINTF(("ioctl get host features returned"
-			    " err = %d\n", err));
+			    " err = %d\n", errno));
+		}
+		value &= ~sc->vsc_feature_mask;
 		break;
 	case VTCFG_R_GUESTCAP:
 		assert(size == 4);
@@ -641,7 +761,7 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		break;
 	case VTCFG_R_QNUM:
 		assert(size == 2);
-		value = pci_viona_qsize(sc->vsc_curq);
+		value = pci_viona_qsize(sc, sc->vsc_curq);
 		break;
 	case VTCFG_R_QSEL:
 		assert(size == 2);
@@ -659,6 +779,9 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		assert(size == 1);
 		value = sc->vsc_isr;
 		sc->vsc_isr = 0;	/* a read clears this flag */
+		if (value != 0) {
+			pci_lintr_deassert(pi);
+		}
 		break;
 	case VTCFG_R_CFGVEC:
 		assert(size == 2);
@@ -705,9 +828,10 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 }
 
 struct pci_devemu pci_de_viona = {
-	.pe_emu = 	"virtio-net-viona",
+	.pe_emu =	"virtio-net-viona",
 	.pe_init =	pci_viona_init,
 	.pe_barwrite =	pci_viona_write,
-	.pe_barread =	pci_viona_read
+	.pe_barread =	pci_viona_read,
+	.pe_lintrupdate = pci_viona_lintrupdate
 };
 PCI_EMUL_SET(pci_de_viona);
diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
index 4aeea7d294..0f8e64551d 100644
--- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c
+++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
@@ -85,6 +85,9 @@ static devfsadm_create_t misc_cbt[] = {
 	{ "pseudo", "ddi_pseudo", "ucode",
 	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name,
 	},
+	{ "pseudo", "ddi_pseudo", "viona",
+	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name,
+	},
 	{ "pseudo", "ddi_pseudo", "vmm",
 	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl,
 	}
@@ -114,6 +117,9 @@ static devfsadm_remove_t misc_remove_cbt[] = {
 	{ "serial", "^tty[a-z]$", RM_ALWAYS | RM_PRE,
 		ILEVEL_1, devfsadm_rm_all
 	},
+	{ "pseudo", "^viona$", RM_ALWAYS | RM_PRE | RM_HOT,
+		ILEVEL_0, devfsadm_rm_all
+	},
 	{ "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT,
 		ILEVEL_0, devfsadm_rm_all
 	}
diff --git a/usr/src/man/man9e/mac.9e b/usr/src/man/man9e/mac.9e
index 3a3f2ae90a..d3d066a564 100644
--- a/usr/src/man/man9e/mac.9e
+++ b/usr/src/man/man9e/mac.9e
@@ -570,24 +570,28 @@ The following set of flags may be combined through a bitwise inclusive OR:
 .Bl -tag -width Ds
 .It Sy HCKSUM_INET_PARTIAL
 This indicates that the hardware can calculate a partial checksum for
-both IPv4 and IPv6; however, it requires the pseudo-header checksum be
-calculated for it.
+both IPv4 and IPv6 UDP and TCP packets; however, it requires the pseudo-header
+checksum be calculated for it.
 The pseudo-header checksum will be available for the mblk_t when calling
 .Xr mac_hcksum_get 9F .
-Note this does not imply that the hardware is capable of calculating the
-IPv4 header checksum.
+Note this does not imply that the hardware is capable of calculating
+the partial checksum for other L4 protocols or the IPv4 header checksum.
 That should be indicated with the
 .Sy HCKSUM_IPHDRCKSUM flag.
 .It Sy HCKSUM_INET_FULL_V4
-This indicates that the hardware will fully calculate the L4 checksum
-for outgoing IPv4 packets and does not require a pseudo-header checksum.
+This indicates that the hardware will fully calculate the L4 checksum for
+outgoing IPv4 UDP or TCP packets only, and does not require a pseudo-header
+checksum.
 Note this does not imply that the hardware is capable of calculating the
-IPv4 header checksum.
+checksum for other L4 protocols or the IPv4 header checksum.
 That should be indicated with the
 .Sy HCKSUM_IPHDRCKSUM .
 .It Sy HCKSUM_INET_FULL_V6
-This indicates that the hardware will fully calculate the L4 checksum
-for outgoing IPv6 packets and does not require a pseudo-header checksum.
+This indicates that the hardware will fully calculate the L4 checksum for
+outgoing IPv6 UDP or TCP packets only, and does not require a pseudo-header
+checksum.
+Note this does not imply that the hardware is capable of calculating the
+checksum for any other L4 protocols.
 .It Sy HCKSUM_IPHDRCKSUM
 This indicates that the hardware supports calculating the checksum for
 the IPv4 header itself.
diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf
index 2a51d4fc22..7fdeb81254 100644
--- a/usr/src/pkg/manifests/system-bhyve.mf
+++ b/usr/src/pkg/manifests/system-bhyve.mf
@@ -35,8 +35,11 @@ dir path=usr group=sys
 dir path=usr/kernel/drv group=sys
 dir path=usr/kernel/drv/$(ARCH64) group=sys
 dir path=usr/sbin
+driver name=viona
 driver name=vmm
+file path=usr/kernel/drv/$(ARCH64)/viona
 file path=usr/kernel/drv/$(ARCH64)/vmm
+file path=usr/kernel/drv/viona.conf
 file path=usr/kernel/drv/vmm.conf
 file path=usr/sbin/bhyve mode=0555
 file path=usr/sbin/bhyvectl mode=0555
diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c
index 6c5868ddde..143077ed32 100644
--- a/usr/src/uts/common/inet/ip/ip6_output.c
+++ b/usr/src/uts/common/inet/ip/ip6_output.c
@@ -23,6 +23,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -866,8 +867,16 @@ ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
 		    ixa->ixa_raw_cksum_offset);
 		cksum = htons(protocol);
 	} else if (protocol == IPPROTO_ICMPV6) {
-		cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
-		cksum = IP_ICMPV6_CSUM_COMP;	/* Pseudo-header cksum */
+		/*
+		 * Currently we assume no HW support for ICMP checksum calc.
+		 *
+		 * When HW support is advertised for ICMP, we'll want the
+		 * following to be set:
+		 * cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+		 * cksum = IP_ICMPV6_CSUM_COMP;		Pseudo-header cksum
+		 */
+
+		return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
 	} else {
 	ip_hdr_cksum:
 		/* No IP header checksum for IPv6 */
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
index 1017240521..a0157d3c48 100644
--- a/usr/src/uts/common/inet/ip/ip_output.c
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -1738,6 +1739,13 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
 #endif
 			sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
 		goto ip_hdr_cksum;
+	} else if (protocol == IPPROTO_ICMP) {
+		/*
+		 * Note that we always calculate a SW checksum for ICMP. In the
+		 * future, if HW support for ICMP is advertised, we can change
+		 * this.
+		 */
+		return (ip_output_sw_cksum_v4(mp, ipha, ixa));
 	} else {
 	ip_hdr_cksum:
 		/* Calculate IPv4 header checksum */
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index b80cf53882..2e55e6fab8 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -22,6 +22,7 @@ static const char rcsid[] = "@(#)$Id: ip_fil_solaris.c,v 2.62.2.19 2005/07/13 21
 #include <sys/filio.h>
 #include <sys/systm.h>
 #include <sys/strsubr.h>
+#include <sys/strsun.h>
 #include <sys/cred.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
@@ -84,9 +85,19 @@ static	int	ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
 static	int	ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
     void *));
 static	int     ipf_hook6 __P((hook_data_t, int, int, void *));
+
+static	int	ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *));
+static	int	ipf_hookviona_out __P((hook_event_token_t, hook_data_t,
+    void *));
+
 extern	int	ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
 extern	int	ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
 
+static int	ipf_hook_protocol_notify __P((hook_notify_cmd_t, void *,
+    const char *, const char *, const char *));
+static int	ipf_hook_instance_notify __P((hook_notify_cmd_t, void *,
+    const char *, const char *, const char *));
+
 #if SOLARIS2 < 10
 #if SOLARIS2 >= 7
 u_int		*ip_ttl_ptr = NULL;
@@ -153,6 +164,12 @@ char *hook6_loop_in_gz = 	"ipfilter_hook6_loop_in_gz";
 char *hook6_loop_out = 		"ipfilter_hook6_loop_out";
 char *hook6_loop_out_gz = 	"ipfilter_hook6_loop_out_gz";
 
+/* viona hook names */
+char *hook_viona_in =		"ipfilter_hookviona_in";
+char *hook_viona_in_gz =	"ipfilter_hookviona_in_gz";
+char *hook_viona_out =		"ipfilter_hookviona_out";
+char *hook_viona_out_gz =	"ipfilter_hookviona_out_gz";
+
 /* ------------------------------------------------------------------------ */
 /* Function:    ipldetach                                                   */
 /* Returns:     int - 0 == success, else error.                             */
@@ -249,8 +266,40 @@ ipf_stack_t *ifs;
 		ifs->ifs_ipf_ipv4 = NULL;
 	}
 
+	/*
+	 * Remove notification of viona hooks
+	 */
+	net_instance_notify_unregister(ifs->ifs_netid,
+	    ipf_hook_instance_notify);
+
 #undef UNDO_HOOK
 
+	/*
+	 * Normally, viona will unregister itself before ipldetach() is called,
+	 * so these will be no-ops, but out of caution, we try to make sure
+	 * we've removed any of our references.
+	 */
+	(void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+	    NH_PHYSICAL_IN);
+	(void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+	    NH_PHYSICAL_OUT);
+
+	{
+		char netidstr[12]; /* Large enough for INT_MAX + NUL */
+		(void) snprintf(netidstr, sizeof (netidstr), "%d",
+		    ifs->ifs_netid);
+
+		/*
+		 * The notify callbacks expect the netid value passed as a
+		 * string in the third argument.  To prevent confusion if
+		 * traced, we pass the same value the nethook framework would
+		 * pass, even though the callback does not currently use the
+		 * value.
+		 */
+		(void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr,
+		    NULL, Hn_VIONA);
+	}
+
 #ifdef	IPFDEBUG
 	cmn_err(CE_CONT, "ipldetach()\n");
 #endif
@@ -445,6 +494,21 @@ ipf_stack_t *ifs;
 			goto hookup_failed;
 	}
 
+	/*
+	 * VIONA INET hooks.  While the nethook framework allows us to register
+	 * hooks for events that haven't been registered yet, we instead
+	 * register and unregister our hooks in response to notifications
+	 * about the viona hooks from the nethook framework.  This prevents
+	 * problems when the viona module gets unloaded while the ipf module
+	 * does not.  If we do not unregister our hooks after the viona module
+	 * is unloaded, the viona module cannot later re-register them if it
+	 * gets reloaded.  As the ip, vnd, and ipf modules are rarely unloaded
+	 * even on DEBUG kernels, they do not experience this issue.
+	 */
+	if (net_instance_notify_register(id, ipf_hook_instance_notify,
+	    ifs) != 0)
+		goto hookup_failed;
+
 	/*
 	 * Reacquire ipf_global, now it is safe.
 	 */
@@ -508,6 +572,155 @@ hookup_failed:
 	return -1;
 }
 
+/* ------------------------------------------------------------------------ */
+/*
+ * Called whenever a nethook protocol is registered or unregistered.  Currently
+ * only used to add or remove the hooks for viona.
+ *
+ * While the function signature requires returning int, nothing
+ * in usr/src/uts/common/io/hook.c that invokes the callbacks
+ * captures the return value (nor is there currently any documentation
+ * on what return values should be).  For now at least, we'll return 0
+ * on success (or 'not applicable') or an error value.  Even if the
+ * nethook framework doesn't use the return address, it can be observed via
+ * dtrace if needed.
+ */
+static int
+ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg,
+    const char *name, const char *dummy __unused, const char *he_name)
+{
+	ipf_stack_t *ifs = arg;
+	hook_t **hookpp;
+	char *hook_name, *hint_name;
+	hook_func_t hookfn;
+	boolean_t *hookedp;
+	hook_hint_t hint;
+	boolean_t out;
+	int ret = 0;
+
+	const boolean_t gz = ifs->ifs_gz_controlled;
+
+	/* We currently only care about viona hooks notifications */
+	if (strcmp(name, Hn_VIONA) != 0)
+		return (0);
+
+	if (strcmp(he_name, NH_PHYSICAL_IN) == 0) {
+		out = B_FALSE;
+	} else if (strcmp(he_name, NH_PHYSICAL_OUT) == 0) {
+		out = B_TRUE;
+	} else {
+		/*
+		 * If we've added more hook events to viona, we must add
+		 * the corresponding handling here (even if it's just to
+		 * ignore it) to prevent the firewall from not working as
+		 * intended.
+		 */
+		cmn_err(CE_PANIC, "%s: unhandled hook event %s", __func__,
+		    he_name);
+
+		return (0);
+	}
+
+	if (out) {
+		hookpp = &ifs->ifs_ipfhookviona_out;
+		hookfn = ipf_hookviona_out;
+		hookedp = &ifs->ifs_hookviona_physical_out;
+		name = gz ? hook_viona_out_gz : hook_viona_out;
+		hint = gz ? HH_AFTER : HH_BEFORE;
+		hint_name = gz ? hook_viona_out : hook_viona_out_gz;
+	} else {
+		hookpp = &ifs->ifs_ipfhookviona_in;
+		hookfn = ipf_hookviona_in;
+		hookedp = &ifs->ifs_hookviona_physical_in;
+		name = gz ? hook_viona_in_gz : hook_viona_in;
+		hint = gz ? HH_BEFORE : HH_AFTER;
+		hint_name = gz ? hook_viona_in : hook_viona_in_gz;
+	}
+
+	switch (command) {
+	default:
+	case HN_NONE:
+		break;
+	case HN_REGISTER:
+		HOOK_INIT(*hookpp, hookfn, (char *)name, ifs);
+		(*hookpp)->h_hint = hint;
+		(*hookpp)->h_hintvalue = (uintptr_t)hint_name;
+		ret = net_hook_register(ifs->ifs_ipf_viona,
+		    (char *)he_name, *hookpp);
+		if (ret != 0) {
+			cmn_err(CE_NOTE, "%s: could not register hook "
+			    "(hook family=%s hook=%s) err=%d", __func__,
+			    name, he_name, ret);
+			*hookedp = B_FALSE;
+			return (ret);
+		}
+		*hookedp = B_TRUE;
+		break;
+	case HN_UNREGISTER:
+		if (ifs->ifs_ipf_viona == NULL)
+			break;
+
+		ret = *hookedp ? net_hook_unregister(ifs->ifs_ipf_viona,
+		    (char *)he_name, *hookpp) : 0;
+		if ((ret == 0 || ret == ENXIO)) {
+			if (*hookpp != NULL) {
+				hook_free(*hookpp);
+				*hookpp = NULL;
+			}
+			*hookedp = B_FALSE;
+		}
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * Called whenever a new nethook instance is created.  Currently only used
+ * with the Hn_VIONA nethooks.  Similar to ipf_hook_protocol_notify, the out
+ * function signature must return an int, though the result is never used.
+ * We elect to return 0 on success (or not applicable) or a non-zero value
+ * on error.
+ */
+static int
+ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
+    const char *netid, const char *dummy __unused, const char *instance)
+{
+	ipf_stack_t *ifs = arg;
+	int ret = 0;
+
+	/* We currently only care about viona hooks */
+	if (strcmp(instance, Hn_VIONA) != 0)
+		return (0);
+
+	switch (command) {
+	case HN_NONE:
+	default:
+		return (0);
+	case HN_REGISTER:
+		ifs->ifs_ipf_viona = net_protocol_lookup(ifs->ifs_netid,
+		    NHF_VIONA);
+
+		if (ifs->ifs_ipf_viona == NULL)
+			return (EPROTONOSUPPORT);
+
+		ret = net_protocol_notify_register(ifs->ifs_ipf_viona,
+		    ipf_hook_protocol_notify, ifs);
+		VERIFY(ret == 0 || ret == ESHUTDOWN);
+		break;
+	case HN_UNREGISTER:
+		if (ifs->ifs_ipf_viona == NULL)
+			break;
+		VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona,
+		    ipf_hook_protocol_notify));
+		VERIFY0(net_protocol_release(ifs->ifs_ipf_viona));
+		ifs->ifs_ipf_viona = NULL;
+		break;
+	}
+
+	return (ret);
+}
+
 static	int	fr_setipfloopback(set, ifs)
 int set;
 ipf_stack_t *ifs;
@@ -2043,6 +2256,124 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
 	return ipf_hook6(info, 1, FI_NOCKSUM, arg);
 }
 
+/* Static constants used by ipf_hook_ether */
+static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+static uint8_t ipf_eth_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+static uint8_t ipf_eth_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/* ------------------------------------------------------------------------ */
+/* Function:	ipf_hook_ether                                              */
+/* Returns:	int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters:	token(I)     - pointer to event                             */
+/*              info(I)      - pointer to hook information for firewalling  */
+/*                                                                          */
+/* The ipf_hook_ether hook is currently private to illumos.  It represents  */
+/* a layer 2 datapath generally used by virtual machines.  Currently the    */
+/* hook is only used by the viona driver to pass along L2 frames for        */
+/* inspection.  It requires that the L2 ethernet header is contained within */
+/* a single dblk_t (however layers above the L2 header have no restrctions  */
+/* in ipf).  ipf does not currently support filtering on L2 fields (e.g.    */
+/* filtering on a MAC address or ethertype), however virtual machines do    */
+/* not have native IP stack instances where ipf traditionally hooks in.     */
+/* Instead this entry point is used to determine if the packet is unicast,  */
+/* broadcast, or multicast. The IPv4 or IPv6 packet is then passed to the   */
+/* traditional ip hooks for filtering.  Non IPv4 or non IPv6 packets are    */
+/* not subject to examination.                                              */
+/* ------------------------------------------------------------------------ */
+int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg,
+    boolean_t out)
+{
+	struct ether_header *ethp;
+	hook_pkt_event_t *hpe = (hook_pkt_event_t *)info;
+	mblk_t *mp;
+	size_t offset, len;
+	uint16_t etype;
+	boolean_t v6;
+
+	/*
+	 * viona will only pass us mblks with the L2 header contained in a
+	 * single data block.
+	 */
+	mp = *hpe->hpe_mp;
+	len = MBLKL(mp);
+
+	VERIFY3S(len, >=, sizeof (struct ether_header));
+
+	ethp = (struct ether_header *)mp->b_rptr;
+	if ((etype = ntohs(ethp->ether_type)) == ETHERTYPE_VLAN) {
+		struct ether_vlan_header *evh =
+		    (struct ether_vlan_header *)ethp;
+
+		VERIFY3S(len, >=, sizeof (struct ether_vlan_header));
+
+		etype = ntohs(evh->ether_type);
+		offset = sizeof (*evh);
+	} else {
+		offset = sizeof (*ethp);
+	}
+
+	/*
+	 * ipf only support filtering IPv4 and IPv6.  Ignore other types.
+	 */
+	if (etype == ETHERTYPE_IP)
+		v6 = B_FALSE;
+	else if (etype == ETHERTYPE_IPV6)
+		v6 = B_TRUE;
+	else
+		return (0);
+
+	if (bcmp(ipf_eth_bcast_addr, ethp, ETHERADDRL) == 0)
+		hpe->hpe_flags |= HPE_BROADCAST;
+	else if (bcmp(ipf_eth_ipv4_mcast, ethp,
+	    sizeof (ipf_eth_ipv4_mcast)) == 0)
+		hpe->hpe_flags |= HPE_MULTICAST;
+	else if (bcmp(ipf_eth_ipv6_mcast, ethp,
+	    sizeof (ipf_eth_ipv6_mcast)) == 0)
+		hpe->hpe_flags |= HPE_MULTICAST;
+
+	/* Find the start of the IPv4 or IPv6 header */
+	for (; offset >= len; len = MBLKL(mp)) {
+		offset -= len;
+		mp = mp->b_cont;
+		if (mp == NULL) {
+			freemsg(*hpe->hpe_mp);
+			*hpe->hpe_mp = NULL;
+			return (-1);
+		}
+	}
+	hpe->hpe_mb = mp;
+	hpe->hpe_hdr = mp->b_rptr + offset;
+
+	return (v6 ? ipf_hook6(info, out, 0, arg) :
+	    ipf_hook(info, out, 0, arg));
+}
+
+/* ------------------------------------------------------------------------ */
+/* Function:    ipf_hookviona_{in,out}                                      */
+/* Returns:     int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters:  event(I)     - pointer to event                             */
+/*              info(I)      - pointer to hook information for firewalling  */
+/*                                                                          */
+/* The viona hooks are private hooks to illumos. They represents a layer 2  */
+/* datapath generally used to implement virtual machines.                   */
+/* along L2 packets.                                                        */
+/*                                                                          */
+/* They end up calling the appropriate traditional ip hooks.                */
+/* ------------------------------------------------------------------------ */
+int
+ipf_hookviona_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return (ipf_hook_ether(token, info, arg, B_FALSE));
+}
+
+int
+ipf_hookviona_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return (ipf_hook_ether(token, info, arg, B_TRUE));
+}
+
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_hook4_loop_in                                           */
 /* Returns:     int - 0 == packet ok, else problem, free packet if not done */
@@ -2386,7 +2717,7 @@ fr_info_t *fin;
 #ifdef USE_INET6
 	struct in6_addr	tmp_src6;
 #endif
-	
+
 	ASSERT(fin->fin_p == IPPROTO_TCP);
 
 	/*
@@ -2428,7 +2759,7 @@ fr_info_t *fin;
 #endif
 
 	if (tcp != NULL) {
-		/* 
+		/*
 		 * Adjust TCP header:
 		 *	swap ports,
 		 *	set flags,
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..0ceea1e921 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -6,7 +6,7 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright 2014 Joyent, Inc.  All rights reserved.
+ * Copyright 2018 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	__IPF_STACK_H__
@@ -87,8 +87,8 @@ struct ipf_stack {
 #endif
 	int			ifs_ipf_locks_done;
 
-	ipftoken_t 		*ifs_ipftokenhead;
-	ipftoken_t 		**ifs_ipftokentail;
+	ipftoken_t		*ifs_ipftokenhead;
+	ipftoken_t		**ifs_ipftokentail;
 
 	ipfmutex_t	ifs_ipl_mutex;
 	ipfmutex_t	ifs_ipf_authmx;
@@ -126,6 +126,9 @@ struct ipf_stack {
 	hook_t		*ifs_ipfhook6_loop_out;
 	hook_t		*ifs_ipfhook6_nicevents;
 
+	hook_t		*ifs_ipfhookviona_in;
+	hook_t		*ifs_ipfhookviona_out;
+
 	/* flags to indicate whether hooks are registered. */
 	boolean_t	ifs_hook4_physical_in;
 	boolean_t	ifs_hook4_physical_out;
@@ -137,10 +140,13 @@ struct ipf_stack {
 	boolean_t	ifs_hook6_nic_events;
 	boolean_t	ifs_hook6_loopback_in;
 	boolean_t	ifs_hook6_loopback_out;
+	boolean_t	ifs_hookviona_physical_in;
+	boolean_t	ifs_hookviona_physical_out;
 
 	int		ifs_ipf_loopback;
 	net_handle_t	ifs_ipf_ipv4;
 	net_handle_t	ifs_ipf_ipv6;
+	net_handle_t	ifs_ipf_viona;
 
 	/* ip_auth.c */
 	int			ifs_fr_authsize;
@@ -167,8 +173,8 @@ struct ipf_stack {
 	ipfr_t			**ifs_ipfr_nattail;
 	ipfr_t			**ifs_ipfr_nattab;
 
-	ipfr_t  		*ifs_ipfr_ipidlist;
-	ipfr_t  		**ifs_ipfr_ipidtail;
+	ipfr_t			*ifs_ipfr_ipidlist;
+	ipfr_t			**ifs_ipfr_ipidtail;
 	ipfr_t			**ifs_ipfr_ipidtab;
 
 	ipfrstat_t		ifs_ipfr_stats;
diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c
index eb139a37e2..44af26e7c4 100644
--- a/usr/src/uts/common/io/hook.c
+++ b/usr/src/uts/common/io/hook.c
@@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks)
 	/* Free container */
 	kmem_free(hfi, sizeof (*hfi));
 
-	if (hks->hks_shutdown == 2)
+	if (hks != NULL && hks->hks_shutdown == 2)
 		hook_stack_remove(hks);
 
 	mutex_exit(&hook_stack_lock);
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 5bc2bd41c5..54aad9307a 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -673,11 +674,11 @@ typedef struct {
 #define	HCKSUM_ENABLE		0x01	/* Set to enable hardware checksum */
 					/* capability */
 #define	HCKSUM_INET_PARTIAL	0x02	/* Partial 1's complement checksum */
-					/* ability */
+					/* ability for TCP/UDP packets. */
 #define	HCKSUM_INET_FULL_V4	0x04	/* Full 1's complement checksum */
-					/* ability for IPv4 packets. */
+					/* ability for IPv4 TCP/UDP packets. */
 #define	HCKSUM_INET_FULL_V6	0x08	/* Full 1's complement checksum */
-					/* ability for IPv6 packets. */
+					/* ability for IPv6 TCP/UDP packets. */
 #define	HCKSUM_IPHDRCKSUM	0x10	/* IPv4 Header checksum offload */
 					/* capability */
 #ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h
index d8a15f0fe5..f3337bbacf 100644
--- a/usr/src/uts/common/sys/hook_impl.h
+++ b/usr/src/uts/common/sys/hook_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018, Joyent, Inc.
  */
 
 /*
@@ -171,7 +172,7 @@ typedef struct hook_family_int {
 	cvwaitlock_t			hfi_lock;
 	SLIST_ENTRY(hook_family_int)	hfi_entry;
 	hook_event_int_head_t		hfi_head;
-	hook_family_t 			hfi_family;
+	hook_family_t			hfi_family;
 	kstat_t				*hfi_kstat;
 	struct hook_stack		*hfi_stack;
 	hook_notify_head_t		hfi_nhead;
@@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t;
 #define	Hn_ARP	"arp"
 #define	Hn_IPV4	"inet"
 #define	Hn_IPV6	"inet6"
+#define	Hn_VIONA "viona_inet"
 
 extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t);
 extern int hook_register(hook_family_int_t *, char *, hook_t *);
diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h
index b21504109c..e7027f8ece 100644
--- a/usr/src/uts/common/sys/neti.h
+++ b/usr/src/uts/common/sys/neti.h
@@ -21,6 +21,8 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright 2018, Joyent, Inc.
  */
 
 #ifndef _SYS_NETI_H
@@ -46,6 +48,7 @@ struct msgb;	/* avoiding sys/stream.h here */
 #define	NHF_INET	"NHF_INET"
 #define	NHF_INET6	"NHF_INET6"
 #define	NHF_ARP		"NHF_ARP"
+#define	NHF_VIONA	"NHF_VIONA"
 
 /*
  * Event identification
@@ -61,7 +64,7 @@ struct msgb;	/* avoiding sys/stream.h here */
 /*
  * Network NIC hardware checksum capability
  */
-#define	NET_HCK_NONE   	0x00
+#define	NET_HCK_NONE	0x00
 #define	NET_HCK_L3_FULL	0x01
 #define	NET_HCK_L3_PART	0x02
 #define	NET_HCK_L4_FULL	0x10
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index ca4ae0cd65..312c0f233d 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -276,7 +276,11 @@ VMM_OBJS += vmm.o \
 	vmm_support.o \
 	vmm_zsd.o
 
-VIONA_OBJS += viona.o
+VIONA_OBJS += viona_main.o \
+	viona_ring.o \
+	viona_rx.o \
+	viona_tx.o \
+	viona_hook.o \
 
 #
 #	Build up defines and paths.
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc
index b66b0ca2da..b60d24d82c 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc
+++ b/usr/src/uts/i86pc/Makefile.i86pc
@@ -247,6 +247,7 @@ DRV_KMODS	+= ioat
 DRV_KMODS	+= fipe
 DRV_KMODS	+= imc imcstub
 DRV_KMODS	+= vmm
+DRV_KMODS	+= viona
 
 DRV_KMODS	+= cpudrv
 
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
deleted file mode 100644
index 2371a2f3ae..0000000000
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * Copyright (c) 2013  Chris Torek <torek @ torek net>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
- */
-
-#include <sys/conf.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/sunndi.h>
-#include <sys/sysmacros.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <vm/seg_kmem.h>
-
-#include <sys/dls.h>
-#include <sys/mac_client.h>
-
-#include <sys/viona_io.h>
-
-#define	MB	(1024UL * 1024)
-#define	GB	(1024UL * MB)
-
-/*
- * Min. octets in an ethernet frame minus FCS
- */
-#define	MIN_BUF_SIZE	60
-
-#define	VIONA_NAME		"Virtio Network Accelerator"
-
-#define	VIONA_CTL_MINOR		0
-#define	VIONA_CTL_NODE_NAME	"ctl"
-
-#define	VIONA_CLI_NAME		"viona"
-
-#define	VTNET_MAXSEGS		32
-
-#define	VRING_ALIGN		4096
-
-#define	VRING_DESC_F_NEXT	(1 << 0)
-#define	VRING_DESC_F_WRITE	(1 << 1)
-#define	VRING_DESC_F_INDIRECT	(1 << 2)
-
-#define	VRING_AVAIL_F_NO_INTERRUPT	1
-
-#define	VRING_USED_F_NO_NOTIFY		1
-
-#define	BCM_NIC_DRIVER		"bnxe"
-/*
- * Host capabilities
- */
-#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
-#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
-#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
-
-#define	VIONA_S_HOSTCAPS		\
-	(VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | \
-	VIRTIO_NET_F_STATUS)
-
-#pragma pack(1)
-struct virtio_desc {
-	uint64_t	vd_addr;
-	uint32_t	vd_len;
-	uint16_t	vd_flags;
-	uint16_t	vd_next;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_used {
-	uint32_t	vu_idx;
-	uint32_t	vu_tlen;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_net_mrgrxhdr {
-	uint8_t		vrh_flags;
-	uint8_t		vrh_gso_type;
-	uint16_t	vrh_hdr_len;
-	uint16_t	vrh_gso_size;
-	uint16_t	vrh_csum_start;
-	uint16_t	vrh_csum_offset;
-	uint16_t	vrh_bufs;
-};
-struct virtio_net_hdr {
-	uint8_t		vrh_flags;
-	uint8_t		vrh_gso_type;
-	uint16_t	vrh_hdr_len;
-	uint16_t	vrh_gso_size;
-	uint16_t	vrh_csum_start;
-	uint16_t	vrh_csum_offset;
-};
-#pragma pack()
-
-typedef struct viona_vring_hqueue {
-	/* Internal state */
-	uint16_t		hq_size;
-	kmutex_t		hq_a_mutex;
-	kmutex_t		hq_u_mutex;
-	uint16_t		hq_cur_aidx;	/* trails behind 'avail_idx' */
-
-	/* Host-context pointers to the queue */
-	caddr_t			hq_baseaddr;
-	uint16_t		*hq_avail_flags;
-	uint16_t		*hq_avail_idx;	/* monotonically increasing */
-	uint16_t		*hq_avail_ring;
-
-	uint16_t		*hq_used_flags;
-	uint16_t		*hq_used_idx;	/* monotonically increasing */
-	struct virtio_used	*hq_used_ring;
-} viona_vring_hqueue_t;
-
-
-typedef struct viona_link {
-	datalink_id_t		l_linkid;
-
-	struct vm		*l_vm;
-	size_t			l_vm_lomemsize;
-	caddr_t			l_vm_lomemaddr;
-	size_t			l_vm_himemsize;
-	caddr_t			l_vm_himemaddr;
-
-	mac_handle_t		l_mh;
-	mac_client_handle_t	l_mch;
-
-	kmem_cache_t		*l_desb_kmc;
-
-	pollhead_t		l_pollhead;
-
-	viona_vring_hqueue_t	l_rx_vring;
-	uint_t			l_rx_intr;
-
-	viona_vring_hqueue_t	l_tx_vring;
-	kcondvar_t		l_tx_cv;
-	uint_t			l_tx_intr;
-	kmutex_t		l_tx_mutex;
-	int			l_tx_outstanding;
-	uint32_t		l_features;
-} viona_link_t;
-
-typedef struct {
-	frtn_t			d_frtn;
-	viona_link_t		*d_link;
-	uint_t			d_ref;
-	uint16_t		d_cookie;
-	int			d_len;
-} viona_desb_t;
-
-typedef struct viona_soft_state {
-	viona_link_t		*ss_link;
-} viona_soft_state_t;
-
-typedef struct used_elem {
-	uint16_t	id;
-	uint32_t	len;
-} used_elem_t;
-
-static void			*viona_state;
-static dev_info_t		*viona_dip;
-static id_space_t		*viona_minor_ids;
-/*
- * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
- * transmission to free resources.
- */
-static boolean_t		copy_tx_mblks = B_TRUE;
-
-extern struct vm *vm_lookup_by_name(char *name);
-extern uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len);
-
-static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
-static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
-static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
-static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
-static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
-    cred_t *credp, int *rval);
-static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
-    struct pollhead **phpp);
-
-static int viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create);
-static int viona_ioc_delete(viona_soft_state_t *ss);
-
-static int viona_vm_map(viona_link_t *link);
-static caddr_t viona_gpa2kva(viona_link_t *link, uint64_t gpa);
-static void viona_vm_unmap(viona_link_t *link);
-
-static int viona_ioc_rx_ring_init(viona_link_t *link,
-    vioc_ring_init_t *u_ri);
-static int viona_ioc_tx_ring_init(viona_link_t *link,
-    vioc_ring_init_t *u_ri);
-static int viona_ioc_rx_ring_reset(viona_link_t *link);
-static int viona_ioc_tx_ring_reset(viona_link_t *link);
-static void viona_ioc_rx_ring_kick(viona_link_t *link);
-static void viona_ioc_tx_ring_kick(viona_link_t *link);
-static int viona_ioc_rx_intr_clear(viona_link_t *link);
-static int viona_ioc_tx_intr_clear(viona_link_t *link);
-
-static void viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t loopback);
-static void viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq);
-
-static struct cb_ops viona_cb_ops = {
-	viona_open,
-	viona_close,
-	nodev,
-	nodev,
-	nodev,
-	nodev,
-	nodev,
-	viona_ioctl,
-	nodev,
-	nodev,
-	nodev,
-	viona_chpoll,
-	ddi_prop_op,
-	0,
-	D_MP | D_NEW | D_HOTPLUG,
-	CB_REV,
-	nodev,
-	nodev
-};
-
-static struct dev_ops viona_ops = {
-	DEVO_REV,
-	0,
-	nodev,
-	nulldev,
-	nulldev,
-	viona_attach,
-	viona_detach,
-	nodev,
-	&viona_cb_ops,
-	NULL,
-	ddi_power,
-	ddi_quiesce_not_needed
-};
-
-static struct modldrv modldrv = {
-	&mod_driverops,
-	VIONA_NAME,
-	&viona_ops,
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1, &modldrv, NULL
-};
-
-int
-_init(void)
-{
-	int	ret;
-
-	ret = ddi_soft_state_init(&viona_state,
-	    sizeof (viona_soft_state_t), 0);
-	if (ret == 0) {
-		ret = mod_install(&modlinkage);
-		if (ret != 0) {
-			ddi_soft_state_fini(&viona_state);
-			return (ret);
-		}
-	}
-
-	return (ret);
-}
-
-int
-_fini(void)
-{
-	int	ret;
-
-	ret = mod_remove(&modlinkage);
-	if (ret == 0) {
-		ddi_soft_state_fini(&viona_state);
-	}
-
-	return (ret);
-}
-
-int
-_info(struct modinfo *modinfop)
-{
-	return (mod_info(&modlinkage, modinfop));
-}
-
-static void
-set_viona_tx_mode()
-{
-	major_t bcm_nic_major;
-	if ((bcm_nic_major = ddi_name_to_major(BCM_NIC_DRIVER))
-	    != DDI_MAJOR_T_NONE) {
-		if (ddi_hold_installed_driver(bcm_nic_major) != NULL) {
-			copy_tx_mblks = B_FALSE;
-			ddi_rele_driver(bcm_nic_major);
-		}
-	}
-}
-
-static int
-viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
-	if (cmd != DDI_ATTACH) {
-		return (DDI_FAILURE);
-	}
-
-	viona_minor_ids = id_space_create("viona_minor_id",
-	    VIONA_CTL_MINOR + 1, UINT16_MAX);
-
-	if (ddi_create_minor_node(dip, VIONA_CTL_NODE_NAME,
-	    S_IFCHR, VIONA_CTL_MINOR, DDI_PSEUDO, 0) != DDI_SUCCESS) {
-		return (DDI_FAILURE);
-	}
-
-	viona_dip = dip;
-
-	set_viona_tx_mode();
-	ddi_report_dev(viona_dip);
-
-	return (DDI_SUCCESS);
-}
-
-static int
-viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
-	if (cmd != DDI_DETACH) {
-		return (DDI_FAILURE);
-	}
-
-	id_space_destroy(viona_minor_ids);
-
-	ddi_remove_minor_node(viona_dip, NULL);
-
-	viona_dip = NULL;
-
-	return (DDI_SUCCESS);
-}
-
-static int
-viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
-{
-	int	minor;
-
-	if (otype != OTYP_CHR) {
-		return (EINVAL);
-	}
-
-	if (drv_priv(credp) != 0) {
-		return (EPERM);
-	}
-
-	if (getminor(*devp) != VIONA_CTL_MINOR) {
-		return (ENXIO);
-	}
-
-	minor = id_alloc(viona_minor_ids);
-	if (minor == 0) {
-		/* All minors are busy */
-		return (EBUSY);
-	}
-
-	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
-		id_free(viona_minor_ids, minor);
-	}
-
-	*devp = makedevice(getmajor(*devp), minor);
-
-	return (0);
-}
-
-static int
-viona_close(dev_t dev, int flag, int otype, cred_t *credp)
-{
-	int			minor;
-	viona_soft_state_t	*ss;
-
-	if (otype != OTYP_CHR) {
-		return (EINVAL);
-	}
-
-	if (drv_priv(credp) != 0) {
-		return (EPERM);
-	}
-
-	minor = getminor(dev);
-
-	ss = ddi_get_soft_state(viona_state, minor);
-	if (ss == NULL) {
-		return (ENXIO);
-	}
-
-	viona_ioc_delete(ss);
-
-	ddi_soft_state_free(viona_state, minor);
-
-	id_free(viona_minor_ids, minor);
-
-	return (0);
-}
-
-static int
-viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
-    cred_t *credp, int *rval)
-{
-	viona_soft_state_t	*ss;
-	int			err = 0;
-
-	ss = ddi_get_soft_state(viona_state, getminor(dev));
-	if (ss == NULL) {
-		return (ENXIO);
-	}
-
-	switch (cmd) {
-	case VNA_IOC_CREATE:
-		err = viona_ioc_create(ss, (vioc_create_t *)data);
-		break;
-	case VNA_IOC_DELETE:
-		err = viona_ioc_delete(ss);
-		break;
-	case VNA_IOC_SET_FEATURES:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		ss->ss_link->l_features = *(int *)data & VIONA_S_HOSTCAPS;
-		break;
-	case VNA_IOC_GET_FEATURES:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		*(int *)data = VIONA_S_HOSTCAPS;
-		break;
-	case VNA_IOC_RX_RING_INIT:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_rx_ring_init(ss->ss_link,
-		    (vioc_ring_init_t *)data);
-		break;
-	case VNA_IOC_RX_RING_RESET:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_rx_ring_reset(ss->ss_link);
-		break;
-	case VNA_IOC_RX_RING_KICK:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		viona_ioc_rx_ring_kick(ss->ss_link);
-		err = 0;
-		break;
-	case VNA_IOC_TX_RING_INIT:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_tx_ring_init(ss->ss_link,
-		    (vioc_ring_init_t *)data);
-		break;
-	case VNA_IOC_TX_RING_RESET:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_tx_ring_reset(ss->ss_link);
-		break;
-	case VNA_IOC_TX_RING_KICK:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		viona_ioc_tx_ring_kick(ss->ss_link);
-		err = 0;
-		break;
-	case VNA_IOC_RX_INTR_CLR:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_rx_intr_clear(ss->ss_link);
-		break;
-	case VNA_IOC_TX_INTR_CLR:
-		if (ss->ss_link == NULL) {
-			return (ENOSYS);
-		}
-		err = viona_ioc_tx_intr_clear(ss->ss_link);
-		break;
-	default:
-		err = ENOTTY;
-		break;
-	}
-
-	return (err);
-}
-
-static int
-viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
-    struct pollhead **phpp)
-{
-	viona_soft_state_t	*ss;
-
-	ss = ddi_get_soft_state(viona_state, getminor(dev));
-	if (ss == NULL || ss->ss_link == NULL) {
-		return (ENXIO);
-	}
-
-	*reventsp = 0;
-
-	if (ss->ss_link->l_rx_intr && (events & POLLIN)) {
-		*reventsp |= POLLIN;
-	}
-
-	if (ss->ss_link->l_tx_intr && (events & POLLOUT)) {
-		*reventsp |= POLLOUT;
-	}
-
-	if (*reventsp == 0 && !anyyet) {
-		*phpp = &ss->ss_link->l_pollhead;
-	}
-
-	return (0);
-}
-
-static int
-viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create)
-{
-	vioc_create_t		k_create;
-	viona_link_t		*link;
-	char			cli_name[MAXNAMELEN];
-	int			err;
-
-	if (ss->ss_link != NULL) {
-		return (ENOSYS);
-	}
-	if (copyin(u_create, &k_create, sizeof (k_create)) != 0) {
-		return (EFAULT);
-	}
-
-	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
-
-	link->l_linkid = k_create.c_linkid;
-	link->l_vm = vm_lookup_by_name(k_create.c_vmname);
-	if (link->l_vm == NULL) {
-		err = ENXIO;
-		goto bail;
-	}
-
-	link->l_vm_lomemsize = k_create.c_lomem_size;
-	link->l_vm_himemsize = k_create.c_himem_size;
-	err = viona_vm_map(link);
-	if (err != 0) {
-		goto bail;
-	}
-
-	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
-	if (err != 0) {
-		cmn_err(CE_WARN, "viona create mac_open_by_linkid"
-		    " returned %d\n", err);
-		goto bail;
-	}
-
-	snprintf(cli_name, sizeof (cli_name), "%s-%d",
-	    VIONA_CLI_NAME, link->l_linkid);
-	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
-	if (err != 0) {
-		cmn_err(CE_WARN, "viona create mac_client_open"
-		    " returned %d\n", err);
-		goto bail;
-	}
-
-	link->l_features = VIONA_S_HOSTCAPS;
-	link->l_desb_kmc = kmem_cache_create(cli_name,
-	    sizeof (viona_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&link->l_rx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&link->l_tx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
-	if (copy_tx_mblks) {
-		mutex_init(&link->l_tx_mutex, NULL, MUTEX_DRIVER, NULL);
-		cv_init(&link->l_tx_cv, NULL, CV_DRIVER, NULL);
-	}
-	ss->ss_link = link;
-
-	return (0);
-
-bail:
-	if (link->l_mch != NULL) {
-		mac_client_close(link->l_mch, 0);
-	}
-	if (link->l_mh != NULL) {
-		mac_close(link->l_mh);
-	}
-
-	kmem_free(link, sizeof (viona_link_t));
-
-	return (err);
-}
-
-static int
-viona_ioc_delete(viona_soft_state_t *ss)
-{
-	viona_link_t	*link;
-
-	link = ss->ss_link;
-	if (link == NULL) {
-		return (ENOSYS);
-	}
-	if (copy_tx_mblks) {
-		mutex_enter(&link->l_tx_mutex);
-		while (link->l_tx_outstanding != 0) {
-			cv_wait(&link->l_tx_cv, &link->l_tx_mutex);
-		}
-		mutex_exit(&link->l_tx_mutex);
-	}
-	if (link->l_mch != NULL) {
-		mac_rx_clear(link->l_mch);
-		mac_client_close(link->l_mch, 0);
-	}
-	if (link->l_mh != NULL) {
-		mac_close(link->l_mh);
-	}
-
-	viona_vm_unmap(link);
-	mutex_destroy(&link->l_tx_vring.hq_a_mutex);
-	mutex_destroy(&link->l_tx_vring.hq_u_mutex);
-	mutex_destroy(&link->l_rx_vring.hq_a_mutex);
-	mutex_destroy(&link->l_rx_vring.hq_u_mutex);
-	if (copy_tx_mblks) {
-		mutex_destroy(&link->l_tx_mutex);
-		cv_destroy(&link->l_tx_cv);
-	}
-
-	kmem_cache_destroy(link->l_desb_kmc);
-
-	kmem_free(link, sizeof (viona_link_t));
-
-	ss->ss_link = NULL;
-
-	return (0);
-}
-
-static caddr_t
-viona_mapin_vm_chunk(viona_link_t *link, uint64_t gpa, size_t len)
-{
-	caddr_t		addr;
-	size_t		offset;
-	pfn_t		pfnum;
-
-	if (len == 0)
-		return (NULL);
-
-	addr = vmem_alloc(heap_arena, len, VM_SLEEP);
-	if (addr == NULL)
-		return (NULL);
-
-	for (offset = 0; offset < len; offset += PAGESIZE) {
-		pfnum = btop(vm_gpa2hpa(link->l_vm, gpa + offset, PAGESIZE));
-		ASSERT(pfnum);
-		hat_devload(kas.a_hat, addr + offset, PAGESIZE, pfnum,
-		    PROT_READ | PROT_WRITE, HAT_LOAD_LOCK);
-	}
-
-	return (addr);
-}
-
-/*
- * Map the guest physical address space into the kernel virtual address space.
- */
-static int
-viona_vm_map(viona_link_t *link)
-{
-	link->l_vm_lomemaddr = viona_mapin_vm_chunk(link,
-	    0, link->l_vm_lomemsize);
-	if (link->l_vm_lomemaddr == NULL)
-		return (-1);
-	link->l_vm_himemaddr = viona_mapin_vm_chunk(link,
-	    4 * (1024 * 1024 * 1024UL), link->l_vm_himemsize);
-	if (link->l_vm_himemsize && link->l_vm_himemaddr == NULL)
-		return (-1);
-
-	return (0);
-}
-
-/*
- * Translate a guest physical address into a kernel virtual address.
- */
-static caddr_t
-viona_gpa2kva(viona_link_t *link, uint64_t gpa)
-{
-	if (gpa < link->l_vm_lomemsize)
-		return (link->l_vm_lomemaddr + gpa);
-
-	gpa -= (4 * GB);
-	if (gpa < link->l_vm_himemsize)
-		return (link->l_vm_himemaddr + gpa);
-
-	return (NULL);
-}
-
-static void
-viona_vm_unmap(viona_link_t *link)
-{
-	if (link->l_vm_lomemaddr) {
-		hat_unload(kas.a_hat, link->l_vm_lomemaddr,
-		    link->l_vm_lomemsize, HAT_UNLOAD_UNLOCK);
-		vmem_free(heap_arena, link->l_vm_lomemaddr,
-		    link->l_vm_lomemsize);
-	}
-	if (link->l_vm_himemaddr) {
-		hat_unload(kas.a_hat, link->l_vm_himemaddr,
-		    link->l_vm_himemsize, HAT_UNLOAD_UNLOCK);
-		vmem_free(heap_arena, link->l_vm_himemaddr,
-		    link->l_vm_himemsize);
-	}
-}
-
-static int
-viona_ioc_ring_init_common(viona_link_t *link, viona_vring_hqueue_t *hq,
-    vioc_ring_init_t *u_ri)
-{
-	vioc_ring_init_t	k_ri;
-
-	if (copyin(u_ri, &k_ri, sizeof (k_ri)) != 0) {
-		return (EFAULT);
-	}
-
-	hq->hq_size = k_ri.ri_qsize;
-	hq->hq_baseaddr = viona_gpa2kva(link, k_ri.ri_qaddr);
-	if (hq->hq_baseaddr == NULL)
-		return (EINVAL);
-
-	hq->hq_avail_flags = (uint16_t *)(viona_gpa2kva(link,
-	    k_ri.ri_qaddr + hq->hq_size * sizeof (struct virtio_desc)));
-	if (hq->hq_avail_flags == NULL)
-		return (EINVAL);
-	hq->hq_avail_idx = hq->hq_avail_flags + 1;
-	hq->hq_avail_ring = hq->hq_avail_flags + 2;
-
-	hq->hq_used_flags = (uint16_t *)(viona_gpa2kva(link,
-	    P2ROUNDUP(k_ri.ri_qaddr +
-	    hq->hq_size * sizeof (struct virtio_desc) + 2, VRING_ALIGN)));
-	if (hq->hq_used_flags == NULL)
-		return (EINVAL);
-	hq->hq_used_idx = hq->hq_used_flags + 1;
-	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
-
-	/*
-	 * Initialize queue indexes
-	 */
-	hq->hq_cur_aidx = 0;
-
-	return (0);
-}
-
-static int
-viona_ioc_rx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
-{
-	viona_vring_hqueue_t	*hq;
-	int			rval;
-
-	hq = &link->l_rx_vring;
-
-	rval = viona_ioc_ring_init_common(link, hq, u_ri);
-	if (rval != 0) {
-		return (rval);
-	}
-
-	return (0);
-}
-
-static int
-viona_ioc_tx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
-{
-	viona_vring_hqueue_t	*hq;
-
-	hq = &link->l_tx_vring;
-
-	return (viona_ioc_ring_init_common(link, hq, u_ri));
-}
-
-static int
-viona_ioc_ring_reset_common(viona_vring_hqueue_t *hq)
-{
-	/*
-	 * Reset all soft state
-	 */
-	hq->hq_cur_aidx = 0;
-
-	return (0);
-}
-
-static int
-viona_ioc_rx_ring_reset(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq;
-
-	mac_rx_clear(link->l_mch);
-
-	hq = &link->l_rx_vring;
-
-	return (viona_ioc_ring_reset_common(hq));
-}
-
-static int
-viona_ioc_tx_ring_reset(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq;
-
-	hq = &link->l_tx_vring;
-
-	return (viona_ioc_ring_reset_common(hq));
-}
-
-static void
-viona_ioc_rx_ring_kick(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq = &link->l_rx_vring;
-
-	atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
-
-	mac_rx_set(link->l_mch, viona_rx, link);
-}
-
-/*
- * Return the number of available descriptors in the vring taking care
- * of the 16-bit index wraparound.
- */
-static inline int
-viona_hq_num_avail(viona_vring_hqueue_t *hq)
-{
-	uint16_t ndesc;
-
-	/*
-	 * We're just computing (a-b) in GF(216).
-	 *
-	 * The only glitch here is that in standard C,
-	 * uint16_t promotes to (signed) int when int has
-	 * more than 16 bits (pretty much always now), so
-	 * we have to force it back to unsigned.
-	 */
-	ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
-
-	ASSERT(ndesc <= hq->hq_size);
-
-	return (ndesc);
-}
-
-static void
-viona_ioc_tx_ring_kick(viona_link_t *link)
-{
-	viona_vring_hqueue_t	*hq = &link->l_tx_vring;
-
-	do {
-		atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
-		while (viona_hq_num_avail(hq)) {
-			viona_tx(link, hq);
-		}
-		if (copy_tx_mblks) {
-			mutex_enter(&link->l_tx_mutex);
-			if (link->l_tx_outstanding != 0) {
-				cv_wait_sig(&link->l_tx_cv, &link->l_tx_mutex);
-			}
-			mutex_exit(&link->l_tx_mutex);
-		}
-		atomic_and_16(hq->hq_used_flags, ~VRING_USED_F_NO_NOTIFY);
-	} while (viona_hq_num_avail(hq));
-}
-
-static int
-viona_ioc_rx_intr_clear(viona_link_t *link)
-{
-	link->l_rx_intr = 0;
-
-	return (0);
-}
-
-static int
-viona_ioc_tx_intr_clear(viona_link_t *link)
-{
-	link->l_tx_intr = 0;
-
-	return (0);
-}
-#define	VQ_MAX_DESCRIPTORS	512
-
-static int
-vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov,
-    int n_iov, uint16_t *cookie)
-{
-	int			i;
-	int			ndesc, nindir;
-	int			idx, head, next;
-	struct virtio_desc	*vdir, *vindir, *vp;
-
-	idx = hq->hq_cur_aidx;
-	ndesc = (uint16_t)((unsigned)*hq->hq_avail_idx - (unsigned)idx);
-
-	if (ndesc == 0)
-		return (0);
-	if (ndesc > hq->hq_size) {
-		cmn_err(CE_NOTE, "ndesc (%d) out of range\n", ndesc);
-		return (-1);
-	}
-
-	head = hq->hq_avail_ring[idx & (hq->hq_size - 1)];
-	next = head;
-
-	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
-		if (next >= hq->hq_size) {
-			cmn_err(CE_NOTE, "descriptor index (%d)"
-			    "out of range\n", next);
-			return (-1);
-		}
-
-		vdir = (struct virtio_desc *)(hq->hq_baseaddr +
-		    next * sizeof (struct virtio_desc));
-		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
-			if (i > n_iov)
-				return (-1);
-			iov[i].iov_base = viona_gpa2kva(link, vdir->vd_addr);
-			if (iov[i].iov_base == NULL) {
-				cmn_err(CE_NOTE, "invalid guest physical"
-				    " address 0x%"PRIx64"\n", vdir->vd_addr);
-				return (-1);
-			}
-			iov[i++].iov_len = vdir->vd_len;
-		} else {
-			nindir = vdir->vd_len / 16;
-			if ((vdir->vd_len & 0xf) || nindir == 0) {
-				cmn_err(CE_NOTE, "invalid indir len 0x%x\n",
-				    vdir->vd_len);
-				return (-1);
-			}
-			vindir = (struct virtio_desc *)
-			    viona_gpa2kva(link, vdir->vd_addr);
-			if (vindir == NULL) {
-				cmn_err(CE_NOTE, "invalid guest physical"
-				    " address 0x%"PRIx64"\n", vdir->vd_addr);
-				return (-1);
-			}
-			next = 0;
-			for (;;) {
-				vp = &vindir[next];
-				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
-					cmn_err(CE_NOTE, "indirect desc"
-					    " has INDIR flag\n");
-					return (-1);
-				}
-				if (i > n_iov)
-					return (-1);
-				iov[i].iov_base =
-				    viona_gpa2kva(link, vp->vd_addr);
-				if (iov[i].iov_base == NULL) {
-					cmn_err(CE_NOTE, "invalid guest"
-					    " physical address 0x%"PRIx64"\n",
-					    vp->vd_addr);
-					return (-1);
-				}
-				iov[i++].iov_len = vp->vd_len;
-
-				if (i > VQ_MAX_DESCRIPTORS)
-					goto loopy;
-				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
-					break;
-
-				next = vp->vd_next;
-				if (next >= nindir) {
-					cmn_err(CE_NOTE, "invalid next"
-					    " %d > %d\n", next, nindir);
-					return (-1);
-				}
-			}
-		}
-		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) {
-			*cookie = head;
-			hq->hq_cur_aidx++;
-			return (i);
-		}
-	}
-
-loopy:
-	cmn_err(CE_NOTE, "%d > descriptor loop count\n", i);
-
-	return (-1);
-}
-
-static void
-vq_pushchain(viona_vring_hqueue_t *hq, uint32_t len, uint16_t cookie)
-{
-	struct virtio_used	*vu;
-	int			uidx;
-
-	uidx = *hq->hq_used_idx;
-	vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
-	vu->vu_idx = cookie;
-	vu->vu_tlen = len;
-	membar_producer();
-	*hq->hq_used_idx = uidx;
-}
-
-static void
-vq_pushchain_mrgrx(viona_vring_hqueue_t *hq, int num_bufs, used_elem_t *elem)
-{
-	struct virtio_used	*vu;
-	int			uidx;
-	int			i;
-
-	uidx = *hq->hq_used_idx;
-	if (num_bufs == 1) {
-		vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
-		vu->vu_idx = elem[0].id;
-		vu->vu_tlen = elem[0].len;
-	} else {
-		for (i = 0; i < num_bufs; i++) {
-			vu = &hq->hq_used_ring[(uidx + i) & (hq->hq_size - 1)];
-			vu->vu_idx = elem[i].id;
-			vu->vu_tlen = elem[i].len;
-		}
-		uidx = uidx + num_bufs;
-	}
-	membar_producer();
-	*hq->hq_used_idx = uidx;
-}
-
-/*
- * Copy bytes from mp to iov.
- * copied_buf: Total num_bytes copied from mblk to iov array.
- * buf: pointer to iov_base.
- * i: index of iov array. Mainly used to identify if we are
- *    dealing with first iov array element.
- * rxhdr_size: Virtio header size. Two possibilities in case
- *    of MRGRX buf, header has 2 additional bytes.
- *    In case of mrgrx, virtio header should be part of iov[0].
- *    In case of non-mrgrx, virtio header may or may not be part
- *    of iov[0].
- */
-static int
-copy_in_mblk(mblk_t *mp, int copied_buf, caddr_t buf, struct iovec *iov,
-    int i, int rxhdr_size)
-{
-	int copied_chunk = 0;
-	mblk_t *ml;
-	int total_buf_len = iov->iov_len;
-	/*
-	 * iov[0] might have header, adjust
-	 * total_buf_len accordingly
-	 */
-	if (i == 0) {
-		total_buf_len = iov->iov_len - rxhdr_size;
-	}
-	for (ml = mp; ml != NULL; ml = ml->b_cont) {
-		size_t	chunk = MBLKL(ml);
-		/*
-		 * If chunk is less than
-		 * copied_buf we should move
-		 * to correct msgblk
-		 */
-		if (copied_buf != 0) {
-			if (copied_buf < chunk) {
-				chunk -= copied_buf;
-			} else {
-				copied_buf -= chunk;
-				continue;
-			}
-		}
-		/*
-		 * iov[0] already has virtio header.
-		 * and if copied chunk is length of iov_len break
-		 */
-		if (copied_chunk == total_buf_len) {
-			break;
-		}
-		/*
-		 * Sometimes chunk is total mblk len, sometimes mblk is
-		 * divided into multiple chunks.
-		 */
-		if (chunk > copied_buf) {
-			if (chunk > copied_chunk) {
-				if ((chunk + copied_chunk) > total_buf_len)
-					chunk = (size_t)total_buf_len
-					    - copied_chunk;
-			} else {
-				if (chunk > (total_buf_len - copied_chunk))
-					chunk = (size_t)((total_buf_len
-					    - copied_chunk) - chunk);
-			}
-			bcopy(ml->b_rptr + copied_buf, buf, chunk);
-		} else {
-			if (chunk > (total_buf_len - copied_chunk)) {
-				chunk = (size_t)(total_buf_len - copied_chunk);
-			}
-			bcopy(ml->b_rptr + copied_buf, buf, chunk);
-		}
-		buf += chunk;
-		copied_chunk += chunk;
-	}
-	return (copied_chunk);
-}
-
-static void
-viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t loopback)
-{
-	viona_link_t		*link = arg;
-	viona_vring_hqueue_t	*hq = &link->l_rx_vring;
-	mblk_t			*mp0 = mp;
-
-	while (viona_hq_num_avail(hq)) {
-		struct iovec		iov[VTNET_MAXSEGS];
-		size_t			mblklen;
-		int			n, i = 0;
-		uint16_t		cookie;
-		struct virtio_net_hdr	*vrx = NULL;
-		struct virtio_net_mrgrxhdr *vmrgrx = NULL;
-#if notyet
-		mblk_t			*ml;
-#endif
-		caddr_t			buf = NULL;
-		int			total_len = 0;
-		int			copied_buf = 0;
-		int			num_bufs = 0;
-		int			num_pops = 0;
-		used_elem_t		uelem[VTNET_MAXSEGS];
-
-		if (mp == NULL) {
-			break;
-		}
-		mblklen = msgsize(mp);
-		if (mblklen == 0) {
-			break;
-		}
-
-		mutex_enter(&hq->hq_a_mutex);
-		n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
-		mutex_exit(&hq->hq_a_mutex);
-		if (n <= 0) {
-			break;
-		}
-		num_pops++;
-		if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
-			int total_n = n;
-			int mrgrxhdr_size = sizeof (struct virtio_net_mrgrxhdr);
-			/*
-			 * Get a pointer to the rx header, and use the
-			 * data immediately following it for the packet buffer.
-			 */
-			vmrgrx = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
-			if (n == 1) {
-				buf = iov[0].iov_base + mrgrxhdr_size;
-			}
-			while (mblklen > copied_buf) {
-				if (total_n == i) {
-					mutex_enter(&hq->hq_a_mutex);
-					n = vq_popchain(link, hq, &iov[i],
-					    VTNET_MAXSEGS, &cookie);
-					mutex_exit(&hq->hq_a_mutex);
-					if (n <= 0) {
-						freemsgchain(mp0);
-						return;
-					}
-					num_pops++;
-					total_n += n;
-				}
-				if (total_n > i) {
-					int copied_chunk = 0;
-					if (i != 0) {
-						buf = iov[i].iov_base;
-					}
-					copied_chunk = copy_in_mblk(mp,
-					    copied_buf, buf, &iov[i], i,
-					    mrgrxhdr_size);
-					copied_buf += copied_chunk;
-					uelem[i].id = cookie;
-					uelem[i].len = copied_chunk;
-					if (i == 0) {
-						uelem[i].len += mrgrxhdr_size;
-					}
-				}
-				num_bufs++;
-				i++;
-			}
-		} else {
-			boolean_t virt_hdr_incl_iov = B_FALSE;
-			int rxhdr_size = sizeof (struct virtio_net_hdr);
-			/* First element is header */
-			vrx = (struct virtio_net_hdr *)iov[0].iov_base;
-			if (n == 1 || iov[0].iov_len > rxhdr_size) {
-				buf = iov[0].iov_base + rxhdr_size;
-				virt_hdr_incl_iov = B_TRUE;
-				total_len += rxhdr_size;
-				if (iov[0].iov_len < rxhdr_size) {
-					// Buff too small to fit pkt. Drop it.
-					freemsgchain(mp0);
-					return;
-				}
-			} else {
-				total_len = iov[0].iov_len;
-			}
-			if (iov[0].iov_len == rxhdr_size)
-				i++;
-			while (mblklen > copied_buf) {
-				if (n > i) {
-					int copied_chunk = 0;
-					if (i != 0) {
-						buf = iov[i].iov_base;
-					}
-					/*
-					 * In case of non-mrgrx buf, first
-					 * descriptor always has header and
-					 * rest of the descriptors have data.
-					 * But it is not guaranteed that first
-					 * descriptor will only have virtio
-					 * header. It might also have data.
-					 */
-					if (virt_hdr_incl_iov) {
-						copied_chunk = copy_in_mblk(mp,
-						    copied_buf, buf, &iov[i],
-						    i, rxhdr_size);
-					} else {
-						copied_chunk = copy_in_mblk(mp,
-						    copied_buf, buf, &iov[i],
-						    i, 0);
-					}
-					copied_buf += copied_chunk;
-					total_len += copied_chunk;
-				} else {
-					/*
-					 * Drop packet as it cant fit
-					 * in buf provided by guest.
-					 */
-					freemsgchain(mp0);
-					return;
-				}
-				i++;
-			}
-		}
-		/*
-		 * The only valid field in the rx packet header is the
-		 * number of buffers, which is always 1 without TSO
-		 * support.
-		 */
-		if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
-			memset(vmrgrx, 0, sizeof (struct virtio_net_mrgrxhdr));
-			vmrgrx->vrh_bufs = num_bufs;
-			/*
-			 * Make sure iov[0].iov_len >= MIN_BUF_SIZE
-			 * otherwise guest will consider it as invalid frame.
-			 */
-			if (num_bufs == 1 && uelem[0].len < MIN_BUF_SIZE) {
-				uelem[0].len = MIN_BUF_SIZE;
-			}
-			/*
-			 * Release this chain and handle more chains.
-			 */
-			mutex_enter(&hq->hq_u_mutex);
-			vq_pushchain_mrgrx(hq, num_pops, uelem);
-			mutex_exit(&hq->hq_u_mutex);
-		} else {
-			memset(vrx, 0, sizeof (struct virtio_net_hdr));
-			if (total_len < MIN_BUF_SIZE) {
-				total_len = MIN_BUF_SIZE;
-			}
-			/*
-			 * Release this chain and handle more chains.
-			 */
-			mutex_enter(&hq->hq_u_mutex);
-			vq_pushchain(hq, total_len, cookie);
-			mutex_exit(&hq->hq_u_mutex);
-		}
-
-		mp = mp->b_next;
-	}
-
-	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		if (atomic_cas_uint(&link->l_rx_intr, 0, 1) == 0) {
-			pollwakeup(&link->l_pollhead, POLLIN);
-		}
-	}
-
-	freemsgchain(mp0);
-}
-
-static void
-viona_desb_free(viona_desb_t *dp)
-{
-	viona_link_t		*link;
-	viona_vring_hqueue_t	*hq;
-#if notyet
-	struct virtio_used	*vu;
-	int			uidx;
-#endif
-	uint_t			ref;
-
-	ref = atomic_dec_uint_nv(&dp->d_ref);
-	if (ref != 0)
-		return;
-
-	link = dp->d_link;
-	hq = &link->l_tx_vring;
-
-	mutex_enter(&hq->hq_u_mutex);
-	vq_pushchain(hq, dp->d_len, dp->d_cookie);
-	mutex_exit(&hq->hq_u_mutex);
-
-	kmem_cache_free(link->l_desb_kmc, dp);
-
-	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
-		if (atomic_cas_uint(&link->l_tx_intr, 0, 1) == 0) {
-			pollwakeup(&link->l_pollhead, POLLOUT);
-		}
-	}
-	if (copy_tx_mblks) {
-		mutex_enter(&link->l_tx_mutex);
-		if (--link->l_tx_outstanding == 0) {
-			cv_broadcast(&link->l_tx_cv);
-		}
-		mutex_exit(&link->l_tx_mutex);
-	}
-}
-
-static void
-viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq)
-{
-	struct iovec		iov[VTNET_MAXSEGS];
-	uint16_t		cookie;
-	int			i, n;
-	mblk_t			*mp_head, *mp_tail, *mp;
-	viona_desb_t		*dp;
-	mac_client_handle_t	link_mch = link->l_mch;
-
-	mp_head = mp_tail = NULL;
-
-	mutex_enter(&hq->hq_a_mutex);
-	n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
-	mutex_exit(&hq->hq_a_mutex);
-	ASSERT(n != 0);
-
-	dp = kmem_cache_alloc(link->l_desb_kmc, KM_SLEEP);
-	dp->d_frtn.free_func = viona_desb_free;
-	dp->d_frtn.free_arg = (void *)dp;
-	dp->d_link = link;
-	dp->d_cookie = cookie;
-
-	dp->d_ref = 0;
-	dp->d_len = iov[0].iov_len;
-
-	for (i = 1; i < n; i++) {
-		dp->d_ref++;
-		dp->d_len += iov[i].iov_len;
-		if (copy_tx_mblks) {
-			mp = desballoc((uchar_t *)iov[i].iov_base,
-			    iov[i].iov_len, BPRI_MED, &dp->d_frtn);
-			ASSERT(mp);
-		} else {
-			mp = allocb(iov[i].iov_len, BPRI_MED);
-			ASSERT(mp);
-			bcopy((uchar_t *)iov[i].iov_base, mp->b_wptr,
-			    iov[i].iov_len);
-		}
-		mp->b_wptr += iov[i].iov_len;
-		if (mp_head == NULL) {
-			ASSERT(mp_tail == NULL);
-			mp_head = mp;
-		} else {
-			ASSERT(mp_tail != NULL);
-			mp_tail->b_cont = mp;
-		}
-		mp_tail = mp;
-	}
-	if (copy_tx_mblks == B_FALSE) {
-		viona_desb_free(dp);
-	}
-	if (copy_tx_mblks) {
-		mutex_enter(&link->l_tx_mutex);
-		link->l_tx_outstanding++;
-		mutex_exit(&link->l_tx_mutex);
-	}
-	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
-}
diff --git a/usr/src/uts/i86pc/io/viona/viona.mapfile b/usr/src/uts/i86pc/io/viona/viona.mapfile
new file mode 100644
index 0000000000..cece86348c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona.mapfile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+    global:
+	# DDI Interfaces
+	_fini;
+	_init;
+	_info;
+
+    local:
+	*;
+};
diff --git a/usr/src/uts/i86pc/io/viona/viona_hook.c b/usr/src/uts/i86pc/io/viona/viona_hook.c
new file mode 100644
index 0000000000..4520be04b0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_hook.c
@@ -0,0 +1,438 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include "viona_impl.h"
+
+
+/*
+ * Global linked list of viona_neti_ts.  Access is protected by viona_neti_lock
+ */
+static list_t		viona_neti_list;
+static kmutex_t		viona_neti_lock;
+
+/*
+ * viona_neti is allocated and initialized during attach, and read-only
+ * until detach (where it's also freed)
+ */
+static net_instance_t	*viona_neti;
+
+
+/*
+ * Generate a hook event for the packet in *mpp headed in the direction
+ * indicated by 'out'.  If the packet is accepted, 0 is returned.  If the
+ * packet is rejected, an error is returned.  The hook function may or may not
+ * alter or even free *mpp.  The caller is expected to deal with either
+ * situation.
+ */
+int
+viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out)
+{
+	viona_neti_t *nip = link->l_neti;
+	viona_nethook_t *vnh = &nip->vni_nethook;
+	hook_pkt_event_t info;
+	hook_event_t he;
+	hook_event_token_t het;
+	int ret;
+
+	he = out ? vnh->vnh_event_out : vnh->vnh_event_in;
+	het = out ? vnh->vnh_token_out : vnh->vnh_token_in;
+
+	if (!he.he_interested)
+		return (0);
+
+	info.hpe_protocol = vnh->vnh_neti;
+	info.hpe_ifp = (phy_if_t)link;
+	info.hpe_ofp = (phy_if_t)link;
+	info.hpe_mp = mpp;
+	info.hpe_flags = 0;
+
+	ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info);
+	if (ret == 0)
+		return (0);
+
+	if (out) {
+		VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring,
+		    mblk_t *, *mpp, int, ret);
+		VIONA_RING_STAT_INCR(ring, tx_hookdrop);
+	} else {
+		VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring,
+		    mblk_t *, *mpp, int, ret);
+		VIONA_RING_STAT_INCR(ring, rx_hookdrop);
+	}
+	return (ret);
+}
+
+/*
+ * netinfo stubs - required by the nethook framework, but otherwise unused
+ *
+ * Currently, all ipf rules are applied against all interfaces in a given
+ * netstack (e.g. all interfaces in a zone).  In the future if we want to
+ * support being able to apply different rules to different interfaces, I
+ * believe we would need to implement some of these stubs to map an interface
+ * name in a rule (e.g. 'net0', back to an index or viona_link_t);
+ */
+static int
+viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused,
+    char *buf __unused, const size_t len __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getptmue(net_handle_t neti __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, size_t nelem __unused,
+    net_ifaddr_t type[] __unused, void *storage __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, zoneid_t *zid __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused, uint64_t *flags __unused)
+{
+	return (-1);
+}
+
+static phy_if_t
+viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static phy_if_t
+viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static lif_if_t
+viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused,
+    lif_if_t ifdata __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_inject(net_handle_t neti __unused, inject_t style __unused,
+    net_inject_t *packet __unused)
+{
+	return (-1);
+}
+
+static phy_if_t
+viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused,
+    struct sockaddr *next __unused)
+{
+	return ((phy_if_t)-1);
+}
+
+static int
+viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+	return (-1);
+}
+
+static int
+viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+	return (-1);
+}
+
+static net_protocol_t viona_netinfo = {
+	NETINFO_VERSION,
+	NHF_VIONA,
+	viona_neti_getifname,
+	viona_neti_getmtu,
+	viona_neti_getptmue,
+	viona_neti_getlifaddr,
+	viona_neti_getlifzone,
+	viona_neti_getlifflags,
+	viona_neti_phygetnext,
+	viona_neti_phylookup,
+	viona_neti_lifgetnext,
+	viona_neti_inject,
+	viona_neti_route,
+	viona_neti_ispchksum,
+	viona_neti_isvchksum
+};
+
+/*
+ * Create/register our nethooks
+ */
+static int
+viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name,
+    net_protocol_t *netip)
+{
+	int ret;
+
+	if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_protocol_register failed "
+		    "(netid=%d name=%s)", __func__, nid, nh_name);
+		goto fail_init_proto;
+	}
+
+	HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name);
+	if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) {
+		cmn_err(CE_NOTE, "%s: net_family_register failed "
+		    "(netid=%d name=%s err=%d)", __func__,
+		    nid, nh_name, ret);
+		goto fail_init_family;
+	}
+
+	HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN);
+	if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti,
+	    &vnh->vnh_event_in)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid,
+		    nh_name);
+		goto fail_init_event_in;
+	}
+
+	HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT);
+	if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti,
+	    &vnh->vnh_event_out)) == NULL) {
+		cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+		    "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid,
+		    nh_name);
+		goto fail_init_event_out;
+	}
+	return (0);
+
+	/*
+	 * On failure, we undo all the steps that succeeded in the
+	 * reverse order of initialization, starting at the last
+	 * successful step (the labels denoting the failing step).
+	 */
+fail_init_event_out:
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+	vnh->vnh_token_in = NULL;
+
+fail_init_event_in:
+	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+
+fail_init_family:
+	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+	vnh->vnh_neti = NULL;
+
+fail_init_proto:
+	return (1);
+}
+
+/*
+ * Shutdown the nethooks for a protocol family.  This triggers notification
+ * callbacks to anything that has registered interest to allow hook consumers
+ * to unhook prior to the removal of the hooks as well as makes them unavailable
+ * to any future consumers as the first step of removal.
+ */
+static void
+viona_nethook_shutdown(viona_nethook_t *vnh)
+{
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out));
+	VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+}
+
+/*
+ * Remove the nethooks for a protocol family.
+ */
+static void
+viona_nethook_fini(viona_nethook_t *vnh)
+{
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out));
+	VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+	VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+	VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+	vnh->vnh_neti = NULL;
+}
+
+/*
+ * Callback invoked by the neti module.  This creates/registers our hooks
+ * {IPv4,IPv6}{in,out} with the nethook framework so they are available to
+ * interested consumers (e.g. ipf).
+ *
+ * During attach, viona_neti_create is called once for every netstack
+ * present on the system at the time of attach.  Thereafter, it is called
+ * during the creation of additional netstack instances (i.e. zone boot).  As a
+ * result, the viona_neti_t that is created during this call always occurs
+ * prior to any viona instances that will use it to send hook events.
+ *
+ * It should never return NULL.  If we cannot register our hooks, we do not
+ * set vnh_hooked of the respective protocol family, which will prevent the
+ * creation of any viona instances on this netstack (see viona_ioc_create).
+ * This can only occur if after a shutdown event (which means destruction is
+ * imminent) we are trying to create a new instance.
+ */
+static void *
+viona_neti_create(const netid_t netid)
+{
+	viona_neti_t *nip;
+
+	VERIFY(netid != -1);
+
+	nip = kmem_zalloc(sizeof (*nip), KM_SLEEP);
+	nip->vni_netid = netid;
+	nip->vni_zid = net_getzoneidbynetid(netid);
+	mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t),
+	    offsetof(viona_soft_state_t, ss_node));
+
+	if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA,
+	    &viona_netinfo) == 0)
+		nip->vni_nethook.vnh_hooked = B_TRUE;
+
+	mutex_enter(&viona_neti_lock);
+	list_insert_tail(&viona_neti_list, nip);
+	mutex_exit(&viona_neti_lock);
+
+	return (nip);
+}
+
+/*
+ * Called during netstack teardown by the neti module.  During teardown, all
+ * the shutdown callbacks are invoked, allowing consumers to release any holds
+ * and otherwise quiesce themselves prior to destruction, followed by the
+ * actual destruction callbacks.
+ */
+static void
+viona_neti_shutdown(netid_t nid, void *arg)
+{
+	viona_neti_t *nip = arg;
+
+	ASSERT(nip != NULL);
+	VERIFY(nid == nip->vni_netid);
+
+	mutex_enter(&viona_neti_lock);
+	list_remove(&viona_neti_list, nip);
+	mutex_exit(&viona_neti_lock);
+
+	if (nip->vni_nethook.vnh_hooked)
+		viona_nethook_shutdown(&nip->vni_nethook);
+}
+
+/*
+ * Called during netstack teardown by the neti module.  Destroys the viona
+ * netinst data.  This is invoked after all the netstack and neti shutdown
+ * callbacks have been invoked.
+ */
+static void
+viona_neti_destroy(netid_t nid, void *arg)
+{
+	viona_neti_t *nip = arg;
+
+	ASSERT(nip != NULL);
+	VERIFY(nid == nip->vni_netid);
+
+	mutex_enter(&nip->vni_lock);
+	while (nip->vni_ref != 0)
+		cv_wait(&nip->vni_ref_change, &nip->vni_lock);
+	mutex_exit(&nip->vni_lock);
+
+	VERIFY(!list_link_active(&nip->vni_node));
+
+	if (nip->vni_nethook.vnh_hooked)
+		viona_nethook_fini(&nip->vni_nethook);
+
+	mutex_destroy(&nip->vni_lock);
+	list_destroy(&nip->vni_dev_list);
+	kmem_free(nip, sizeof (*nip));
+}
+
+/*
+ * Find the viona netinst data by zone id.  This is only used during
+ * viona instance creation (and thus is only called by a zone that is running).
+ */
+viona_neti_t *
+viona_neti_lookup_by_zid(zoneid_t zid)
+{
+	viona_neti_t *nip;
+
+	mutex_enter(&viona_neti_lock);
+	for (nip = list_head(&viona_neti_list); nip != NULL;
+	    nip = list_next(&viona_neti_list, nip)) {
+		if (nip->vni_zid == zid) {
+			mutex_enter(&nip->vni_lock);
+			nip->vni_ref++;
+			mutex_exit(&nip->vni_lock);
+			mutex_exit(&viona_neti_lock);
+			return (nip);
+		}
+	}
+	mutex_exit(&viona_neti_lock);
+	return (NULL);
+}
+
+void
+viona_neti_rele(viona_neti_t *nip)
+{
+	mutex_enter(&nip->vni_lock);
+	VERIFY3S(nip->vni_ref, >, 0);
+	nip->vni_ref--;
+	mutex_exit(&nip->vni_lock);
+	cv_broadcast(&nip->vni_ref_change);
+}
+
+void
+viona_neti_attach(void)
+{
+	mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&viona_neti_list, sizeof (viona_neti_t),
+	    offsetof(viona_neti_t, vni_node));
+
+	/* This can only fail if NETINFO_VERSION is wrong */
+	viona_neti = net_instance_alloc(NETINFO_VERSION);
+	VERIFY(viona_neti != NULL);
+
+	viona_neti->nin_name = "viona";
+	viona_neti->nin_create = viona_neti_create;
+	viona_neti->nin_shutdown = viona_neti_shutdown;
+	viona_neti->nin_destroy = viona_neti_destroy;
+	/* This can only fail if we've registered ourselves multiple times */
+	VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS);
+}
+
+void
+viona_neti_detach(void)
+{
+	/* This can only fail if we've not registered previously */
+	VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS);
+	net_instance_free(viona_neti);
+	viona_neti = NULL;
+
+	list_destroy(&viona_neti_list);
+	mutex_destroy(&viona_neti_lock);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_impl.h b/usr/src/uts/i86pc/io/viona/viona_impl.h
new file mode 100644
index 0000000000..5471b611a4
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_impl.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef	_VIONA_IMPL_H
+#define	_VIONA_IMPL_H
+
+#include <sys/ddi.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/uio.h>
+
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/neti.h>
+#include <inet/ip.h>
+#include <inet/tcp.h>
+
+#include <sys/vmm_drv.h>
+#include <sys/viona_io.h>
+
+struct viona_link;
+typedef struct viona_link viona_link_t;
+struct viona_desb;
+typedef struct viona_desb viona_desb_t;
+struct viona_net;
+typedef struct viona_neti viona_neti_t;
+
+enum viona_ring_state {
+	VRS_RESET	= 0x0,	/* just allocated or reset */
+	VRS_SETUP	= 0x1,	/* addrs setup and starting worker thread */
+	VRS_INIT	= 0x2,	/* worker thread started & waiting to run */
+	VRS_RUN		= 0x3,	/* running work routine */
+	VRS_STOP	= 0x4,	/* worker is exiting */
+};
+enum viona_ring_state_flags {
+	VRSF_REQ_START	= 0x1,	/* start running from INIT state */
+	VRSF_REQ_STOP	= 0x2,	/* stop running, clean up, goto RESET state */
+	VRSF_RENEW	= 0x4,	/* ring renewing lease */
+};
+
+typedef struct viona_vring {
+	viona_link_t	*vr_link;
+
+	kmutex_t	vr_lock;
+	kcondvar_t	vr_cv;
+	uint16_t	vr_state;
+	uint16_t	vr_state_flags;
+	uint_t		vr_xfer_outstanding;
+	kthread_t	*vr_worker_thread;
+	vmm_lease_t	*vr_lease;
+
+	/* ring-sized resources for TX activity */
+	viona_desb_t	*vr_txdesb;
+	struct iovec	*vr_txiov;
+
+	uint_t		vr_intr_enabled;
+	uint64_t	vr_msi_addr;
+	uint64_t	vr_msi_msg;
+
+	/* Internal ring-related state */
+	kmutex_t	vr_a_mutex;	/* sync consumers of 'avail' */
+	kmutex_t	vr_u_mutex;	/* sync consumers of 'used' */
+	uint64_t	vr_pa;
+	uint16_t	vr_size;
+	uint16_t	vr_mask;	/* cached from vr_size */
+	uint16_t	vr_cur_aidx;	/* trails behind 'avail_idx' */
+
+	/* Host-context pointers to the queue */
+	volatile struct virtio_desc	*vr_descr;
+
+	volatile uint16_t		*vr_avail_flags;
+	volatile uint16_t		*vr_avail_idx;
+	volatile uint16_t		*vr_avail_ring;
+	volatile uint16_t		*vr_avail_used_event;
+
+	volatile uint16_t		*vr_used_flags;
+	volatile uint16_t		*vr_used_idx;
+	volatile struct virtio_used	*vr_used_ring;
+	volatile uint16_t		*vr_used_avail_event;
+
+	/* Per-ring error condition statistics */
+	struct viona_ring_stats {
+		uint64_t	rs_ndesc_too_high;
+		uint64_t	rs_bad_idx;
+		uint64_t	rs_indir_bad_len;
+		uint64_t	rs_indir_bad_nest;
+		uint64_t	rs_indir_bad_next;
+		uint64_t	rs_no_space;
+		uint64_t	rs_too_many_desc;
+		uint64_t	rs_desc_bad_len;
+
+		uint64_t	rs_bad_ring_addr;
+
+		uint64_t	rs_fail_hcksum;
+		uint64_t	rs_fail_hcksum6;
+		uint64_t	rs_fail_hcksum_proto;
+
+		uint64_t	rs_bad_rx_frame;
+		uint64_t	rs_rx_merge_overrun;
+		uint64_t	rs_rx_merge_underrun;
+		uint64_t	rs_rx_pad_short;
+		uint64_t	rs_rx_mcast_check;
+		uint64_t	rs_too_short;
+		uint64_t	rs_tx_absent;
+
+		uint64_t	rs_rx_hookdrop;
+		uint64_t	rs_tx_hookdrop;
+	} vr_stats;
+} viona_vring_t;
+
+struct viona_link {
+	vmm_hold_t		*l_vm_hold;
+	boolean_t		l_destroyed;
+
+	viona_vring_t		l_vrings[VIONA_VQ_MAX];
+
+	uint32_t		l_features;
+	uint32_t		l_features_hw;
+	uint32_t		l_cap_csum;
+
+	uintptr_t		l_notify_ioport;
+	void			*l_notify_cookie;
+
+	datalink_id_t		l_linkid;
+	mac_handle_t		l_mh;
+	mac_client_handle_t	l_mch;
+	mac_promisc_handle_t	l_mph;
+
+	pollhead_t		l_pollhead;
+
+	viona_neti_t		*l_neti;
+};
+
+typedef struct viona_nethook {
+	net_handle_t		vnh_neti;
+	hook_family_t		vnh_family;
+	hook_event_t		vnh_event_in;
+	hook_event_t		vnh_event_out;
+	hook_event_token_t	vnh_token_in;
+	hook_event_token_t	vnh_token_out;
+	boolean_t		vnh_hooked;
+} viona_nethook_t;
+
+struct viona_neti {
+	list_node_t		vni_node;
+
+	netid_t			vni_netid;
+	zoneid_t		vni_zid;
+
+	viona_nethook_t		vni_nethook;
+
+	kmutex_t		vni_lock;	/* Protects remaining members */
+	kcondvar_t		vni_ref_change; /* Protected by vni_lock */
+	uint_t			vni_ref;	/* Protected by vni_lock */
+	list_t			vni_dev_list;	/* Protected by vni_lock */
+};
+
+typedef struct used_elem {
+	uint16_t	id;
+	uint32_t	len;
+} used_elem_t;
+
+typedef struct viona_soft_state {
+	kmutex_t		ss_lock;
+	viona_link_t		*ss_link;
+	list_node_t		ss_node;
+} viona_soft_state_t;
+
+#pragma pack(1)
+struct virtio_desc {
+	uint64_t	vd_addr;
+	uint32_t	vd_len;
+	uint16_t	vd_flags;
+	uint16_t	vd_next;
+};
+
+struct virtio_used {
+	uint32_t	vu_idx;
+	uint32_t	vu_tlen;
+};
+
+struct virtio_net_mrgrxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+};
+
+struct virtio_net_hdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+};
+#pragma pack()
+
+#define	VRING_NEED_BAIL(ring, proc)					\
+		(((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 ||	\
+		((proc)->p_flag & SEXITING) != 0)
+
+
+#define	VNETHOOK_INTERESTED_IN(neti) \
+	(neti)->vni_nethook.vnh_event_in.he_interested
+#define	VNETHOOK_INTERESTED_OUT(neti) \
+	(neti)->vni_nethook.vnh_event_out.he_interested
+
+
+#define	VIONA_PROBE(name)	DTRACE_PROBE(viona__##name)
+#define	VIONA_PROBE1(name, arg1, arg2)	\
+	DTRACE_PROBE1(viona__##name, arg1, arg2)
+#define	VIONA_PROBE2(name, arg1, arg2, arg3, arg4)	\
+	DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
+#define	VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6)	\
+	DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
+#define	VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
+	arg9, arg10) \
+	DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
+	arg8, arg9, arg10)
+#define	VIONA_PROBE_BAD_RING_ADDR(r, a)		\
+	VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
+
+#define	VIONA_RING_STAT_INCR(r, name)	\
+	(((r)->vr_stats.rs_ ## name)++)
+
+
+#define	VIONA_MAX_HDRS_LEN	(sizeof (struct ether_vlan_header) + \
+	IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
+
+#define	VRING_AVAIL_F_NO_INTERRUPT	1
+#define	VRING_USED_F_NO_NOTIFY		1
+
+#define	VRING_DESC_F_NEXT	(1 << 0)
+#define	VRING_DESC_F_WRITE	(1 << 1)
+#define	VRING_DESC_F_INDIRECT	(1 << 2)
+
+#define	VIRTIO_NET_HDR_F_NEEDS_CSUM	(1 << 0)
+#define	VIRTIO_NET_HDR_F_DATA_VALID	(1 << 1)
+
+#define	VIRTIO_NET_HDR_GSO_NONE		0
+#define	VIRTIO_NET_HDR_GSO_TCPV4	1
+
+#define	VIRTIO_NET_F_CSUM		(1 << 0)
+#define	VIRTIO_NET_F_GUEST_CSUM		(1 << 1)
+#define	VIRTIO_NET_F_MAC		(1 << 5) /* host supplies MAC */
+#define	VIRTIO_NET_F_GUEST_TSO4		(1 << 7) /* guest can accept TSO */
+#define	VIRTIO_NET_F_HOST_TSO4		(1 << 11) /* host can accept TSO */
+#define	VIRTIO_NET_F_MRG_RXBUF		(1 << 15) /* host can merge RX bufs */
+#define	VIRTIO_NET_F_STATUS		(1 << 16) /* cfg status field present */
+#define	VIRTIO_F_RING_NOTIFY_ON_EMPTY	(1 << 24)
+#define	VIRTIO_F_RING_INDIRECT_DESC	(1 << 28)
+#define	VIRTIO_F_RING_EVENT_IDX		(1 << 29)
+
+
+void viona_ring_alloc(viona_link_t *, viona_vring_t *);
+void viona_ring_free(viona_vring_t *);
+int viona_ring_reset(viona_vring_t *, boolean_t);
+int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t);
+boolean_t viona_ring_lease_renew(viona_vring_t *);
+int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *);
+void vq_pushchain(viona_vring_t *, uint32_t, uint16_t);
+void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *);
+void viona_intr_ring(viona_vring_t *ring);
+
+void viona_rx_init(void);
+void viona_rx_fini(void);
+int viona_rx_set(viona_link_t *);
+void viona_rx_clear(viona_link_t *);
+void viona_worker_rx(viona_vring_t *, viona_link_t *);
+
+extern kmutex_t viona_force_copy_lock;
+void viona_worker_tx(viona_vring_t *, viona_link_t *);
+void viona_tx_ring_alloc(viona_vring_t *, const uint16_t);
+void viona_tx_ring_free(viona_vring_t *, const uint16_t);
+
+void viona_neti_attach(void);
+void viona_neti_detach(void);
+viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
+void viona_neti_rele(viona_neti_t *);
+int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
+
+#endif	/* _VIONA_IMPL_H */
diff --git a/usr/src/uts/i86pc/io/viona/viona_main.c b/usr/src/uts/i86pc/io/viona/viona_main.c
new file mode 100644
index 0000000000..f51a1f9b12
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_main.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * viona - VirtIO-Net, Accelerated
+ *
+ * The purpose of viona is to provide high performance virtio-net devices to
+ * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
+ * DLS/DLD stack.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * A single viona instance is comprised of a "link" handle and two "rings".
+ * After opening the viona device, it must be associated with a MAC network
+ * interface and a bhyve (vmm) instance to form its link resource.  This is
+ * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
+ * passed in to perform the initialization.  With the MAC client opened, and a
+ * driver handle to the vmm instance established, the device is ready to be
+ * configured by the guest.
+ *
+ * The userspace portion of bhyve, which interfaces with the PCI device
+ * emulation framework, is meant to stay out of the datapath if at all
+ * possible.  Configuration changes made via PCI are mapped to actions which
+ * will steer the operation of the in-kernel logic.
+ *
+ *
+ * -----------
+ * Ring Basics
+ * -----------
+ *
+ * Each viona link has two viona_vring_t entities, RX and TX, for handling data
+ * transfers to and from the guest.  They represent an interface to the
+ * standard virtio ring structures.  When intiailized and active, each ring is
+ * backed by a kernel worker thread (parented to the bhyve process for the
+ * instance) which handles ring events.  The RX worker has the simple task of
+ * watching for ring shutdown conditions.  The TX worker does that in addition
+ * to processing all requests to transmit data.  Data destined for the guest is
+ * delivered directly by MAC to viona_rx() when the ring is active.
+ *
+ *
+ * -----------
+ * Ring States
+ * -----------
+ *
+ * The viona_vring_t instances follow a simple path through the possible state
+ * values represented in virtio_vring_t`vr_state:
+ *
+ *        +<--------------------------------------------+
+ *        |						|
+ *        V						^
+ *  +-----------+	This is the initial state when a link is created or
+ *  | VRS_RESET |	when the ring has been explicitly reset.
+ *  +-----------+
+ *        |						^
+ *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
+ *        |						|
+ *        |						^
+ *        V
+ *  +-----------+	The ring parameters (size, guest physical addresses)
+ *  | VRS_SETUP |	have been set and start-up of the ring worker thread
+ *  +-----------+	has begun.
+ *        |						^
+ *        |						|
+ *        |---* ring worker thread begins execution	|
+ *        |						|
+ *        +-------------------------------------------->+
+ *        |	      |					^
+ *        |	      |
+ *        |	      *	If ring shutdown is requested (by ioctl or impending
+ *        |		bhyve process death) while the worker thread is
+ *        |		starting, the worker will transition the ring to
+ *        |		VRS_RESET and exit.
+ *        |						^
+ *        |						|
+ *        |						^
+ *        V
+ *  +-----------+	The worker thread associated with the ring has started
+ *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
+ *  +-----------+	for the ring to operate.
+ *        |						^
+ *        |						|
+ *        +-------------------------------------------->+
+ *        |	      |					^
+ *        |	      |
+ *        |	      *	If ring shutdown is requested while the worker is
+ *        |		waiting in VRS_INIT, it will free any extra resources
+ *        |		and transition to VRS_RESET.
+ *        |						^
+ *        |						|
+ *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
+ *        |						^
+ *        V
+ *  +-----------+	The worker thread associated with the ring is executing
+ *  | VRS_RUN   |	workload specific to that ring.
+ *  +-----------+
+ *        |						^
+ *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
+ *        |	(or bhyve process begins exit)		^
+ *        |
+ *  +-----------+	The worker thread associated with the ring is in the
+ *  | VRS_STOP  |	process of exiting. All outstanding TX and RX
+ *  +-----------+	requests are allowed to complete, but new requests
+ *        |		must be ignored.
+ *        |						^
+ *        |						|
+ *        +-------------------------------------------->+
+ *
+ *
+ * While the worker thread is not running, changes to vr_state are only made by
+ * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
+ * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
+ * has been started, only it may perform ring state transitions (still under
+ * the protection of vr_lock), when requested by outside consumers via
+ * vr_state_flags or when the containing bhyve process initiates an exit.
+ *
+ *
+ * ----------------------------
+ * Transmission mblk_t Handling
+ * ----------------------------
+ *
+ * For incoming frames destined for a bhyve guest, the data must first land in
+ * a host OS buffer from the physical NIC before it is copied into the awaiting
+ * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
+ * this limitation and can avoid extra copying before the buffers are accessed
+ * directly by the NIC.  When a guest designates buffers to be transmitted,
+ * viona translates the guest-physical addresses contained in the ring
+ * descriptors to host-virtual addresses via vmm_dr_gpa2kva().  That pointer is
+ * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
+ * Doing so increments vr_xfer_outstanding, preventing the ring from being
+ * reset (allowing the link to drop its vmm handle to the guest) until all
+ * transmit mblks referencing guest memory have been processed.  Allocation of
+ * the viona_desb_t entries is done during the VRS_INIT stage of the ring
+ * worker thread.  The ring size informs that allocation as the number of
+ * concurrent transmissions is limited by the number of descriptors in the
+ * ring.  This minimizes allocation in the transmit hot-path by aqcuiring those
+ * fixed-size resources during initialization.
+ *
+ * This optimization depends on the underlying NIC driver freeing the mblks in
+ * a timely manner after they have been transmitted by the hardware.  Some
+ * drivers have been found to flush TX descriptors only when new transmissions
+ * are initiated.  This means that there is no upper bound to the time needed
+ * for an mblk to be flushed and can stall bhyve guests from shutting down
+ * since their memory must be free of viona TX references prior to clean-up.
+ *
+ * This expectation of deterministic mblk_t processing is likely the reason
+ * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
+ * loaded will copy transmit data into fresh buffers rather than passing up
+ * zero-copy mblks.  It is a hold-over from the original viona sources provided
+ * by Pluribus and its continued necessity has not been confirmed.
+ *
+ *
+ * ----------------------------
+ * Ring Notification Fast-paths
+ * ----------------------------
+ *
+ * Device operation for viona requires that notifications flow to and from the
+ * guest to indicate certain ring conditions.  In order to minimize latency and
+ * processing overhead, the notification procedures are kept in-kernel whenever
+ * possible.
+ *
+ * Guest-to-host notifications, when new available descriptors have been placed
+ * in the ring, are posted via the 'queue notify' address in the virtio BAR.
+ * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
+ * install a callback hook on an ioport address.  Guest exits for accesses to
+ * viona-hooked ioport addresses will result in direct calls to notify the
+ * appropriate ring worker without a trip to userland.
+ *
+ * Host-to-guest notifications in the form of interrupts enjoy similar
+ * acceleration.  Each viona ring can be configured to send MSI notifications
+ * to the guest as virtio conditions dictate.  This in-kernel interrupt
+ * configuration is kept synchronized through viona ioctls which are utilized
+ * during writes to the associated PCI config registers or MSI-X BAR.
+ *
+ * Guests which do not utilize MSI-X will result in viona falling back to the
+ * slow path for interrupts.  It will poll(2) the viona handle, receiving
+ * notification when ring events necessitate the assertion of an interrupt.
+ *
+ *
+ * ---------------
+ * Nethook Support
+ * ---------------
+ *
+ * Viona provides four nethook events that consumers (e.g. ipf) can hook into
+ * to intercept packets as they go up or down the stack.  Unfortunately,
+ * the nethook framework does not understand raw packets, so we can only
+ * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
+ * we register callbacks with the neti (netinfo) module that will be invoked
+ * for each netstack already present, as well as for any additional netstack
+ * instances created as the system operates.  These callbacks will
+ * register/unregister the hooks with the nethook framework for each
+ * netstack instance.  This registration occurs prior to creating any
+ * viona instances for a given netstack, and the unregistration for a netstack
+ * instance occurs after all viona instances of the netstack instance have
+ * been deleted.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+
+#include <sys/dlpi.h>
+
+#include "viona_impl.h"
+
+
+#define	VIONA_NAME		"Virtio Network Accelerator"
+#define	VIONA_CTL_MINOR		0
+#define	VIONA_CLI_NAME		"viona"		/* MAC client name */
+
+
+/*
+ * Host capabilities.
+ */
+#define	VIONA_S_HOSTCAPS	(	\
+	VIRTIO_NET_F_GUEST_CSUM |	\
+	VIRTIO_NET_F_MAC |		\
+	VIRTIO_NET_F_GUEST_TSO4 |	\
+	VIRTIO_NET_F_MRG_RXBUF |	\
+	VIRTIO_NET_F_STATUS |		\
+	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
+	VIRTIO_F_RING_INDIRECT_DESC)
+
+/* MAC_CAPAB_HCKSUM specifics of interest */
+#define	VIONA_CAP_HCKSUM_INTEREST	\
+	(HCKSUM_INET_PARTIAL |		\
+	HCKSUM_INET_FULL_V4 |		\
+	HCKSUM_INET_FULL_V6)
+
+static void		*viona_state;
+static dev_info_t	*viona_dip;
+static id_space_t	*viona_minors;
+
+
+static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
+    void **result);
+static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
+static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
+static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+    cred_t *credp, int *rval);
+static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp);
+
+static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
+static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
+
+static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
+static int viona_ioc_ring_init(viona_link_t *, void *, int);
+static int viona_ioc_ring_reset(viona_link_t *, uint_t);
+static int viona_ioc_ring_kick(viona_link_t *, uint_t);
+static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
+static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
+static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
+
+static struct cb_ops viona_cb_ops = {
+	viona_open,
+	viona_close,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	nodev,
+	viona_ioctl,
+	nodev,
+	nodev,
+	nodev,
+	viona_chpoll,
+	ddi_prop_op,
+	0,
+	D_MP | D_NEW | D_HOTPLUG,
+	CB_REV,
+	nodev,
+	nodev
+};
+
+static struct dev_ops viona_ops = {
+	DEVO_REV,
+	0,
+	viona_info,
+	nulldev,
+	nulldev,
+	viona_attach,
+	viona_detach,
+	nodev,
+	&viona_cb_ops,
+	NULL,
+	ddi_power,
+	ddi_quiesce_not_needed
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	VIONA_NAME,
+	&viona_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+	int ret;
+
+	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	viona_minors = id_space_create("viona_minors",
+	    VIONA_CTL_MINOR + 1, UINT16_MAX);
+	viona_rx_init();
+	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
+
+	ret = mod_install(&modlinkage);
+	if (ret != 0) {
+		ddi_soft_state_fini(&viona_state);
+		id_space_destroy(viona_minors);
+		viona_rx_fini();
+		mutex_destroy(&viona_force_copy_lock);
+	}
+
+	return (ret);
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	ret = mod_remove(&modlinkage);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	ddi_soft_state_fini(&viona_state);
+	id_space_destroy(viona_minors);
+	viona_rx_fini();
+	mutex_destroy(&viona_force_copy_lock);
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/* ARGSUSED */
+static int
+viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+	int error;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = (void *)viona_dip;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)0;
+		error = DDI_SUCCESS;
+		break;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+	return (error);
+}
+
+static int
+viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	if (cmd != DDI_ATTACH) {
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	viona_neti_attach();
+
+	viona_dip = dip;
+	ddi_report_dev(viona_dip);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	dev_info_t *old_dip = viona_dip;
+
+	if (cmd != DDI_DETACH) {
+		return (DDI_FAILURE);
+	}
+
+	VERIFY(old_dip != NULL);
+
+	viona_neti_detach();
+	viona_dip = NULL;
+	ddi_remove_minor_node(old_dip, NULL);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+	int	minor;
+	viona_soft_state_t *ss;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+#if 0
+	/*
+	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
+	 * Should the check be at open() or ioctl()?
+	 */
+	if (drv_priv(credp) != 0) {
+		return (EPERM);
+	}
+#endif
+	if (getminor(*devp) != VIONA_CTL_MINOR) {
+		return (ENXIO);
+	}
+
+	minor = id_alloc_nosleep(viona_minors);
+	if (minor == -1) {
+		/* All minors are busy */
+		return (EBUSY);
+	}
+	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
+		id_free(viona_minors, minor);
+		return (ENOMEM);
+	}
+
+	ss = ddi_get_soft_state(viona_state, minor);
+	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
+	*devp = makedevice(getmajor(*devp), minor);
+
+	return (0);
+}
+
+static int
+viona_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+	int			minor;
+	viona_soft_state_t	*ss;
+
+	if (otype != OTYP_CHR) {
+		return (EINVAL);
+	}
+
+	minor = getminor(dev);
+
+	ss = ddi_get_soft_state(viona_state, minor);
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	VERIFY0(viona_ioc_delete(ss, B_TRUE));
+	VERIFY(!list_link_active(&ss->ss_node));
+	ddi_soft_state_free(viona_state, minor);
+	id_free(viona_minors, minor);
+
+	return (0);
+}
+
+static int
+viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
+{
+	viona_soft_state_t *ss;
+	void *dptr = (void *)data;
+	int err = 0, val;
+	viona_link_t *link;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	case VNA_IOC_CREATE:
+		return (viona_ioc_create(ss, dptr, md, cr));
+	case VNA_IOC_DELETE:
+		return (viona_ioc_delete(ss, B_FALSE));
+	default:
+		break;
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
+	    vmm_drv_release_reqd(link->l_vm_hold)) {
+		mutex_exit(&ss->ss_lock);
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+	case VNA_IOC_GET_FEATURES:
+		val = VIONA_S_HOSTCAPS | link->l_features_hw;
+		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
+			err = EFAULT;
+		}
+		break;
+	case VNA_IOC_SET_FEATURES:
+		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
+			err = EFAULT;
+			break;
+		}
+		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
+
+		if ((val & VIRTIO_NET_F_CSUM) == 0)
+			val &= ~VIRTIO_NET_F_HOST_TSO4;
+
+		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
+			val &= ~VIRTIO_NET_F_GUEST_TSO4;
+
+		link->l_features = val;
+		break;
+	case VNA_IOC_RING_INIT:
+		err = viona_ioc_ring_init(link, dptr, md);
+		break;
+	case VNA_IOC_RING_RESET:
+		err = viona_ioc_ring_reset(link, (uint_t)data);
+		break;
+	case VNA_IOC_RING_KICK:
+		err = viona_ioc_ring_kick(link, (uint_t)data);
+		break;
+	case VNA_IOC_RING_SET_MSI:
+		err = viona_ioc_ring_set_msi(link, dptr, md);
+		break;
+	case VNA_IOC_RING_INTR_CLR:
+		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
+		break;
+	case VNA_IOC_INTR_POLL:
+		err = viona_ioc_intr_poll(link, dptr, md, rv);
+		break;
+	case VNA_IOC_SET_NOTIFY_IOP:
+		err = viona_ioc_set_notify_ioport(link, (uint_t)data);
+		break;
+	default:
+		err = ENOTTY;
+		break;
+	}
+
+	mutex_exit(&ss->ss_lock);
+	return (err);
+}
+
+static int
+viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+    struct pollhead **phpp)
+{
+	viona_soft_state_t *ss;
+	viona_link_t *link;
+
+	ss = ddi_get_soft_state(viona_state, getminor(dev));
+	if (ss == NULL) {
+		return (ENXIO);
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
+		mutex_exit(&ss->ss_lock);
+		return (ENXIO);
+	}
+
+	*reventsp = 0;
+	if ((events & POLLRDBAND) != 0) {
+		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+			if (link->l_vrings[i].vr_intr_enabled != 0) {
+				*reventsp |= POLLRDBAND;
+				break;
+			}
+		}
+	}
+	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+		*phpp = &link->l_pollhead;
+	}
+	mutex_exit(&ss->ss_lock);
+
+	return (0);
+}
+
+static void
+viona_get_mac_capab(viona_link_t *link)
+{
+	mac_handle_t mh = link->l_mh;
+	uint32_t cap = 0;
+	mac_capab_lso_t lso_cap;
+
+	link->l_features_hw = 0;
+	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
+		/*
+		 * Only report HW checksum ability if the underlying MAC
+		 * resource is capable of populating the L4 header.
+		 */
+		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
+			link->l_features_hw |= VIRTIO_NET_F_CSUM;
+		}
+		link->l_cap_csum = cap;
+	}
+
+	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
+	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
+		/*
+		 * Virtio doesn't allow for negotiating a maximum LSO
+		 * packet size. We have to assume that the guest may
+		 * send a maximum length IP packet. Make sure the
+		 * underlying MAC can handle an LSO of this size.
+		 */
+		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
+		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
+			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
+	}
+}
+
+static int
+viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
+{
+	vioc_create_t	kvc;
+	viona_link_t	*link = NULL;
+	char		cli_name[MAXNAMELEN];
+	int		err = 0;
+	file_t		*fp;
+	vmm_hold_t	*hold = NULL;
+	viona_neti_t	*nip = NULL;
+	zoneid_t	zid;
+
+	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
+
+	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
+		return (EFAULT);
+	}
+
+	zid = crgetzoneid(cr);
+	nip = viona_neti_lookup_by_zid(zid);
+	if (nip == NULL) {
+		return (EIO);
+	}
+
+	if (!nip->vni_nethook.vnh_hooked) {
+		viona_neti_rele(nip);
+		return (EIO);
+	}
+
+	mutex_enter(&ss->ss_lock);
+	if (ss->ss_link != NULL) {
+		mutex_exit(&ss->ss_lock);
+		viona_neti_rele(nip);
+		return (EEXIST);
+	}
+
+	if ((fp = getf(kvc.c_vmfd)) == NULL) {
+		err = EBADF;
+		goto bail;
+	}
+	err = vmm_drv_hold(fp, cr, &hold);
+	releasef(kvc.c_vmfd);
+	if (err != 0) {
+		goto bail;
+	}
+
+	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
+	link->l_linkid = kvc.c_linkid;
+	link->l_vm_hold = hold;
+
+	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
+	if (err != 0) {
+		goto bail;
+	}
+
+	viona_get_mac_capab(link);
+
+	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
+	    link->l_linkid);
+	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
+	if (err != 0) {
+		goto bail;
+	}
+
+	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
+	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
+
+	if ((err = viona_rx_set(link)) != 0) {
+		viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+		viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+		goto bail;
+	}
+
+	link->l_neti = nip;
+	ss->ss_link = link;
+	mutex_exit(&ss->ss_lock);
+
+	mutex_enter(&nip->vni_lock);
+	list_insert_tail(&nip->vni_dev_list, ss);
+	mutex_exit(&nip->vni_lock);
+
+	return (0);
+
+bail:
+	if (link != NULL) {
+		if (link->l_mch != NULL) {
+			mac_client_close(link->l_mch, 0);
+		}
+		if (link->l_mh != NULL) {
+			mac_close(link->l_mh);
+		}
+		kmem_free(link, sizeof (viona_link_t));
+	}
+	if (hold != NULL) {
+		vmm_drv_rele(hold);
+	}
+	viona_neti_rele(nip);
+
+	mutex_exit(&ss->ss_lock);
+	return (err);
+}
+
+static int
+viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
+{
+	viona_link_t *link;
+	viona_neti_t *nip = NULL;
+
+	mutex_enter(&ss->ss_lock);
+	if ((link = ss->ss_link) == NULL) {
+		/* Link destruction already complete */
+		mutex_exit(&ss->ss_lock);
+		return (0);
+	}
+
+	if (link->l_destroyed) {
+		/*
+		 * Link destruction has been started by another thread, but has
+		 * not completed.  This condition should be impossible to
+		 * encounter when performing the on-close destroy of the link,
+		 * since racing ioctl accessors must necessarily be absent.
+		 */
+		VERIFY(!on_close);
+		mutex_exit(&ss->ss_lock);
+		return (EAGAIN);
+	}
+	/*
+	 * The link deletion cannot fail after this point, continuing until its
+	 * successful completion is reached.
+	 */
+	link->l_destroyed = B_TRUE;
+
+	/*
+	 * Tear down the IO port hook so it cannot be used to kick any of the
+	 * rings which are about to be reset and stopped.
+	 */
+	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
+	mutex_exit(&ss->ss_lock);
+
+	/*
+	 * Return the rings to their reset state, ignoring any possible
+	 * interruptions from signals.
+	 */
+	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
+	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
+
+	mutex_enter(&ss->ss_lock);
+	if (link->l_mch != NULL) {
+		/* Unhook the receive callbacks and close out the client */
+		viona_rx_clear(link);
+		mac_client_close(link->l_mch, 0);
+	}
+	if (link->l_mh != NULL) {
+		mac_close(link->l_mh);
+	}
+	if (link->l_vm_hold != NULL) {
+		vmm_drv_rele(link->l_vm_hold);
+		link->l_vm_hold = NULL;
+	}
+
+	nip = link->l_neti;
+	link->l_neti = NULL;
+
+	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+	pollhead_clean(&link->l_pollhead);
+	ss->ss_link = NULL;
+	mutex_exit(&ss->ss_lock);
+
+	mutex_enter(&nip->vni_lock);
+	list_remove(&nip->vni_dev_list, ss);
+	mutex_exit(&nip->vni_lock);
+
+	viona_neti_rele(nip);
+
+	kmem_free(link, sizeof (viona_link_t));
+	return (0);
+}
+
+static int
+viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
+{
+	vioc_ring_init_t kri;
+	int err;
+
+	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
+		return (EFAULT);
+	}
+
+	err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr);
+
+	return (err);
+}
+
+static int
+viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
+{
+	viona_vring_t *ring;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	ring = &link->l_vrings[idx];
+
+	return (viona_ring_reset(ring, B_TRUE));
+}
+
+static int
+viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
+{
+	viona_vring_t *ring;
+	int err;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	ring = &link->l_vrings[idx];
+
+	mutex_enter(&ring->vr_lock);
+	switch (ring->vr_state) {
+	case VRS_SETUP:
+		/*
+		 * An early kick to a ring which is starting its worker thread
+		 * is fine.  Once that thread is active, it will process the
+		 * start-up request immediately.
+		 */
+		/* FALLTHROUGH */
+	case VRS_INIT:
+		ring->vr_state_flags |= VRSF_REQ_START;
+		/* FALLTHROUGH */
+	case VRS_RUN:
+		cv_broadcast(&ring->vr_cv);
+		err = 0;
+		break;
+	default:
+		err = EBUSY;
+		break;
+	}
+	mutex_exit(&ring->vr_lock);
+
+	return (err);
+}
+
+static int
+viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
+{
+	vioc_ring_msi_t vrm;
+	viona_vring_t *ring;
+
+	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
+		return (EFAULT);
+	}
+	if (vrm.rm_index >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+
+	ring = &link->l_vrings[vrm.rm_index];
+	mutex_enter(&ring->vr_lock);
+	ring->vr_msi_addr = vrm.rm_addr;
+	ring->vr_msi_msg = vrm.rm_msg;
+	mutex_exit(&ring->vr_lock);
+
+	return (0);
+}
+
+static int
+viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val)
+{
+	viona_link_t *link = (viona_link_t *)arg;
+	uint16_t vq = (uint16_t)val;
+
+	if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) {
+		return (EINVAL);
+	}
+	return (viona_ioc_ring_kick(link, vq));
+}
+
+static int
+viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport)
+{
+	int err = 0;
+
+	if (link->l_notify_ioport != 0) {
+		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
+		link->l_notify_ioport = 0;
+	}
+
+	if (ioport != 0) {
+		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL,
+		    viona_notify_wcb, (void *)link, &link->l_notify_cookie);
+		if (err == 0) {
+			link->l_notify_ioport = ioport;
+		}
+	}
+	return (err);
+}
+
+static int
+viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
+{
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+
+	link->l_vrings[idx].vr_intr_enabled = 0;
+	return (0);
+}
+
+static int
+viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
+{
+	uint_t cnt = 0;
+	vioc_intr_poll_t vip;
+
+	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+		uint_t val = link->l_vrings[i].vr_intr_enabled;
+
+		vip.vip_status[i] = val;
+		if (val != 0) {
+			cnt++;
+		}
+	}
+
+	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
+		return (EFAULT);
+	}
+	*rv = (int)cnt;
+	return (0);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_ring.c b/usr/src/uts/i86pc/io/viona/viona_ring.c
new file mode 100644
index 0000000000..5ba6fad963
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_ring.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/disp.h>
+
+#include "viona_impl.h"
+
+#define	VRING_ALIGN		4096
+#define	VRING_MAX_LEN		32768
+
+static boolean_t viona_ring_map(viona_vring_t *);
+static void viona_ring_unmap(viona_vring_t *);
+static kthread_t *viona_create_worker(viona_vring_t *);
+
+static void *
+viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
+{
+	ASSERT3P(ring->vr_lease, !=, NULL);
+
+	return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
+}
+
+static boolean_t
+viona_ring_lease_expire_cb(void *arg)
+{
+	viona_vring_t *ring = arg;
+
+	cv_broadcast(&ring->vr_cv);
+
+	/* The lease will be broken asynchronously. */
+	return (B_FALSE);
+}
+
+static void
+viona_ring_lease_drop(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	if (ring->vr_lease != NULL) {
+		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+		ASSERT(hold != NULL);
+
+		/*
+		 * Without an active lease, the ring mappings cannot be
+		 * considered valid.
+		 */
+		viona_ring_unmap(ring);
+
+		vmm_drv_lease_break(hold, ring->vr_lease);
+		ring->vr_lease = NULL;
+	}
+}
+
+boolean_t
+viona_ring_lease_renew(viona_vring_t *ring)
+{
+	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+	ASSERT(hold != NULL);
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	viona_ring_lease_drop(ring);
+
+	/*
+	 * Lease renewal will fail if the VM has requested that all holds be
+	 * cleaned up.
+	 */
+	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
+	    ring);
+	if (ring->vr_lease != NULL) {
+		/* A ring undergoing renewal will need valid guest mappings */
+		if (ring->vr_pa != 0 && ring->vr_size != 0) {
+			/*
+			 * If new mappings cannot be established, consider the
+			 * lease renewal a failure.
+			 */
+			if (!viona_ring_map(ring)) {
+				viona_ring_lease_drop(ring);
+				return (B_FALSE);
+			}
+		}
+	}
+	return (ring->vr_lease != NULL);
+}
+
+void
+viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
+{
+	ring->vr_link = link;
+	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
+	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
+}
+
+static void
+viona_ring_misc_free(viona_vring_t *ring)
+{
+	const uint_t qsz = ring->vr_size;
+
+	viona_tx_ring_free(ring, qsz);
+}
+
+void
+viona_ring_free(viona_vring_t *ring)
+{
+	mutex_destroy(&ring->vr_lock);
+	cv_destroy(&ring->vr_cv);
+	mutex_destroy(&ring->vr_a_mutex);
+	mutex_destroy(&ring->vr_u_mutex);
+	ring->vr_link = NULL;
+}
+
+int
+viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa)
+{
+	viona_vring_t *ring;
+	kthread_t *t;
+	int err = 0;
+
+	if (idx >= VIONA_VQ_MAX) {
+		return (EINVAL);
+	}
+	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
+		return (EINVAL);
+	}
+
+	ring = &link->l_vrings[idx];
+	mutex_enter(&ring->vr_lock);
+	if (ring->vr_state != VRS_RESET) {
+		mutex_exit(&ring->vr_lock);
+		return (EBUSY);
+	}
+	VERIFY(ring->vr_state_flags == 0);
+
+	ring->vr_lease = NULL;
+	if (!viona_ring_lease_renew(ring)) {
+		err = EBUSY;
+		goto fail;
+	}
+
+	ring->vr_size = qsz;
+	ring->vr_mask = (ring->vr_size - 1);
+	ring->vr_pa = pa;
+	if (!viona_ring_map(ring)) {
+		err = EINVAL;
+		goto fail;
+	}
+
+	/* Initialize queue indexes */
+	ring->vr_cur_aidx = 0;
+
+	if (idx == VIONA_VQ_TX) {
+		viona_tx_ring_alloc(ring, qsz);
+	}
+
+	/* Zero out MSI-X configuration */
+	ring->vr_msi_addr = 0;
+	ring->vr_msi_msg = 0;
+
+	/* Clear the stats */
+	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
+
+	t = viona_create_worker(ring);
+	if (t == NULL) {
+		err = ENOMEM;
+		goto fail;
+	}
+	ring->vr_worker_thread = t;
+	ring->vr_state = VRS_SETUP;
+	cv_broadcast(&ring->vr_cv);
+	mutex_exit(&ring->vr_lock);
+	return (0);
+
+fail:
+	viona_ring_lease_drop(ring);
+	viona_ring_misc_free(ring);
+	ring->vr_size = 0;
+	ring->vr_mask = 0;
+	mutex_exit(&ring->vr_lock);
+	return (err);
+}
+
+int
+viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
+{
+	mutex_enter(&ring->vr_lock);
+	if (ring->vr_state == VRS_RESET) {
+		mutex_exit(&ring->vr_lock);
+		return (0);
+	}
+
+	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
+		ring->vr_state_flags |= VRSF_REQ_STOP;
+		cv_broadcast(&ring->vr_cv);
+	}
+	while (ring->vr_state != VRS_RESET) {
+		if (!heed_signals) {
+			cv_wait(&ring->vr_cv, &ring->vr_lock);
+		} else {
+			int rs;
+
+			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+			if (rs <= 0 && ring->vr_state != VRS_RESET) {
+				mutex_exit(&ring->vr_lock);
+				return (EINTR);
+			}
+		}
+	}
+	viona_ring_lease_drop(ring);
+	mutex_exit(&ring->vr_lock);
+	return (0);
+}
+
+static boolean_t
+viona_ring_map(viona_vring_t *ring)
+{
+	uint64_t pos = ring->vr_pa;
+	const uint16_t qsz = ring->vr_size;
+
+	ASSERT3U(qsz, !=, 0);
+	ASSERT3U(pos, !=, 0);
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	const size_t desc_sz = qsz * sizeof (struct virtio_desc);
+	ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
+	if (ring->vr_descr == NULL) {
+		goto fail;
+	}
+	pos += desc_sz;
+
+	const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
+	ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
+	if (ring->vr_avail_flags == NULL) {
+		goto fail;
+	}
+	ring->vr_avail_idx = ring->vr_avail_flags + 1;
+	ring->vr_avail_ring = ring->vr_avail_flags + 2;
+	ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
+	pos += avail_sz;
+
+	const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
+	    (sizeof (uint16_t) * 3);
+	pos = P2ROUNDUP(pos, VRING_ALIGN);
+	ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
+	if (ring->vr_used_flags == NULL) {
+		goto fail;
+	}
+	ring->vr_used_idx = ring->vr_used_flags + 1;
+	ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
+	ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
+
+	return (B_TRUE);
+
+fail:
+	viona_ring_unmap(ring);
+	return (B_FALSE);
+}
+
+static void
+viona_ring_unmap(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	ring->vr_descr = NULL;
+	ring->vr_avail_flags = NULL;
+	ring->vr_avail_idx = NULL;
+	ring->vr_avail_ring = NULL;
+	ring->vr_avail_used_event = NULL;
+	ring->vr_used_flags = NULL;
+	ring->vr_used_idx = NULL;
+	ring->vr_used_ring = NULL;
+	ring->vr_used_avail_event = NULL;
+}
+
+void
+viona_intr_ring(viona_vring_t *ring)
+{
+	uint64_t addr;
+
+	mutex_enter(&ring->vr_lock);
+	/* Deliver the interrupt directly, if so configured. */
+	if ((addr = ring->vr_msi_addr) != 0) {
+		uint64_t msg = ring->vr_msi_msg;
+
+		mutex_exit(&ring->vr_lock);
+		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
+		return;
+	}
+	mutex_exit(&ring->vr_lock);
+
+	if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
+		pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
+	}
+}
+
+static void
+viona_worker(void *arg)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+	viona_link_t *link = ring->vr_link;
+	proc_t *p = ttoproc(curthread);
+
+	mutex_enter(&ring->vr_lock);
+	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
+
+	/* Bail immediately if ring shutdown or process exit was requested */
+	if (VRING_NEED_BAIL(ring, p)) {
+		goto cleanup;
+	}
+
+	/* Report worker thread as alive and notify creator */
+	ring->vr_state = VRS_INIT;
+	cv_broadcast(&ring->vr_cv);
+
+	while (ring->vr_state_flags == 0) {
+		/*
+		 * Keeping lease renewals timely while waiting for the ring to
+		 * be started is important for avoiding deadlocks.
+		 */
+		if (vmm_drv_lease_expired(ring->vr_lease)) {
+			if (!viona_ring_lease_renew(ring)) {
+				goto cleanup;
+			}
+		}
+
+		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+
+		if (VRING_NEED_BAIL(ring, p)) {
+			goto cleanup;
+		}
+	}
+
+	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
+	ring->vr_state = VRS_RUN;
+	ring->vr_state_flags &= ~VRSF_REQ_START;
+
+	/* Ensure ring lease is valid first */
+	if (vmm_drv_lease_expired(ring->vr_lease)) {
+		if (!viona_ring_lease_renew(ring)) {
+			goto cleanup;
+		}
+	}
+
+	/* Process actual work */
+	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
+		viona_worker_rx(ring, link);
+	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
+		viona_worker_tx(ring, link);
+	} else {
+		panic("unexpected ring: %p", (void *)ring);
+	}
+
+	VERIFY3U(ring->vr_state, ==, VRS_STOP);
+
+cleanup:
+	if (ring->vr_txdesb != NULL) {
+		/*
+		 * Transmit activity must be entirely concluded before the
+		 * associated descriptors can be cleaned up.
+		 */
+		VERIFY(ring->vr_xfer_outstanding == 0);
+	}
+	viona_ring_misc_free(ring);
+
+	viona_ring_lease_drop(ring);
+	ring->vr_cur_aidx = 0;
+	ring->vr_state = VRS_RESET;
+	ring->vr_state_flags = 0;
+	ring->vr_worker_thread = NULL;
+	cv_broadcast(&ring->vr_cv);
+	mutex_exit(&ring->vr_lock);
+
+	mutex_enter(&ttoproc(curthread)->p_lock);
+	lwp_exit();
+}
+
+static kthread_t *
+viona_create_worker(viona_vring_t *ring)
+{
+	k_sigset_t hold_set;
+	proc_t *p = curproc;
+	kthread_t *t;
+	klwp_t *lwp;
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT(ring->vr_state == VRS_RESET);
+
+	sigfillset(&hold_set);
+	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
+	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
+	if (lwp == NULL) {
+		return (NULL);
+	}
+
+	t = lwptot(lwp);
+	mutex_enter(&p->p_lock);
+	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
+	lwp_create_done(t);
+	mutex_exit(&p->p_lock);
+
+	return (t);
+}
+
+int
+vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
+    uint16_t *cookie)
+{
+	uint_t i, ndesc, idx, head, next;
+	struct virtio_desc vdir;
+	void *buf;
+
+	ASSERT(iov != NULL);
+	ASSERT(niov > 0 && niov < INT_MAX);
+
+	mutex_enter(&ring->vr_a_mutex);
+	idx = ring->vr_cur_aidx;
+	ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx);
+
+	if (ndesc == 0) {
+		mutex_exit(&ring->vr_a_mutex);
+		return (0);
+	}
+	if (ndesc > ring->vr_size) {
+		/*
+		 * Despite the fact that the guest has provided an 'avail_idx'
+		 * which indicates that an impossible number of descriptors are
+		 * available, continue on and attempt to process the next one.
+		 *
+		 * The transgression will not escape the probe or stats though.
+		 */
+		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
+		    uint16_t, ndesc);
+		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
+	}
+
+	head = ring->vr_avail_ring[idx & ring->vr_mask];
+	next = head;
+
+	for (i = 0; i < niov; next = vdir.vd_next) {
+		if (next >= ring->vr_size) {
+			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
+			    uint16_t, next);
+			VIONA_RING_STAT_INCR(ring, bad_idx);
+			goto bail;
+		}
+
+		vdir = ring->vr_descr[next];
+		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+			if (vdir.vd_len == 0) {
+				VIONA_PROBE2(desc_bad_len,
+				    viona_vring_t *, ring,
+				    uint32_t, vdir.vd_len);
+				VIONA_RING_STAT_INCR(ring, desc_bad_len);
+				goto bail;
+			}
+			buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+			if (buf == NULL) {
+				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+				goto bail;
+			}
+			iov[i].iov_base = buf;
+			iov[i].iov_len = vdir.vd_len;
+			i++;
+		} else {
+			const uint_t nindir = vdir.vd_len / 16;
+			volatile struct virtio_desc *vindir;
+
+			if ((vdir.vd_len & 0xf) || nindir == 0) {
+				VIONA_PROBE2(indir_bad_len,
+				    viona_vring_t *, ring,
+				    uint32_t, vdir.vd_len);
+				VIONA_RING_STAT_INCR(ring, indir_bad_len);
+				goto bail;
+			}
+			vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+			if (vindir == NULL) {
+				VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+				goto bail;
+			}
+			next = 0;
+			for (;;) {
+				struct virtio_desc vp;
+
+				/*
+				 * A copy of the indirect descriptor is made
+				 * here, rather than simply using a reference
+				 * pointer.  This prevents malicious or
+				 * erroneous guest writes to the descriptor
+				 * from fooling the flags/bounds verification
+				 * through a race.
+				 */
+				vp = vindir[next];
+				if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
+					VIONA_PROBE1(indir_bad_nest,
+					    viona_vring_t *, ring);
+					VIONA_RING_STAT_INCR(ring,
+					    indir_bad_nest);
+					goto bail;
+				} else if (vp.vd_len == 0) {
+					VIONA_PROBE2(desc_bad_len,
+					    viona_vring_t *, ring,
+					    uint32_t, vp.vd_len);
+					VIONA_RING_STAT_INCR(ring,
+					    desc_bad_len);
+					goto bail;
+				}
+				buf = viona_gpa2kva(ring, vp.vd_addr,
+				    vp.vd_len);
+				if (buf == NULL) {
+					VIONA_PROBE_BAD_RING_ADDR(ring,
+					    vp.vd_addr);
+					VIONA_RING_STAT_INCR(ring,
+					    bad_ring_addr);
+					goto bail;
+				}
+				iov[i].iov_base = buf;
+				iov[i].iov_len = vp.vd_len;
+				i++;
+
+				if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0)
+					break;
+				if (i >= niov) {
+					goto loopy;
+				}
+
+				next = vp.vd_next;
+				if (next >= nindir) {
+					VIONA_PROBE3(indir_bad_next,
+					    viona_vring_t *, ring,
+					    uint16_t, next,
+					    uint_t, nindir);
+					VIONA_RING_STAT_INCR(ring,
+					    indir_bad_next);
+					goto bail;
+				}
+			}
+		}
+		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
+			*cookie = head;
+			ring->vr_cur_aidx++;
+			mutex_exit(&ring->vr_a_mutex);
+			return (i);
+		}
+	}
+
+loopy:
+	VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
+	VIONA_RING_STAT_INCR(ring, too_many_desc);
+bail:
+	mutex_exit(&ring->vr_a_mutex);
+	return (-1);
+}
+
+void
+vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+	volatile struct virtio_used *vu;
+	uint_t uidx;
+
+	mutex_enter(&ring->vr_u_mutex);
+
+	uidx = *ring->vr_used_idx;
+	vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+	vu->vu_idx = cookie;
+	vu->vu_tlen = len;
+	membar_producer();
+	*ring->vr_used_idx = uidx;
+
+	mutex_exit(&ring->vr_u_mutex);
+}
+
+void
+vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
+{
+	volatile struct virtio_used *vu;
+	uint_t uidx, i;
+
+	mutex_enter(&ring->vr_u_mutex);
+
+	uidx = *ring->vr_used_idx;
+	if (num_bufs == 1) {
+		vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+		vu->vu_idx = elem[0].id;
+		vu->vu_tlen = elem[0].len;
+	} else {
+		for (i = 0; i < num_bufs; i++) {
+			vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask];
+			vu->vu_idx = elem[i].id;
+			vu->vu_tlen = elem[i].len;
+		}
+		uidx = uidx + num_bufs;
+	}
+	membar_producer();
+	*ring->vr_used_idx = uidx;
+
+	mutex_exit(&ring->vr_u_mutex);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_rx.c b/usr/src/uts/i86pc/io/viona/viona_rx.c
new file mode 100644
index 0000000000..1ccbaa63f1
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_rx.c
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/strsubr.h>
+
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
+#include <sys/vlan.h>
+
+#include "viona_impl.h"
+
+
+
+#define	VTNET_MAXSEGS		32
+
+/* Min. octets in an ethernet frame minus FCS */
+#define	MIN_BUF_SIZE		60
+#define	NEED_VLAN_PAD_SIZE	(MIN_BUF_SIZE - VLAN_TAGSZ)
+
+static mblk_t *viona_vlan_pad_mp;
+
+void
+viona_rx_init(void)
+{
+	mblk_t *mp;
+
+	ASSERT(viona_vlan_pad_mp == NULL);
+
+	/* Create mblk for padding when VLAN tags are stripped */
+	mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
+	bzero(mp->b_rptr, VLAN_TAGSZ);
+	mp->b_wptr += VLAN_TAGSZ;
+	viona_vlan_pad_mp = mp;
+}
+
+void
+viona_rx_fini(void)
+{
+	mblk_t *mp;
+
+	/* Clean up the VLAN padding mblk */
+	mp = viona_vlan_pad_mp;
+	viona_vlan_pad_mp = NULL;
+	VERIFY(mp != NULL && mp->b_cont == NULL);
+	freemsg(mp);
+}
+
+void
+viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
+{
+	proc_t *p = ttoproc(curthread);
+
+	(void) thread_vsetname(curthread, "viona_rx_%p", ring);
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+	*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+
+	do {
+		if (vmm_drv_lease_expired(ring->vr_lease)) {
+			/*
+			 * Set the renewal flag, causing incoming traffic to be
+			 * dropped, and issue an RX barrier to ensure any
+			 * threads in the RX callbacks will have finished.
+			 * The vr_lock cannot be held across the barrier as it
+			 * poses a deadlock risk.
+			 */
+			ring->vr_state_flags |= VRSF_RENEW;
+			mutex_exit(&ring->vr_lock);
+			mac_rx_barrier(link->l_mch);
+			mutex_enter(&ring->vr_lock);
+
+			if (!viona_ring_lease_renew(ring)) {
+				break;
+			}
+			ring->vr_state_flags &= ~VRSF_RENEW;
+		}
+
+		/*
+		 * For now, there is little to do in the RX worker as inbound
+		 * data is delivered by MAC via the RX callbacks.  If tap-like
+		 * functionality is added later, this would be a convenient
+		 * place to inject frames into the guest.
+		 */
+		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+	} while (!VRING_NEED_BAIL(ring, p));
+
+	ring->vr_state = VRS_STOP;
+
+	/*
+	 * The RX ring is stopping, before we start tearing it down it
+	 * is imperative that we perform an RX barrier so that
+	 * incoming packets are dropped at viona_rx_classified().
+	 */
+	mutex_exit(&ring->vr_lock);
+	mac_rx_barrier(link->l_mch);
+	mutex_enter(&ring->vr_lock);
+
+	*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+}
+
+static size_t
+viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
+    boolean_t *end)
+{
+	size_t copied = 0;
+	size_t off = 0;
+
+	/* Seek past already-consumed data */
+	while (seek > 0 && mp != NULL) {
+		const size_t chunk = MBLKL(mp);
+
+		if (chunk > seek) {
+			off = seek;
+			break;
+		}
+		mp = mp->b_cont;
+		seek -= chunk;
+	}
+
+	while (mp != NULL) {
+		const size_t chunk = MBLKL(mp) - off;
+		const size_t to_copy = MIN(chunk, len);
+
+		bcopy(mp->b_rptr + off, buf, to_copy);
+		copied += to_copy;
+		buf += to_copy;
+		len -= to_copy;
+
+		/*
+		 * If all the remaining data in the mblk_t was copied, move on
+		 * to the next one in the chain.  Any seek offset applied to
+		 * the first mblk copy is zeroed out for subsequent operations.
+		 */
+		if (chunk == to_copy) {
+			mp = mp->b_cont;
+			off = 0;
+		}
+#ifdef DEBUG
+		else {
+			/*
+			 * The only valid reason for the copy to consume less
+			 * than the entire contents of the mblk_t is because
+			 * the output buffer has been filled.
+			 */
+			ASSERT0(len);
+		}
+#endif
+
+		/* Go no further if the buffer has been filled */
+		if (len == 0) {
+			break;
+		}
+
+	}
+	*end = (mp == NULL);
+	return (copied);
+}
+
+static int
+viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+	struct iovec iov[VTNET_MAXSEGS];
+	uint16_t cookie;
+	int n;
+	const size_t hdr_sz = sizeof (struct virtio_net_hdr);
+	struct virtio_net_hdr *hdr;
+	size_t len, copied = 0;
+	caddr_t buf = NULL;
+	boolean_t end = B_FALSE;
+	const uint32_t features = ring->vr_link->l_features;
+
+	ASSERT(msz >= MIN_BUF_SIZE);
+
+	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+	if (n <= 0) {
+		/* Without available buffers, the frame must be dropped. */
+		return (ENOSPC);
+	}
+	if (iov[0].iov_len < hdr_sz) {
+		/*
+		 * There is little to do if there is not even space available
+		 * for the sole header.  Zero the buffer and bail out as a last
+		 * act of desperation.
+		 */
+		bzero(iov[0].iov_base, iov[0].iov_len);
+		goto bad_frame;
+	}
+
+	/* Grab the address of the header before anything else */
+	hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+
+	/*
+	 * If there is any space remaining in the first buffer after writing
+	 * the header, fill it with frame data.
+	 */
+	if (iov[0].iov_len > hdr_sz) {
+		buf = (caddr_t)iov[0].iov_base + hdr_sz;
+		len = iov[0].iov_len - hdr_sz;
+
+		copied += viona_copy_mblk(mp, copied, buf, len, &end);
+	}
+
+	/* Copy any remaining data into subsequent buffers, if present */
+	for (int i = 1; i < n && !end; i++) {
+		buf = (caddr_t)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		copied += viona_copy_mblk(mp, copied, buf, len, &end);
+	}
+
+	/* Was the expected amount of data copied? */
+	if (copied != msz) {
+		VIONA_PROBE5(too_short, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
+		    size_t, msz);
+		VIONA_RING_STAT_INCR(ring, too_short);
+		goto bad_frame;
+	}
+
+	/* Populate (read: zero) the header and account for it in the size */
+	bzero(hdr, hdr_sz);
+	copied += hdr_sz;
+
+	/* Add chksum bits, if needed */
+	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+		uint32_t cksum_flags;
+
+		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+			hdr->vrh_gso_size = DB_LSOMSS(mp);
+		}
+
+		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+		    &cksum_flags);
+		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+		}
+	}
+
+	/* Release this chain */
+	vq_pushchain(ring, copied, cookie);
+	return (0);
+
+bad_frame:
+	VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
+	    mblk_t *, mp);
+	VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+
+	vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
+	return (EINVAL);
+}
+
+static int
+viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+	struct iovec iov[VTNET_MAXSEGS];
+	used_elem_t uelem[VTNET_MAXSEGS];
+	int n, i = 0, buf_idx = 0, err = 0;
+	uint16_t cookie;
+	caddr_t buf;
+	size_t len, copied = 0, chunk = 0;
+	struct virtio_net_mrgrxhdr *hdr = NULL;
+	const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
+	boolean_t end = B_FALSE;
+	const uint32_t features = ring->vr_link->l_features;
+
+	ASSERT(msz >= MIN_BUF_SIZE);
+
+	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+	if (n <= 0) {
+		/* Without available buffers, the frame must be dropped. */
+		VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, no_space);
+		return (ENOSPC);
+	}
+	if (iov[0].iov_len < hdr_sz) {
+		/*
+		 * There is little to do if there is not even space available
+		 * for the sole header.  Zero the buffer and bail out as a last
+		 * act of desperation.
+		 */
+		bzero(iov[0].iov_base, iov[0].iov_len);
+		uelem[0].id = cookie;
+		uelem[0].len = iov[0].iov_len;
+		err = EINVAL;
+		goto done;
+	}
+
+	/* Grab the address of the header and do initial population */
+	hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
+	bzero(hdr, hdr_sz);
+	hdr->vrh_bufs = 1;
+
+	/*
+	 * If there is any space remaining in the first buffer after writing
+	 * the header, fill it with frame data.
+	 */
+	if (iov[0].iov_len > hdr_sz) {
+		buf = iov[0].iov_base + hdr_sz;
+		len = iov[0].iov_len - hdr_sz;
+
+		chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+		copied += chunk;
+	}
+	i = 1;
+
+	do {
+		while (i < n && !end) {
+			buf = iov[i].iov_base;
+			len = iov[i].iov_len;
+
+			chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+			copied += chunk;
+			i++;
+		}
+
+		uelem[buf_idx].id = cookie;
+		uelem[buf_idx].len = chunk;
+
+		/*
+		 * Try to grab another buffer from the ring if the mblk has not
+		 * yet been entirely copied out.
+		 */
+		if (!end) {
+			if (buf_idx == (VTNET_MAXSEGS - 1)) {
+				/*
+				 * Our arbitrary limit on the number of buffers
+				 * to offer for merge has already been reached.
+				 */
+				err = EOVERFLOW;
+				break;
+			}
+			n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+			if (n <= 0) {
+				/*
+				 * Without more immediate space to perform the
+				 * copying, there is little choice left but to
+				 * drop the packet.
+				 */
+				err = EMSGSIZE;
+				break;
+			}
+			chunk = 0;
+			i = 0;
+			buf_idx++;
+			/*
+			 * Keep the header up-to-date with the number of
+			 * buffers, but never reference its value since the
+			 * guest could meddle with it.
+			 */
+			hdr->vrh_bufs++;
+		}
+	} while (!end && copied < msz);
+
+	/* Account for the header size in the first buffer */
+	uelem[0].len += hdr_sz;
+
+	/*
+	 * If no other errors were encounted during the copy, was the expected
+	 * amount of data transfered?
+	 */
+	if (err == 0 && copied != msz) {
+		VIONA_PROBE5(too_short, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
+		    size_t, msz);
+		VIONA_RING_STAT_INCR(ring, too_short);
+		err = EINVAL;
+	}
+
+	/* Add chksum bits, if needed */
+	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+		uint32_t cksum_flags;
+
+		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+			hdr->vrh_gso_size = DB_LSOMSS(mp);
+		}
+
+		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+		    &cksum_flags);
+		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+		}
+	}
+
+done:
+	switch (err) {
+	case 0:
+		/* Success can fall right through to ring delivery */
+		break;
+
+	case EMSGSIZE:
+		VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
+		break;
+
+	case EOVERFLOW:
+		VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
+		break;
+
+	default:
+		VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
+		    uint16_t, cookie, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+	}
+	vq_pushchain_many(ring, buf_idx + 1, uelem);
+	return (err);
+}
+
+static void
+viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
+{
+	viona_link_t *link = ring->vr_link;
+	mblk_t *mprx = NULL, **mprx_prevp = &mprx;
+	mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
+	const boolean_t do_merge =
+	    ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
+
+	size_t nrx = 0, ndrop = 0;
+
+	while (mp != NULL) {
+		mblk_t *next = mp->b_next;
+		mblk_t *pad = NULL;
+		size_t size = msgsize(mp);
+		int err = 0;
+
+		mp->b_next = NULL;
+
+		/*
+		 * We treat both a 'drop' response and errors the same here
+		 * and put the packet on the drop chain.  As packets may be
+		 * subject to different actions in ipf (which do not all
+		 * return the same set of error values), an error processing
+		 * one packet doesn't mean the next packet will also generate
+		 * an error.
+		 */
+		if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
+		    viona_hook(link, ring, &mp, B_FALSE) != 0) {
+			if (mp != NULL) {
+				*mpdrop_prevp = mp;
+				mpdrop_prevp = &mp->b_next;
+			} else {
+				/*
+				 * If the hook consumer (e.g. ipf) already
+				 * freed the mblk_t, update the drop count now.
+				 */
+				ndrop++;
+			}
+			mp = next;
+			continue;
+		}
+
+		/*
+		 * Ethernet frames are expected to be padded out in order to
+		 * meet the minimum size.
+		 *
+		 * A special case is made for frames which are short by
+		 * VLAN_TAGSZ, having been stripped of their VLAN tag while
+		 * traversing MAC.  A preallocated (and recycled) mblk is used
+		 * for that specific condition.
+		 *
+		 * All other frames that fall short on length will have custom
+		 * zero-padding allocated appended to them.
+		 */
+		if (size == NEED_VLAN_PAD_SIZE) {
+			ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
+			ASSERT(viona_vlan_pad_mp->b_cont == NULL);
+
+			for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
+				;
+
+			pad->b_cont = viona_vlan_pad_mp;
+			size += VLAN_TAGSZ;
+		} else if (size < MIN_BUF_SIZE) {
+			const size_t pad_size = MIN_BUF_SIZE - size;
+			mblk_t *zero_mp;
+
+			zero_mp = allocb(pad_size, BPRI_MED);
+			if (zero_mp == NULL) {
+				err = ENOMEM;
+				goto pad_drop;
+			}
+
+			VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
+			    mblk_t *, mp, size_t, pad_size);
+			VIONA_RING_STAT_INCR(ring, rx_pad_short);
+			zero_mp->b_wptr += pad_size;
+			bzero(zero_mp->b_rptr, pad_size);
+			linkb(mp, zero_mp);
+			size += pad_size;
+		}
+
+		if (do_merge) {
+			err = viona_recv_merged(ring, mp, size);
+		} else {
+			err = viona_recv_plain(ring, mp, size);
+		}
+
+		/*
+		 * The VLAN padding mblk is meant for continual reuse, so
+		 * remove it from the chain to prevent it from being freed.
+		 *
+		 * Custom allocated padding does not require this treatment and
+		 * is freed normally.
+		 */
+		if (pad != NULL) {
+			pad->b_cont = NULL;
+		}
+
+pad_drop:
+		/*
+		 * While an error during rx processing
+		 * (viona_recv_{merged,plain}) does not free mp on error,
+		 * hook processing might or might not free mp.  Handle either
+		 * scenario -- if mp is not yet free, it is queued up and
+		 * freed after the guest has been notified.  If mp is
+		 * already NULL, just proceed on.
+		 */
+		if (err != 0) {
+			*mpdrop_prevp = mp;
+			mpdrop_prevp = &mp->b_next;
+
+			/*
+			 * If the available ring is empty, do not bother
+			 * attempting to deliver any more frames.  Count the
+			 * rest as dropped too.
+			 */
+			if (err == ENOSPC) {
+				mp->b_next = next;
+				break;
+			}
+		} else {
+			/* Chain successful mblks to be freed later */
+			*mprx_prevp = mp;
+			mprx_prevp = &mp->b_next;
+			nrx++;
+		}
+		mp = next;
+	}
+
+	membar_enter();
+	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		viona_intr_ring(ring);
+	}
+
+	/* Free successfully received frames */
+	if (mprx != NULL) {
+		freemsgchain(mprx);
+	}
+
+	/* Free dropped frames, also tallying them */
+	mp = mpdrop;
+	while (mp != NULL) {
+		mblk_t *next = mp->b_next;
+
+		mp->b_next = NULL;
+		freemsg(mp);
+		mp = next;
+		ndrop++;
+	}
+	VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
+}
+
+static void
+viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t is_loopback)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+
+	/* Drop traffic if ring is inactive or renewing its lease */
+	if (ring->vr_state != VRS_RUN ||
+	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
+		freemsgchain(mp);
+		return;
+	}
+
+	viona_rx_common(ring, mp, is_loopback);
+}
+
+static void
+viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t is_loopback)
+{
+	viona_vring_t *ring = (viona_vring_t *)arg;
+	mac_handle_t mh = ring->vr_link->l_mh;
+	mblk_t *mp_mcast_only = NULL;
+	mblk_t **mpp = &mp_mcast_only;
+
+	/* Drop traffic if ring is inactive or renewing its lease */
+	if (ring->vr_state != VRS_RUN ||
+	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
+		freemsgchain(mp);
+		return;
+	}
+
+	/*
+	 * In addition to multicast traffic, broadcast packets will also arrive
+	 * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
+	 * for fully-classified traffic has already delivered that broadcast
+	 * traffic, so it should be suppressed here, rather than duplicating it
+	 * to the guest.
+	 */
+	while (mp != NULL) {
+		mblk_t *mp_next;
+		mac_header_info_t mhi;
+		int err;
+
+		mp_next = mp->b_next;
+		mp->b_next = NULL;
+
+		/* Determine the packet type */
+		err = mac_vlan_header_info(mh, mp, &mhi);
+		if (err != 0) {
+			mblk_t *pull;
+
+			/*
+			 * It is possible that gathering of the header
+			 * information was impeded by a leading mblk_t which
+			 * was of inadequate length to reference the needed
+			 * fields.  Try again, in case that could be solved
+			 * with a pull-up.
+			 */
+			pull = msgpullup(mp, sizeof (struct ether_vlan_header));
+			if (pull == NULL) {
+				err = ENOMEM;
+			} else {
+				err = mac_vlan_header_info(mh, pull, &mhi);
+				freemsg(pull);
+			}
+
+			if (err != 0) {
+				VIONA_RING_STAT_INCR(ring, rx_mcast_check);
+			}
+		}
+
+		/* Chain up matching packets while discarding others */
+		if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
+			*mpp = mp;
+			mpp = &mp->b_next;
+		} else {
+			freemsg(mp);
+		}
+
+		mp = mp_next;
+	}
+
+	if (mp_mcast_only != NULL) {
+		viona_rx_common(ring, mp_mcast_only, is_loopback);
+	}
+}
+
+int
+viona_rx_set(viona_link_t *link)
+{
+	viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
+	int err;
+
+	mac_rx_set(link->l_mch, viona_rx_classified, ring);
+	err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
+	    viona_rx_mcast, ring, &link->l_mph,
+	    MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
+	if (err != 0) {
+		mac_rx_clear(link->l_mch);
+	}
+
+	return (err);
+}
+
+void
+viona_rx_clear(viona_link_t *link)
+{
+	mac_promisc_remove(link->l_mph);
+	mac_rx_clear(link->l_mch);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_tx.c b/usr/src/uts/i86pc/io/viona/viona_tx.c
new file mode 100644
index 0000000000..5dc645723c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_tx.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/types.h>
+#include <sys/smt.h>
+#include <sys/strsubr.h>
+
+#include <sys/pattr.h>
+#include <sys/dlpi.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+
+#include "viona_impl.h"
+
+#define	BNXE_NIC_DRIVER		"bnxe"
+
+/*
+ * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
+ * transmission to free resources.
+ */
+kmutex_t viona_force_copy_lock;
+static enum viona_force_copy {
+	VFC_UNINITALIZED	= 0,
+	VFC_COPY_UNEEDED	= 1,
+	VFC_COPY_REQUIRED	= 2,
+} viona_force_copy_state = VFC_UNINITALIZED;
+
+struct viona_desb {
+	frtn_t			d_frtn;
+	viona_vring_t		*d_ring;
+	uint_t			d_ref;
+	uint32_t		d_len;
+	uint16_t		d_cookie;
+	uchar_t			*d_headers;
+};
+
+static void viona_tx(viona_link_t *, viona_vring_t *);
+static void viona_desb_release(viona_desb_t *);
+
+/*
+ * Return the number of available descriptors in the vring taking care of the
+ * 16-bit index wraparound.
+ *
+ * Note: If the number of apparently available descriptors is larger than the
+ * ring size (due to guest misbehavior), this check will still report the
+ * positive count of descriptors.
+ */
+static inline uint_t
+viona_vr_num_avail(viona_vring_t *ring)
+{
+	uint16_t ndesc;
+
+	/*
+	 * We're just computing (a-b) in GF(216).
+	 *
+	 * The only glitch here is that in standard C, uint16_t promotes to
+	 * (signed) int when int has more than 16 bits (almost always now).
+	 * A cast back to unsigned is necessary for proper operation.
+	 */
+	ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx;
+
+	return (ndesc);
+}
+
+static void
+viona_tx_wait_outstanding(viona_vring_t *ring)
+{
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	while (ring->vr_xfer_outstanding != 0) {
+		/*
+		 * Paying heed to signals is counterproductive here.  This is a
+		 * very tight loop if pending transfers take an extended amount
+		 * of time to be reclaimed while the host process is exiting.
+		 */
+		cv_wait(&ring->vr_cv, &ring->vr_lock);
+	}
+}
+
+/*
+ * Check if full TX packet copying is needed.  This should not be called from
+ * viona attach()/detach() context.
+ */
+static boolean_t
+viona_tx_copy_needed(void)
+{
+	boolean_t result;
+
+	mutex_enter(&viona_force_copy_lock);
+	if (viona_force_copy_state == VFC_UNINITALIZED) {
+		major_t bnxe_major;
+
+		/*
+		 * The original code for viona featured an explicit check for
+		 * the bnxe driver which, when found present, necessitated that
+		 * all transmissions be copied into their own mblks instead of
+		 * passing guest memory to the underlying device.
+		 *
+		 * The motivations for this are unclear, but until it can be
+		 * proven unnecessary, the check lives on.
+		 */
+		viona_force_copy_state = VFC_COPY_UNEEDED;
+		if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
+		    != DDI_MAJOR_T_NONE) {
+			if (ddi_hold_installed_driver(bnxe_major) != NULL) {
+				viona_force_copy_state = VFC_COPY_REQUIRED;
+				ddi_rele_driver(bnxe_major);
+			}
+		}
+	}
+	result = (viona_force_copy_state == VFC_COPY_REQUIRED);
+	mutex_exit(&viona_force_copy_lock);
+
+	return (result);
+}
+
+void
+viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
+{
+	/* Allocate desb handles for TX ring if packet copying not disabled */
+	if (!viona_tx_copy_needed()) {
+		viona_desb_t *dp;
+
+		dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
+		ring->vr_txdesb = dp;
+		for (uint_t i = 0; i < qsz; i++, dp++) {
+			dp->d_frtn.free_func = viona_desb_release;
+			dp->d_frtn.free_arg = (void *)dp;
+			dp->d_ring = ring;
+			dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
+			    KM_SLEEP);
+		}
+	}
+
+	/* Allocate ring-sized iovec buffers for TX */
+	ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
+}
+
+void
+viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
+{
+	if (ring->vr_txdesb != NULL) {
+		viona_desb_t *dp = ring->vr_txdesb;
+
+		for (uint_t i = 0; i < qsz; i++, dp++) {
+			kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
+		}
+		kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
+		ring->vr_txdesb = NULL;
+	}
+
+	if (ring->vr_txiov != NULL) {
+		kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
+		ring->vr_txiov = NULL;
+	}
+}
+
+static void
+viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+	vq_pushchain(ring, len, cookie);
+
+	membar_enter();
+	if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		viona_intr_ring(ring);
+	}
+}
+
+void
+viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
+{
+	proc_t *p = ttoproc(curthread);
+
+	(void) thread_vsetname(curthread, "viona_tx_%p", ring);
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+	ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+	mutex_exit(&ring->vr_lock);
+
+	for (;;) {
+		boolean_t bail = B_FALSE;
+		boolean_t renew = B_FALSE;
+		uint_t ntx = 0;
+
+		*ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+		while (viona_vr_num_avail(ring)) {
+			viona_tx(link, ring);
+
+			/*
+			 * It is advantageous for throughput to keep this
+			 * transmission loop tight, but periodic breaks to
+			 * check for other events are of value too.
+			 */
+			if (ntx++ >= ring->vr_size)
+				break;
+		}
+		*ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+		VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
+
+		/*
+		 * Check for available descriptors on the ring once more in
+		 * case a late addition raced with the NO_NOTIFY flag toggle.
+		 *
+		 * The barrier ensures that visibility of the vr_used_flags
+		 * store does not cross the viona_vr_num_avail() check below.
+		 */
+		membar_enter();
+		bail = VRING_NEED_BAIL(ring, p);
+		renew = vmm_drv_lease_expired(ring->vr_lease);
+		if (!bail && !renew && viona_vr_num_avail(ring)) {
+			continue;
+		}
+
+		if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
+			viona_intr_ring(ring);
+		}
+
+		mutex_enter(&ring->vr_lock);
+
+		while (!bail && !renew && !viona_vr_num_avail(ring)) {
+			(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+			bail = VRING_NEED_BAIL(ring, p);
+			renew = vmm_drv_lease_expired(ring->vr_lease);
+		}
+
+		if (bail) {
+			break;
+		} else if (renew) {
+			ring->vr_state_flags |= VRSF_RENEW;
+			/*
+			 * When renewing the lease for the ring, no TX
+			 * frames may be outstanding, as they contain
+			 * references to guest memory.
+			 */
+			viona_tx_wait_outstanding(ring);
+
+			if (!viona_ring_lease_renew(ring)) {
+				break;
+			}
+			ring->vr_state_flags &= ~VRSF_RENEW;
+		}
+		mutex_exit(&ring->vr_lock);
+	}
+
+	ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+	ring->vr_state = VRS_STOP;
+	viona_tx_wait_outstanding(ring);
+}
+
+static void
+viona_desb_release(viona_desb_t *dp)
+{
+	viona_vring_t *ring = dp->d_ring;
+	uint_t ref;
+	uint32_t len;
+	uint16_t cookie;
+
+	ref = atomic_dec_uint_nv(&dp->d_ref);
+	if (ref > 1) {
+		return;
+	}
+
+	/*
+	 * The desb corresponding to this index must be ready for reuse before
+	 * the descriptor is returned to the guest via the 'used' ring.
+	 */
+	len = dp->d_len;
+	cookie = dp->d_cookie;
+	dp->d_len = 0;
+	dp->d_cookie = 0;
+	dp->d_ref = 0;
+
+	viona_tx_done(ring, len, cookie);
+
+	mutex_enter(&ring->vr_lock);
+	if ((--ring->vr_xfer_outstanding) == 0) {
+		cv_broadcast(&ring->vr_cv);
+	}
+	mutex_exit(&ring->vr_lock);
+}
+
+static boolean_t
+viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
+    mblk_t *mp, uint32_t len)
+{
+	viona_link_t *link = ring->vr_link;
+	const struct ether_header *eth;
+	uint_t eth_len = sizeof (struct ether_header);
+	ushort_t ftype;
+	ipha_t *ipha = NULL;
+	uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
+	uint16_t flags = 0;
+	const uint_t csum_start = hdr->vrh_csum_start;
+	const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
+
+	/*
+	 * Validate that the checksum offsets provided by the guest are within
+	 * the bounds of the packet.  Additionally, ensure that the checksum
+	 * contents field is within the headers mblk copied by viona_tx().
+	 */
+	if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
+	    (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
+		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum);
+		return (B_FALSE);
+	}
+
+	/*
+	 * This is guaranteed to be safe thanks to the header copying
+	 * done in viona_tx().
+	 */
+	eth = (const struct ether_header *)mp->b_rptr;
+	ftype = ntohs(eth->ether_type);
+
+	if (ftype == ETHERTYPE_VLAN) {
+		const struct ether_vlan_header *veth;
+
+		/* punt on QinQ for now */
+		eth_len = sizeof (struct ether_vlan_header);
+		veth = (const struct ether_vlan_header *)eth;
+		ftype = ntohs(veth->ether_type);
+	}
+
+	if (ftype == ETHERTYPE_IP) {
+		ipha = (ipha_t *)(mp->b_rptr + eth_len);
+
+		ipproto = ipha->ipha_protocol;
+	} else if (ftype == ETHERTYPE_IPV6) {
+		ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
+
+		ipproto = ip6h->ip6_nxt;
+	}
+
+	/*
+	 * We ignore hdr_len because the spec says it can't be
+	 * trusted. Besides, our own stack will determine the header
+	 * boundary.
+	 */
+	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+	    (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
+	    ftype == ETHERTYPE_IP) {
+		uint16_t	*cksump;
+		uint32_t	cksum;
+		ipaddr_t	src = ipha->ipha_src;
+		ipaddr_t	dst = ipha->ipha_dst;
+
+		/*
+		 * Our native IP stack doesn't set the L4 length field
+		 * of the pseudo header when LSO is in play. Other IP
+		 * stacks, e.g. Linux, do include the length field.
+		 * This is a problem because the hardware expects that
+		 * the length field is not set. When it is set it will
+		 * cause an incorrect TCP checksum to be generated.
+		 * The reason this works in Linux is because Linux
+		 * corrects the pseudo-header checksum in the driver
+		 * code. In order to get the correct HW checksum we
+		 * need to assume the guest's IP stack gave us a bogus
+		 * TCP partial checksum and calculate it ourselves.
+		 */
+		cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
+		cksum = IP_TCP_CSUM_COMP;
+		cksum += (dst >> 16) + (dst & 0xFFFF) +
+		    (src >> 16) + (src & 0xFFFF);
+		cksum = (cksum & 0xFFFF) + (cksum >> 16);
+		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+		/*
+		 * Since viona is a "legacy device", the data stored
+		 * by the driver will be in the guest's native endian
+		 * format (see sections 2.4.3 and 5.1.6.1 of the
+		 * VIRTIO 1.0 spec for more info). At this time the
+		 * only guests using viona are x86 and we can assume
+		 * little-endian.
+		 */
+		lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
+
+		/*
+		 * Hardware, like ixgbe, expects the client to request
+		 * IP header checksum offload if it's sending LSO (see
+		 * ixgbe_get_context()). Unfortunately, virtio makes
+		 * no allowances for negotiating IP header checksum
+		 * and HW offload, only TCP checksum. We add the flag
+		 * and zero-out the checksum field. This mirrors the
+		 * behavior of our native IP stack (which does this in
+		 * the interest of HW that expects the field to be
+		 * zero).
+		 */
+		flags |= HCK_IPV4_HDRCKSUM;
+		ipha->ipha_hdr_checksum = 0;
+	}
+
+	/*
+	 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
+	 * HW_LSO, if present, is not lost.
+	 */
+	flags |= DB_CKSUMFLAGS(mp);
+
+	/*
+	 * Partial checksum support from the NIC is ideal, since it most
+	 * closely maps to the interface defined by virtio.
+	 */
+	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+	    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+		/*
+		 * MAC expects these offsets to be relative to the
+		 * start of the L3 header rather than the L2 frame.
+		 */
+		flags |= HCK_PARTIALCKSUM;
+		mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
+		    len - eth_len, 0, flags);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Without partial checksum support, look to the L3/L4 protocol
+	 * information to see if the NIC can handle it.  If not, the
+	 * checksum will need to calculated inline.
+	 */
+	if (ftype == ETHERTYPE_IP) {
+		if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
+		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+			*csump = 0;
+			flags |= HCK_FULLCKSUM;
+			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+			return (B_TRUE);
+		}
+
+		/* XXX: Implement manual fallback checksumming? */
+		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum);
+		return (B_FALSE);
+	} else if (ftype == ETHERTYPE_IPV6) {
+		if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
+		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+			*csump = 0;
+			flags |= HCK_FULLCKSUM;
+			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+			return (B_TRUE);
+		}
+
+		/* XXX: Implement manual fallback checksumming? */
+		VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
+		VIONA_RING_STAT_INCR(ring, fail_hcksum6);
+		return (B_FALSE);
+	}
+
+	/* Cannot even emulate hcksum for unrecognized protocols */
+	VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
+	VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
+	return (B_FALSE);
+}
+
+static void
+viona_tx(viona_link_t *link, viona_vring_t *ring)
+{
+	struct iovec		*iov = ring->vr_txiov;
+	const uint_t		max_segs = ring->vr_size;
+	uint16_t		cookie;
+	int			i, n;
+	uint32_t		len, base_off = 0;
+	uint32_t		min_copy = VIONA_MAX_HDRS_LEN;
+	mblk_t			*mp_head, *mp_tail, *mp;
+	viona_desb_t		*dp = NULL;
+	mac_client_handle_t	link_mch = link->l_mch;
+	const struct virtio_net_hdr *hdr;
+
+	mp_head = mp_tail = NULL;
+
+	ASSERT(iov != NULL);
+
+	n = vq_popchain(ring, iov, max_segs, &cookie);
+	if (n == 0) {
+		VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
+		VIONA_RING_STAT_INCR(ring, tx_absent);
+		return;
+	} else if (n < 0) {
+		/*
+		 * Any error encountered in vq_popchain has already resulted in
+		 * specific probe and statistic handling.  Further action here
+		 * is unnecessary.
+		 */
+		return;
+	}
+
+	/* Grab the header and ensure it is of adequate length */
+	hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
+	len = iov[0].iov_len;
+	if (len < sizeof (struct virtio_net_hdr)) {
+		goto drop_fail;
+	}
+
+	/* Make sure the packet headers are always in the first mblk. */
+	if (ring->vr_txdesb != NULL) {
+		dp = &ring->vr_txdesb[cookie];
+
+		/*
+		 * If the guest driver is operating properly, each desb slot
+		 * should be available for use when processing a TX descriptor
+		 * from the 'avail' ring.  In the case of drivers that reuse a
+		 * descriptor before it has been posted to the 'used' ring, the
+		 * data is simply dropped.
+		 */
+		if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
+			dp = NULL;
+			goto drop_fail;
+		}
+
+		dp->d_cookie = cookie;
+		mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
+		    &dp->d_frtn);
+
+		/* Account for the successful desballoc. */
+		if (mp_head != NULL)
+			dp->d_ref++;
+	} else {
+		mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
+	}
+
+	if (mp_head == NULL)
+		goto drop_fail;
+
+	mp_tail = mp_head;
+
+	/*
+	 * We always copy enough of the guest data to cover the
+	 * headers. This protects us from TOCTOU attacks and allows
+	 * message block length assumptions to be made in subsequent
+	 * code. In many cases, this means copying more data than
+	 * strictly necessary. That's okay, as it is the larger packets
+	 * (such as LSO) that really benefit from desballoc().
+	 */
+	for (i = 1; i < n; i++) {
+		const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
+
+		bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
+		mp_head->b_wptr += to_copy;
+		len += to_copy;
+		min_copy -= to_copy;
+
+		/*
+		 * We've met the minimum copy requirement. The rest of
+		 * the guest data can be referenced.
+		 */
+		if (min_copy == 0) {
+			/*
+			 * If we copied all contents of this
+			 * descriptor then move onto the next one.
+			 * Otherwise, record how far we are into the
+			 * current descriptor.
+			 */
+			if (iov[i].iov_len == to_copy)
+				i++;
+			else
+				base_off = to_copy;
+
+			break;
+		}
+	}
+
+	ASSERT3P(mp_head, !=, NULL);
+	ASSERT3P(mp_tail, !=, NULL);
+
+	for (; i < n; i++) {
+		uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
+		uint32_t chunk = iov[i].iov_len - base_off;
+
+		ASSERT3U(base_off, <, iov[i].iov_len);
+		ASSERT3U(chunk, >, 0);
+
+		if (dp != NULL) {
+			mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
+			if (mp == NULL) {
+				goto drop_fail;
+			}
+			dp->d_ref++;
+		} else {
+			mp = allocb(chunk, BPRI_MED);
+			if (mp == NULL) {
+				goto drop_fail;
+			}
+			bcopy((uchar_t *)base, mp->b_wptr, chunk);
+		}
+
+		base_off = 0;
+		len += chunk;
+		mp->b_wptr += chunk;
+		mp_tail->b_cont = mp;
+		mp_tail = mp;
+	}
+
+	if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
+		/*
+		 * The hook consumer may elect to free the mblk_t and set
+		 * our mblk_t ** to NULL.  When using a viona_desb_t
+		 * (dp != NULL), we do not want the corresponding cleanup to
+		 * occur during the viona_hook() call. We instead want to
+		 * reset and recycle dp for future use.  To prevent cleanup
+		 * during the viona_hook() call, we take a ref on dp (if being
+		 * used), and release it on success.  On failure, the
+		 * freemsgchain() call will release all the refs taken earlier
+		 * in viona_tx() (aside from the initial ref and the one we
+		 * take), and drop_hook will reset dp for reuse.
+		 */
+		if (dp != NULL)
+			dp->d_ref++;
+
+		/*
+		 * Pass &mp instead of &mp_head so we don't lose track of
+		 * mp_head if the hook consumer (i.e. ipf) elects to free mp
+		 * and set mp to NULL.
+		 */
+		mp = mp_head;
+		if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
+			if (mp != NULL)
+				freemsgchain(mp);
+			goto drop_hook;
+		}
+
+		if (dp != NULL) {
+			dp->d_ref--;
+
+			/*
+			 * It is possible that the hook(s) accepted the packet,
+			 * but as part of its processing, it issued a pull-up
+			 * which released all references to the desb.  In that
+			 * case, go back to acting like the packet is entirely
+			 * copied (which it is).
+			 */
+			if (dp->d_ref == 1) {
+				dp->d_cookie = 0;
+				dp->d_ref = 0;
+				dp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * Request hardware checksumming, if necessary. If the guest
+	 * sent an LSO packet then it must have also negotiated and
+	 * requested partial checksum; therefore the LSO logic is
+	 * contained within viona_tx_csum().
+	 */
+	if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
+	    (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
+		if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
+			goto drop_fail;
+		}
+	}
+
+	if (dp != NULL) {
+		dp->d_len = len;
+		mutex_enter(&ring->vr_lock);
+		ring->vr_xfer_outstanding++;
+		mutex_exit(&ring->vr_lock);
+	} else {
+		/*
+		 * If the data was cloned out of the ring, the descriptors can
+		 * be marked as 'used' now, rather than deferring that action
+		 * until after successful packet transmission.
+		 */
+		viona_tx_done(ring, len, cookie);
+	}
+
+	/*
+	 * We're potentially going deep into the networking layer; make sure the
+	 * guest can't run concurrently.
+	 */
+	smt_begin_unsafe();
+	mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+	smt_end_unsafe();
+	return;
+
+drop_fail:
+	/*
+	 * On the off chance that memory is not available via the desballoc or
+	 * allocb calls, there are few options left besides to fail and drop
+	 * the frame on the floor.
+	 */
+
+	if (dp != NULL) {
+		/*
+		 * Take an additional reference on the desb handle (if present)
+		 * so any desballoc-sourced mblks can release their hold on it
+		 * without the handle reaching its final state and executing
+		 * its clean-up logic.
+		 */
+		dp->d_ref++;
+	}
+
+	/*
+	 * Free any already-allocated blocks and sum up the total length of the
+	 * dropped data to be released to the used ring.
+	 */
+	freemsgchain(mp_head);
+
+drop_hook:
+	len = 0;
+	for (uint_t i = 0; i < n; i++) {
+		len += iov[i].iov_len;
+	}
+
+	if (dp != NULL) {
+		VERIFY(dp->d_ref == 2);
+
+		/* Clean up the desb handle, releasing the extra hold. */
+		dp->d_len = 0;
+		dp->d_cookie = 0;
+		dp->d_ref = 0;
+	}
+
+	VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
+	    uint16_t, cookie);
+	viona_tx_done(ring, len, cookie);
+}
diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h
index a26cc00a55..46cc72eb06 100644
--- a/usr/src/uts/i86pc/sys/viona_io.h
+++ b/usr/src/uts/i86pc/sys/viona_io.h
@@ -11,36 +11,53 @@
 
 /*
  * Copyright 2013 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #ifndef	_VIONA_IO_H_
 #define	_VIONA_IO_H_
 
 #define	VNA_IOC			(('V' << 16)|('C' << 8))
-#define	VNA_IOC_CREATE		(VNA_IOC | 1)
-#define	VNA_IOC_DELETE		(VNA_IOC | 2)
-#define	VNA_IOC_RX_RING_INIT	(VNA_IOC | 3)
-#define	VNA_IOC_TX_RING_INIT	(VNA_IOC | 4)
-#define	VNA_IOC_RX_RING_RESET	(VNA_IOC | 5)
-#define	VNA_IOC_TX_RING_RESET	(VNA_IOC | 6)
-#define	VNA_IOC_RX_RING_KICK	(VNA_IOC | 7)
-#define	VNA_IOC_TX_RING_KICK	(VNA_IOC | 8)
-#define	VNA_IOC_RX_INTR_CLR	(VNA_IOC | 9)
-#define	VNA_IOC_TX_INTR_CLR	(VNA_IOC | 10)
-#define	VNA_IOC_SET_FEATURES	(VNA_IOC | 11)
-#define	VNA_IOC_GET_FEATURES	(VNA_IOC | 12)
+#define	VNA_IOC_CREATE		(VNA_IOC | 0x01)
+#define	VNA_IOC_DELETE		(VNA_IOC | 0x02)
+
+#define	VNA_IOC_RING_INIT	(VNA_IOC | 0x10)
+#define	VNA_IOC_RING_RESET	(VNA_IOC | 0x11)
+#define	VNA_IOC_RING_KICK	(VNA_IOC | 0x12)
+#define	VNA_IOC_RING_SET_MSI	(VNA_IOC | 0x13)
+#define	VNA_IOC_RING_INTR_CLR	(VNA_IOC | 0x14)
+
+#define	VNA_IOC_INTR_POLL	(VNA_IOC | 0x20)
+#define	VNA_IOC_SET_FEATURES	(VNA_IOC | 0x21)
+#define	VNA_IOC_GET_FEATURES	(VNA_IOC | 0x22)
+#define	VNA_IOC_SET_NOTIFY_IOP	(VNA_IOC | 0x23)
 
 typedef struct vioc_create {
 	datalink_id_t	c_linkid;
-	char		c_vmname[64];
-	size_t		c_lomem_size;
-	size_t		c_himem_size;
+	int		c_vmfd;
 } vioc_create_t;
 
 typedef struct vioc_ring_init {
+	uint16_t	ri_index;
 	uint16_t	ri_qsize;
 	uint64_t	ri_qaddr;
 } vioc_ring_init_t;
 
+typedef struct vioc_ring_msi {
+	uint16_t	rm_index;
+	uint64_t	rm_addr;
+	uint64_t	rm_msg;
+} vioc_ring_msi_t;
+
+enum viona_vq_id {
+	VIONA_VQ_RX = 0,
+	VIONA_VQ_TX = 1,
+	VIONA_VQ_MAX = 2
+};
+
+typedef struct vioc_intr_poll {
+	uint32_t	vip_status[VIONA_VQ_MAX];
+} vioc_intr_poll_t;
+
+
 #endif	/* _VIONA_IO_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
index 33fefc10ea..856b75e5cc 100644
--- a/usr/src/uts/i86pc/sys/vmm_drv.h
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -17,6 +17,9 @@
 #define	_VMM_DRV_H_
 
 #ifdef	_KERNEL
+
+#include <sys/file.h>
+
 struct vmm_hold;
 typedef struct vmm_hold vmm_hold_t;
 
diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile
index 4ede5bbd84..dac59c9a45 100644
--- a/usr/src/uts/i86pc/viona/Makefile
+++ b/usr/src/uts/i86pc/viona/Makefile
@@ -11,7 +11,7 @@
 
 #
 # Copyright 2013 Pluribus Networks Inc.
-# Copyright 2017 Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 #
@@ -27,6 +27,7 @@ OBJECTS	= $(VIONA_OBJS:%=$(OBJS_DIR)/%)
 LINTS		= $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
 CONF_SRCDIR	= $(UTSBASE)/i86pc/io/viona
+MAPFILE		= $(UTSBASE)/i86pc/io/viona/viona.mapfile
 
 #
 # Include common rules.
@@ -49,8 +50,16 @@ LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
 LINTTAGS	+= -erroff=E_FUNC_RET_MAYBE_IGNORED2
 LINTTAGS	+= -erroff=E_FUNC_RET_ALWAYS_IGNOR2
 
+# needs work
+SMOFF += all_func_returns
+
+ALL_BUILDS	= $(ALL_BUILDSONLY64)
+DEF_BUILDS	= $(DEF_BUILDSONLY64)
+
 CFLAGS		+= $(CCVERBOSE)
-LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm
+LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm -Nmisc/neti
+LDFLAGS		+= -Nmisc/hook
+LDFLAGS		+= -M $(MAPFILE)
 
 #
 #	Default build targets.
diff --git a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
index 663613cee3..846011b4c5 100644
--- a/usr/src/uts/intel/ipf/ipf.global-objs.debug64
+++ b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
@@ -22,13 +22,17 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# Copyright 2013 Joyent, Inc. All rights reserved
+# Copyright 2018 Joyent, Inc. All rights reserved
 #
 
 fr_availfuncs
 fr_features
 fr_objbytes
 hdrsizes
+hook_viona_in
+hook_viona_in_gz
+hook_viona_out
+hook_viona_out_gz
 hook4_in
 hook4_in_gz
 hook4_loop_in
@@ -58,6 +62,9 @@ ip6exthdr
 ipf_cb_ops
 ipf_dev_info
 ipf_devfiles
+ipf_eth_bcast_addr
+ipf_eth_ipv4_mcast
+ipf_eth_ipv6_mcast
 ipf_kstat_tmp
 ipf_minor
 ipf_ops
-- 
cgit v1.2.3


From ee8ae3fa63afd7fd57d5e63676a991af0fb8d887 Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Mon, 11 May 2020 13:36:12 -0500
Subject: 12738 Panic in vlapic_callout_handler Reviewed by: Hans Rosenfeld
 <hans.rosenfeld@joyent.com> Reviewed by: Jason King <jason.king@joyent.com>
 Reviewed by: Mike Zeller <mike.zeller@joyent.com> Approved by: Dan McDonald
 <danmcd@joyent.com>

---
 usr/src/compat/freebsd/sys/callout.h    | 24 +++++++++++++-------
 usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c | 39 ++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 28 deletions(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h
index 6087a09f54..11823e6321 100644
--- a/usr/src/compat/freebsd/sys/callout.h
+++ b/usr/src/compat/freebsd/sys/callout.h
@@ -12,6 +12,7 @@
 /*
  * Copyright 2014 Pluribus Networks Inc.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_
@@ -21,20 +22,27 @@
 
 struct callout {
 	cyclic_id_t	c_cyc_id;
-	int		c_flags;
+	hrtime_t	c_target;
+	hrtime_t	c_fired;
 	void		(*c_func)(void *);
 	void		*c_arg;
-
 };
 
-#define	CALLOUT_ACTIVE		0x0002	/* callout is currently active */
-#define	CALLOUT_PENDING		0x0004	/* callout is waiting for timeout */
-
 #define	C_ABSOLUTE		0x0200	/* event time is absolute. */
 
-#define	callout_active(c)	((c)->c_flags & CALLOUT_ACTIVE)
-#define	callout_deactivate(c)	((c)->c_flags &= ~CALLOUT_ACTIVE)
-#define	callout_pending(c)	((c)->c_flags & CALLOUT_PENDING)
+/* Callout considered active if t_target has not been zeroed */
+#define	callout_active(c)	((c)->c_target != 0)
+#define	callout_deactivate(c)	((c)->c_target = 0)
+
+/*
+ * If a callout is rescheduled (into the future) while its handler is running,
+ * it will be able to detect the pending invocation by the target time being
+ * greater than the time at which the handler was fired.
+ *
+ * This is only valid when checked from the callout handler, which is the only
+ * place where it is used by bhyve today.
+ */
+#define	callout_pending(c)	((c)->c_target > (c)->c_fired)
 
 void	vmm_glue_callout_init(struct callout *c, int mpsafe);
 int	vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt,
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
index a8d94ea024..e2522858dd 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -37,6 +37,7 @@
  *
  * Copyright 2014 Pluribus Networks Inc.
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/types.h>
@@ -305,8 +306,13 @@ vmm_glue_callout_handler(void *arg)
 {
 	struct callout *c = arg;
 
-	c->c_flags &= ~CALLOUT_PENDING;
-	if (c->c_flags & CALLOUT_ACTIVE) {
+	if (callout_active(c)) {
+		/*
+		 * Record the handler fire time so that callout_pending() is
+		 * able to detect if the callout becomes rescheduled during the
+		 * course of the handler.
+		 */
+		c->c_fired = gethrtime();
 		(c->c_func)(c->c_arg);
 	}
 }
@@ -322,17 +328,9 @@ vmm_glue_callout_init(struct callout *c, int mpsafe)
 	hdlr.cyh_arg = c;
 	when.cyt_when = CY_INFINITY;
 	when.cyt_interval = CY_INFINITY;
+	bzero(c, sizeof (*c));
 
 	mutex_enter(&cpu_lock);
-#if 0
-	/*
-	 * XXXJOY: according to the freebsd sources, callouts do not begin
-	 * their life in the ACTIVE state.
-	 */
-	c->c_flags |= CALLOUT_ACTIVE;
-#else
-	bzero(c, sizeof (*c));
-#endif
 	c->c_cyc_id = cyclic_add(&hdlr, &when);
 	mutex_exit(&cpu_lock);
 }
@@ -352,15 +350,14 @@ vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
 
 	ASSERT(c->c_cyc_id != CYCLIC_NONE);
 
+	if ((flags & C_ABSOLUTE) == 0) {
+		target += gethrtime();
+	}
+
 	c->c_func = func;
 	c->c_arg = arg;
-	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
-
-	if (flags & C_ABSOLUTE) {
-		cyclic_reprogram(c->c_cyc_id, target);
-	} else {
-		cyclic_reprogram(c->c_cyc_id, target + gethrtime());
-	}
+	c->c_target = target;
+	cyclic_reprogram(c->c_cyc_id, target);
 
 	return (0);
 }
@@ -369,8 +366,9 @@ int
 vmm_glue_callout_stop(struct callout *c)
 {
 	ASSERT(c->c_cyc_id != CYCLIC_NONE);
+
+	c->c_target = 0;
 	cyclic_reprogram(c->c_cyc_id, CY_INFINITY);
-	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	return (0);
 }
@@ -379,10 +377,11 @@ int
 vmm_glue_callout_drain(struct callout *c)
 {
 	ASSERT(c->c_cyc_id != CYCLIC_NONE);
+
+	c->c_target = 0;
 	mutex_enter(&cpu_lock);
 	cyclic_remove(c->c_cyc_id);
 	c->c_cyc_id = CYCLIC_NONE;
-	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 	mutex_exit(&cpu_lock);
 
 	return (0);
-- 
cgit v1.2.3


From 3c5f2a9de9c6554ce899ad4ebf7978ea7293994a Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Tue, 5 May 2020 15:31:34 -0500
Subject: 12746 x86_emulate_cpuid() should clear upper 32 bits Reviewed by:
 John Levon <john.levon@joyent.com> Reviewed by: Mike Zeller
 <mike.zeller@joyent.com> Reviewed by: Toomas Soome <tsoome@me.com> Approved
 by: Dan McDonald <danmcd@joyent.com>

---
 usr/src/uts/i86pc/io/vmm/amd/svm.c   |  8 ++--
 usr/src/uts/i86pc/io/vmm/intel/vmx.c |  9 ++---
 usr/src/uts/i86pc/io/vmm/x86.c       | 75 ++++++++++++++++++++----------------
 usr/src/uts/i86pc/io/vmm/x86.h       |  4 +-
 4 files changed, 51 insertions(+), 45 deletions(-)

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
index 25dc3a63fa..80d76ab640 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -28,6 +28,7 @@
 
 /*
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/cdefs.h>
@@ -1518,11 +1519,8 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 		break;
 	case VMCB_EXIT_CPUID:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
-		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
-		    (uint32_t *)&state->rax,
-		    (uint32_t *)&ctx->sctx_rbx,
-		    (uint32_t *)&ctx->sctx_rcx,
-		    (uint32_t *)&ctx->sctx_rdx);
+		handled = x86_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
+		    &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
 		break;
 	case VMCB_EXIT_HLT:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index ce42ff8c9c..eea036b253 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -40,6 +40,7 @@
  *
  * Copyright 2015 Pluribus Networks Inc.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/cdefs.h>
@@ -1244,11 +1245,9 @@ vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 	int handled;
 #endif
 
-	handled = x86_emulate_cpuid(vm, vcpu,
-				    (uint32_t*)(&vmxctx->guest_rax),
-				    (uint32_t*)(&vmxctx->guest_rbx),
-				    (uint32_t*)(&vmxctx->guest_rcx),
-				    (uint32_t*)(&vmxctx->guest_rdx));
+	handled = x86_emulate_cpuid(vm, vcpu, (uint64_t *)&vmxctx->guest_rax,
+	    (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx,
+	    (uint64_t *)&vmxctx->guest_rdx);
 	return (handled);
 }
 
diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c
index d74f866013..6213173587 100644
--- a/usr/src/uts/i86pc/io/vmm/x86.c
+++ b/usr/src/uts/i86pc/io/vmm/x86.c
@@ -39,6 +39,7 @@
  *
  * Copyright 2014 Pluribus Networks Inc.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/cdefs.h>
@@ -90,34 +91,39 @@ log2(u_int x)
 }
 
 int
-x86_emulate_cpuid(struct vm *vm, int vcpu_id,
-		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx,
+    uint64_t *rcx, uint64_t *rdx)
 {
 	const struct xsave_limits *limits;
 	uint64_t cr4;
 	int error, enable_invpcid, level, width = 0, x2apic_id = 0;
-	unsigned int func, regs[4], logical_cpus = 0;
+	unsigned int func, regs[4], logical_cpus = 0, param;
 	enum x2apic_state x2apic_state;
 	uint16_t cores, maxcpus, sockets, threads;
 
-	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
+	/*
+	 * The function of CPUID is controlled through the provided value of
+	 * %eax (and secondarily %ecx, for certain leaf data).
+	 */
+	func = (uint32_t)*rax;
+	param = (uint32_t)*rcx;
+
+	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", func, param);
 
 	/*
 	 * Requests for invalid CPUID levels should map to the highest
 	 * available level instead.
 	 */
-	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
-		if (*eax > cpu_exthigh)
-			*eax = cpu_exthigh;
-	} else if (*eax >= 0x40000000) {
-		if (*eax > CPUID_VM_HIGH)
-			*eax = CPUID_VM_HIGH;
-	} else if (*eax > cpu_high) {
-		*eax = cpu_high;
+	if (cpu_exthigh != 0 && func >= 0x80000000) {
+		if (func > cpu_exthigh)
+			func = cpu_exthigh;
+	} else if (func >= 0x40000000) {
+		if (func > CPUID_VM_HIGH)
+			func = CPUID_VM_HIGH;
+	} else if (func > cpu_high) {
+		func = cpu_high;
 	}
 
-	func = *eax;
-
 	/*
 	 * In general the approach used for CPU topology is to
 	 * advertise a flat topology where all CPUs are packages with
@@ -135,10 +141,10 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		case CPUID_8000_0003:
 		case CPUID_8000_0004:
 		case CPUID_8000_0006:
-			cpuid_count(*eax, *ecx, regs);
+			cpuid_count(func, param, regs);
 			break;
 		case CPUID_8000_0008:
-			cpuid_count(*eax, *ecx, regs);
+			cpuid_count(func, param, regs);
 			if (vmm_is_amd()) {
 				/*
 				 * As on Intel (0000_0007:0, EDX), mask out
@@ -169,7 +175,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			break;
 
 		case CPUID_8000_0001:
-			cpuid_count(*eax, *ecx, regs);
+			cpuid_count(func, param, regs);
 
 			/*
 			 * Hide SVM from guest.
@@ -263,7 +269,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			 */
 			vm_get_topology(vm, &sockets, &cores, &threads,
 			    &maxcpus);
-			switch (*ecx) {
+			switch (param) {
 			case 0:
 				logical_cpus = threads;
 				level = 1;
@@ -408,7 +414,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			break;
 
 		case CPUID_0000_0004:
-			cpuid_count(*eax, *ecx, regs);
+			cpuid_count(func, param, regs);
 
 			if (regs[0] || regs[1] || regs[2] || regs[3]) {
 				vm_get_topology(vm, &sockets, &cores, &threads,
@@ -437,8 +443,8 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			regs[3] = 0;
 
 			/* leaf 0 */
-			if (*ecx == 0) {
-				cpuid_count(*eax, *ecx, regs);
+			if (param == 0) {
+				cpuid_count(func, param, regs);
 
 				/* Only leaf 0 is supported */
 				regs[0] = 0;
@@ -491,21 +497,21 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 			if (vmm_is_intel()) {
 				vm_get_topology(vm, &sockets, &cores, &threads,
 				    &maxcpus);
-				if (*ecx == 0) {
+				if (param == 0) {
 					logical_cpus = threads;
 					width = log2(logical_cpus);
 					level = CPUID_TYPE_SMT;
 					x2apic_id = vcpu_id;
 				}
 
-				if (*ecx == 1) {
+				if (param == 1) {
 					logical_cpus = threads * cores;
 					width = log2(logical_cpus);
 					level = CPUID_TYPE_CORE;
 					x2apic_id = vcpu_id;
 				}
 
-				if (!cpuid_leaf_b || *ecx >= 2) {
+				if (!cpuid_leaf_b || param >= 2) {
 					width = 0;
 					logical_cpus = 0;
 					level = 0;
@@ -514,7 +520,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 
 				regs[0] = width & 0x1f;
 				regs[1] = logical_cpus & 0xffff;
-				regs[2] = (level << 8) | (*ecx & 0xff);
+				regs[2] = (level << 8) | (param & 0xff);
 				regs[3] = x2apic_id;
 			} else {
 				regs[0] = 0;
@@ -534,8 +540,8 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 				break;
 			}
 
-			cpuid_count(*eax, *ecx, regs);
-			switch (*ecx) {
+			cpuid_count(func, param, regs);
+			switch (param) {
 			case 0:
 				/*
 				 * Only permit the guest to use bits
@@ -565,7 +571,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 				 * pass through as-is, otherwise return
 				 * all zeroes.
 				 */
-				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
+				if (!(limits->xcr0_allowed & (1ul << param))) {
 					regs[0] = 0;
 					regs[1] = 0;
 					regs[2] = 0;
@@ -590,14 +596,17 @@ default_leaf:
 			 * how many unhandled leaf values have been seen.
 			 */
 			atomic_add_long(&bhyve_xcpuids, 1);
-			cpuid_count(*eax, *ecx, regs);
+			cpuid_count(func, param, regs);
 			break;
 	}
 
-	*eax = regs[0];
-	*ebx = regs[1];
-	*ecx = regs[2];
-	*edx = regs[3];
+	/*
+	 * CPUID clears the upper 32-bits of the long-mode registers.
+	 */
+	*rax = regs[0];
+	*rbx = regs[1];
+	*rcx = regs[2];
+	*rdx = regs[3];
 
 	return (1);
 }
diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h
index 0d70c04fd8..cb8e12fcd2 100644
--- a/usr/src/uts/i86pc/io/vmm/x86.h
+++ b/usr/src/uts/i86pc/io/vmm/x86.h
@@ -63,8 +63,8 @@
  */
 #define CPUID_0000_0001_FEAT0_VMX	(1<<5)
 
-int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
-		      uint32_t *ecx, uint32_t *edx);
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint64_t *rax, uint64_t *rbx,
+    uint64_t *rcx, uint64_t *rdx);
 
 enum vm_cpuid_capability {
 	VCC_NONE,
-- 
cgit v1.2.3


From eb9a1df2aeb866bf1de4494433b6d7e5fa07b3ae Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@joyent.com>
Date: Mon, 16 Jul 2018 09:03:51 +0000
Subject: 12680 want PCI pass-thru in bhyve Portions contributed by: Patrick
 Mooney <patrick.mooney@joyent.com> Portions contributed by: John Levon
 <john.levon@joyent.com> Portions contributed by: Andy Fiddaman
 <omnios@citrus-it.co.uk> Reviewed by: Patrick Mooney <pmooney@oxide.computer>
 Approved by: Dan McDonald <danmcd@joyent.com>

---
 exception_lists/packaging                          |    3 +
 usr/src/cmd/Makefile                               |    1 +
 usr/src/cmd/bhyve/pci_passthru.c                   |  433 +++---
 usr/src/cmd/bhyvectl/bhyvectl.c                    |    8 +
 usr/src/cmd/devfsadm/i386/misc_link_i386.c         |   19 +
 usr/src/cmd/pptadm/Makefile                        |   43 +
 usr/src/cmd/pptadm/pptadm.c                        |  205 +++
 usr/src/compat/freebsd/amd64/machine/cpu.h         |   23 +
 .../freebsd/contrib/dev/acpica/include/acpi.h      |   21 +
 usr/src/compat/freebsd/dev/pci/pcivar.h            |   38 +
 usr/src/compat/freebsd/sys/bus.h                   |   21 +
 usr/src/lib/Makefile                               |    3 +
 usr/src/lib/libppt/Makefile                        |   44 +
 usr/src/lib/libppt/Makefile.com                    |   46 +
 usr/src/lib/libppt/amd64/Makefile                  |   19 +
 usr/src/lib/libppt/common/libppt.c                 |  506 +++++++
 usr/src/lib/libppt/common/libppt.h                 |   36 +
 usr/src/lib/libppt/common/llib-lppt                |   19 +
 usr/src/lib/libppt/common/mapfile-vers             |   40 +
 usr/src/lib/libppt/i386/Makefile                   |   18 +
 usr/src/lib/libppt/sparc/Makefile                  |   18 +
 usr/src/lib/libppt/sparcv9/Makefile                |   19 +
 usr/src/lib/libvmmapi/common/mapfile-vers          |    1 +
 usr/src/lib/libvmmapi/common/vmmapi.c              |  100 +-
 usr/src/lib/libvmmapi/common/vmmapi.h              |   15 +
 usr/src/man/man1m/Makefile                         |    1 +
 usr/src/man/man1m/pptadm.1m                        |   74 +
 usr/src/pkg/manifests/system-bhyve.mf              |    8 +
 usr/src/pkg/manifests/system-library-bhyve.mf      |    3 +
 usr/src/uts/common/os/modsysfile.c                 |   26 +-
 usr/src/uts/i86pc/Makefile.files                   |    7 +-
 usr/src/uts/i86pc/Makefile.i86pc                   |    1 +
 usr/src/uts/i86pc/io/vmm/intel/vtd.c               |  103 +-
 usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c           |   83 ++
 usr/src/uts/i86pc/io/vmm/io/iommu.c                |  383 ++++++
 usr/src/uts/i86pc/io/vmm/io/ppt.c                  | 1436 ++++++++++++++++++++
 usr/src/uts/i86pc/io/vmm/io/ppt.conf               |   14 +
 usr/src/uts/i86pc/io/vmm/io/ppt.h                  |   27 +-
 usr/src/uts/i86pc/io/vmm/io/ppt.mapfile            |   52 +
 usr/src/uts/i86pc/io/vmm/io/sol_iommu.c            |   86 --
 usr/src/uts/i86pc/io/vmm/io/sol_ppt.c              |   92 --
 usr/src/uts/i86pc/io/vmm/vmm.c                     |   27 +
 usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c             |   57 +-
 usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c            |   15 +
 usr/src/uts/i86pc/ppt/Makefile                     |   86 ++
 usr/src/uts/i86pc/sys/ppt_dev.h                    |   56 +
 usr/src/uts/i86pc/sys/vmm.h                        |    5 +
 usr/src/uts/i86pc/sys/vmm_dev.h                    |   47 +
 usr/src/uts/i86pc/vmm/Makefile                     |    5 +-
 usr/src/uts/intel/ia32/ml/modstubs.s               |   18 +-
 50 files changed, 3963 insertions(+), 448 deletions(-)
 create mode 100644 usr/src/cmd/pptadm/Makefile
 create mode 100644 usr/src/cmd/pptadm/pptadm.c
 create mode 100644 usr/src/compat/freebsd/amd64/machine/cpu.h
 create mode 100644 usr/src/compat/freebsd/contrib/dev/acpica/include/acpi.h
 create mode 100644 usr/src/compat/freebsd/dev/pci/pcivar.h
 create mode 100644 usr/src/compat/freebsd/sys/bus.h
 create mode 100644 usr/src/lib/libppt/Makefile
 create mode 100644 usr/src/lib/libppt/Makefile.com
 create mode 100644 usr/src/lib/libppt/amd64/Makefile
 create mode 100644 usr/src/lib/libppt/common/libppt.c
 create mode 100644 usr/src/lib/libppt/common/libppt.h
 create mode 100644 usr/src/lib/libppt/common/llib-lppt
 create mode 100644 usr/src/lib/libppt/common/mapfile-vers
 create mode 100644 usr/src/lib/libppt/i386/Makefile
 create mode 100644 usr/src/lib/libppt/sparc/Makefile
 create mode 100644 usr/src/lib/libppt/sparcv9/Makefile
 create mode 100644 usr/src/man/man1m/pptadm.1m
 create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/iommu.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/ppt.c
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/ppt.conf
 create mode 100644 usr/src/uts/i86pc/io/vmm/io/ppt.mapfile
 delete mode 100644 usr/src/uts/i86pc/io/vmm/io/sol_iommu.c
 delete mode 100644 usr/src/uts/i86pc/io/vmm/io/sol_ppt.c
 create mode 100644 usr/src/uts/i86pc/ppt/Makefile
 create mode 100644 usr/src/uts/i86pc/sys/ppt_dev.h

(limited to 'usr/src/uts/i86pc')

diff --git a/exception_lists/packaging b/exception_lists/packaging
index cd1e8ed230..981eb8aa91 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -820,7 +820,10 @@ usr/lib/libsff.so
 # private bhyve files
 #
 lib/amd64/libvmmapi.so			i386
+usr/include/libppt.h			i386
 usr/include/vmmapi.h			i386
+usr/lib/amd64/libppt.so			i386
+usr/lib/libppt.so			i386
 
 #
 # libcustr is private
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index f20274bd35..0f2cc306aa 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -323,6 +323,7 @@ COMMON_SUBDIRS=		\
 	ppgsz		\
 	pg		\
 	plockstat	\
+	pptadm		\
 	pr		\
 	prctl		\
 	print		\
diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c
index d2c69e795c..3782914cd5 100644
--- a/usr/src/cmd/bhyve/pci_passthru.c
+++ b/usr/src/cmd/bhyve/pci_passthru.c
@@ -40,6 +40,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/pciio.h>
 #include <sys/ioctl.h>
 
+#include <sys/pci.h>
+
 #include <dev/io/iodev.h>
 #include <dev/pci/pcireg.h>
 
@@ -59,30 +61,15 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
+#include <sys/ppt_dev.h>
 #include "pci_emul.h"
 #include "mem.h"
 
-#ifndef _PATH_DEVPCI
-#define	_PATH_DEVPCI	"/dev/pci"
-#endif
-
-#ifndef	_PATH_DEVIO
-#define	_PATH_DEVIO	"/dev/io"
-#endif
-
-#ifndef _PATH_MEM
-#define	_PATH_MEM	"/dev/mem"
-#endif
-
 #define	LEGACY_SUPPORT	1
 
 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
 #define MSIX_CAPLEN 12
 
-static int pcifd = -1;
-static int iofd = -1;
-static int memfd = -1;
-
 struct passthru_softc {
 	struct pci_devinst *psc_pi;
 	struct pcibar psc_bar[PCI_BARMAX + 1];
@@ -94,14 +81,16 @@ struct passthru_softc {
 	struct {
 		int		capoff;
 	} psc_msix;
-	struct pcisel psc_sel;
+	int pptfd;
+	int msi_limit;
+	int msix_limit;
 };
 
 static int
 msi_caplen(int msgctrl)
 {
 	int len;
-	
+
 	len = 10;		/* minimum length of msi capability */
 
 	if (msgctrl & PCIM_MSICTRL_64BIT)
@@ -120,33 +109,76 @@ msi_caplen(int msgctrl)
 }
 
 static uint32_t
-read_config(const struct pcisel *sel, long reg, int width)
+read_config(const struct passthru_softc *sc, long reg, int width)
 {
-	struct pci_io pi;
+	struct ppt_cfg_io pi;
 
-	bzero(&pi, sizeof(pi));
-	pi.pi_sel = *sel;
-	pi.pi_reg = reg;
-	pi.pi_width = width;
+	pi.pci_off = reg;
+	pi.pci_width = width;
 
-	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
-		return (0);				/* XXX */
-	else
-		return (pi.pi_data);
+	if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
+		return (0);
+	}
+	return (pi.pci_data);
 }
 
 static void
-write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
+write_config(const struct passthru_softc *sc, long reg, int width,
+    uint32_t data)
 {
-	struct pci_io pi;
+	struct ppt_cfg_io pi;
 
-	bzero(&pi, sizeof(pi));
-	pi.pi_sel = *sel;
-	pi.pi_reg = reg;
-	pi.pi_width = width;
-	pi.pi_data = data;
+	pi.pci_off = reg;
+	pi.pci_width = width;
+	pi.pci_data = data;
 
-	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
+	(void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
+}
+
+static int
+passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
+    uint64_t *base, uint64_t *size)
+{
+	struct ppt_bar_query pb;
+
+	pb.pbq_baridx = bar;
+
+	if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
+		return (-1);
+	}
+
+	switch (pb.pbq_type) {
+	case PCI_ADDR_IO:
+		*type = PCIBAR_IO;
+		break;
+	case PCI_ADDR_MEM32:
+		*type = PCIBAR_MEM32;
+		break;
+	case PCI_ADDR_MEM64:
+		*type = PCIBAR_MEM64;
+		break;
+	default:
+		err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
+		break;
+	}
+
+	*base = pb.pbq_base;
+	*size = pb.pbq_size;
+	return (0);
+}
+
+static int
+passthru_dev_open(const char *path, int *pptfdp)
+{
+	int pptfd;
+
+	if ((pptfd = open(path, O_RDWR)) < 0) {
+		return (errno);
+	}
+
+	/* XXX: verify fd with ioctl? */
+	*pptfdp = pptfd;
+	return (0);
 }
 
 #ifdef LEGACY_SUPPORT
@@ -174,55 +206,87 @@ passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
 }
 #endif	/* LEGACY_SUPPORT */
 
+static void
+passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
+{
+	struct pci_devinst *pi = sc->psc_pi;
+	int off;
+
+	/* Reduce the number of MSI vectors if higher than OS limit */
+	if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
+		int msi_limit, mmc;
+
+		msi_limit =
+		    sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
+		    sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
+		    sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
+		    sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
+		    sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
+		    PCIM_MSICTRL_MMC_1;
+		mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
+
+		if (mmc > msi_limit) {
+			sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
+			sc->psc_msi.msgctrl |= msi_limit;
+			pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
+		}
+	}
+
+	/* Reduce the number of MSI-X vectors if higher than OS limit */
+	if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
+		if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
+			msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
+			msixcap->msgctrl |= sc->msix_limit - 1;
+			pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
+		}
+	}
+}
+
 static int
 cfginitmsi(struct passthru_softc *sc)
 {
 	int i, ptr, capptr, cap, sts, caplen, table_size;
 	uint32_t u32;
-	struct pcisel sel;
-	struct pci_devinst *pi;
+	struct pci_devinst *pi = sc->psc_pi;
 	struct msixcap msixcap;
 	uint32_t *msixcap_ptr;
 
-	pi = sc->psc_pi;
-	sel = sc->psc_sel;
-
 	/*
 	 * Parse the capabilities and cache the location of the MSI
 	 * and MSI-X capabilities.
 	 */
-	sts = read_config(&sel, PCIR_STATUS, 2);
+	sts = read_config(sc, PCIR_STATUS, 2);
 	if (sts & PCIM_STATUS_CAPPRESENT) {
-		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
+		ptr = read_config(sc, PCIR_CAP_PTR, 1);
 		while (ptr != 0 && ptr != 0xff) {
-			cap = read_config(&sel, ptr + PCICAP_ID, 1);
+			cap = read_config(sc, ptr + PCICAP_ID, 1);
 			if (cap == PCIY_MSI) {
 				/*
 				 * Copy the MSI capability into the config
 				 * space of the emulated pci device
 				 */
 				sc->psc_msi.capoff = ptr;
-				sc->psc_msi.msgctrl = read_config(&sel,
-								  ptr + 2, 2);
+				sc->psc_msi.msgctrl = read_config(sc,
+				    ptr + 2, 2);
 				sc->psc_msi.emulated = 0;
 				caplen = msi_caplen(sc->psc_msi.msgctrl);
 				capptr = ptr;
 				while (caplen > 0) {
-					u32 = read_config(&sel, capptr, 4);
+					u32 = read_config(sc, capptr, 4);
 					pci_set_cfgdata32(pi, capptr, u32);
 					caplen -= 4;
 					capptr += 4;
 				}
 			} else if (cap == PCIY_MSIX) {
 				/*
-				 * Copy the MSI-X capability 
+				 * Copy the MSI-X capability
 				 */
 				sc->psc_msix.capoff = ptr;
 				caplen = 12;
 				msixcap_ptr = (uint32_t*) &msixcap;
 				capptr = ptr;
 				while (caplen > 0) {
-					u32 = read_config(&sel, capptr, 4);
+					u32 = read_config(sc, capptr, 4);
 					*msixcap_ptr = u32;
 					pci_set_cfgdata32(pi, capptr, u32);
 					caplen -= 4;
@@ -230,10 +294,12 @@ cfginitmsi(struct passthru_softc *sc)
 					msixcap_ptr++;
 				}
 			}
-			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
+			ptr = read_config(sc, ptr + PCICAP_NEXTPTR, 1);
 		}
 	}
 
+	passthru_intr_limit(sc, &msixcap);
+
 	if (sc->psc_msix.capoff != 0) {
 		pi->pi_msix.pba_bar =
 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
@@ -265,7 +331,7 @@ cfginitmsi(struct passthru_softc *sc)
 	 */
 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
 		int origptr, msiptr;
-		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
+		origptr = read_config(sc, PCIR_CAP_PTR, 1);
 		msiptr = passthru_add_msicap(pi, 1, origptr);
 		sc->psc_msi.capoff = msiptr;
 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
@@ -275,14 +341,15 @@ cfginitmsi(struct passthru_softc *sc)
 #endif
 
 	/* Make sure one of the capabilities is present */
-	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) 
+	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) {
 		return (-1);
-	else
+	} else {
 		return (0);
+	}
 }
 
 static uint64_t
-msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
+passthru_msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
 {
 	struct pci_devinst *pi;
 	struct msix_table_entry *entry;
@@ -360,8 +427,8 @@ msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
 }
 
 static void
-msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
-		 uint64_t offset, int size, uint64_t data)
+passthru_msix_table_write(struct vmctx *ctx, int vcpu,
+    struct passthru_softc *sc, uint64_t offset, int size, uint64_t data)
 {
 	struct pci_devinst *pi;
 	struct msix_table_entry *entry;
@@ -426,10 +493,9 @@ msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
 		/* If the entry is masked, don't set it up */
 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
-			(void)vm_setup_pptdev_msix(ctx, vcpu,
-			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
-			    sc->psc_sel.pc_func, index, entry->addr,
-			    entry->msg_data, entry->vector_control);
+			(void) vm_setup_pptdev_msix(ctx, vcpu, sc->pptfd,
+			    index, entry->addr, entry->msg_data,
+			    entry->vector_control);
 		}
 	}
 }
@@ -437,7 +503,6 @@ msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
 static int
 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
 {
-	int b, s, f;
 	int error, idx;
 	size_t len, remaining;
 	uint32_t table_size, table_offset;
@@ -447,14 +512,10 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
 
 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
 
-	b = sc->psc_sel.pc_bus;
-	s = sc->psc_sel.pc_dev;
-	f = sc->psc_sel.pc_func;
-
-	/* 
+	/*
 	 * If the MSI-X table BAR maps memory intended for
-	 * other uses, it is at least assured that the table 
-	 * either resides in its own page within the region, 
+	 * other uses, it is at least assured that the table
+	 * either resides in its own page within the region,
 	 * or it resides in a page shared with only the PBA.
 	 */
 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
@@ -490,12 +551,11 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
 				pi->pi_msix.pba_page_offset = table_offset +
 				    table_size - 4096;
 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
-			    PROT_WRITE, MAP_SHARED, memfd, start +
+			    PROT_WRITE, MAP_SHARED, sc->pptfd,
 			    pi->pi_msix.pba_page_offset);
 			if (pi->pi_msix.pba_page == MAP_FAILED) {
-				warn(
-			    "Failed to map PBA page for MSI-X on %d/%d/%d",
-				    b, s, f);
+				warn("Failed to map PBA page for MSI-X on %d",
+				    sc->pptfd);
 				return (-1);
 			}
 		}
@@ -504,7 +564,7 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
 	/* Map everything before the MSI-X table */
 	if (table_offset > 0) {
 		len = table_offset;
-		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
+		error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base);
 		if (error)
 			return (error);
 
@@ -521,7 +581,7 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
 	/* Map everything beyond the end of the MSI-X table */
 	if (remaining > 0) {
 		len = remaining;
-		error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
+		error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base);
 		if (error)
 			return (error);
 	}
@@ -532,47 +592,26 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
 static int
 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
 {
-	int i, error;
-	struct pci_devinst *pi;
-	struct pci_bar_io bar;
-	enum pcibar_type bartype;
-	uint64_t base, size;
-
-	pi = sc->psc_pi;
+	struct pci_devinst *pi = sc->psc_pi;
+	uint_t i;
 
 	/*
 	 * Initialize BAR registers
 	 */
 	for (i = 0; i <= PCI_BARMAX; i++) {
-		bzero(&bar, sizeof(bar));
-		bar.pbi_sel = sc->psc_sel;
-		bar.pbi_reg = PCIR_BAR(i);
+		enum pcibar_type bartype;
+		uint64_t base, size;
+		int error;
 
-		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
+		if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
 			continue;
-
-		if (PCI_BAR_IO(bar.pbi_base)) {
-			bartype = PCIBAR_IO;
-			base = bar.pbi_base & PCIM_BAR_IO_BASE;
-		} else {
-			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
-			case PCIM_BAR_MEM_64:
-				bartype = PCIBAR_MEM64;
-				break;
-			default:
-				bartype = PCIBAR_MEM32;
-				break;
-			}
-			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
 		}
-		size = bar.pbi_length;
 
 		if (bartype != PCIBAR_IO) {
 			if (((base | size) & PAGE_MASK) != 0) {
-				warnx("passthru device %d/%d/%d BAR %d: "
+				warnx("passthru device %d BAR %d: "
 				    "base %#lx or size %#lx not page aligned\n",
-				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
-				    sc->psc_sel.pc_func, i, base, size);
+				    sc->pptfd, i, base, size);
 				return (-1);
 			}
 		}
@@ -590,13 +629,12 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
 		/* The MSI-X table needs special handling */
 		if (i == pci_msix_table_bar(pi)) {
 			error = init_msix_table(ctx, sc, base);
-			if (error) 
+			if (error)
 				return (-1);
 		} else if (bartype != PCIBAR_IO) {
 			/* Map the physical BAR in the guest MMIO space */
-			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
-				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
-				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+			error = vm_map_pptdev_mmio(ctx, sc->pptfd,
+			    pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
 			if (error)
 				return (-1);
 		}
@@ -614,114 +652,43 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
 }
 
 static int
-cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
+cfginit(struct vmctx *ctx, struct passthru_softc *sc)
 {
-	int error;
-	struct passthru_softc *sc;
-
-	error = 1;
-	sc = pi->pi_arg;
-
-	bzero(&sc->psc_sel, sizeof(struct pcisel));
-	sc->psc_sel.pc_bus = bus;
-	sc->psc_sel.pc_dev = slot;
-	sc->psc_sel.pc_func = func;
-
 	if (cfginitmsi(sc) != 0) {
-		warnx("failed to initialize MSI for PCI %d/%d/%d",
-		    bus, slot, func);
-		goto done;
+		warnx("failed to initialize MSI for PCI %d", sc->pptfd);
+		return (-1);
 	}
 
 	if (cfginitbar(ctx, sc) != 0) {
-		warnx("failed to initialize BARs for PCI %d/%d/%d",
-		    bus, slot, func);
-		goto done;
+		warnx("failed to initialize BARs for PCI %d", sc->pptfd);
+		return (-1);
 	}
 
-	error = 0;				/* success */
-done:
-	return (error);
+	return (0);
 }
 
 static int
 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
-	int bus, slot, func, error, memflags;
+	int error, memflags, pptfd;
 	struct passthru_softc *sc;
-#ifndef WITHOUT_CAPSICUM
-	cap_rights_t rights;
-	cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR };
-	cap_ioctl_t io_ioctls[] = { IODEV_PIO };
-#endif
 
 	sc = NULL;
 	error = 1;
 
-#ifndef WITHOUT_CAPSICUM
-	cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
-#endif
-
 	memflags = vm_get_memflags(ctx);
 	if (!(memflags & VM_MEM_F_WIRED)) {
 		warnx("passthru requires guest memory to be wired");
 		goto done;
 	}
 
-	if (pcifd < 0) {
-		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
-		if (pcifd < 0) {
-			warn("failed to open %s", _PATH_DEVPCI);
-			goto done;
-		}
-	}
-
-#ifndef WITHOUT_CAPSICUM
-	if (cap_rights_limit(pcifd, &rights) == -1 && errno != ENOSYS)
-		errx(EX_OSERR, "Unable to apply rights for sandbox");
-	if (cap_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1 && errno != ENOSYS)
-		errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
-	if (iofd < 0) {
-		iofd = open(_PATH_DEVIO, O_RDWR, 0);
-		if (iofd < 0) {
-			warn("failed to open %s", _PATH_DEVIO);
-			goto done;
-		}
-	}
-
-#ifndef WITHOUT_CAPSICUM
-	if (cap_rights_limit(iofd, &rights) == -1 && errno != ENOSYS)
-		errx(EX_OSERR, "Unable to apply rights for sandbox");
-	if (cap_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1 && errno != ENOSYS)
-		errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
-	if (memfd < 0) {
-		memfd = open(_PATH_MEM, O_RDWR, 0);
-		if (memfd < 0) {
-			warn("failed to open %s", _PATH_MEM);
-			goto done;
-		}
-	}
-
-#ifndef WITHOUT_CAPSICUM
-	cap_rights_clear(&rights, CAP_IOCTL);
-	cap_rights_set(&rights, CAP_MMAP_RW);
-	if (cap_rights_limit(memfd, &rights) == -1 && errno != ENOSYS)
-		errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
-	if (opts == NULL ||
-	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
+	if (opts == NULL || passthru_dev_open(opts, &pptfd) != 0) {
 		warnx("invalid passthru options");
 		goto done;
 	}
 
-	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
-		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
-		    bus, slot, func);
+	if (vm_assign_pptdev(ctx, pptfd) != 0) {
+		warnx("PCI device at %d is not using the ppt driver", pptfd);
 		goto done;
 	}
 
@@ -729,16 +696,21 @@ passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 
 	pi->pi_arg = sc;
 	sc->psc_pi = pi;
+	sc->pptfd = pptfd;
+
+	if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
+	    &sc->msix_limit)) != 0)
+		goto done;
 
 	/* initialize config space */
-	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
+	if ((error = cfginit(ctx, sc)) != 0)
 		goto done;
-	
+
 	error = 0;		/* success */
 done:
 	if (error) {
 		free(sc);
-		vm_unassign_pptdev(ctx, bus, slot, func);
+		vm_unassign_pptdev(ctx, pptfd);
 	}
 	return (error);
 }
@@ -768,7 +740,7 @@ msicap_access(struct passthru_softc *sc, int coff)
 		return (0);
 }
 
-static int 
+static int
 msixcap_access(struct passthru_softc *sc, int coff)
 {
 	if (sc->psc_msix.capoff == 0) 
@@ -780,7 +752,7 @@ msixcap_access(struct passthru_softc *sc, int coff)
 
 static int
 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
-		 int coff, int bytes, uint32_t *rv)
+    int coff, int bytes, uint32_t *rv)
 {
 	struct passthru_softc *sc;
 
@@ -792,6 +764,13 @@ passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	if (bar_access(coff) || msicap_access(sc, coff))
 		return (-1);
 
+	/*
+	 * MSI-X is also emulated since a limit on interrupts may be imposed by
+	 * the OS, altering the perceived register state.
+	 */
+	if (msixcap_access(sc, coff))
+		return (-1);
+
 #ifdef LEGACY_SUPPORT
 	/*
 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
@@ -804,14 +783,14 @@ passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 #endif
 
 	/* Everything else just read from the device's config space */
-	*rv = read_config(&sc->psc_sel, coff, bytes);
+	*rv = read_config(sc, coff, bytes);
 
 	return (0);
 }
 
 static int
 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
-		  int coff, int bytes, uint32_t val)
+    int coff, int bytes, uint32_t val)
 {
 	int error, msix_table_entries, i;
 	struct passthru_softc *sc;
@@ -830,10 +809,8 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	if (msicap_access(sc, coff)) {
 		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
 
-		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
-			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
-			pi->pi_msi.addr, pi->pi_msi.msg_data,
-			pi->pi_msi.maxmsgnum);
+		error = vm_setup_pptdev_msi(ctx, vcpu, sc->pptfd,
+		    pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
 		if (error != 0)
 			err(1, "vm_setup_pptdev_msi");
 		return (0);
@@ -845,12 +822,11 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 			msix_table_entries = pi->pi_msix.table_count;
 			for (i = 0; i < msix_table_entries; i++) {
 				error = vm_setup_pptdev_msix(ctx, vcpu,
-				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, 
-				    sc->psc_sel.pc_func, i, 
+				    sc->pptfd, i,
 				    pi->pi_msix.table[i].addr,
 				    pi->pi_msix.table[i].msg_data,
 				    pi->pi_msix.table[i].vector_control);
-		
+
 				if (error)
 					err(1, "vm_setup_pptdev_msix");
 			}
@@ -870,57 +846,54 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	}
 #endif
 
-	write_config(&sc->psc_sel, coff, bytes, val);
+	write_config(sc, coff, bytes, val);
 
 	return (0);
 }
 
 static void
 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
-	       uint64_t offset, int size, uint64_t value)
+    uint64_t offset, int size, uint64_t value)
 {
-	struct passthru_softc *sc;
-	struct iodev_pio_req pio;
-
-	sc = pi->pi_arg;
+	struct passthru_softc *sc = pi->pi_arg;
 
 	if (baridx == pci_msix_table_bar(pi)) {
-		msix_table_write(ctx, vcpu, sc, offset, size, value);
+		passthru_msix_table_write(ctx, vcpu, sc, offset, size, value);
 	} else {
+		struct ppt_bar_io pbi;
+
 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
-		bzero(&pio, sizeof(struct iodev_pio_req));
-		pio.access = IODEV_PIO_WRITE;
-		pio.port = sc->psc_bar[baridx].addr + offset;
-		pio.width = size;
-		pio.val = value;
-		
-		(void)ioctl(iofd, IODEV_PIO, &pio);
+
+		pbi.pbi_bar = baridx;
+		pbi.pbi_width = size;
+		pbi.pbi_off = offset;
+		pbi.pbi_data = value;
+		(void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
 	}
 }
 
 static uint64_t
 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
-	      uint64_t offset, int size)
+    uint64_t offset, int size)
 {
-	struct passthru_softc *sc;
-	struct iodev_pio_req pio;
+	struct passthru_softc *sc = pi->pi_arg;
 	uint64_t val;
 
-	sc = pi->pi_arg;
-
 	if (baridx == pci_msix_table_bar(pi)) {
-		val = msix_table_read(sc, offset, size);
+		val = passthru_msix_table_read(sc, offset, size);
 	} else {
-		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
-		bzero(&pio, sizeof(struct iodev_pio_req));
-		pio.access = IODEV_PIO_READ;
-		pio.port = sc->psc_bar[baridx].addr + offset;
-		pio.width = size;
-		pio.val = 0;
+		struct ppt_bar_io pbi;
 
-		(void)ioctl(iofd, IODEV_PIO, &pio);
+		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
 
-		val = pio.val;
+		pbi.pbi_bar = baridx;
+		pbi.pbi_width = size;
+		pbi.pbi_off = offset;
+		if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
+			val = pbi.pbi_data;
+		} else {
+			val = 0;
+		}
 	}
 
 	return (val);
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c
index b8bdf524a9..bbe36917fd 100644
--- a/usr/src/cmd/bhyvectl/bhyvectl.c
+++ b/usr/src/cmd/bhyvectl/bhyvectl.c
@@ -183,7 +183,9 @@ usage(bool cpu_intel)
 	"       [--get-ldtr]\n"
 	"       [--set-x2apic-state=<state>]\n"
 	"       [--get-x2apic-state]\n"
+#ifdef __FreeBSD__
 	"       [--unassign-pptdev=<bus/slot/func>]\n"
+#endif
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
 	"       [--get-highmem]\n"
@@ -302,7 +304,9 @@ static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
 static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
 static int set_x2apic_state, get_x2apic_state;
 enum x2apic_state x2apic_state;
+#ifdef __FreeBSD__
 static int unassign_pptdev, bus, slot, func;
+#endif
 static int run;
 static int get_cpu_topology;
 #ifndef __FreeBSD__
@@ -1875,11 +1879,13 @@ main(int argc, char *argv[])
 		case CAPNAME:
 			capname = optarg;
 			break;
+#ifdef __FreeBSD__
 		case UNASSIGN_PPTDEV:
 			unassign_pptdev = 1;
 			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
 				usage(cpu_intel);
 			break;
+#endif
 		case ASSERT_LAPIC_LVT:
 			assert_lapic_lvt = atoi(optarg);
 			break;
@@ -2040,8 +2046,10 @@ main(int argc, char *argv[])
 	if (!error && set_x2apic_state)
 		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
 
+#ifdef __FreeBSD__
 	if (!error && unassign_pptdev)
 		error = vm_unassign_pptdev(ctx, bus, slot, func);
+#endif /* __FreeBSD__ */
 
 	if (!error && set_exception_bitmap) {
 		if (cpu_intel)
diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
index 0f8e64551d..eb5f789c37 100644
--- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c
+++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
@@ -46,6 +46,7 @@ static int kdmouse(di_minor_t minor, di_node_t node);
 static int ipmi(di_minor_t minor, di_node_t node);
 static int mc_node(di_minor_t minor, di_node_t node);
 static int vmmctl(di_minor_t minor, di_node_t node);
+static int ppt(di_minor_t minor, di_node_t node);
 
 static devfsadm_create_t misc_cbt[] = {
 	{ "vt00", "ddi_display", NULL,
@@ -90,6 +91,9 @@ static devfsadm_create_t misc_cbt[] = {
 	},
 	{ "pseudo", "ddi_pseudo", "vmm",
 	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl,
+	},
+	{ "pseudo", "ddi_pseudo", "ppt",
+	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, ppt,
 	}
 };
 
@@ -122,6 +126,9 @@ static devfsadm_remove_t misc_remove_cbt[] = {
 	},
 	{ "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT,
 		ILEVEL_0, devfsadm_rm_all
+	},
+	{ "pseudo", "^ppt$", RM_ALWAYS | RM_PRE | RM_HOT,
+		ILEVEL_0, devfsadm_rm_all
 	}
 };
 
@@ -369,3 +376,15 @@ vmmctl(di_minor_t minor, di_node_t node)
 		(void) devfsadm_mklink("vmmctl", node, minor, 0);
 	return (DEVFSADM_CONTINUE);
 }
+
+static int
+ppt(di_minor_t minor, di_node_t node)
+{
+	char linkpath[PATH_MAX];
+
+	(void) snprintf(linkpath, sizeof (linkpath), "ppt%d",
+	    di_instance(node));
+
+	(void) devfsadm_mklink(linkpath, node, minor, 0);
+	return (DEVFSADM_CONTINUE);
+}
diff --git a/usr/src/cmd/pptadm/Makefile b/usr/src/cmd/pptadm/Makefile
new file mode 100644
index 0000000000..3be558a7a0
--- /dev/null
+++ b/usr/src/cmd/pptadm/Makefile
@@ -0,0 +1,43 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+PROG = pptadm
+OBJS = pptadm.o
+SRCS = $(OBJS:%.o=%.c)
+
+include ../Makefile.cmd
+include ../Makefile.ctf
+
+LDLIBS += -lofmt -lppt -lnvpair
+
+CSTD = $(CSTD_GNU99)
+C99LMODE = -Xc99=%all
+
+CLEANFILES += $(OBJS)
+
+.KEEP_STATE:
+
+all: $(OBJS) $(PROG)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+	-$(RM) $(CLEANFILES)
+
+lint: lint_SRCS
+
+%.o: ../%.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/pptadm/pptadm.c b/usr/src/cmd/pptadm/pptadm.c
new file mode 100644
index 0000000000..c6b9094408
--- /dev/null
+++ b/usr/src/cmd/pptadm/pptadm.c
@@ -0,0 +1,205 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <getopt.h>
+#include <string.h>
+#include <ofmt.h>
+#include <err.h>
+
+#include <libppt.h>
+
+typedef enum field {
+	PPT_DEV,
+	PPT_VENDOR,
+	PPT_DEVICE,
+	PPT_SUBVENDOR,
+	PPT_SUBDEVICE,
+	PPT_REV,
+	PPT_PATH,
+	PPT_LABEL
+} field_t;
+
+const char *valname[] = {
+	"dev",
+	"vendor-id",
+	"device-id",
+	"subsystem-vendor-id",
+	"subsystem-id",
+	"revision-id",
+	"path",
+	"label"
+};
+
+static ofmt_cb_t print_field;
+
+static ofmt_field_t fields[] = {
+/* name,	field width, index, callback */
+{ "DEV",	sizeof ("/dev/pptXX"), PPT_DEV, print_field },
+{ "VENDOR",	sizeof ("VENDOR"), PPT_VENDOR, print_field },
+{ "DEVICE",	sizeof ("DEVICE"), PPT_DEVICE, print_field },
+{ "SUBVENDOR",	sizeof ("SUBVENDOR"), PPT_SUBVENDOR, print_field },
+{ "SUBDEVICE",	sizeof ("SUBDEVICE"), PPT_SUBDEVICE, print_field },
+{ "REV",	sizeof ("REV"), PPT_REV, print_field },
+{ "PATH",	50, PPT_PATH, print_field },
+{ "LABEL",	60, PPT_LABEL, print_field },
+{ NULL,		0, 0, NULL },
+};
+
+static void
+usage(const char *errmsg)
+{
+	if (errmsg != NULL)
+		(void) fprintf(stderr, "pptadm: %s\n", errmsg);
+	(void) fprintf(errmsg != NULL ? stderr : stdout,
+	    "Usage:\n"
+	    "pptadm list [ -j ]\n"
+	    "pptadm list [-ap] [-o fields]\n");
+	exit(errmsg != NULL ? EXIT_FAILURE : EXIT_SUCCESS);
+}
+
+/* PRINTFLIKE1 */
+static void
+die(const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	verrx(EXIT_FAILURE, fmt, ap);
+	va_end(ap);
+}
+
+static boolean_t
+print_field(ofmt_arg_t *arg, char *buf, uint_t bufsize)
+{
+	nvlist_t *nvl = arg->ofmt_cbarg;
+	nvpair_t *nvp = NULL;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *name = nvpair_name(nvp);
+		char *val = NULL;
+
+		(void) nvpair_value_string(nvp, &val);
+
+		if (strcmp(name, valname[arg->ofmt_id]) != 0)
+			continue;
+
+		(void) snprintf(buf, bufsize, "%s", val);
+		return (B_TRUE);
+	}
+
+	(void) snprintf(buf, bufsize, "--");
+	return (B_TRUE);
+}
+
+static int
+list(int argc, char *argv[])
+{
+	const char *fields_str = NULL;
+	boolean_t parsable = B_FALSE;
+	boolean_t json = B_FALSE;
+	boolean_t all = B_FALSE;
+	uint_t ofmtflags = 0;
+	ofmt_status_t oferr;
+	ofmt_handle_t ofmt;
+	int opt;
+
+	while ((opt = getopt(argc, argv, "ahjo:p")) != -1) {
+		switch (opt) {
+		case 'a':
+			all = B_TRUE;
+			break;
+		case 'h':
+			usage(NULL);
+			break;
+		case 'j':
+			json = B_TRUE;
+			break;
+		case 'o':
+			fields_str = optarg;
+			break;
+		case 'p':
+			ofmtflags |= OFMT_PARSABLE;
+			parsable = B_TRUE;
+			break;
+		default:
+			usage("unrecognized option");
+			break;
+		}
+	}
+
+	if (optind == (argc - 1))
+		usage("unused arguments");
+
+	if (json && (parsable || fields_str != NULL))
+		usage("-j option cannot be used with -p or -o options");
+
+	if (fields_str == NULL) {
+		if (parsable)
+			usage("-o must be provided when using -p option");
+		fields_str = "dev,vendor,device,path";
+	}
+
+	oferr = ofmt_open(fields_str, fields, ofmtflags, 0, &ofmt);
+
+	ofmt_check(oferr, parsable, ofmt, die, warn);
+
+	nvlist_t *nvl = all ? ppt_list() : ppt_list_assigned();
+	nvpair_t *nvp = NULL;
+
+	if (json) {
+		if (printf("{\n\t\"devices\": [\n") < 0)
+			err(EXIT_FAILURE, "failed to write JSON");
+	}
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *props;
+
+		(void) nvpair_value_nvlist(nvp, &props);
+
+		if (json) {
+			if (printf("\t\t") < 0)
+				err(EXIT_FAILURE, "failed to write JSON");
+			if (nvlist_print_json(stdout, props) < 0)
+				err(EXIT_FAILURE, "failed to write JSON");
+			if (nvlist_next_nvpair(nvl, nvp) != NULL)
+				(void) printf(",\n");
+		} else {
+			ofmt_print(ofmt, props);
+		}
+	}
+
+	if (json) {
+		if (printf("\n\t]\n}\n") < 0)
+			err(EXIT_FAILURE, "failed to write JSON");
+	}
+
+	nvlist_free(nvl);
+	ofmt_close(ofmt);
+	return (EXIT_SUCCESS);
+}
+
+int
+main(int argc, char *argv[])
+{
+	if (argc == 1)
+		return (list(argc - 1, argv));
+
+	if (strcmp(argv[1], "list") == 0) {
+		return (list(argc - 1, &argv[1]));
+	} else {
+		usage("unknown sub-command");
+	}
+
+	return (EXIT_SUCCESS);
+}
diff --git a/usr/src/compat/freebsd/amd64/machine/cpu.h b/usr/src/compat/freebsd/amd64/machine/cpu.h
new file mode 100644
index 0000000000..40253af108
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/cpu.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CPU_H
+#define	_COMPAT_FREEBSD_AMD64_MACHINE_CPU_H
+
+#include <sys/cpu.h>
+
+#define	cpu_spinwait()	SMT_PAUSE()
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_CPU_H */
diff --git a/usr/src/compat/freebsd/contrib/dev/acpica/include/acpi.h b/usr/src/compat/freebsd/contrib/dev/acpica/include/acpi.h
new file mode 100644
index 0000000000..2668f98ab3
--- /dev/null
+++ b/usr/src/compat/freebsd/contrib/dev/acpica/include/acpi.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_CONTRIB_DEV_ACPICA_INCLUDE_ACPI_H
+#define	_COMPAT_FREEBSD_CONTRIB_DEV_ACPICA_INCLUDE_ACPI_H
+
+#include <sys/acpi/acpi.h>
+
+#endif /* _COMPAT_FREEBSD_CONTRIB_DEV_ACPICA_INCLUDE_ACPI_H */
diff --git a/usr/src/compat/freebsd/dev/pci/pcivar.h b/usr/src/compat/freebsd/dev/pci/pcivar.h
new file mode 100644
index 0000000000..064d983117
--- /dev/null
+++ b/usr/src/compat/freebsd/dev/pci/pcivar.h
@@ -0,0 +1,38 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_DEV_PCI_PCIVAR_H
+#define	_COMPAT_FREEBSD_DEV_PCI_PCIVAR_H
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pcie.h>
+#include <sys/pcie_impl.h>
+
+static inline pcie_req_id_t
+pci_get_bdf(device_t dev)
+{
+	pcie_req_id_t bdf;
+
+	VERIFY(pcie_get_bdf_from_dip(dev, &bdf) == DDI_SUCCESS);
+
+	return (bdf);
+}
+
+#define	pci_get_rid(dev)	(pci_get_bdf(dev))
+
+#endif /* _COMPAT_FREEBSD_DEV_PCI_PCIVAR_H */
diff --git a/usr/src/compat/freebsd/sys/bus.h b/usr/src/compat/freebsd/sys/bus.h
new file mode 100644
index 0000000000..e3b5e0e69d
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/bus.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_BUS_H
+#define	_COMPAT_FREEBSD_SYS_BUS_H
+
+#define	device_get_softc(dev)	ddi_get_driver_private(dev)
+
+#endif /* _COMPAT_FREEBSD_SYS_BUS_H */
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index b64d4c2bc1..c40721fd55 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -186,6 +186,7 @@ SUBDIRS +=				\
 	libpkg		\
 	libpool		\
 	libpp		\
+	libppt		\
 	libproc		\
 	libproject	\
 	libpthread	\
@@ -446,6 +447,7 @@ HDRSUBDIRS=				\
 	libpicltree	\
 	libpool		\
 	libpp		\
+	libppt		\
 	libproc		\
 	libraidcfg	\
 	librcm		\
@@ -657,6 +659,7 @@ libpctx:	libproc
 libpkg:		libscf libadm
 libpool:	libscf libexacct
 libpp:		libast
+libppt:		libpcidb libdevinfo libcmdutils
 libproc:	../cmd/sgs/librtld_db ../cmd/sgs/libelf libctf
 $(INTEL_BLD)libproc: libsaveargs
 libproject:	libpool libproc libsecdb
diff --git a/usr/src/lib/libppt/Makefile b/usr/src/lib/libppt/Makefile
new file mode 100644
index 0000000000..21c26d447e
--- /dev/null
+++ b/usr/src/lib/libppt/Makefile
@@ -0,0 +1,44 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include $(SRC)/lib/Makefile.lib
+
+SUBDIRS = $(MACH) $(BUILD64) $(MACH64)
+
+HDRS = libppt.h
+HDRDIR = common
+
+all :=		TARGET= all
+clean :=	TARGET= clean
+clobber :=	TARGET= clobber
+install :=	TARGET= install
+lint :=		TARGET= lint
+
+.KEEP_STATE:
+
+all clean clobber install lint: $(SUBDIRS)
+
+install_h: $(ROOTHDRS)
+
+all install: install_h
+
+check: $(CHECKHDRS)
+
+$(SUBDIRS): FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include	$(SRC)/lib/Makefile.targ
diff --git a/usr/src/lib/libppt/Makefile.com b/usr/src/lib/libppt/Makefile.com
new file mode 100644
index 0000000000..7b2ff4885f
--- /dev/null
+++ b/usr/src/lib/libppt/Makefile.com
@@ -0,0 +1,46 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+LIBRARY = libppt.a
+VERS = .1
+
+OBJECTS = libppt.o
+
+include $(SRC)/lib/Makefile.lib
+
+SRCDIR = ../common
+
+LIBS = $(DYNLIB) $(LINTLIB)
+SRCS =	$(SRCDIR)/libppt.c
+
+CSTD=	$(CSTD_GNU99)
+C99LMODE=	-Xc99=%all
+
+#
+# lint doesn't like %4s in sscanf().
+#
+LINTFLAGS += -erroff=E_BAD_FORMAT_ARG_TYPE2
+LINTFLAGS64 += -erroff=E_BAD_FORMAT_ARG_TYPE2
+
+$(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC)
+LDLIBS += -lpcidb -ldevinfo -lcmdutils -lnvpair -lc
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+include $(SRC)/lib/Makefile.targ
diff --git a/usr/src/lib/libppt/amd64/Makefile b/usr/src/lib/libppt/amd64/Makefile
new file mode 100644
index 0000000000..5a304d7fe7
--- /dev/null
+++ b/usr/src/lib/libppt/amd64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include		../Makefile.com
+include		$(SRC)/lib/Makefile.lib.64
+
+install:	all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libppt/common/libppt.c b/usr/src/lib/libppt/common/libppt.c
new file mode 100644
index 0000000000..7e8385da06
--- /dev/null
+++ b/usr/src/lib/libppt/common/libppt.c
@@ -0,0 +1,506 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ *
+ * Convenience routines for identifying current or available devices that are
+ * suitable for PCI passthrough to a bhyve guest.
+ */
+
+#include <libdevinfo.h>
+#include <libppt.h>
+
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/list.h>
+#include <strings.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <pcidb.h>
+#include <glob.h>
+
+typedef struct node_data {
+	pcidb_hdl_t *nd_db;
+	list_t nd_matches;
+	nvlist_t *nd_nvl;
+	int nd_err;
+} node_data_t;
+
+typedef struct ppt_match {
+	list_node_t pm_list;
+	char pm_path[MAXPATHLEN];
+	char pm_vendor[5];
+	char pm_device[5];
+} ppt_match_t;
+
+static boolean_t
+is_pci(di_node_t di_node)
+{
+	char *svals;
+
+	if (di_prop_lookup_strings(DDI_DEV_T_ANY, di_parent_node(di_node),
+	    "device_type", &svals) != 1)
+		return (B_FALSE);
+
+	return (strcmp(svals, "pci") == 0 || strcmp(svals, "pciex") == 0);
+}
+
+static int
+populate_int_prop(di_node_t di_node, nvlist_t *nvl, const char *name, int *ival)
+{
+	char val[20];
+	int *ivals;
+	int err;
+
+	if (di_prop_lookup_ints(DDI_DEV_T_ANY, di_node, name, &ivals) != 1)
+		return (errno);
+
+	(void) snprintf(val, sizeof (val), "%x", ivals[0]);
+
+	err = nvlist_add_string(nvl, name, val);
+
+	if (err == 0 && ival != NULL)
+		*ival = ivals[0];
+
+	return (err);
+}
+
+static int
+dev_getlabel(pcidb_hdl_t *db, int vid, int did, char *buf, size_t buflen)
+{
+	pcidb_vendor_t *vend = NULL;
+	pcidb_device_t *dev = NULL;
+
+	if ((vend = pcidb_lookup_vendor(db, vid)) == NULL)
+		return (ENOENT);
+
+	if ((dev = pcidb_lookup_device_by_vendor(vend, did)) == NULL)
+		return (ENOENT);
+
+	(void) snprintf(buf, buflen, "%s %s", pcidb_vendor_name(vend),
+	    pcidb_device_name(dev));
+
+	return (0);
+}
+
+static nvlist_t *
+dev_getinfo(di_node_t di_node, pcidb_hdl_t *db,
+    const char *dev, const char *path)
+{
+	char label[MAXPATHLEN];
+	nvlist_t *nvl = NULL;
+	int vid, did;
+	int err;
+
+	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0)) != 0)
+		goto out;
+
+	if (dev != NULL && (err = nvlist_add_string(nvl, "dev", dev)) != 0)
+		goto out;
+	if ((err = nvlist_add_string(nvl, "path", path)) != 0)
+		goto out;
+	if ((err = populate_int_prop(di_node, nvl, "vendor-id", &vid)) != 0)
+		goto out;
+	if ((err = populate_int_prop(di_node, nvl, "device-id", &did)) != 0)
+		goto out;
+	if ((err = populate_int_prop(di_node, nvl,
+	    "subsystem-vendor-id", NULL)) != 0)
+		goto out;
+	if ((err = populate_int_prop(di_node, nvl, "subsystem-id", NULL)) != 0)
+		goto out;
+	if ((err = populate_int_prop(di_node, nvl, "revision-id", NULL)) != 0)
+		goto out;
+
+	err = dev_getlabel(db, vid, did, label, sizeof (label));
+
+	if (err == 0) {
+		err = nvlist_add_string(nvl, "label", label);
+	} else if (err == ENOENT) {
+		err = 0;
+	}
+
+out:
+	if (err) {
+		nvlist_free(nvl);
+		errno = err;
+		return (NULL);
+	}
+
+	return (nvl);
+}
+
+/*
+ * /devices/pci0@0/....@0,1:ppt -> /pci0@0/...@0,1
+ */
+static const char *
+fs_to_phys_path(char *fspath)
+{
+	const char prefix[] = "/devices";
+	char *c;
+
+	if ((c = strrchr(fspath, ':')) != NULL && strcmp(c, ":ppt") == 0)
+		*c = '\0';
+
+	c = fspath;
+
+	if (strncmp(c, prefix, sizeof (prefix) - 1) == 0)
+		c += sizeof (prefix) - 1;
+
+	return (c);
+}
+
+/*
+ * Return an nvlist representing the mappings of /dev/ppt* devices to physical
+ * devices.  Of the form:
+ *
+ * /pci@0,0/... {
+ *  dev: "/dev/ppt0"
+ *  path: "/pci@0,0/..."
+ *  vendor-id: "8086"
+ *  device-id: "1528"
+ *  subsystem-vendor-id: "8086"
+ *  subsystem-id: "1528"
+ *  revision-id: "1"
+ *  label: "Intel Corporation ..."
+ * },
+ * /pci@0,0/...
+ *
+ * The nvlist should be freed by the caller.
+ */
+nvlist_t *
+ppt_list_assigned(void)
+{
+	di_node_t di_root = DI_NODE_NIL;
+	pcidb_hdl_t *db = NULL;
+	nvlist_t *nvl = NULL;
+	glob_t gl;
+	int err;
+
+	bzero(&gl, sizeof (gl));
+
+	if ((di_root = di_init("/", DINFOCACHE)) == DI_NODE_NIL)
+		return (NULL);
+
+	if ((db = pcidb_open(PCIDB_VERSION)) == NULL) {
+		err = errno;
+		goto out;
+	}
+
+	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0)) != 0)
+		goto out;
+
+	if ((err = glob("/dev/ppt*", GLOB_KEEPSTAT | GLOB_ERR,
+	    NULL, &gl)) != 0) {
+		err = (err == GLOB_NOMATCH) ? 0 : errno;
+		goto out;
+	}
+
+	for (size_t i = 0; i < gl.gl_pathc; i++) {
+		char fspath[MAXPATHLEN];
+		nvlist_t *info_nvl;
+		di_node_t di_node;
+		const char *path;
+
+		if (!S_ISLNK(gl.gl_statv[i]->st_mode))
+			continue;
+
+		if (realpath(gl.gl_pathv[i], fspath) == NULL) {
+			err = errno;
+			goto out;
+		}
+
+		path = fs_to_phys_path(fspath);
+
+		/*
+		 * path argument is treated as const.
+		 */
+		if ((di_node = di_lookup_node(di_root, (char *)path)) == NULL) {
+			err = errno;
+			goto out;
+		}
+
+		if (!is_pci(di_node))
+			continue;
+
+		info_nvl = dev_getinfo(di_node, db, gl.gl_pathv[i], path);
+
+		if (info_nvl == NULL) {
+			err = errno;
+			goto out;
+		}
+
+		err = nvlist_add_nvlist(nvl, path, info_nvl);
+		nvlist_free(info_nvl);
+
+		if (err)
+			goto out;
+	}
+
+out:
+	if (di_root != DI_NODE_NIL)
+		di_fini(di_root);
+
+	pcidb_close(db);
+	globfree(&gl);
+
+	if (err) {
+		nvlist_free(nvl);
+		errno = err;
+		return (NULL);
+	}
+
+	return (nvl);
+}
+
+/*
+ * Read in our list of potential PPT devices.  A boot-module provided file
+ * explicitly over-rides anything delivered.
+ */
+static int
+get_matches(list_t *listp)
+{
+	int err = 0;
+	FILE *fp;
+
+	list_create(listp, sizeof (ppt_match_t),
+	    offsetof(ppt_match_t, pm_list));
+
+	if ((fp = fopen("/system/boot/etc/ppt_matches", "r")) == NULL) {
+		if (errno != ENOENT)
+			return (errno);
+
+		if ((fp = fopen("/etc/ppt_matches", "r")) == NULL) {
+			if (errno == ENOENT)
+				return (0);
+			return (errno);
+		}
+	}
+
+	for (;;) {
+		char *line = NULL;
+		ppt_match_t *pm;
+		size_t cap = 0;
+		ssize_t read;
+
+		if ((read = getline(&line, &cap, fp)) <= 0) {
+			free(line);
+			break;
+		}
+
+		if (line[read - 1] == '\n')
+			line[read - 1] = '\0';
+
+		if ((pm = malloc(sizeof (*pm))) == NULL) {
+			err = errno;
+			free(line);
+			goto out;
+		}
+
+		bzero(pm, sizeof (*pm));
+
+		if (sscanf(line, "pciex%4s,%4s", &pm->pm_vendor,
+		    &pm->pm_device) == 2 ||
+		    sscanf(line, "pci%4s,%4s", &pm->pm_vendor,
+		    &pm->pm_device) == 2 ||
+		    sscanf(line, "pciex%4s", &pm->pm_vendor) == 1 ||
+		    sscanf(line, "pci%4s", &pm->pm_vendor) == 1) {
+			list_insert_tail(listp, pm);
+		} else if (line[0] == '/') {
+			(void) strlcpy(pm->pm_path, line, sizeof (pm->pm_path));
+			list_insert_tail(listp, pm);
+		} else {
+			/*
+			 * Ignore any line we don't understand.
+			 */
+			free(pm);
+		}
+
+		free(line);
+	}
+
+out:
+	(void) fclose(fp);
+	return (err);
+}
+
+static boolean_t
+match_ppt(list_t *matches, nvlist_t *nvl)
+{
+	char *vendor;
+	char *device;
+	char *path;
+
+	if (nvlist_lookup_string(nvl, "path", &path) != 0 ||
+	    nvlist_lookup_string(nvl, "vendor-id", &vendor) != 0 ||
+	    nvlist_lookup_string(nvl, "device-id", &device) != 0)
+		return (B_FALSE);
+
+	for (ppt_match_t *pm = list_head(matches); pm != NULL;
+	    pm = list_next(matches, pm)) {
+		if (pm->pm_path[0] != '\0' && strcmp(pm->pm_path, path) == 0)
+			return (B_TRUE);
+
+		if (pm->pm_vendor[0] != '\0' &&
+		    strcmp(pm->pm_vendor, vendor) == 0) {
+			if (pm->pm_device[0] == '\0')
+				return (B_TRUE);
+			if (strcmp(pm->pm_device, device) == 0)
+				return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static int
+inspect_node(di_node_t di_node, void *arg)
+{
+	node_data_t *data = arg;
+	nvlist_t *info_nvl = NULL;
+	char *devname = NULL;
+	const char *driver;
+	char *path = NULL;
+
+	if (!is_pci(di_node))
+		return (DI_WALK_CONTINUE);
+
+	driver = di_driver_name(di_node);
+
+	if (driver != NULL && strcmp(driver, "ppt") == 0) {
+		if (asprintf(&devname, "/dev/ppt%d",
+		    di_instance(di_node)) < 0) {
+			data->nd_err = errno;
+			goto out;
+		}
+	}
+
+	if ((path = di_devfs_path(di_node)) == NULL) {
+		data->nd_err = ENOENT;
+		goto out;
+	}
+
+	info_nvl = dev_getinfo(di_node, data->nd_db, devname, path);
+
+	if (info_nvl == NULL)
+		goto out;
+
+	if (devname == NULL && !match_ppt(&data->nd_matches, info_nvl))
+		goto out;
+
+	data->nd_err = nvlist_add_nvlist(data->nd_nvl, path, info_nvl);
+
+out:
+	free(path);
+	free(devname);
+	nvlist_free(info_nvl);
+	return (data->nd_err ? DI_WALK_TERMINATE : DI_WALK_CONTINUE);
+}
+
+/*
+ * Like ppt_list_assigned() output, but includes all devices that could be used
+ * for passthrough, whether assigned or not.
+ */
+nvlist_t *
+ppt_list(void)
+{
+	node_data_t nd = { NULL, };
+	di_node_t di_root;
+	int err;
+
+	if ((di_root = di_init("/", DINFOCACHE)) == DI_NODE_NIL)
+		return (NULL);
+
+	if ((err = get_matches(&nd.nd_matches)) != 0)
+		goto out;
+
+	if ((nd.nd_db = pcidb_open(PCIDB_VERSION)) == NULL) {
+		err = errno;
+		goto out;
+	}
+
+	if ((err = nvlist_alloc(&nd.nd_nvl, NV_UNIQUE_NAME, 0)) != 0)
+		goto out;
+
+	if ((err = di_walk_node(di_root, DI_WALK_CLDFIRST,
+	    &nd, inspect_node)) != 0)
+		goto out;
+
+	err = nd.nd_err;
+
+out:
+	pcidb_close(nd.nd_db);
+
+	for (ppt_match_t *pm = list_head(&nd.nd_matches); pm != NULL; ) {
+		ppt_match_t *next = list_next(&nd.nd_matches, pm);
+		free(pm);
+		pm = next;
+	}
+
+	if (di_root != DI_NODE_NIL)
+		di_fini(di_root);
+
+	if (err) {
+		nvlist_free(nd.nd_nvl);
+		errno = err;
+		return (NULL);
+	}
+
+	return (nd.nd_nvl);
+}
+
+/*
+ * Given a physical path such as "/devices/pci0@0...", return the "/dev/pptX"
+ * that is bound to it, if any.  The "/devices/" prefix is optional.  The
+ * physical path may have the ":ppt" minor name suffix.
+ *
+ * Returns ENOENT if no such PPT device exists.
+ */
+int
+ppt_devpath_to_dev(const char *inpath, char *buf, size_t buflen)
+{
+	char fspath[MAXPATHLEN] = "";
+	nvpair_t *nvp = NULL;
+	const char *devpath;
+	int err = ENOENT;
+	nvlist_t *nvl;
+
+	if (strlcat(fspath, inpath, sizeof (fspath)) >= sizeof (fspath))
+		return (ENAMETOOLONG);
+
+	devpath = fs_to_phys_path(fspath);
+
+	if ((nvl = ppt_list_assigned()) == NULL)
+		return (errno);
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *name = nvpair_name(nvp);
+		char *ppt = NULL;
+		nvlist_t *props;
+
+		(void) nvpair_value_nvlist(nvp, &props);
+
+		if (strcmp(name, devpath) == 0) {
+			(void) nvlist_lookup_string(props, "dev", &ppt);
+
+			err = 0;
+
+			if (strlcpy(buf, ppt, buflen) >= buflen)
+				err = ENAMETOOLONG;
+			break;
+		}
+	}
+
+	nvlist_free(nvl);
+	return (err);
+}
diff --git a/usr/src/lib/libppt/common/libppt.h b/usr/src/lib/libppt/common/libppt.h
new file mode 100644
index 0000000000..efbf2c7b8b
--- /dev/null
+++ b/usr/src/lib/libppt/common/libppt.h
@@ -0,0 +1,36 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ *
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _LIBPPT_H
+#define	_LIBPPT_H
+
+#include <sys/types.h>
+
+#include <libnvpair.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int ppt_devpath_to_dev(const char *, char *, size_t);
+
+extern nvlist_t *ppt_list_assigned(void);
+
+extern nvlist_t *ppt_list(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBPPT_H */
diff --git a/usr/src/lib/libppt/common/llib-lppt b/usr/src/lib/libppt/common/llib-lppt
new file mode 100644
index 0000000000..dadd992a31
--- /dev/null
+++ b/usr/src/lib/libppt/common/llib-lppt
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/* LINTLIBRARY */
+/* PROTOLIB1 */
+
+#include <libppt.h>
diff --git a/usr/src/lib/libppt/common/mapfile-vers b/usr/src/lib/libppt/common/mapfile-vers
new file mode 100644
index 0000000000..d9d882874b
--- /dev/null
+++ b/usr/src/lib/libppt/common/mapfile-vers
@@ -0,0 +1,40 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+    global:
+	ppt_devpath_to_dev;
+	ppt_list_assigned;
+	ppt_list;
+
+    local:
+	*;
+};
diff --git a/usr/src/lib/libppt/i386/Makefile b/usr/src/lib/libppt/i386/Makefile
new file mode 100644
index 0000000000..3f11e556d4
--- /dev/null
+++ b/usr/src/lib/libppt/i386/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include		../Makefile.com
+
+install:	all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libppt/sparc/Makefile b/usr/src/lib/libppt/sparc/Makefile
new file mode 100644
index 0000000000..3f11e556d4
--- /dev/null
+++ b/usr/src/lib/libppt/sparc/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include		../Makefile.com
+
+install:	all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libppt/sparcv9/Makefile b/usr/src/lib/libppt/sparcv9/Makefile
new file mode 100644
index 0000000000..5a304d7fe7
--- /dev/null
+++ b/usr/src/lib/libppt/sparcv9/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include		../Makefile.com
+include		$(SRC)/lib/Makefile.lib.64
+
+install:	all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers
index a64231ad1c..397ebd7d59 100644
--- a/usr/src/lib/libvmmapi/common/mapfile-vers
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers
@@ -61,6 +61,7 @@ SYMBOL_VERSION ILLUMOSprivate {
 		vm_get_lowmem_size;
 		vm_get_memflags;
 		vm_get_memseg;
+		vm_get_pptdev_limits;
 		vm_get_register;
 		vm_get_register_set;
 		vm_get_seg_desc;
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
index 0b9b871081..ceac495746 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.c
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -995,6 +995,7 @@ vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
 	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
 }
 
+#ifdef __FreeBSD__
 int
 vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
 {
@@ -1056,7 +1057,7 @@ vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
 	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
 }
 
-int	
+int
 vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
     int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
 {
@@ -1075,6 +1076,103 @@ vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
 	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
 }
 
+int
+vm_get_pptdev_limits(struct vmctx *ctx, int bus, int slot, int func,
+    int *msi_limit, int *msix_limit)
+{
+	struct vm_pptdev_limits pptlimits;
+	int error;
+
+	bzero(&pptlimits, sizeof (pptlimits));
+	pptlimits.bus = bus;
+	pptlimits.slot = slot;
+	pptlimits.func = func;
+
+	error = ioctl(ctx->fd, VM_GET_PPTDEV_LIMITS, &pptlimits);
+
+	*msi_limit = pptlimits.msi_limit;
+	*msix_limit = pptlimits.msix_limit;
+
+	return (error);
+}
+#else /* __FreeBSD__ */
+int
+vm_assign_pptdev(struct vmctx *ctx, int pptfd)
+{
+	struct vm_pptdev pptdev;
+
+	pptdev.pptfd = pptfd;
+	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
+}
+
+int
+vm_unassign_pptdev(struct vmctx *ctx, int pptfd)
+{
+	struct vm_pptdev pptdev;
+
+	pptdev.pptfd = pptfd;
+	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
+}
+
+int
+vm_map_pptdev_mmio(struct vmctx *ctx, int pptfd, vm_paddr_t gpa, size_t len,
+    vm_paddr_t hpa)
+{
+	struct vm_pptdev_mmio pptmmio;
+
+	pptmmio.pptfd = pptfd;
+	pptmmio.gpa = gpa;
+	pptmmio.len = len;
+	pptmmio.hpa = hpa;
+	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
+}
+
+int
+vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int pptfd, uint64_t addr,
+    uint64_t msg, int numvec)
+{
+	struct vm_pptdev_msi pptmsi;
+
+	pptmsi.vcpu = vcpu;
+	pptmsi.pptfd = pptfd;
+	pptmsi.msg = msg;
+	pptmsi.addr = addr;
+	pptmsi.numvec = numvec;
+	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
+}
+
+int
+vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int pptfd, int idx,
+    uint64_t addr, uint64_t msg, uint32_t vector_control)
+{
+	struct vm_pptdev_msix pptmsix;
+
+	pptmsix.vcpu = vcpu;
+	pptmsix.pptfd = pptfd;
+	pptmsix.idx = idx;
+	pptmsix.msg = msg;
+	pptmsix.addr = addr;
+	pptmsix.vector_control = vector_control;
+	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
+}
+
+int
+vm_get_pptdev_limits(struct vmctx *ctx, int pptfd, int *msi_limit,
+    int *msix_limit)
+{
+	struct vm_pptdev_limits pptlimits;
+	int error;
+
+	bzero(&pptlimits, sizeof (pptlimits));
+	pptlimits.pptfd = pptfd;
+	error = ioctl(ctx->fd, VM_GET_PPTDEV_LIMITS, &pptlimits);
+
+	*msi_limit = pptlimits.msi_limit;
+	*msix_limit = pptlimits.msix_limit;
+	return (error);
+}
+#endif /* __FreeBSD__ */
+
 uint64_t *
 vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 	     int *ret_entries)
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
index a1507255cb..1b08a9cae5 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.h
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -177,6 +177,7 @@ int	vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int *retval);
 int	vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int val);
+#ifdef __FreeBSD__
 int	vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
@@ -186,6 +187,20 @@ int	vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
 int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
+int	vm_get_pptdev_limits(struct vmctx *ctx, int bus, int slot, int func,
+    int *msi_limit, int *msix_limit);
+#else /* __FreeBSD__ */
+int	vm_assign_pptdev(struct vmctx *ctx, int pptfd);
+int	vm_unassign_pptdev(struct vmctx *ctx, int pptfd);
+int	vm_map_pptdev_mmio(struct vmctx *ctx, int pptfd, vm_paddr_t gpa,
+    size_t len, vm_paddr_t hpa);
+int	vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int pptfd,
+    uint64_t addr, uint64_t msg, int numvec);
+int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int pptfd,
+    int idx, uint64_t addr, uint64_t msg, uint32_t vector_control);
+int	vm_get_pptdev_limits(struct vmctx *ctx, int pptfd, int *msi_limit,
+    int *msix_limit);
+#endif /* __FreeBSD__ */
 
 int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
 int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
diff --git a/usr/src/man/man1m/Makefile b/usr/src/man/man1m/Makefile
index 9f01ad7606..fc43842db2 100644
--- a/usr/src/man/man1m/Makefile
+++ b/usr/src/man/man1m/Makefile
@@ -547,6 +547,7 @@ i386_MANFILES=				\
 		acpidump.1m		\
 		acpixtract.1m		\
 		nvmeadm.1m		\
+		pptadm.1m		\
 		rdmsr.1m
 
 sparc_MANFILES=	dcs.1m			\
diff --git a/usr/src/man/man1m/pptadm.1m b/usr/src/man/man1m/pptadm.1m
new file mode 100644
index 0000000000..f13a5e32a4
--- /dev/null
+++ b/usr/src/man/man1m/pptadm.1m
@@ -0,0 +1,74 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source.  A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\" Copyright 2018 Joyent, Inc.
+.\"
+.Dd April 10, 2018
+.Dt PPTADM 1M
+.Os
+.Sh NAME
+.Nm pptadm
+.Nd PPT administration utility
+.Sh SYNOPSIS
+.Nm
+.Cm list -j
+.Op Fl a
+.Nm
+.Cm list
+.Op Fl ap Op Fl o Ar fields
+.Sh DESCRIPTION
+The
+.Nm
+utility can enumerate passthrough devices for use by a virtualized guest.
+.Sh OPTIONS
+The following options to the
+.Cm list
+command are supported:
+.Bl -tag -width Ds
+.It Fl a
+Show all PPT devices, both available and assigned.
+.It Fl j
+Output JSON.
+.It Fl o
+Specify fields to output, or "all". Available fields are
+dev,path,vendor,device,subvendor,subdevice,rev,label
+.It Fl p
+Output in a parsable format; this requires the -o option to be specified.
+.El
+.Sh JSON OUTPUT
+The JSON output consists of an array under the key "devices" with the fields:
+.Bl -tag -width Ds
+.It dev
+The PPT /dev path, if assigned and bound.
+.It path
+The physical /devices path.
+.It vendor-id
+The PCI vendor ID.
+.It device-id
+The PCI device ID.
+.It subsystem-vendor-id
+The PCI subsystem vendor ID.
+.It subsystem-id
+The PCI subsystem ID.
+.It revision-id
+The PCI device revision.
+.It label
+Human-readable description from the PCI database.
+.El
+.Sh FILES
+.Bl -tag -width Ds
+.It /etc/ppt_aliases
+Containts the bindings of PPT devices in the same format as /etc/driver_aliases
+.It /etc/ppt_matches
+Identifies devices that PPT could be bound to, either by physical path, or by
+PCI ID.
+.El
+.Sh EXIT STATUS
+.Ex -std
diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf
index 7fdeb81254..002bef64cc 100644
--- a/usr/src/pkg/manifests/system-bhyve.mf
+++ b/usr/src/pkg/manifests/system-bhyve.mf
@@ -35,14 +35,22 @@ dir path=usr group=sys
 dir path=usr/kernel/drv group=sys
 dir path=usr/kernel/drv/$(ARCH64) group=sys
 dir path=usr/sbin
+dir path=usr/share
+dir path=usr/share/man
+dir path=usr/share/man/man1m
+driver name=ppt
 driver name=viona
 driver name=vmm
+file path=usr/kernel/drv/$(ARCH64)/ppt
 file path=usr/kernel/drv/$(ARCH64)/viona
 file path=usr/kernel/drv/$(ARCH64)/vmm
+file path=usr/kernel/drv/ppt.conf
 file path=usr/kernel/drv/viona.conf
 file path=usr/kernel/drv/vmm.conf
 file path=usr/sbin/bhyve mode=0555
 file path=usr/sbin/bhyvectl mode=0555
+file path=usr/sbin/pptadm mode=0555
+file path=usr/share/man/man1m/pptadm.1m
 license lic_CDDL license=lic_CDDL
 depend fmri=developer/acpi type=require
 depend fmri=system/bhyve/firmware type=require
diff --git a/usr/src/pkg/manifests/system-library-bhyve.mf b/usr/src/pkg/manifests/system-library-bhyve.mf
index d9a15e1b37..3c7e52c938 100644
--- a/usr/src/pkg/manifests/system-library-bhyve.mf
+++ b/usr/src/pkg/manifests/system-library-bhyve.mf
@@ -27,5 +27,8 @@ dir path=lib group=bin
 dir path=lib/$(ARCH64) group=bin
 dir path=usr group=sys
 dir path=usr/lib group=bin
+dir path=usr/lib/$(ARCH64) group=bin
 file path=lib/$(ARCH64)/libvmmapi.so.1
+file path=usr/lib/$(ARCH64)/libppt.so.1
+file path=usr/lib/libppt.so.1
 license lic_CDDL license=lic_CDDL
diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c
index 7875824a86..2015cfefae 100644
--- a/usr/src/uts/common/os/modsysfile.c
+++ b/usr/src/uts/common/os/modsysfile.c
@@ -58,10 +58,12 @@ struct hwc_class *hcl_head;	/* head of list of classes */
 static kmutex_t hcl_lock;	/* for accessing list of classes */
 
 #define	DAFILE		"/etc/driver_aliases"
+#define	PPTFILE		"/etc/ppt_aliases"
 #define	CLASSFILE	"/etc/driver_classes"
 #define	DACFFILE	"/etc/dacf.conf"
 
 static char class_file[] = CLASSFILE;
+static char pptfile[] = PPTFILE;
 static char dafile[] = DAFILE;
 static char dacffile[] = DACFFILE;
 
@@ -2170,14 +2172,13 @@ hwc_parse_now(char *fname, struct par_list **pl, ddi_prop_t **props)
 	return (0);	/* always return success */
 }
 
-void
-make_aliases(struct bind **bhash)
+static void
+parse_aliases(struct bind **bhash, struct _buf *file)
 {
 	enum {
 		AL_NEW, AL_DRVNAME, AL_DRVNAME_COMMA, AL_ALIAS, AL_ALIAS_COMMA
 	} state;
 
-	struct _buf *file;
 	char tokbuf[MAXPATHLEN];
 	char drvbuf[MAXPATHLEN];
 	token_t token;
@@ -2186,9 +2187,6 @@ make_aliases(struct bind **bhash)
 	static char dupwarn[] = "!Driver alias \"%s\" conflicts with "
 	    "an existing driver name or alias.";
 
-	if ((file = kobj_open_file(dafile)) == (struct _buf *)-1)
-		return;
-
 	state = AL_NEW;
 	major = DDI_MAJOR_T_NONE;
 	while (!done) {
@@ -2273,8 +2271,22 @@ make_aliases(struct bind **bhash)
 			kobj_file_err(CE_WARN, file, tok_err, tokbuf);
 		}
 	}
+}
 
-	kobj_close_file(file);
+void
+make_aliases(struct bind **bhash)
+{
+	struct _buf *file;
+
+	if ((file = kobj_open_file(pptfile)) != (struct _buf *)-1) {
+		parse_aliases(bhash, file);
+		kobj_close_file(file);
+	}
+
+	if ((file = kobj_open_file(dafile)) != (struct _buf *)-1) {
+		parse_aliases(bhash, file);
+		kobj_close_file(file);
+	}
 }
 
 
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 312c0f233d..ed404d3d6d 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -247,6 +247,7 @@ VMM_OBJS += vmm.o \
 	vmm_stat.o \
 	vmm_util.o \
 	x86.o \
+	iommu.o \
 	vdev.o \
 	vatpic.o \
 	vatpit.o \
@@ -260,14 +261,14 @@ VMM_OBJS += vmm.o \
 	vmx_msr.o \
 	vmx.o \
 	vmx_support.o \
+	vtd.o \
+	vtd_sol.o \
 	svm.o \
 	svm_msr.o \
 	npt.o \
 	vmcb.o \
 	svm_support.o \
 	amdv.o \
-	sol_iommu.o \
-	sol_ppt.o \
 	gipt.o \
 	vmm_sol_vm.o \
 	vmm_sol_glue.o \
@@ -282,6 +283,8 @@ VIONA_OBJS += viona_main.o \
 	viona_tx.o \
 	viona_hook.o \
 
+PPT_OBJS += ppt.o
+
 #
 #	Build up defines and paths.
 #
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc
index b60d24d82c..0c921b4028 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc
+++ b/usr/src/uts/i86pc/Makefile.i86pc
@@ -248,6 +248,7 @@ DRV_KMODS	+= fipe
 DRV_KMODS	+= imc imcstub
 DRV_KMODS	+= vmm
 DRV_KMODS	+= viona
+DRV_KMODS	+= ppt
 
 DRV_KMODS	+= cpudrv
 
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
index 9474b30fc6..902080e34c 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vtd.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
@@ -44,6 +44,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 #include <contrib/dev/acpica/include/acpi.h>
 
+#include <sys/sunndi.h>
+
 #include "io/iommu.h"
 
 /*
@@ -120,6 +122,9 @@ static int		drhd_num;
 static struct vtdmap	*vtdmaps[DRHD_MAX_UNITS];
 static int		max_domains;
 typedef int		(*drhd_ident_func_t)(void);
+#ifndef __FreeBSD__
+static dev_info_t	*vtddips[DRHD_MAX_UNITS];
+#endif
 
 static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
 static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
@@ -237,19 +242,63 @@ vtd_translation_disable(struct vtdmap *vtdmap)
 		;
 }
 
+static void *
+vtd_map(dev_info_t *dip)
+{
+	caddr_t regs;
+	ddi_acc_handle_t hdl;
+	int error;
+
+	static ddi_device_acc_attr_t regs_attr = {
+		DDI_DEVICE_ATTR_V0,
+		DDI_NEVERSWAP_ACC,
+		DDI_STRICTORDER_ACC,
+	};
+
+	error = ddi_regs_map_setup(dip, 0, &regs, 0, PAGE_SIZE, &regs_attr,
+	    &hdl);
+
+	if (error != DDI_SUCCESS)
+		return (NULL);
+
+	ddi_set_driver_private(dip, hdl);
+
+	return (regs);
+}
+
+static void
+vtd_unmap(dev_info_t *dip)
+{
+	ddi_acc_handle_t hdl = ddi_get_driver_private(dip);
+
+	if (hdl != NULL)
+		ddi_regs_map_free(&hdl);
+}
+
+#ifndef __FreeBSD__
+/*
+ * This lives in vtd_sol.c for license reasons.
+ */
+extern dev_info_t *vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *, int);
+#endif
+
 static int
 vtd_init(void)
 {
 	int i, units, remaining;
 	struct vtdmap *vtdmap;
 	vm_paddr_t ctx_paddr;
-	char *end, envname[32];
+	char *end;
+#ifdef __FreeBSD__
+	char envname[32];
 	unsigned long mapaddr;
+#endif
 	ACPI_STATUS status;
 	ACPI_TABLE_DMAR *dmar;
 	ACPI_DMAR_HEADER *hdr;
 	ACPI_DMAR_HARDWARE_UNIT *drhd;
 
+#ifdef __FreeBSD__
 	/*
 	 * Allow the user to override the ACPI DMAR table by specifying the
 	 * physical address of each remapping unit.
@@ -268,7 +317,9 @@ vtd_init(void)
 
 	if (units > 0)
 		goto skip_dmar;
-
+#else
+	units = 0;
+#endif
 	/* Search for DMAR table. */
 	status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
 	if (ACPI_FAILURE(status))
@@ -291,7 +342,15 @@ vtd_init(void)
 			break;
 
 		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
+#ifdef __FreeBSD__
 		vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
+#else
+		vtddips[units] = vtd_get_dip(drhd, units);
+		vtdmaps[units] = (struct vtdmap *)vtd_map(vtddips[units]);
+		if (vtdmaps[units] == NULL)
+			goto fail;
+		units++;
+#endif
 		if (units >= DRHD_MAX_UNITS)
 			break;
 		remaining -= hdr->Length;
@@ -300,7 +359,9 @@ vtd_init(void)
 	if (units <= 0)
 		return (ENXIO);
 
+#ifdef __FreeBSD__
 skip_dmar:
+#endif
 	drhd_num = units;
 	vtdmap = vtdmaps[0];
 
@@ -321,11 +382,36 @@ skip_dmar:
 	}
 
 	return (0);
+
+#ifndef __FreeBSD__
+fail:
+	for (i = 0; i <= units; i++)
+		vtd_unmap(vtddips[i]);
+	return (ENXIO);
+#endif
 }
 
 static void
 vtd_cleanup(void)
 {
+#ifndef __FreeBSD__
+	int i;
+
+	KASSERT(SLIST_EMPTY(&domhead), ("domain list not empty"));
+
+	bzero(root_table, sizeof (root_table));
+
+	for (i = 0; i <= drhd_num; i++) {
+		vtdmaps[i] = NULL;
+		/*
+		 * Unmap the vtd registers. Note that the devinfo nodes
+		 * themselves aren't removed, they are considered system state
+		 * and can be reused when the module is reloaded.
+		 */
+		if (vtddips[i] != NULL)
+			vtd_unmap(vtddips[i]);
+	}
+#endif
 }
 
 static void
@@ -619,6 +705,7 @@ vtd_create_domain(vm_paddr_t maxaddr)
 	if ((uintptr_t)dom->ptp & PAGE_MASK)
 		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
 
+#ifdef __FreeBSD__
 #ifdef notyet
 	/*
 	 * XXX superpage mappings for the iommu do not work correctly.
@@ -636,6 +723,18 @@ vtd_create_domain(vm_paddr_t maxaddr)
 	 */
 	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
 #endif
+#else
+	/*
+	 * On illumos we decidedly do not remove memory mapped to a VM's domain
+	 * from the host_domain, so we don't have to deal with page demotion and
+	 * can just use large pages.
+	 *
+	 * Since VM memory is currently allocated as 4k pages and mapped into
+	 * the VM domain page by page, the use of large pages is essentially
+	 * limited to the host_domain.
+	 */
+	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+#endif
 
 	SLIST_INSERT_HEAD(&domhead, dom, next);
 
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c b/usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c
new file mode 100644
index 0000000000..1dbe8ffa48
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vtd_sol.c
@@ -0,0 +1,83 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#include <sys/sunndi.h>
+#include <contrib/dev/acpica/include/acpi.h>
+
+dev_info_t *
+vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *drhd, int unit)
+{
+	dev_info_t *dip;
+	struct ddi_parent_private_data *pdptr;
+	struct regspec reg;
+	int circ;
+
+	/*
+	 * Try to find an existing devinfo node for this vtd unit.
+	 */
+	ndi_devi_enter(ddi_root_node(), &circ);
+	dip = ddi_find_devinfo("vtd", unit, 0);
+	ndi_devi_exit(ddi_root_node(), circ);
+
+	if (dip != NULL)
+		return (dip);
+
+	/*
+	 * None found, construct a devinfo node for this vtd unit.
+	 */
+	dip = ddi_add_child(ddi_root_node(), "vtd",
+	    DEVI_SID_NODEID, unit);
+
+	reg.regspec_bustype = 0;
+	reg.regspec_addr = drhd->Address;
+	reg.regspec_size = PAGE_SIZE;
+
+	/*
+	 * update the reg properties
+	 *
+	 *   reg property will be used for register
+	 *   set access
+	 *
+	 * refer to the bus_map of root nexus driver
+	 * I/O or memory mapping:
+	 *
+	 * <bustype=0, addr=x, len=x>: memory
+	 * <bustype=1, addr=x, len=x>: i/o
+	 * <bustype>1, addr=0, len=x>: x86-compatibility i/o
+	 */
+	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE,
+	    dip, "reg", (int *)&reg,
+	    sizeof (struct regspec) / sizeof (int));
+
+	/*
+	 * This is an artificially constructed dev_info, and we
+	 * need to set a few more things to be able to use it
+	 * for ddi_dma_alloc_handle/free_handle.
+	 */
+	ddi_set_driver(dip, ddi_get_driver(ddi_root_node()));
+	DEVI(dip)->devi_bus_dma_allochdl =
+	    DEVI(ddi_get_driver((ddi_root_node())));
+
+	pdptr = kmem_zalloc(sizeof (struct ddi_parent_private_data)
+	    + sizeof (struct regspec), KM_SLEEP);
+	pdptr->par_nreg = 1;
+	pdptr->par_reg = (struct regspec *)(pdptr + 1);
+	pdptr->par_reg->regspec_bustype = 0;
+	pdptr->par_reg->regspec_addr = drhd->Address;
+	pdptr->par_reg->regspec_size = PAGE_SIZE;
+	ddi_set_parent_data(dip, pdptr);
+
+	return (dip);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.c b/usr/src/uts/i86pc/io/vmm/io/iommu.c
new file mode 100644
index 0000000000..b949573fe2
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/iommu.c
@@ -0,0 +1,383 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+#include "iommu.h"
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW, 0, "bhyve iommu parameters");
+
+static int iommu_avail;
+SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail,
+    0, "bhyve iommu initialized?");
+
+static int iommu_enable = 1;
+SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, enable, CTLFLAG_RDTUN, &iommu_enable, 0,
+    "Enable use of I/O MMU (required for PCI passthrough).");
+
+static struct iommu_ops *ops;
+static void *host_domain;
+#ifdef __FreeBSD__
+static eventhandler_tag add_tag, delete_tag;
+#endif
+
+#ifndef __FreeBSD__
+static volatile u_int iommu_initted;
+#endif
+
+static __inline int
+IOMMU_INIT(void)
+{
+	if (ops != NULL)
+		return ((*ops->init)());
+	else
+		return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+	if (ops != NULL && iommu_avail)
+		(*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_domain)(maxaddr));
+	else
+		return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_mapping)(domain, gpa, hpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline uint64_t
+IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->remove_mapping)(domain, gpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, uint16_t rid)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->add_device)(domain, rid);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, uint16_t rid)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->remove_device)(domain, rid);
+}
+
+static __inline void
+IOMMU_INVALIDATE_TLB(void *domain)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->invalidate_tlb)(domain);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->disable)();
+}
+
+#ifdef __FreeBSD__
+static void
+iommu_pci_add(void *arg, device_t dev)
+{
+
+	/* Add new devices to the host domain. */
+	iommu_add_device(host_domain, pci_get_rid(dev));
+}
+
+static void
+iommu_pci_delete(void *arg, device_t dev)
+{
+
+	iommu_remove_device(host_domain, pci_get_rid(dev));
+}
+#endif
+
+#ifndef __FreeBSD__
+static int
+iommu_find_device(dev_info_t *dip, void *arg)
+{
+	boolean_t add = (boolean_t)arg;
+
+	if (pcie_is_pci_device(dip)) {
+		if (add)
+			iommu_add_device(host_domain, pci_get_rid(dip));
+		else
+			iommu_remove_device(host_domain, pci_get_rid(dip));
+	}
+
+	return (DDI_WALK_CONTINUE);
+}
+#endif
+
+static void
+iommu_init(void)
+{
+	int error, bus, slot, func;
+	vm_paddr_t maxaddr;
+#ifdef __FreeBSD__
+	devclass_t dc;
+#endif
+	device_t dev;
+
+	if (!iommu_enable)
+		return;
+
+	if (vmm_is_intel())
+		ops = &iommu_ops_intel;
+	else if (vmm_is_amd())
+		ops = &iommu_ops_amd;
+	else
+		ops = NULL;
+
+	error = IOMMU_INIT();
+	if (error)
+		return;
+
+	iommu_avail = 1;
+
+	/*
+	 * Create a domain for the devices owned by the host
+	 */
+	maxaddr = vmm_mem_maxaddr();
+	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+	if (host_domain == NULL) {
+		printf("iommu_init: unable to create a host domain");
+		IOMMU_CLEANUP();
+		ops = NULL;
+		iommu_avail = 0;
+		return;
+	}
+
+	/*
+	 * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
+	 * the host
+	 */
+	iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+#ifdef __FreeBSD__
+	add_tag = EVENTHANDLER_REGISTER(pci_add_device, iommu_pci_add, NULL, 0);
+	delete_tag = EVENTHANDLER_REGISTER(pci_delete_device, iommu_pci_delete,
+	    NULL, 0);
+	dc = devclass_find("ppt");
+	for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+			for (func = 0; func <= PCI_FUNCMAX; func++) {
+				dev = pci_find_dbsf(0, bus, slot, func);
+				if (dev == NULL)
+					continue;
+
+				/* Skip passthrough devices. */
+				if (dc != NULL &&
+				    device_get_devclass(dev) == dc)
+					continue;
+
+				/*
+				 * Everything else belongs to the host
+				 * domain.
+				 */
+				iommu_add_device(host_domain,
+				    pci_get_rid(dev));
+			}
+		}
+	}
+#else
+	ddi_walk_devs(ddi_root_node(), iommu_find_device, (void *)B_TRUE);
+#endif
+	IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+#ifdef __FreeBSD__
+	if (add_tag != NULL) {
+		EVENTHANDLER_DEREGISTER(pci_add_device, add_tag);
+		add_tag = NULL;
+	}
+	if (delete_tag != NULL) {
+		EVENTHANDLER_DEREGISTER(pci_delete_device, delete_tag);
+		delete_tag = NULL;
+	}
+#else
+	atomic_store_rel_int(&iommu_initted, 0);
+#endif
+	IOMMU_DISABLE();
+#ifndef __FreeBSD__
+	ddi_walk_devs(ddi_root_node(), iommu_find_device, (void *)B_FALSE);
+#endif
+	IOMMU_DESTROY_DOMAIN(host_domain);
+	IOMMU_CLEANUP();
+#ifndef __FreeBSD__
+	ops = NULL;
+#endif
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+	if (iommu_initted < 2) {
+		if (atomic_cmpset_int(&iommu_initted, 0, 1)) {
+			iommu_init();
+			atomic_store_rel_int(&iommu_initted, 2);
+		} else
+			while (iommu_initted == 1)
+				cpu_spinwait();
+	}
+	return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+	IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+	uint64_t mapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+		gpa += mapped;
+		hpa += mapped;
+		remaining -= mapped;
+	}
+}
+
+void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+	uint64_t unmapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
+		gpa += unmapped;
+		remaining -= unmapped;
+	}
+}
+
+void *
+iommu_host_domain(void)
+{
+
+	return (host_domain);
+}
+
+void
+iommu_add_device(void *dom, uint16_t rid)
+{
+
+	IOMMU_ADD_DEVICE(dom, rid);
+}
+
+void
+iommu_remove_device(void *dom, uint16_t rid)
+{
+
+	IOMMU_REMOVE_DEVICE(dom, rid);
+}
+
+void
+iommu_invalidate_tlb(void *domain)
+{
+
+	IOMMU_INVALIDATE_TLB(domain);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.c b/usr/src/uts/i86pc/io/vmm/io/ppt.c
new file mode 100644
index 0000000000..a71ce86c2d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.c
@@ -0,0 +1,1436 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+#include <sys/pci_cap.h>
+#include <sys/pcie_impl.h>
+#include <sys/ppt_dev.h>
+#include <sys/mkdev.h>
+#include <sys/sysmacros.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+#define	MAX_MSIMSGS	32
+
+/*
+ * If the MSI-X table is located in the middle of a BAR then that MMIO
+ * region gets split into two segments - one segment above the MSI-X table
+ * and the other segment below the MSI-X table - with a hole in place of
+ * the MSI-X table so accesses to it can be trapped and emulated.
+ *
+ * So, allocate a MMIO segment for each BAR register + 1 additional segment.
+ */
+#define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
+
+struct pptintr_arg {
+	struct pptdev	*pptdev;
+	uint64_t	addr;
+	uint64_t	msg_data;
+};
+
+struct pptseg {
+	vm_paddr_t	gpa;
+	size_t		len;
+	int		wired;
+};
+
+struct pptbar {
+	uint64_t base;
+	uint64_t size;
+	uint_t type;
+	ddi_acc_handle_t io_handle;
+	caddr_t io_ptr;
+};
+
+struct pptdev {
+	dev_info_t		*pptd_dip;
+	list_node_t		pptd_node;
+	ddi_acc_handle_t	pptd_cfg;
+	struct pptbar		pptd_bars[PCI_BASE_NUM];
+	struct vm		*vm;
+	struct pptseg mmio[MAX_MMIOSEGS];
+	struct {
+		int	num_msgs;		/* guest state */
+		boolean_t is_fixed;
+		size_t	inth_sz;
+		ddi_intr_handle_t *inth;
+		struct pptintr_arg arg[MAX_MSIMSGS];
+	} msi;
+
+	struct {
+		int num_msgs;
+		size_t inth_sz;
+		size_t arg_sz;
+		ddi_intr_handle_t *inth;
+		struct pptintr_arg *arg;
+	} msix;
+};
+
+
+static major_t		ppt_major;
+static void		*ppt_state;
+static kmutex_t		pptdev_mtx;
+static list_t		pptdev_list;
+
+#define	PPT_MINOR_NAME	"ppt"
+
+static ddi_device_acc_attr_t ppt_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_NEVERSWAP_ACC,
+	DDI_STORECACHING_OK_ACC,
+	DDI_DEFAULT_ACC
+};
+
+static int
+ppt_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+{
+	/* XXX: require extra privs? */
+	return (0);
+}
+
+#define	BAR_TO_IDX(bar)	(((bar) - PCI_CONF_BASE0) / PCI_BAR_SZ_32)
+#define	BAR_VALID(b)	(			\
+		(b) >= PCI_CONF_BASE0 &&	\
+		(b) <= PCI_CONF_BASE5 &&	\
+		((b) & (PCI_BAR_SZ_32-1)) == 0)
+
+static int
+ppt_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+	minor_t minor = getminor(dev);
+	struct pptdev *ppt;
+	void *data = (void *)arg;
+
+	if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL) {
+		return (ENOENT);
+	}
+
+	switch (cmd) {
+	case PPT_CFG_READ: {
+		struct ppt_cfg_io cio;
+		ddi_acc_handle_t cfg = ppt->pptd_cfg;
+
+		if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) {
+			return (EFAULT);
+		}
+		switch (cio.pci_width) {
+		case 4:
+			cio.pci_data = pci_config_get32(cfg, cio.pci_off);
+			break;
+		case 2:
+			cio.pci_data = pci_config_get16(cfg, cio.pci_off);
+			break;
+		case 1:
+			cio.pci_data = pci_config_get8(cfg, cio.pci_off);
+			break;
+		default:
+			return (EINVAL);
+		}
+
+		if (ddi_copyout(&cio, data, sizeof (cio), md) != 0) {
+			return (EFAULT);
+		}
+		return (0);
+	}
+	case PPT_CFG_WRITE: {
+		struct ppt_cfg_io cio;
+		ddi_acc_handle_t cfg = ppt->pptd_cfg;
+
+		if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) {
+			return (EFAULT);
+		}
+		switch (cio.pci_width) {
+		case 4:
+			pci_config_put32(cfg, cio.pci_off, cio.pci_data);
+			break;
+		case 2:
+			pci_config_put16(cfg, cio.pci_off, cio.pci_data);
+			break;
+		case 1:
+			pci_config_put8(cfg, cio.pci_off, cio.pci_data);
+			break;
+		default:
+			return (EINVAL);
+		}
+
+		return (0);
+	}
+	case PPT_BAR_QUERY: {
+		struct ppt_bar_query barg;
+		struct pptbar *pbar;
+
+		if (ddi_copyin(data, &barg, sizeof (barg), md) != 0) {
+			return (EFAULT);
+		}
+		if (barg.pbq_baridx >= PCI_BASE_NUM) {
+			return (EINVAL);
+		}
+		pbar = &ppt->pptd_bars[barg.pbq_baridx];
+
+		if (pbar->base == 0 || pbar->size == 0) {
+			return (ENOENT);
+		}
+		barg.pbq_type = pbar->type;
+		barg.pbq_base = pbar->base;
+		barg.pbq_size = pbar->size;
+
+		if (ddi_copyout(&barg, data, sizeof (barg), md) != 0) {
+			return (EFAULT);
+		}
+		return (0);
+	}
+	case PPT_BAR_READ: {
+		struct ppt_bar_io bio;
+		struct pptbar *pbar;
+		void *addr;
+		uint_t rnum;
+		ddi_acc_handle_t cfg;
+
+		if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) {
+			return (EFAULT);
+		}
+		rnum = bio.pbi_bar;
+		if (rnum >= PCI_BASE_NUM) {
+			return (EINVAL);
+		}
+		pbar = &ppt->pptd_bars[rnum];
+		if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) {
+			return (EINVAL);
+		}
+		addr = pbar->io_ptr + bio.pbi_off;
+
+		switch (bio.pbi_width) {
+		case 4:
+			bio.pbi_data = ddi_get32(pbar->io_handle, addr);
+			break;
+		case 2:
+			bio.pbi_data = ddi_get16(pbar->io_handle, addr);
+			break;
+		case 1:
+			bio.pbi_data = ddi_get8(pbar->io_handle, addr);
+			break;
+		default:
+			return (EINVAL);
+		}
+
+		if (ddi_copyout(&bio, data, sizeof (bio), md) != 0) {
+			return (EFAULT);
+		}
+		return (0);
+	}
+	case PPT_BAR_WRITE: {
+		struct ppt_bar_io bio;
+		struct pptbar *pbar;
+		void *addr;
+		uint_t rnum;
+		ddi_acc_handle_t cfg;
+
+		if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) {
+			return (EFAULT);
+		}
+		rnum = bio.pbi_bar;
+		if (rnum >= PCI_BASE_NUM) {
+			return (EINVAL);
+		}
+		pbar = &ppt->pptd_bars[rnum];
+		if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) {
+			return (EINVAL);
+		}
+		addr = pbar->io_ptr + bio.pbi_off;
+
+		switch (bio.pbi_width) {
+		case 4:
+			ddi_put32(pbar->io_handle, addr, bio.pbi_data);
+			break;
+		case 2:
+			ddi_put16(pbar->io_handle, addr, bio.pbi_data);
+			break;
+		case 1:
+			ddi_put8(pbar->io_handle, addr, bio.pbi_data);
+			break;
+		default:
+			return (EINVAL);
+		}
+
+		return (0);
+	}
+
+	default:
+		return (ENOTTY);
+	}
+
+	return (0);
+}
+
+static int
+ppt_find_pba_bar(struct pptdev *ppt)
+{
+	uint16_t base;
+	uint32_t pba_off;
+
+	if (PCI_CAP_LOCATE(ppt->pptd_cfg, PCI_CAP_ID_MSI_X, &base) !=
+	    DDI_SUCCESS)
+		return (-1);
+
+	pba_off = pci_config_get32(ppt->pptd_cfg, base + PCI_MSIX_PBA_OFFSET);
+
+	if (pba_off == PCI_EINVAL32)
+		return (-1);
+
+	return (pba_off & PCI_MSIX_PBA_BIR_MASK);
+}
+
+static int
+ppt_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off, size_t len,
+    size_t *maplen, uint_t model)
+{
+	minor_t minor;
+	struct pptdev *ppt;
+	int err;
+	int bar;
+
+	minor = getminor(dev);
+
+	if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL)
+		return (ENXIO);
+
+#ifdef _MULTI_DATAMODEL
+	if (ddi_model_convert_from(model) != DDI_MODEL_NONE)
+		return (ENXIO);
+#endif
+
+	if (off < 0 || off != P2ALIGN(off, PAGESIZE))
+		return (EINVAL);
+
+	if ((bar = ppt_find_pba_bar(ppt)) == -1)
+		return (EINVAL);
+
+	/*
+	 * Add 1 to the BAR number to get the register number used by DDI.
+	 * Register 0 corresponds to PCI config space, the PCI BARs start at 1.
+	 */
+	bar += 1;
+
+	err = devmap_devmem_setup(dhp, ppt->pptd_dip, NULL, bar, off, len,
+	    PROT_USER | PROT_READ | PROT_WRITE, IOMEM_DATA_CACHED, &ppt_attr);
+
+	if (err == DDI_SUCCESS)
+		*maplen = len;
+
+	return (err);
+}
+
+
+static void
+ppt_bar_wipe(struct pptdev *ppt)
+{
+	uint_t i;
+
+	for (i = 0; i < PCI_BASE_NUM; i++) {
+		struct pptbar *pbar = &ppt->pptd_bars[i];
+		if (pbar->type == PCI_ADDR_IO && pbar->io_handle != NULL) {
+			ddi_regs_map_free(&pbar->io_handle);
+		}
+	}
+	bzero(&ppt->pptd_bars, sizeof (ppt->pptd_bars));
+}
+
+static int
+ppt_bar_crawl(struct pptdev *ppt)
+{
+	pci_regspec_t *regs;
+	uint_t rcount, i;
+	int err = 0, rlen;
+
+	if (ddi_getlongprop(DDI_DEV_T_ANY, ppt->pptd_dip, DDI_PROP_DONTPASS,
+	    "assigned-addresses", (caddr_t)&regs, &rlen) != DDI_PROP_SUCCESS) {
+		return (EIO);
+	}
+
+	VERIFY3S(rlen, >, 0);
+	rcount = rlen / sizeof (pci_regspec_t);
+	for (i = 0; i < rcount; i++) {
+		pci_regspec_t *reg = &regs[i];
+		struct pptbar *pbar;
+		uint_t bar, rnum;
+
+		DTRACE_PROBE1(ppt__crawl__reg, pci_regspec_t *, reg);
+		bar = PCI_REG_REG_G(reg->pci_phys_hi);
+		if (!BAR_VALID(bar)) {
+			continue;
+		}
+
+		rnum = BAR_TO_IDX(bar);
+		pbar = &ppt->pptd_bars[rnum];
+		/* is this somehow already populated? */
+		if (pbar->base != 0 || pbar->size != 0) {
+			err = EEXIST;
+			break;
+		}
+
+		pbar->type = reg->pci_phys_hi & PCI_ADDR_MASK;
+		pbar->base = ((uint64_t)reg->pci_phys_mid << 32) |
+		    (uint64_t)reg->pci_phys_low;
+		pbar->size = ((uint64_t)reg->pci_size_hi << 32) |
+		    (uint64_t)reg->pci_size_low;
+		if (pbar->type == PCI_ADDR_IO) {
+			err = ddi_regs_map_setup(ppt->pptd_dip, rnum,
+			    &pbar->io_ptr, 0, 0, &ppt_attr, &pbar->io_handle);
+			if (err != 0) {
+				break;
+			}
+		}
+	}
+	kmem_free(regs, rlen);
+
+	if (err != 0) {
+		ppt_bar_wipe(ppt);
+	}
+	return (err);
+}
+
+static boolean_t
+ppt_bar_verify_mmio(struct pptdev *ppt, uint64_t base, uint64_t size)
+{
+	const uint64_t map_end = base + size;
+
+	/* Zero-length or overflow mappings are not valid */
+	if (map_end <= base) {
+		return (B_FALSE);
+	}
+	/* MMIO bounds should be page-aligned */
+	if ((base & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
+		return (B_FALSE);
+	}
+
+	for (uint_t i = 0; i < PCI_BASE_NUM; i++) {
+		const struct pptbar *bar = &ppt->pptd_bars[i];
+		const uint64_t bar_end = bar->base + bar->size;
+
+		/* Only memory BARs can be mapped */
+		if (bar->type != PCI_ADDR_MEM32 &&
+		    bar->type != PCI_ADDR_MEM64) {
+			continue;
+		}
+
+		/* Does the mapping fit within this BAR? */
+		if (base < bar->base || base >= bar_end ||
+		    map_end < bar->base || map_end > bar_end) {
+			continue;
+		}
+
+		/* This BAR satisfies the provided map */
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static int
+ppt_ddi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	struct pptdev *ppt = NULL;
+	char name[PPT_MAXNAMELEN];
+	int inst;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	inst = ddi_get_instance(dip);
+
+	if (ddi_soft_state_zalloc(ppt_state, inst) != DDI_SUCCESS) {
+		goto fail;
+	}
+	VERIFY(ppt = ddi_get_soft_state(ppt_state, inst));
+	ppt->pptd_dip = dip;
+	ddi_set_driver_private(dip, ppt);
+
+	if (pci_config_setup(dip, &ppt->pptd_cfg) != DDI_SUCCESS) {
+		goto fail;
+	}
+	if (ppt_bar_crawl(ppt) != 0) {
+		goto fail;
+	}
+	if (ddi_create_minor_node(dip, PPT_MINOR_NAME, S_IFCHR, inst,
+	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
+		goto fail;
+	}
+
+	mutex_enter(&pptdev_mtx);
+	list_insert_tail(&pptdev_list, ppt);
+	mutex_exit(&pptdev_mtx);
+
+	return (DDI_SUCCESS);
+
+fail:
+	if (ppt != NULL) {
+		ddi_remove_minor_node(dip, NULL);
+		if (ppt->pptd_cfg != NULL) {
+			pci_config_teardown(&ppt->pptd_cfg);
+		}
+		ppt_bar_wipe(ppt);
+		ddi_soft_state_free(ppt_state, inst);
+	}
+	return (DDI_FAILURE);
+}
+
+static int
+ppt_ddi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	struct pptdev *ppt;
+	int inst;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	ppt = ddi_get_driver_private(dip);
+	inst = ddi_get_instance(dip);
+
+	ASSERT3P(ddi_get_soft_state(ppt_state, inst), ==, ppt);
+
+	mutex_enter(&pptdev_mtx);
+	if (ppt->vm != NULL) {
+		mutex_exit(&pptdev_mtx);
+		return (DDI_FAILURE);
+	}
+	list_remove(&pptdev_list, ppt);
+	mutex_exit(&pptdev_mtx);
+
+	ddi_remove_minor_node(dip, PPT_MINOR_NAME);
+	ppt_bar_wipe(ppt);
+	pci_config_teardown(&ppt->pptd_cfg);
+	ddi_set_driver_private(dip, NULL);
+	ddi_soft_state_free(ppt_state, inst);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+ppt_ddi_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+	int error = DDI_FAILURE;
+	int inst = getminor((dev_t)arg);
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO: {
+		struct pptdev *ppt = ddi_get_soft_state(ppt_state, inst);
+
+		if (ppt != NULL) {
+			*result = (void *)ppt->pptd_dip;
+			error = DDI_SUCCESS;
+		}
+		break;
+	}
+	case DDI_INFO_DEVT2INSTANCE: {
+		*result = (void *)(uintptr_t)inst;
+		error = DDI_SUCCESS;
+		break;
+	}
+	default:
+		break;
+	}
+	return (error);
+}
+
+static struct cb_ops ppt_cb_ops = {
+	ppt_open,
+	nulldev,	/* close */
+	nodev,		/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	nodev,		/* read */
+	nodev,		/* write */
+	ppt_ioctl,
+	ppt_devmap,	/* devmap */
+	NULL,		/* mmap */
+	NULL,		/* segmap */
+	nochpoll,	/* poll */
+	ddi_prop_op,
+	NULL,
+	D_NEW | D_MP | D_64BIT | D_DEVMAP,
+	CB_REV
+};
+
+static struct dev_ops ppt_ops = {
+	DEVO_REV,
+	0,
+	ppt_ddi_info,
+	nulldev,	/* identify */
+	nulldev,	/* probe */
+	ppt_ddi_attach,
+	ppt_ddi_detach,
+	nodev,		/* reset */
+	&ppt_cb_ops,
+	(struct bus_ops *)NULL
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"bhyve pci pass-thru",
+	&ppt_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	int error;
+
+	mutex_init(&pptdev_mtx, NULL, MUTEX_DRIVER, NULL);
+	list_create(&pptdev_list, sizeof (struct pptdev),
+	    offsetof(struct pptdev, pptd_node));
+
+	error = ddi_soft_state_init(&ppt_state, sizeof (struct pptdev), 0);
+	if (error) {
+		goto fail;
+	}
+
+	error = mod_install(&modlinkage);
+
+	ppt_major = ddi_name_to_major("ppt");
+fail:
+	if (error) {
+		ddi_soft_state_fini(&ppt_state);
+	}
+	return (error);
+}
+
+int
+_fini(void)
+{
+	int error;
+
+	error = mod_remove(&modlinkage);
+	if (error)
+		return (error);
+	ddi_soft_state_fini(&ppt_state);
+
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+static boolean_t
+ppt_wait_for_pending_txn(dev_info_t *dip, uint_t max_delay_us)
+{
+	uint16_t cap_ptr, devsts;
+	ddi_acc_handle_t hdl;
+
+	if (pci_config_setup(dip, &hdl) != DDI_SUCCESS)
+		return (B_FALSE);
+
+	if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) {
+		pci_config_teardown(&hdl);
+		return (B_FALSE);
+	}
+
+	devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS);
+	while ((devsts & PCIE_DEVSTS_TRANS_PENDING) != 0) {
+		if (max_delay_us == 0) {
+			pci_config_teardown(&hdl);
+			return (B_FALSE);
+		}
+
+		/* Poll once every 100 milliseconds up to the timeout. */
+		if (max_delay_us > 100000) {
+			delay(drv_usectohz(100000));
+			max_delay_us -= 100000;
+		} else {
+			delay(drv_usectohz(max_delay_us));
+			max_delay_us = 0;
+		}
+		devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS);
+	}
+
+	pci_config_teardown(&hdl);
+	return (B_TRUE);
+}
+
+static uint_t
+ppt_max_completion_tmo_us(dev_info_t *dip)
+{
+	uint_t timo = 0;
+	uint16_t cap_ptr;
+	ddi_acc_handle_t hdl;
+	uint_t timo_ranges[] = {	/* timeout ranges */
+		50000,		/* 50ms */
+		100,		/* 100us */
+		10000,		/* 10ms */
+		0,
+		0,
+		55000,		/* 55ms */
+		210000,		/* 210ms */
+		0,
+		0,
+		900000,		/* 900ms */
+		3500000,	/* 3.5s */
+		0,
+		0,
+		13000000,	/* 13s */
+		64000000,	/* 64s */
+		0
+	};
+
+	if (pci_config_setup(dip, &hdl) != DDI_SUCCESS)
+		return (50000); /* default 50ms */
+
+	if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS)
+		goto out;
+
+	if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_PCIECAP) &
+	    PCIE_PCIECAP_VER_MASK) < PCIE_PCIECAP_VER_2_0)
+		goto out;
+
+	if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCAP2) &
+	    PCIE_DEVCTL2_COM_TO_RANGE_MASK) == 0)
+		goto out;
+
+	timo = timo_ranges[PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL2) &
+	    PCIE_DEVCAP2_COM_TO_RANGE_MASK];
+
+out:
+	if (timo == 0)
+		timo = 50000; /* default 50ms */
+
+	pci_config_teardown(&hdl);
+	return (timo);
+}
+
+static boolean_t
+ppt_flr(dev_info_t *dip, boolean_t force)
+{
+	uint16_t cap_ptr, ctl, cmd;
+	ddi_acc_handle_t hdl;
+	uint_t compl_delay = 0, max_delay_us;
+
+	if (pci_config_setup(dip, &hdl) != DDI_SUCCESS)
+		return (B_FALSE);
+
+	if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS)
+		goto fail;
+
+	if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCAP) & PCIE_DEVCAP_FLR)
+	    == 0)
+		goto fail;
+
+	max_delay_us = MAX(ppt_max_completion_tmo_us(dip), 10000);
+
+	/*
+	 * Disable busmastering to prevent generation of new transactions while
+	 * waiting for the device to go idle.  If the idle timeout fails, the
+	 * command register is restored which will re-enable busmastering.
+	 */
+	cmd = pci_config_get16(hdl, PCI_CONF_COMM);
+	pci_config_put16(hdl, PCI_CONF_COMM, cmd & ~PCI_COMM_ME);
+	if (!ppt_wait_for_pending_txn(dip, max_delay_us)) {
+		if (!force) {
+			pci_config_put16(hdl, PCI_CONF_COMM, cmd);
+			goto fail;
+		}
+		dev_err(dip, CE_WARN,
+		    "?Resetting with transactions pending after %u us\n",
+		    max_delay_us);
+
+		/*
+		 * Extend the post-FLR delay to cover the maximum Completion
+		 * Timeout delay of anything in flight during the FLR delay.
+		 * Enforce a minimum delay of at least 10ms.
+		 */
+		compl_delay = MAX(10, (ppt_max_completion_tmo_us(dip) / 1000));
+	}
+
+	/* Initiate the reset. */
+	ctl = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL);
+	(void) PCI_CAP_PUT16(hdl, 0, cap_ptr, PCIE_DEVCTL,
+	    ctl | PCIE_DEVCTL_INITIATE_FLR);
+
+	/* Wait for at least 100ms */
+	delay(drv_usectohz((100 + compl_delay) * 1000));
+
+	pci_config_teardown(&hdl);
+	return (B_TRUE);
+
+fail:
+	/*
+	 * TODO: If the FLR fails for some reason, we should attempt a reset
+	 * using the PCI power management facilities (if possible).
+	 */
+	pci_config_teardown(&hdl);
+	return (B_FALSE);
+}
+
+
+static struct pptdev *
+ppt_findf(int fd)
+{
+	struct pptdev *ppt = NULL;
+	file_t *fp;
+	vattr_t va;
+
+	if ((fp = getf(fd)) == NULL) {
+		return (NULL);
+	}
+
+	va.va_mask = AT_RDEV;
+	if (VOP_GETATTR(fp->f_vnode, &va, NO_FOLLOW, fp->f_cred, NULL) != 0 ||
+	    getmajor(va.va_rdev) != ppt_major)
+		goto fail;
+
+	ppt = ddi_get_soft_state(ppt_state, getminor(va.va_rdev));
+
+	if (ppt != NULL)
+		return (ppt);
+
+fail:
+	releasef(fd);
+	return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+	int i;
+	struct pptseg *seg;
+
+	for (i = 0; i < MAX_MMIOSEGS; i++) {
+		seg = &ppt->mmio[i];
+		if (seg->len == 0)
+			continue;
+		(void) vm_unmap_mmio(vm, seg->gpa, seg->len);
+		bzero(seg, sizeof (struct pptseg));
+	}
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+	int i;
+
+	if (ppt->msi.num_msgs == 0)
+		return;
+
+	for (i = 0; i < ppt->msi.num_msgs; i++) {
+		int intr_cap;
+
+		(void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap);
+		if (intr_cap & DDI_INTR_FLAG_BLOCK)
+			ddi_intr_block_disable(&ppt->msi.inth[i], 1);
+		else
+			ddi_intr_disable(ppt->msi.inth[i]);
+
+		ddi_intr_remove_handler(ppt->msi.inth[i]);
+		ddi_intr_free(ppt->msi.inth[i]);
+
+		ppt->msi.inth[i] = NULL;
+	}
+
+	kmem_free(ppt->msi.inth, ppt->msi.inth_sz);
+	ppt->msi.inth = NULL;
+	ppt->msi.inth_sz = 0;
+	ppt->msi.is_fixed = B_FALSE;
+
+	ppt->msi.num_msgs = 0;
+}
+
+static void
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+	if (ppt->msix.inth != NULL && ppt->msix.inth[idx] != NULL) {
+		int intr_cap;
+
+		(void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap);
+		if (intr_cap & DDI_INTR_FLAG_BLOCK)
+			ddi_intr_block_disable(&ppt->msix.inth[idx], 1);
+		else
+			ddi_intr_disable(ppt->msix.inth[idx]);
+
+		ddi_intr_remove_handler(ppt->msix.inth[idx]);
+	}
+}
+
+static void
+ppt_teardown_msix(struct pptdev *ppt)
+{
+	uint_t i;
+
+	if (ppt->msix.num_msgs == 0)
+		return;
+
+	for (i = 0; i < ppt->msix.num_msgs; i++)
+		ppt_teardown_msix_intr(ppt, i);
+
+	if (ppt->msix.inth) {
+		for (i = 0; i < ppt->msix.num_msgs; i++)
+			ddi_intr_free(ppt->msix.inth[i]);
+		kmem_free(ppt->msix.inth, ppt->msix.inth_sz);
+		ppt->msix.inth = NULL;
+		ppt->msix.inth_sz = 0;
+		kmem_free(ppt->msix.arg, ppt->msix.arg_sz);
+		ppt->msix.arg = NULL;
+		ppt->msix.arg_sz = 0;
+	}
+
+	ppt->msix.num_msgs = 0;
+}
+
+int
+ppt_assigned_devices(struct vm *vm)
+{
+	struct pptdev *ppt;
+	uint_t num = 0;
+
+	mutex_enter(&pptdev_mtx);
+	for (ppt = list_head(&pptdev_list); ppt != NULL;
+	    ppt = list_next(&pptdev_list, ppt)) {
+		if (ppt->vm == vm) {
+			num++;
+		}
+	}
+	mutex_exit(&pptdev_mtx);
+	return (num);
+}
+
+boolean_t
+ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
+{
+	struct pptdev *ppt = list_head(&pptdev_list);
+
+	/* XXX: this should probably be restructured to avoid the lock */
+	mutex_enter(&pptdev_mtx);
+	for (ppt = list_head(&pptdev_list); ppt != NULL;
+	    ppt = list_next(&pptdev_list, ppt)) {
+		if (ppt->vm != vm) {
+			continue;
+		}
+
+		for (uint_t i = 0; i < MAX_MMIOSEGS; i++) {
+			struct pptseg *seg = &ppt->mmio[i];
+
+			if (seg->len == 0)
+				continue;
+			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) {
+				mutex_exit(&pptdev_mtx);
+				return (B_TRUE);
+			}
+		}
+	}
+
+	mutex_exit(&pptdev_mtx);
+	return (B_FALSE);
+}
+
+int
+ppt_assign_device(struct vm *vm, int pptfd)
+{
+	struct pptdev *ppt;
+	int err = 0;
+
+	mutex_enter(&pptdev_mtx);
+	ppt = ppt_findf(pptfd);
+	if (ppt == NULL) {
+		mutex_exit(&pptdev_mtx);
+		return (EBADF);
+	}
+
+	/* Only one VM may own a device at any given time */
+	if (ppt->vm != NULL && ppt->vm != vm) {
+		err = EBUSY;
+		goto done;
+	}
+
+	if (pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) {
+		err = EIO;
+		goto done;
+	}
+	ppt_flr(ppt->pptd_dip, B_TRUE);
+
+	/*
+	 * Restore the device state after reset and then perform another save
+	 * so the "pristine" state can be restored when the device is removed
+	 * from the guest.
+	 */
+	if (pci_restore_config_regs(ppt->pptd_dip) != DDI_SUCCESS ||
+	    pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) {
+		err = EIO;
+		goto done;
+	}
+
+	ppt->vm = vm;
+	iommu_remove_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip));
+	iommu_add_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip));
+	pf_set_passthru(ppt->pptd_dip, B_TRUE);
+
+done:
+	releasef(pptfd);
+	mutex_exit(&pptdev_mtx);
+	return (err);
+}
+
+static void
+ppt_reset_pci_power_state(dev_info_t *dip)
+{
+	ddi_acc_handle_t cfg;
+	uint16_t cap_ptr;
+
+	if (pci_config_setup(dip, &cfg) != DDI_SUCCESS)
+		return;
+
+	if (PCI_CAP_LOCATE(cfg, PCI_CAP_ID_PM, &cap_ptr) == DDI_SUCCESS) {
+		uint16_t val;
+
+		val = PCI_CAP_GET16(cfg, 0, cap_ptr, PCI_PMCSR);
+		if ((val & PCI_PMCSR_STATE_MASK) != PCI_PMCSR_D0) {
+			val = (val & ~PCI_PMCSR_STATE_MASK) | PCI_PMCSR_D0;
+			(void) PCI_CAP_PUT16(cfg, 0, cap_ptr, PCI_PMCSR,
+			    val);
+		}
+	}
+
+	pci_config_teardown(&cfg);
+}
+
+static void
+ppt_do_unassign(struct pptdev *ppt)
+{
+	struct vm *vm = ppt->vm;
+
+	ASSERT3P(vm, !=, NULL);
+	ASSERT(MUTEX_HELD(&pptdev_mtx));
+
+
+	ppt_flr(ppt->pptd_dip, B_TRUE);
+
+	/*
+	 * Restore from the state saved during device assignment.
+	 * If the device power state has been altered, that must be remedied
+	 * first, as it will reset register state during the transition.
+	 */
+	ppt_reset_pci_power_state(ppt->pptd_dip);
+	(void) pci_restore_config_regs(ppt->pptd_dip);
+
+	pf_set_passthru(ppt->pptd_dip, B_FALSE);
+
+	ppt_unmap_mmio(vm, ppt);
+	ppt_teardown_msi(ppt);
+	ppt_teardown_msix(ppt);
+	iommu_remove_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip));
+	iommu_add_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip));
+	ppt->vm = NULL;
+}
+
+int
+ppt_unassign_device(struct vm *vm, int pptfd)
+{
+	struct pptdev *ppt;
+	int err = 0;
+
+	mutex_enter(&pptdev_mtx);
+	ppt = ppt_findf(pptfd);
+	if (ppt == NULL) {
+		mutex_exit(&pptdev_mtx);
+		return (EBADF);
+	}
+
+	/* If this device is not owned by this 'vm' then bail out. */
+	if (ppt->vm != vm) {
+		err = EBUSY;
+		goto done;
+	}
+	ppt_do_unassign(ppt);
+
+done:
+	releasef(pptfd);
+	mutex_exit(&pptdev_mtx);
+	return (err);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+	struct pptdev *ppt;
+
+	mutex_enter(&pptdev_mtx);
+	for (ppt = list_head(&pptdev_list); ppt != NULL;
+	    ppt = list_next(&pptdev_list, ppt)) {
+		if (ppt->vm == vm) {
+			ppt_do_unassign(ppt);
+		}
+	}
+	mutex_exit(&pptdev_mtx);
+
+	return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len,
+    vm_paddr_t hpa)
+{
+	struct pptdev *ppt;
+	int err = 0;
+
+	mutex_enter(&pptdev_mtx);
+	ppt = ppt_findf(pptfd);
+	if (ppt == NULL) {
+		mutex_exit(&pptdev_mtx);
+		return (EBADF);
+	}
+	if (ppt->vm != vm) {
+		err = EBUSY;
+		goto done;
+	}
+
+	/*
+	 * Ensure that the host-physical range of the requested mapping fits
+	 * within one of the MMIO BARs of the device.
+	 */
+	if (!ppt_bar_verify_mmio(ppt, hpa, len)) {
+		err = EINVAL;
+		goto done;
+	}
+
+	for (uint_t i = 0; i < MAX_MMIOSEGS; i++) {
+		struct pptseg *seg = &ppt->mmio[i];
+
+		if (seg->len == 0) {
+			err = vm_map_mmio(vm, gpa, len, hpa);
+			if (err == 0) {
+				seg->gpa = gpa;
+				seg->len = len;
+			}
+			goto done;
+		}
+	}
+	err = ENOSPC;
+
+done:
+	releasef(pptfd);
+	mutex_exit(&pptdev_mtx);
+	return (err);
+}
+
+static uint_t
+pptintr(caddr_t arg, caddr_t unused)
+{
+	struct pptintr_arg *pptarg = (struct pptintr_arg *)arg;
+	struct pptdev *ppt = pptarg->pptdev;
+
+	if (ppt->vm != NULL) {
+		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
+	} else {
+		/*
+		 * XXX
+		 * This is not expected to happen - panic?
+		 */
+	}
+
+	/*
+	 * For legacy interrupts give other filters a chance in case
+	 * the interrupt was not generated by the passthrough device.
+	 */
+	return (ppt->msi.is_fixed ? DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, uint64_t msg,
+    int numvec)
+{
+	int i, msi_count, intr_type;
+	struct pptdev *ppt;
+	int err = 0;
+
+	if (numvec < 0 || numvec > MAX_MSIMSGS)
+		return (EINVAL);
+
+	mutex_enter(&pptdev_mtx);
+	ppt = ppt_findf(pptfd);
+	if (ppt == NULL) {
+		mutex_exit(&pptdev_mtx);
+		return (EBADF);
+	}
+	if (ppt->vm != vm) {
+		/* Make sure we own this device */
+		err = EBUSY;
+		goto done;
+	}
+
+	/* Free any allocated resources */
+	ppt_teardown_msi(ppt);
+
+	if (numvec == 0) {
+		/* nothing more to do */
+		goto done;
+	}
+
+	if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI,
+	    &msi_count) != DDI_SUCCESS) {
+		if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_FIXED,
+		    &msi_count) != DDI_SUCCESS) {
+			err = EINVAL;
+			goto done;
+		}
+
+		intr_type = DDI_INTR_TYPE_FIXED;
+		ppt->msi.is_fixed = B_TRUE;
+	} else {
+		intr_type = DDI_INTR_TYPE_MSI;
+	}
+
+	/*
+	 * The device must be capable of supporting the number of vectors
+	 * the guest wants to allocate.
+	 */
+	if (numvec > msi_count) {
+		err = EINVAL;
+		goto done;
+	}
+
+	ppt->msi.inth_sz = numvec * sizeof (ddi_intr_handle_t);
+	ppt->msi.inth = kmem_zalloc(ppt->msi.inth_sz, KM_SLEEP);
+	if (ddi_intr_alloc(ppt->pptd_dip, ppt->msi.inth, intr_type, 0,
+	    numvec, &msi_count, 0) != DDI_SUCCESS) {
+		kmem_free(ppt->msi.inth, ppt->msi.inth_sz);
+		err = EINVAL;
+		goto done;
+	}
+
+	/* Verify that we got as many vectors as the guest requested */
+	if (numvec != msi_count) {
+		ppt_teardown_msi(ppt);
+		err = EINVAL;
+		goto done;
+	}
+
+	/* Set up & enable interrupt handler for each vector. */
+	for (i = 0; i < numvec; i++) {
+		int res, intr_cap = 0;
+
+		ppt->msi.num_msgs = i + 1;
+		ppt->msi.arg[i].pptdev = ppt;
+		ppt->msi.arg[i].addr = addr;
+		ppt->msi.arg[i].msg_data = msg + i;
+
+		if (ddi_intr_add_handler(ppt->msi.inth[i], pptintr,
+		    &ppt->msi.arg[i], NULL) != DDI_SUCCESS)
+			break;
+
+		(void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap);
+		if (intr_cap & DDI_INTR_FLAG_BLOCK)
+			res = ddi_intr_block_enable(&ppt->msi.inth[i], 1);
+		else
+			res = ddi_intr_enable(ppt->msi.inth[i]);
+
+		if (res != DDI_SUCCESS)
+			break;
+	}
+	if (i < numvec) {
+		ppt_teardown_msi(ppt);
+		err = ENXIO;
+	}
+
+done:
+	releasef(pptfd);
+	mutex_exit(&pptdev_mtx);
+	return (err);
+}
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr,
+    uint64_t msg, uint32_t vector_control)
+{
+	struct pptdev *ppt;
+	int numvec, alloced;
+	int err = 0;
+
+	mutex_enter(&pptdev_mtx);
+	ppt = ppt_findf(pptfd);
+	if (ppt == NULL) {
+		mutex_exit(&pptdev_mtx);
+		return (EBADF);
+	}
+	/* Make sure we own this device */
+	if (ppt->vm != vm) {
+		err = EBUSY;
+		goto done;
+	}
+
+	/*
+	 * First-time configuration:
+	 * 	Allocate the MSI-X table
+	 *	Allocate the IRQ resources
+	 *	Set up some variables in ppt->msix
+	 */
+	if (ppt->msix.num_msgs == 0) {
+		dev_info_t *dip = ppt->pptd_dip;
+
+		if (ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX,
+		    &numvec) != DDI_SUCCESS) {
+			err = EINVAL;
+			goto done;
+		}
+
+		ppt->msix.num_msgs = numvec;
+
+		ppt->msix.arg_sz = numvec * sizeof (ppt->msix.arg[0]);
+		ppt->msix.arg = kmem_zalloc(ppt->msix.arg_sz, KM_SLEEP);
+		ppt->msix.inth_sz = numvec * sizeof (ddi_intr_handle_t);
+		ppt->msix.inth = kmem_zalloc(ppt->msix.inth_sz, KM_SLEEP);
+
+		if (ddi_intr_alloc(dip, ppt->msix.inth, DDI_INTR_TYPE_MSIX, 0,
+		    numvec, &alloced, 0) != DDI_SUCCESS) {
+			kmem_free(ppt->msix.arg, ppt->msix.arg_sz);
+			kmem_free(ppt->msix.inth, ppt->msix.inth_sz);
+			ppt->msix.arg = NULL;
+			ppt->msix.inth = NULL;
+			ppt->msix.arg_sz = ppt->msix.inth_sz = 0;
+			err = EINVAL;
+			goto done;
+		}
+
+		if (numvec != alloced) {
+			ppt_teardown_msix(ppt);
+			err = EINVAL;
+			goto done;
+		}
+	}
+
+	if (idx >= ppt->msix.num_msgs) {
+		err = EINVAL;
+		goto done;
+	}
+
+	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+		int intr_cap, res;
+
+		/* Tear down the IRQ if it's already set up */
+		ppt_teardown_msix_intr(ppt, idx);
+
+		ppt->msix.arg[idx].pptdev = ppt;
+		ppt->msix.arg[idx].addr = addr;
+		ppt->msix.arg[idx].msg_data = msg;
+
+		/* Setup the MSI-X interrupt */
+		if (ddi_intr_add_handler(ppt->msix.inth[idx], pptintr,
+		    &ppt->msix.arg[idx], NULL) != DDI_SUCCESS) {
+			err = ENXIO;
+			goto done;
+		}
+
+		(void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap);
+		if (intr_cap & DDI_INTR_FLAG_BLOCK)
+			res = ddi_intr_block_enable(&ppt->msix.inth[idx], 1);
+		else
+			res = ddi_intr_enable(ppt->msix.inth[idx]);
+
+		if (res != DDI_SUCCESS) {
+			ddi_intr_remove_handler(ppt->msix.inth[idx]);
+			err = ENXIO;
+			goto done;
+		}
+	} else {
+		/* Masked, tear it down if it's already been set up */
+		ppt_teardown_msix_intr(ppt, idx);
+	}
+
+done:
+	releasef(pptfd);
+	mutex_exit(&pptdev_mtx);
+	return (err);
+}
+
+int
+ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit)
+{
+	struct pptdev *ppt;
+	int err = 0;
+
+	mutex_enter(&pptdev_mtx);
+	ppt = ppt_findf(pptfd);
+	if (ppt == NULL) {
+		mutex_exit(&pptdev_mtx);
+		return (EBADF);
+	}
+	if (ppt->vm != vm) {
+		err = EBUSY;
+		goto done;
+	}
+
+	if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI,
+	    msilimit) != DDI_SUCCESS) {
+		*msilimit = -1;
+	}
+	if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSIX,
+	    msixlimit) != DDI_SUCCESS) {
+		*msixlimit = -1;
+	}
+
+done:
+	releasef(pptfd);
+	mutex_exit(&pptdev_mtx);
+	return (err);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.conf b/usr/src/uts/i86pc/io/vmm/io/ppt.conf
new file mode 100644
index 0000000000..698cecb6f8
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.conf
@@ -0,0 +1,14 @@
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2017 Joyent, Inc.
+#
+
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.h b/usr/src/uts/i86pc/io/vmm/io/ppt.h
index 686b15db49..979c0e18ac 100644
--- a/usr/src/uts/i86pc/io/vmm/io/ppt.h
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.h
@@ -31,26 +31,21 @@
 #ifndef _IO_PPT_H_
 #define	_IO_PPT_H_
 
-int	ppt_unassign_all(struct vm *vm);
-int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
-		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
-int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
-		      uint64_t addr, uint64_t msg, int numvec);
-int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
-		int idx, uint64_t addr, uint64_t msg, uint32_t vector_control);
-int	ppt_assigned_devices(struct vm *vm);
+int ppt_unassign_all(struct vm *vm);
+int ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len,
+    vm_paddr_t hpa);
+int ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr,
+    uint64_t msg, int numvec);
+int ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr,
+    uint64_t msg, uint32_t vector_control);
+int ppt_assigned_devices(struct vm *vm);
 boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
-
-/*
- * Returns the number of devices sequestered by the ppt driver for assignment
- * to virtual machines.
- */
-int	ppt_avail_devices(void);
+int ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit);
 
 /*
  * The following functions should never be called directly.
  * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
  */
-int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
-int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_assign_device(struct vm *vm, int pptfd);
+int ppt_unassign_device(struct vm *vm, int pptfd);
 #endif
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile
new file mode 100644
index 0000000000..aac896e89e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile
@@ -0,0 +1,52 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+    global:
+	# DDI Interfaces
+	_fini;
+	_init;
+	_info;
+
+	# PCI pass-thru API for bhyve
+	ppt_assigned_devices;
+	ppt_is_mmio;
+	ppt_assign_device;
+	ppt_unassign_device;
+	ppt_unassign_all;
+	ppt_map_mmio;
+	ppt_setup_msi;
+	ppt_setup_msix;
+	ppt_get_limits;
+
+    local:
+	*;
+};
diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c b/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c
deleted file mode 100644
index 989e88e17b..0000000000
--- a/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2017 Joyent, Inc.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/cmn_err.h>
-
-/*
- * IOMMU Stub
- *
- * Until proper iommu support can be wired into bhyve, stub out all the
- * functions to either fail, if reasonable, or panic.
- */
-
-void
-iommu_cleanup(void)
-{
-}
-
-void *
-iommu_host_domain(void)
-{
-	return (NULL);
-}
-
-/*ARGSUSED*/
-void *
-iommu_create_domain(vm_paddr_t maxaddr)
-{
-	return (NULL);
-}
-
-/*ARGSUSED*/
-void
-iommu_destroy_domain(void *dom)
-{
-	panic("unimplemented");
-}
-
-/*ARGSUSED*/
-void
-iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
-{
-	panic("unimplemented");
-}
-
-/*ARGSUSED*/
-void
-iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
-{
-	panic("unimplemented");
-}
-
-/*ARGSUSED*/
-void
-iommu_add_device(void *dom, uint16_t rid)
-{
-	panic("unimplemented");
-}
-
-/*ARGSUSED*/
-void
-iommu_remove_device(void *dom, uint16_t rid)
-{
-	panic("unimplemented");
-}
-
-/*ARGSUSED*/
-void
-iommu_invalidate_tlb(void *domain)
-{
-	panic("unimplemented");
-}
-
diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c b/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c
deleted file mode 100644
index 9d5b1f5cdc..0000000000
--- a/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2017 Joyent, Inc.
- */
-
-#include <sys/types.h>
-#include <sys/errno.h>
-#include <sys/cmn_err.h>
-
-#include <sys/vmm.h>
-
-/*
- * PCI Pass-Through Stub
- *
- * Until proper passthrough support can be wired into bhyve, stub out all the
- * functions to either fail or no-op.
- */
-
-int
-ppt_unassign_all(struct vm *vm)
-{
-	return (0);
-}
-
-/*ARGSUSED*/
-int
-ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa,
-    size_t len, vm_paddr_t hpa)
-{
-	return (ENXIO);
-}
-
-/*ARGSUSED*/
-int
-ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
-    uint64_t addr, uint64_t msg, int numvec)
-{
-	return (ENXIO);
-}
-
-/*ARGSUSED*/
-int
-ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, int idx,
-    uint64_t addr, uint64_t msg, uint32_t vector_control)
-{
-	return (ENXIO);
-}
-
-/*ARGSUSED*/
-int
-ppt_assigned_devices(struct vm *vm)
-{
-	return (0);
-}
-
-/*ARGSUSED*/
-boolean_t
-ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
-{
-	return (B_FALSE);
-}
-
-/*ARGSUSED*/
-int
-ppt_avail_devices(void)
-{
-	return (0);
-}
-
-/*ARGSUSED*/
-int
-ppt_assign_device(struct vm *vm, int bus, int slot, int func)
-{
-	return (ENOENT);
-}
-
-/*ARGSUSED*/
-int
-ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
-{
-	return (ENXIO);
-}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index 6df094b50e..dd24a18f6a 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -495,6 +495,7 @@ vmm_mod_unload()
 {
 	int	error;
 
+	iommu_cleanup();
 	error = VMM_CLEANUP();
 	if (error)
 		return (error);
@@ -1054,10 +1055,14 @@ vm_iommu_modify(struct vm *vm, boolean_t map)
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
+#ifdef __FreeBSD__
 				iommu_remove_mapping(host_domain, hpa, sz);
+#endif
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
+#ifdef __FreeBSD__
 				iommu_create_mapping(host_domain, hpa, hpa, sz);
+#endif
 			}
 
 			gpa += PAGE_SIZE;
@@ -1068,21 +1073,34 @@ vm_iommu_modify(struct vm *vm, boolean_t map)
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
+#ifdef __FreeBSD__
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
+#else
+	iommu_invalidate_tlb(vm->iommu);
+#endif
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
 
+#ifdef __FreeBSD__
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
+#else
+int
+vm_unassign_pptdev(struct vm *vm, int pptfd)
+#endif /* __FreeBSD__ */
 {
 	int error;
 
+#ifdef __FreeBSD__
 	error = ppt_unassign_device(vm, bus, slot, func);
+#else
+	error = ppt_unassign_device(vm, pptfd);
+#endif /* __FreeBSD__ */
 	if (error)
 		return (error);
 
@@ -1092,8 +1110,13 @@ vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 	return (0);
 }
 
+#ifdef __FreeBSD__
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
+#else
+int
+vm_assign_pptdev(struct vm *vm, int pptfd)
+#endif /* __FreeBSD__ */
 {
 	int error;
 	vm_paddr_t maxaddr;
@@ -1109,7 +1132,11 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 		vm_iommu_map(vm);
 	}
 
+#ifdef __FreeBSD__
 	error = ppt_assign_device(vm, bus, slot, func);
+#else
+	error = ppt_assign_device(vm, pptfd);
+#endif /* __FreeBSD__ */
 	return (error);
 }
 
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index 2b612b20e9..d84580d04c 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -42,6 +42,7 @@
 #include <vm/vm.h>
 #include <vm/seg_dev.h>
 
+#include "io/ppt.h"
 #include "io/vatpic.h"
 #include "io/vioapic.h"
 #include "io/vrtc.h"
@@ -564,7 +565,6 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 		break;
 	}
 
-	/* XXXJOY: punt on these for now */
 	case VM_PPTDEV_MSI: {
 		struct vm_pptdev_msi pptmsi;
 
@@ -572,7 +572,9 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 			error = EFAULT;
 			break;
 		}
-		return (ENOTTY);
+		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
+		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
+		break;
 	}
 	case VM_PPTDEV_MSIX: {
 		struct vm_pptdev_msix pptmsix;
@@ -581,7 +583,10 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 			error = EFAULT;
 			break;
 		}
-		return (ENOTTY);
+		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
+		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
+		    pptmsix.vector_control);
+		break;
 	}
 	case VM_MAP_PPTDEV_MMIO: {
 		struct vm_pptdev_mmio pptmmio;
@@ -590,9 +595,20 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 			error = EFAULT;
 			break;
 		}
-		return (ENOTTY);
+		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
+		    pptmmio.len, pptmmio.hpa);
+		break;
+	}
+	case VM_BIND_PPTDEV: {
+		struct vm_pptdev pptdev;
+
+		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
+		break;
 	}
-	case VM_BIND_PPTDEV:
 	case VM_UNBIND_PPTDEV: {
 		struct vm_pptdev pptdev;
 
@@ -600,12 +616,27 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
 			error = EFAULT;
 			break;
 		}
-		return (ENOTTY);
+		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
+		break;
 	}
+	case VM_GET_PPTDEV_LIMITS: {
+		struct vm_pptdev_limits pptlimits;
 
+		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
+		    &pptlimits.msi_limit, &pptlimits.msix_limit);
+		if (error == 0 &&
+		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
+			error = EFAULT;
+			break;
+		}
+		break;
+	}
 	case VM_INJECT_EXCEPTION: {
 		struct vm_exception vmexc;
-
 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
 			error = EFAULT;
 			break;
@@ -2091,8 +2122,16 @@ vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		return (DDI_FAILURE);
 	}
 
-	/* Ensure that all resources have been cleaned up */
-	mutex_enter(&vmmdev_mtx);
+	/*
+	 * Ensure that all resources have been cleaned up.
+	 *
+	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
+	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
+	 * devinfo locked as iommu_cleanup() tries to recursively lock each
+	 * devinfo, including our own, while holding vmmdev_mtx.
+	 */
+	if (mutex_tryenter(&vmmdev_mtx) == 0)
+		return (DDI_FAILURE);
 
 	mutex_enter(&vmm_mtx);
 	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
index e2522858dd..2401774ab7 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -55,6 +55,8 @@
 #include <sys/modhash.h>
 #include <sys/hma.h>
 
+#include <sys/x86_archext.h>
+
 #include <machine/cpufunc.h>
 #include <machine/fpu.h>
 #include <machine/md_var.h>
@@ -91,6 +93,19 @@ u_char const bin2bcd_data[] = {
 	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
 };
 
+void
+pmap_invalidate_cache(void)
+{
+	cpuset_t cpuset;
+
+	kpreempt_disable();
+	cpuset_all_but(&cpuset, CPU->cpu_id);
+	xc_call((xc_arg_t)NULL, (xc_arg_t)NULL, (xc_arg_t)NULL,
+	    CPUSET2BV(cpuset), (xc_func_t)invalidate_cache);
+	invalidate_cache();
+	kpreempt_enable();
+}
+
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
diff --git a/usr/src/uts/i86pc/ppt/Makefile b/usr/src/uts/i86pc/ppt/Makefile
new file mode 100644
index 0000000000..f231dfddf6
--- /dev/null
+++ b/usr/src/uts/i86pc/ppt/Makefile
@@ -0,0 +1,86 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= ppt
+OBJECTS		= $(PPT_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(PPT_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/i86pc/io/vmm/io
+MAPFILE		= $(UTSBASE)/i86pc/io/vmm/io/ppt.mapfile
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/i86pc/Makefile.i86pc
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+#	Overrides and additions
+#
+ALL_BUILDS	= $(ALL_BUILDSONLY64)
+DEF_BUILDS	= $(DEF_BUILDSONLY64)
+PRE_INC_PATH	= -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 \
+	-I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64
+INC_PATH	+= -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io
+AS_INC_PATH	+= -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR)
+
+LDFLAGS         += -dy -N drv/vmm -N misc/pcie
+LDFLAGS         += -M $(MAPFILE)
+
+$(OBJS_DIR)/ppt.o := CERRWARN	+= -_gcc=-Wno-unused-variable
+
+# needs work
+SMOFF += all_func_returns
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/i86pc/Makefile.targ
diff --git a/usr/src/uts/i86pc/sys/ppt_dev.h b/usr/src/uts/i86pc/sys/ppt_dev.h
new file mode 100644
index 0000000000..e25f941f14
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/ppt_dev.h
@@ -0,0 +1,56 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc
+ */
+
+#ifndef _PPT_DEV_H
+#define	_PPT_DEV_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	PPT_IOC			(('P' << 16)|('T' << 8))
+
+#define	PPT_CFG_READ		(PPT_IOC | 0x01)
+#define	PPT_CFG_WRITE		(PPT_IOC | 0x02)
+#define	PPT_BAR_QUERY		(PPT_IOC | 0x03)
+#define	PPT_BAR_READ		(PPT_IOC | 0x04)
+#define	PPT_BAR_WRITE		(PPT_IOC | 0x05)
+
+#define	PPT_MAXNAMELEN	32
+
+struct ppt_cfg_io {
+	uint64_t pci_off;
+	uint32_t pci_width;
+	uint32_t pci_data;
+};
+struct ppt_bar_io {
+	uint32_t pbi_bar;
+	uint32_t pbi_off;
+	uint32_t pbi_width;
+	uint32_t pbi_data;
+};
+
+struct ppt_bar_query {
+	uint32_t pbq_baridx;
+	uint32_t pbq_type;
+	uint64_t pbq_base;
+	uint64_t pbq_size;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PPT_DEV_H */
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index 8a35d123c7..ac8f14b042 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -224,8 +224,13 @@ int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
 void vm_free_memseg(struct vm *vm, int ident);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+#ifdef __FreeBSD__
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
+#else
+int vm_assign_pptdev(struct vm *vm, int pptfd);
+int vm_unassign_pptdev(struct vm *vm, int pptfd);
+#endif /* __FreeBSD__ */
 
 /*
  * APIs that inspect the guest memory map require only a *single* vcpu to
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
index 58e581a60d..dd87dcb0a6 100644
--- a/usr/src/uts/i86pc/sys/vmm_dev.h
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -127,6 +127,7 @@ struct vm_capability {
 	int		allcpus;
 };
 
+#ifdef __FreeBSD__
 struct vm_pptdev {
 	int		bus;
 	int		slot;
@@ -163,6 +164,49 @@ struct vm_pptdev_msix {
 	uint64_t	addr;
 };
 
+struct vm_pptdev_limits {
+	int		bus;
+	int		slot;
+	int		func;
+	int		msi_limit;
+	int		msix_limit;
+};
+#else /* __FreeBSD__ */
+struct vm_pptdev {
+	int		pptfd;
+};
+
+struct vm_pptdev_mmio {
+	int		pptfd;
+	vm_paddr_t	gpa;
+	vm_paddr_t	hpa;
+	size_t		len;
+};
+
+struct vm_pptdev_msi {
+	int		vcpu;
+	int		pptfd;
+	int		numvec;		/* 0 means disabled */
+	uint64_t	msg;
+	uint64_t	addr;
+};
+
+struct vm_pptdev_msix {
+	int		vcpu;
+	int		pptfd;
+	int		idx;
+	uint64_t	msg;
+	uint32_t	vector_control;
+	uint64_t	addr;
+};
+
+struct vm_pptdev_limits {
+	int		pptfd;
+	int		msi_limit;
+	int		msix_limit;
+};
+#endif /* __FreeBSD__ */
+
 struct vm_nmi {
 	int		cpuid;
 };
@@ -307,6 +351,7 @@ enum {
 	IOCNUM_MAP_PPTDEV_MMIO = 42,
 	IOCNUM_PPTDEV_MSI = 43,
 	IOCNUM_PPTDEV_MSIX = 44,
+	IOCNUM_GET_PPTDEV_LIMITS = 45,
 
 	/* statistics */
 	IOCNUM_VM_STATS = 50, 
@@ -410,6 +455,8 @@ enum {
 	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
 #define	VM_PPTDEV_MSIX \
 	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define	VM_GET_PPTDEV_LIMITS \
+	_IOR('v', IOCNUM_GET_PPTDEV_LIMITS, struct vm_pptdev_limits)
 #define VM_INJECT_NMI \
 	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
 #define	VM_STATS_IOC \
diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile
index 5b93db987b..d5dc8d7124 100644
--- a/usr/src/uts/i86pc/vmm/Makefile
+++ b/usr/src/uts/i86pc/vmm/Makefile
@@ -104,11 +104,12 @@ CFLAGS		+= -_gcc=-Wno-format
 # enable collection of VMM statistics
 CFLAGS		+= -DVMM_KEEP_STATS
 
-LDFLAGS         += -Nfs/dev
-
 $(OBJS_DIR)/vmm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits
 $(OBJS_DIR)/svm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits
+$(OBJS_DIR)/vmx.o := CERRWARN += -_gcc=-Wno-unused-variable
+$(OBJS_DIR)/iommu.o := CERRWARN += -_gcc=-Wno-unused-variable
 
+LDFLAGS         += -N misc/acpica -N misc/pcie -N fs/dev
 LDFLAGS         += -z type=kmod -M $(MAPFILE)
 
 OFFSETS_VMX	= $(CONF_SRCDIR)/intel/offsets.in
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index 2562f9ec4b..49c0cce31c 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -1293,9 +1293,25 @@ fcnname/**/_info:							\
 #endif
 
 /*
- * this is just a marker for the area of text that contains stubs
+ * Stubs for ppt module (bhyve PCI passthrough driver)
  */
+#ifndef PPT_MODULE
+	MODULE(ppt,drv);
+	WSTUB(ppt, ppt_unassign_all,		nomod_zero);
+	WSTUB(ppt, ppt_map_mmio,		nomod_einval);
+	WSTUB(ppt, ppt_setup_msi,		nomod_einval);
+	WSTUB(ppt, ppt_setup_msix,		nomod_einval);
+	WSTUB(ppt, ppt_assigned_devices,	nomod_zero);
+	WSTUB(ppt, ppt_is_mmio,			nomod_zero);
+	WSTUB(ppt, ppt_assign_device,		nomod_einval);
+	WSTUB(ppt, ppt_unassign_device,		nomod_einval);
+	WSTUB(ppt, ppt_get_limits,		nomod_einval);
+	END_MODULE(ppt);
+#endif
 
+/*
+ * this is just a marker for the area of text that contains stubs
+ */
 	ENTRY_NP(stubs_end)
 	nop
 
-- 
cgit v1.2.3


From 9c3024a3457d2d1269be18124a1ac69e33000da7 Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@joyent.com>
Date: Tue, 15 Jan 2019 16:29:40 +0100
Subject: 12682 want mdb-bhyve module Portions contributed by: Andy Fiddaman
 <omnios@citrus-it.co.uk> Portions contributed by: John Levon
 <john.levon@joyent.com> Portions contributed by: Patrick Mooney
 <patrick.mooney@joyent.com> Reviewed by: John Levon <john.levon@joyent.com>
 Approved by: Dan McDonald <danmcd@joyent.com>

---
 exception_lists/packaging                          |    2 +
 usr/src/cmd/bhyve/bhyverun.c                       |   30 +-
 usr/src/cmd/bhyve/bhyverun.h                       |    5 +
 usr/src/cmd/bhyve/gdb.c                            |    4 +
 usr/src/cmd/bhyve/spinup_ap.c                      |    4 +
 usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c             |    2 +
 usr/src/cmd/mdb/common/mdb/mdb_cmds.c              |    7 +-
 usr/src/cmd/mdb/common/mdb/mdb_create.c            |    9 +-
 usr/src/cmd/mdb/common/mdb/mdb_fmt.c               |    6 +
 usr/src/cmd/mdb/common/mdb/mdb_kproc.c             |    3 +-
 usr/src/cmd/mdb/common/mdb/mdb_kvm.c               |    2 +
 usr/src/cmd/mdb/common/mdb/mdb_main.c              |   32 +-
 usr/src/cmd/mdb/common/mdb/mdb_rawfile.c           |    4 +
 usr/src/cmd/mdb/common/mdb/mdb_target.c            |    4 +
 usr/src/cmd/mdb/common/mdb/mdb_target.h            |   13 +-
 usr/src/cmd/mdb/i86pc/modules/unix/amd64/Makefile  |    3 +-
 usr/src/cmd/mdb/i86pc/modules/unix/unix.c          |   79 +-
 usr/src/cmd/mdb/i86xpv/modules/unix/amd64/Makefile |    3 +-
 usr/src/cmd/mdb/intel/Makefile.kmdb                |    3 +-
 usr/src/cmd/mdb/intel/amd64/Makefile.kmdb          |    5 +-
 usr/src/cmd/mdb/intel/amd64/mdb/Makefile           |   17 +-
 usr/src/cmd/mdb/intel/kmdb/kaif.c                  |   34 +-
 usr/src/cmd/mdb/intel/mdb/kvm_amd64dep.c           |    1 +
 usr/src/cmd/mdb/intel/mdb/kvm_ia32dep.c            |    1 +
 usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c          |   35 +-
 usr/src/cmd/mdb/intel/mdb/mdb_amd64util.h          |    7 +-
 usr/src/cmd/mdb/intel/mdb/mdb_bhyve.c              | 1461 ++++++++++++++++++++
 usr/src/cmd/mdb/intel/mdb/mdb_ia32util.c           |   83 +-
 usr/src/cmd/mdb/intel/mdb/mdb_ia32util.h           |    7 +-
 usr/src/cmd/mdb/intel/mdb/mdb_isautil.h            |    7 +-
 usr/src/cmd/mdb/intel/mdb/mdb_kreg.h               |   12 +
 usr/src/cmd/mdb/intel/mdb/mdb_x86util.c            |  215 +++
 usr/src/cmd/mdb/intel/mdb/mdb_x86util.h            |   68 +
 usr/src/cmd/mdb/intel/mdb/proc_amd64dep.c          |    4 +-
 usr/src/cmd/mdb/intel/mdb/proc_ia32dep.c           |    6 +-
 usr/src/compat/freebsd/amd64/machine/specialreg.h  |    1 +
 usr/src/lib/Makefile                               |    3 +
 usr/src/lib/libvmm/Makefile                        |   43 +
 usr/src/lib/libvmm/Makefile.com                    |   51 +
 usr/src/lib/libvmm/amd64/Makefile                  |   19 +
 usr/src/lib/libvmm/libvmm.c                        |  860 ++++++++++++
 usr/src/lib/libvmm/libvmm.h                        |  122 ++
 usr/src/lib/libvmm/mapfile-vers                    |   60 +
 usr/src/lib/libvmmapi/common/mapfile-vers          |    2 +
 usr/src/lib/libvmmapi/common/vmmapi.c              |   51 +-
 usr/src/lib/libvmmapi/common/vmmapi.h              |   10 +
 usr/src/man/man1/mdb.1                             |   26 +-
 usr/src/pkg/manifests/system-library-bhyve.mf      |    1 +
 usr/src/uts/i86pc/io/vmm/vmm.c                     |    4 +
 usr/src/uts/intel/sys/controlregs.h                |   22 +-
 usr/src/uts/intel/sys/debugreg.h                   |    7 +
 51 files changed, 3273 insertions(+), 187 deletions(-)
 create mode 100644 usr/src/cmd/mdb/intel/mdb/mdb_bhyve.c
 create mode 100644 usr/src/cmd/mdb/intel/mdb/mdb_x86util.c
 create mode 100644 usr/src/cmd/mdb/intel/mdb/mdb_x86util.h
 create mode 100644 usr/src/lib/libvmm/Makefile
 create mode 100644 usr/src/lib/libvmm/Makefile.com
 create mode 100644 usr/src/lib/libvmm/amd64/Makefile
 create mode 100644 usr/src/lib/libvmm/libvmm.c
 create mode 100644 usr/src/lib/libvmm/libvmm.h
 create mode 100644 usr/src/lib/libvmm/mapfile-vers

(limited to 'usr/src/uts/i86pc')

diff --git a/exception_lists/packaging b/exception_lists/packaging
index 981eb8aa91..556a6012f6 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -819,8 +819,10 @@ usr/lib/libsff.so
 #
 # private bhyve files
 #
+lib/amd64/libvmm.so			i386
 lib/amd64/libvmmapi.so			i386
 usr/include/libppt.h			i386
+usr/include/libvmm.h			i386
 usr/include/vmmapi.h			i386
 usr/lib/amd64/libppt.so			i386
 usr/lib/libppt.so			i386
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c
index 928d2dc811..07489ad8d5 100644
--- a/usr/src/cmd/bhyve/bhyverun.c
+++ b/usr/src/cmd/bhyve/bhyverun.c
@@ -251,6 +251,9 @@ usage(int code)
 		"       -A: create ACPI tables\n"
 		"       -c: number of cpus and/or topology specification\n"
 		"       -C: include guest memory in core file\n"
+#ifndef __FreeBSD__
+	        "       -d: suspend cpu at boot\n"
+#endif
 		"       -e: exit on unhandled I/O access\n"
 		"       -g: gdb port\n"
 		"       -h: help\n"
@@ -500,8 +503,14 @@ fbsdrun_start_thread(void *param)
 	return (NULL);
 }
 
+#ifdef __FreeBSD__
 void
 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
+#else
+void
+fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip,
+    bool suspend)
+#endif
 {
 	int error;
 
@@ -519,6 +528,11 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 
 	CPU_SET_ATOMIC(newcpu, &cpumask);
 
+#ifndef __FreeBSD__
+	if (suspend)
+		(void) vm_suspend_cpu(ctx, newcpu);
+#endif
+
 	/*
 	 * Set up the vmexit struct to allow execution to start
 	 * at the given RIP
@@ -1057,6 +1071,9 @@ main(int argc, char *argv[])
 	int max_vcpus, mptgen, memflags;
 	int rtc_localtime;
 	bool gdb_stop;
+#ifndef __FreeBSD__
+	bool suspend = false;
+#endif
 	struct vmctx *ctx;
 	uint64_t rip;
 	size_t memsize;
@@ -1078,7 +1095,7 @@ main(int argc, char *argv[])
 #ifdef	__FreeBSD__
 	optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:";
 #else
-	optstr = "abehuwxACHIPSWYg:G:c:s:m:l:B:U:";
+	optstr = "abdehuwxACHIPSWYg:G:c:s:m:l:B:U:";
 #endif
 	while ((c = getopt(argc, argv, optstr)) != -1) {
 		switch (c) {
@@ -1097,7 +1114,11 @@ main(int argc, char *argv[])
 				    "configuration '%s'", optarg);
 			}
 			break;
-#ifdef	__FreeBSD__
+#ifndef	__FreeBSD__
+		case 'd':
+			suspend = true;
+			break;
+#else
 		case 'p':
 			if (pincpu_parse(optarg) != 0) {
 				errx(EX_USAGE, "invalid vcpu pinning "
@@ -1331,8 +1352,11 @@ main(int argc, char *argv[])
 	/*
 	 * Add CPU 0
 	 */
+#ifdef __FreeBSD__
 	fbsdrun_addcpu(ctx, BSP, BSP, rip);
-
+#else
+	fbsdrun_addcpu(ctx, BSP, BSP, rip, suspend);
+#endif
 	/*
 	 * Head off to the main event dispatch loop
 	 */
diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h
index 78b3f1111f..8df8e01a73 100644
--- a/usr/src/cmd/bhyve/bhyverun.h
+++ b/usr/src/cmd/bhyve/bhyverun.h
@@ -61,7 +61,12 @@ extern pthread_cond_t bcons_wait_done;
 void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
 
 void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
+#ifdef __FreeBSD__
 void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
+#else
+void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip,
+    bool suspend);
+#endif
 int  fbsdrun_muxed(void);
 int  fbsdrun_vmexit_on_hlt(void);
 int  fbsdrun_vmexit_on_pause(void);
diff --git a/usr/src/cmd/bhyve/gdb.c b/usr/src/cmd/bhyve/gdb.c
index 20c2de1dec..71cb780544 100644
--- a/usr/src/cmd/bhyve/gdb.c
+++ b/usr/src/cmd/bhyve/gdb.c
@@ -75,7 +75,11 @@ static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting;
 static pthread_mutex_t gdb_lock;
 static pthread_cond_t idle_vcpus;
 static bool stop_pending, first_stop;
+#ifdef __FreeBSD__
 static int stepping_vcpu, stopped_vcpu;
+#else
+static int stepping_vcpu = -1, stopped_vcpu = -1;
+#endif
 
 /*
  * An I/O buffer contains 'capacity' bytes of room at 'data'.  For a
diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c
index 7c4186f5ed..ecdd05694c 100644
--- a/usr/src/cmd/bhyve/spinup_ap.c
+++ b/usr/src/cmd/bhyve/spinup_ap.c
@@ -100,7 +100,11 @@ spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
 
 	spinup_ap_realmode(ctx, newcpu, &rip);
 
+#ifdef __FreeBSD__
 	fbsdrun_addcpu(ctx, vcpu, newcpu, rip);
+#else
+	fbsdrun_addcpu(ctx, vcpu, newcpu, rip, false);
+#endif
 
 	return (newcpu);
 }
diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c b/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c
index 90fd098d1c..628503d179 100644
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c
@@ -286,6 +286,8 @@ kmt_vtop(mdb_tgt_t *t, mdb_tgt_as_t as, uintptr_t va, physaddr_t *pap)
 	case (uintptr_t)MDB_TGT_AS_IO:
 		return (set_errno(EINVAL));
 	case (uintptr_t)MDB_TGT_AS_VIRT:
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
 		if ((asp = (struct as *)kmt_read_kas(t)) == NULL)
 			return (-1); /* errno is set for us */
 		break;
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_cmds.c b/usr/src/cmd/mdb/common/mdb/mdb_cmds.c
index b7b15aa507..10c622443a 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_cmds.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_cmds.c
@@ -2090,7 +2090,7 @@ cmd_dis(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	if (opt_f)
 		as = MDB_TGT_AS_FILE;
 	else
-		as = MDB_TGT_AS_VIRT;
+		as = MDB_TGT_AS_VIRT_I;
 
 	if (opt_w == FALSE) {
 		n++;
@@ -2635,8 +2635,9 @@ tgt_status(const mdb_tgt_status_t *tsp)
 		return (DCMD_OK);
 
 	if (tsp->st_pc != 0) {
-		if (mdb_dis_ins2str(mdb.m_disasm, mdb.m_target, MDB_TGT_AS_VIRT,
-		    buf, sizeof (buf), tsp->st_pc) != tsp->st_pc)
+		if (mdb_dis_ins2str(mdb.m_disasm, mdb.m_target,
+		    MDB_TGT_AS_VIRT_I, buf, sizeof (buf), tsp->st_pc) !=
+		    tsp->st_pc)
 			format = "target stopped at:\n%-#16a%8T%s\n";
 		else
 			format = "target stopped at %a:\n";
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_create.c b/usr/src/cmd/mdb/common/mdb/mdb_create.c
index ea30457ef0..86490bf825 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_create.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_create.c
@@ -24,7 +24,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
 
 #include <mdb/mdb.h>
 #include <mdb/mdb_conf.h>
@@ -51,6 +53,11 @@ mdb_create_builtin_tgts(void)
 
 	if ((mp = mdb_module_load_builtin("mdb_raw")) != NULL)
 		mp->mod_tgt_ctor = mdb_rawfile_tgt_create;
+
+#ifdef __amd64
+	if ((mp = mdb_module_load_builtin("mdb_bhyve")) != NULL)
+		mp->mod_tgt_ctor = mdb_bhyve_tgt_create;
+#endif
 }
 
 void
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_fmt.c b/usr/src/cmd/mdb/common/mdb/mdb_fmt.c
index 662cf7bea5..6a745b2ac4 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_fmt.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_fmt.c
@@ -411,6 +411,9 @@ fmt_instr(mdb_tgt_t *t, mdb_tgt_as_t as, mdb_tgt_addr_t addr, size_t cnt)
 	char buf[BUFSIZ];
 	uintptr_t naddr;
 
+	if (as == MDB_TGT_AS_VIRT)
+		as = MDB_TGT_AS_VIRT_I;
+
 	while (cnt-- != 0) {
 		naddr = mdb_dis_ins2str(mdb.m_disasm, t, as,
 		    buf, sizeof (buf), addr);
@@ -431,6 +434,9 @@ fmt_dotinstr(mdb_tgt_t *t, mdb_tgt_as_t as, mdb_tgt_addr_t addr, size_t cnt)
 	uintptr_t naddr;
 	uint32_t i;
 
+	if (as == MDB_TGT_AS_VIRT)
+		as = MDB_TGT_AS_VIRT_I;
+
 	for (mdb_iob_clrflags(mdb.m_out, oflags); cnt-- != 0; addr = naddr) {
 		if (mdb_tgt_aread(t, as, &i, sizeof (i), addr) != sizeof (i)) {
 			warn("failed to read data from target");
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_kproc.c b/usr/src/cmd/mdb/common/mdb/mdb_kproc.c
index e13dcea53f..4eeb8ac708 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_kproc.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_kproc.c
@@ -553,7 +553,8 @@ kp_vtop(mdb_tgt_t *t, mdb_tgt_as_t as, uintptr_t va, physaddr_t *pap)
 	kp_data_t *kp = t->t_data;
 	physaddr_t pa;
 
-	if (as != MDB_TGT_AS_VIRT)
+	if (as != MDB_TGT_AS_VIRT && as != MDB_TGT_AS_VIRT_I &&
+	    as != MDB_TGT_AS_VIRT_S)
 		return (set_errno(EINVAL));
 
 	if ((pa = kvm_physaddr(kp->kp_cookie, kp->kp_as, va)) != -1ULL) {
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_kvm.c b/usr/src/cmd/mdb/common/mdb/mdb_kvm.c
index fe280c05c7..27ca238ca9 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_kvm.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_kvm.c
@@ -814,6 +814,8 @@ kt_vtop(mdb_tgt_t *t, mdb_tgt_as_t as, uintptr_t va, physaddr_t *pap)
 	case (uintptr_t)MDB_TGT_AS_IO:
 		return (set_errno(EINVAL));
 	case (uintptr_t)MDB_TGT_AS_VIRT:
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
 		asp = kt->k_as;
 		break;
 	default:
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_main.c b/usr/src/cmd/mdb/common/mdb/mdb_main.c
index eea4b5b60e..8747464328 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_main.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_main.c
@@ -322,12 +322,13 @@ static void
 usage(int status)
 {
 	mdb_iob_printf(mdb.m_err, "Usage: %s [-fkmuwyAFKMSUW] [+/-o option] "
-	    "[-p pid] [-s dist] [-I path] [-L path]\n\t[-P prompt] "
+	    "[-b VM] [-p pid] [-s dist] [-I path] [-L path]\n\t[-P prompt] "
 	    "[-R root] [-V dis-version] [-e expr] "
 	    "[object [core] | core | suffix]\n\n",
 	    mdb.m_pname);
 
 	mdb_iob_puts(mdb.m_err,
+	    "\t-b attach to specified bhyve VM\n"
 	    "\t-e evaluate expr and return status\n"
 	    "\t-f force raw file debugging mode\n"
 	    "\t-k force kernel debugging mode\n"
@@ -405,6 +406,19 @@ identify_xvm_file(const char *file, int *longmode)
 }
 #endif /* __x86 */
 
+#ifndef __amd64
+/*
+ * There is no bhyve target in a 32bit x86 or any SPARC mdb. This dummy helps
+ * keep the code simpler.
+ */
+/*ARGSUSED*/
+static int
+mdb_bhyve_tgt_create(mdb_tgt_t *t, int argc, const char *argv[])
+{
+	return (set_errno(EINVAL));
+}
+#endif
+
 int
 main(int argc, char *argv[], char *envp[])
 {
@@ -424,6 +438,7 @@ main(int argc, char *argv[], char *envp[])
 	const char *Iflag = NULL, *Lflag = NULL, *Vflag = NULL, *pidarg = NULL;
 	const char *eflag = NULL;
 	int fflag = 0, Kflag = 0, Rflag = 0, Sflag = 0, Oflag = 0, Uflag = 0;
+	int bflag = 0;
 
 	int ttylike;
 	int longmode = 0;
@@ -513,8 +528,12 @@ main(int argc, char *argv[], char *envp[])
 
 	while (optind < argc) {
 		while ((c = getopt(argc, argv,
-		    "e:fkmo:p:s:uwyACD:FI:KL:MOP:R:SUV:W")) != (int)EOF) {
+		    "be:fkmo:p:s:uwyACD:FI:KL:MOP:R:SUV:W")) != (int)EOF) {
 			switch (c) {
+			case 'b':
+				bflag++;
+				tgt_ctor = mdb_bhyve_tgt_create;
+				break;
 			case 'e':
 				if (eflag != NULL) {
 					warn("-e already specified\n");
@@ -830,6 +849,15 @@ main(int argc, char *argv[], char *envp[])
 		if (fflag)
 			goto tcreate; /* skip re-exec and just create target */
 
+		/* bhyve: directly create target, or re-exec in case of 32bit */
+		if (bflag) {
+#ifndef __amd64
+			goto reexec;
+#else
+			goto tcreate;
+#endif
+		}
+
 		/*
 		 * If we just have an object file name, and that file doesn't
 		 * exist, and it's a string of digits, infer it to be a
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_rawfile.c b/usr/src/cmd/mdb/common/mdb/mdb_rawfile.c
index 086639de74..d21ad0f38a 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_rawfile.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_rawfile.c
@@ -143,6 +143,8 @@ rf_aread(mdb_tgt_t *t, mdb_tgt_as_t as, void *buf,
 {
 	switch ((uintptr_t)as) {
 	case (uintptr_t)MDB_TGT_AS_VIRT:
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
 	case (uintptr_t)MDB_TGT_AS_PHYS:
 		if (RF_CORE(t->t_data) != NULL)
 			return (rf_read(RF_CORE(t->t_data), buf, len, addr));
@@ -160,6 +162,8 @@ rf_awrite(mdb_tgt_t *t, mdb_tgt_as_t as, const void *buf,
 {
 	switch ((uintptr_t)as) {
 	case (uintptr_t)MDB_TGT_AS_VIRT:
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
 	case (uintptr_t)MDB_TGT_AS_PHYS:
 		if (RF_CORE(t->t_data) != NULL)
 			return (rf_write(RF_CORE(t->t_data), buf, len, addr));
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_target.c b/usr/src/cmd/mdb/common/mdb/mdb_target.c
index 17aef0aac1..e0ae29bd99 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_target.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_target.c
@@ -398,6 +398,8 @@ mdb_tgt_aread(mdb_tgt_t *t, mdb_tgt_as_t as,
 
 	switch ((uintptr_t)as) {
 	case (uintptr_t)MDB_TGT_AS_VIRT:
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
 		return (t->t_ops->t_vread(t, buf, n, addr));
 	case (uintptr_t)MDB_TGT_AS_PHYS:
 		return (t->t_ops->t_pread(t, buf, n, addr));
@@ -421,6 +423,8 @@ mdb_tgt_awrite(mdb_tgt_t *t, mdb_tgt_as_t as,
 
 	switch ((uintptr_t)as) {
 	case (uintptr_t)MDB_TGT_AS_VIRT:
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
 		return (t->t_ops->t_vwrite(t, buf, n, addr));
 	case (uintptr_t)MDB_TGT_AS_PHYS:
 		return (t->t_ops->t_pwrite(t, buf, n, addr));
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_target.h b/usr/src/cmd/mdb/common/mdb/mdb_target.h
index c36b85e2f3..e385caa38e 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_target.h
+++ b/usr/src/cmd/mdb/common/mdb/mdb_target.h
@@ -64,6 +64,9 @@ extern int mdb_kvm_tgt_create(mdb_tgt_t *, int, const char *[]);
 extern int mdb_proc_tgt_create(mdb_tgt_t *, int, const char *[]);
 extern int mdb_kproc_tgt_create(mdb_tgt_t *, int, const char *[]);
 extern int mdb_rawfile_tgt_create(mdb_tgt_t *, int, const char *[]);
+#ifdef __amd64
+extern int mdb_bhyve_tgt_create(mdb_tgt_t *, int, const char *[]);
+#endif
 #else
 extern int kmdb_kvm_create(mdb_tgt_t *, int, const char *[]);
 #endif
@@ -139,10 +142,12 @@ typedef void *		mdb_tgt_as_t;		/* Opaque address space id */
 typedef uint64_t	mdb_tgt_addr_t;		/* Generic unsigned address */
 typedef uint64_t	physaddr_t;		/* Physical memory address */
 
-#define	MDB_TGT_AS_VIRT	((mdb_tgt_as_t)-1L)	/* Virtual address space */
-#define	MDB_TGT_AS_PHYS	((mdb_tgt_as_t)-2L)	/* Physical address space */
-#define	MDB_TGT_AS_FILE	((mdb_tgt_as_t)-3L)	/* Object file address space */
-#define	MDB_TGT_AS_IO	((mdb_tgt_as_t)-4L)	/* I/o address space */
+#define	MDB_TGT_AS_VIRT	((mdb_tgt_as_t)-1L)	/* Virtual address space: */
+#define	MDB_TGT_AS_VIRT_I ((mdb_tgt_as_t)-2L)	/*   special case for code */
+#define	MDB_TGT_AS_VIRT_S ((mdb_tgt_as_t)-3L)	/*   special case for stack */
+#define	MDB_TGT_AS_PHYS	((mdb_tgt_as_t)-4L)	/* Physical address space */
+#define	MDB_TGT_AS_FILE	((mdb_tgt_as_t)-5L)	/* Object file address space */
+#define	MDB_TGT_AS_IO	((mdb_tgt_as_t)-6L)	/* I/o address space */
 
 extern ssize_t mdb_tgt_aread(mdb_tgt_t *, mdb_tgt_as_t,
 	void *, size_t, mdb_tgt_addr_t);
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/amd64/Makefile b/usr/src/cmd/mdb/i86pc/modules/unix/amd64/Makefile
index efcfad5375..54bc62a086 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/amd64/Makefile
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/amd64/Makefile
@@ -22,7 +22,7 @@
 # Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# Copyright 2018 Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
 
 MODULE = unix.so
 MDBTGT = kvm
@@ -38,6 +38,7 @@ include ../../../../Makefile.module
 
 CPPFLAGS += -DMP -D_MACHDEP
 CPPFLAGS += -I../../../../common
+CPPFLAGS += -I../../../../intel
 CPPFLAGS += -I$(SRC)/uts/i86pc
 CPPFLAGS += -I$(SRC)/uts/intel
 
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
index e91321b235..c3b7e809e4 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
@@ -20,11 +20,13 @@
  */
 /*
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <mdb/mdb_modapi.h>
 #include <mdb/mdb_ctf.h>
+#include <mdb/mdb_x86util.h>
 #include <sys/cpuvar.h>
 #include <sys/systm.h>
 #include <sys/traptrace.h>
@@ -963,73 +965,24 @@ x86_featureset_dcmd(uintptr_t addr, uint_t flags, int argc,
 static int
 sysregs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
-	ulong_t cr0, cr2, cr3, cr4;
+	struct sysregs sregs = { 0 };
 	desctbr_t gdtr;
+	boolean_t longmode = B_FALSE;
 
-	static const mdb_bitmask_t cr0_flag_bits[] = {
-		{ "PE",		CR0_PE,		CR0_PE },
-		{ "MP",		CR0_MP,		CR0_MP },
-		{ "EM",		CR0_EM,		CR0_EM },
-		{ "TS",		CR0_TS,		CR0_TS },
-		{ "ET",		CR0_ET,		CR0_ET },
-		{ "NE",		CR0_NE,		CR0_NE },
-		{ "WP",		CR0_WP,		CR0_WP },
-		{ "AM",		CR0_AM,		CR0_AM },
-		{ "NW",		CR0_NW,		CR0_NW },
-		{ "CD",		CR0_CD,		CR0_CD },
-		{ "PG",		CR0_PG,		CR0_PG },
-		{ NULL,		0,		0 }
-	};
-
-	static const mdb_bitmask_t cr3_flag_bits[] = {
-		{ "PCD",	CR3_PCD,	CR3_PCD },
-		{ "PWT",	CR3_PWT,	CR3_PWT },
-		{ NULL,		0,		0, }
-	};
-
-	static const mdb_bitmask_t cr4_flag_bits[] = {
-		{ "VME",	CR4_VME,	CR4_VME },
-		{ "PVI",	CR4_PVI,	CR4_PVI },
-		{ "TSD",	CR4_TSD,	CR4_TSD },
-		{ "DE",		CR4_DE,		CR4_DE },
-		{ "PSE",	CR4_PSE,	CR4_PSE },
-		{ "PAE",	CR4_PAE,	CR4_PAE },
-		{ "MCE",	CR4_MCE,	CR4_MCE },
-		{ "PGE",	CR4_PGE,	CR4_PGE },
-		{ "PCE",	CR4_PCE,	CR4_PCE },
-		{ "OSFXSR",	CR4_OSFXSR,	CR4_OSFXSR },
-		{ "OSXMMEXCPT",	CR4_OSXMMEXCPT,	CR4_OSXMMEXCPT },
-		{ "VMXE",	CR4_VMXE,	CR4_VMXE },
-		{ "SMXE",	CR4_SMXE,	CR4_SMXE },
-		{ "PCIDE",	CR4_PCIDE,	CR4_PCIDE },
-		{ "OSXSAVE",	CR4_OSXSAVE,	CR4_OSXSAVE },
-		{ "SMEP",	CR4_SMEP,	CR4_SMEP },
-		{ "SMAP",	CR4_SMAP,	CR4_SMAP },
-		{ NULL,		0,		0 }
-	};
-
-	cr0 = kmdb_unix_getcr0();
-	cr2 = kmdb_unix_getcr2();
-	cr3 = kmdb_unix_getcr3();
-	cr4 = kmdb_unix_getcr4();
-
-	kmdb_unix_getgdtr(&gdtr);
+#ifdef __amd64
+	longmode = B_TRUE;
+#endif
 
-	mdb_printf("%%cr0 = 0x%lx <%b>\n", cr0, cr0, cr0_flag_bits);
-	mdb_printf("%%cr2 = 0x%lx <%a>\n", cr2, cr2);
+	sregs.sr_cr0 = kmdb_unix_getcr0();
+	sregs.sr_cr2 = kmdb_unix_getcr2();
+	sregs.sr_cr3 = kmdb_unix_getcr3();
+	sregs.sr_cr4 = kmdb_unix_getcr4();
 
-	if ((cr4 & CR4_PCIDE)) {
-		mdb_printf("%%cr3 = 0x%lx <pfn:0x%lx pcid:%lu>\n", cr3,
-		    cr3 >> MMU_PAGESHIFT, cr3 & MMU_PAGEOFFSET);
-	} else {
-		mdb_printf("%%cr3 = 0x%lx <pfn:0x%lx flags:%b>\n", cr3,
-		    cr3 >> MMU_PAGESHIFT, cr3, cr3_flag_bits);
-	}
-
-	mdb_printf("%%cr4 = 0x%lx <%b>\n", cr4, cr4, cr4_flag_bits);
+	kmdb_unix_getgdtr(&gdtr);
+	sregs.sr_gdtr.d_base = gdtr.dtr_base;
+	sregs.sr_gdtr.d_lim = gdtr.dtr_limit;
 
-	mdb_printf("%%gdtr.base = 0x%lx, %%gdtr.limit = 0x%hx\n",
-	    gdtr.dtr_base, gdtr.dtr_limit);
+	mdb_x86_print_sysregs(&sregs, longmode);
 
 	return (DCMD_OK);
 }
diff --git a/usr/src/cmd/mdb/i86xpv/modules/unix/amd64/Makefile b/usr/src/cmd/mdb/i86xpv/modules/unix/amd64/Makefile
index 32c1fe33fd..5854a1610e 100644
--- a/usr/src/cmd/mdb/i86xpv/modules/unix/amd64/Makefile
+++ b/usr/src/cmd/mdb/i86xpv/modules/unix/amd64/Makefile
@@ -22,7 +22,7 @@
 # Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# Copyright 2018 Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
 
 MODULE = unix.so
 MDBTGT = kvm
@@ -40,6 +40,7 @@ MODSRCS_DIR = ../../../../i86pc/modules/unix/
 
 CPPFLAGS += -DMP -D_MACHDEP -D__xpv
 CPPFLAGS += -I../../../../common
+CPPFLAGS += -I../../../../intel
 CPPFLAGS += -I$(SRC)/uts/common
 CPPFLAGS += -I$(SRC)/uts/i86xpv
 CPPFLAGS += -I$(SRC)/uts/i86pc
diff --git a/usr/src/cmd/mdb/intel/Makefile.kmdb b/usr/src/cmd/mdb/intel/Makefile.kmdb
index 1e9efcbc83..6151a2e46a 100644
--- a/usr/src/cmd/mdb/intel/Makefile.kmdb
+++ b/usr/src/cmd/mdb/intel/Makefile.kmdb
@@ -22,7 +22,7 @@
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# Copyright 2019, Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
 #
 
 PROMSRCS += \
@@ -68,6 +68,7 @@ KMDBLIBS = $(STANDLIBS) ../mdb_ks/kmod/mdb_ks
 MAPFILE_SOURCES = \
 	$(MAPFILE_SOURCES_COMMON) \
 	../../kmdb/kmdb_dpi_isadep.h \
+	../../mdb/mdb_x86util.h \
 	$(MAPFILE_SOURCES_$(MACH))
 
 %.o: ../../../../../uts/intel/promif/%.c
diff --git a/usr/src/cmd/mdb/intel/amd64/Makefile.kmdb b/usr/src/cmd/mdb/intel/amd64/Makefile.kmdb
index 61cf1541a2..8cee9ff049 100644
--- a/usr/src/cmd/mdb/intel/amd64/Makefile.kmdb
+++ b/usr/src/cmd/mdb/intel/amd64/Makefile.kmdb
@@ -22,6 +22,8 @@
 # Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
+# Copyright 2019 Joyent, Inc.
+#
 
 KMDBML += \
 	kaif_invoke.s \
@@ -29,6 +31,7 @@ KMDBML += \
 
 KMDBSRCS += \
 	kmdb_makecontext.c \
-	mdb_amd64util.c
+	mdb_amd64util.c \
+	mdb_x86util.c
 
 SACPPFLAGS = -D__$(MACH64) -U__$(MACH)
diff --git a/usr/src/cmd/mdb/intel/amd64/mdb/Makefile b/usr/src/cmd/mdb/intel/amd64/mdb/Makefile
index 3dfa7a34d5..918aa71ea1 100644
--- a/usr/src/cmd/mdb/intel/amd64/mdb/Makefile
+++ b/usr/src/cmd/mdb/intel/amd64/mdb/Makefile
@@ -22,10 +22,15 @@
 # Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
+# Copyright 2019 Joyent, Inc.
+#
 
 SRCS =	kvm_amd64dep.c \
 	kvm_isadep.c \
 	mdb_amd64util.c \
+	mdb_ia32util.c \
+	mdb_x86util.c \
+	mdb_bhyve.c \
 	proc_amd64dep.c
 
 %.o: %.c
@@ -42,12 +47,14 @@ SRCS =	kvm_amd64dep.c \
 %.ln: ../../mdb/%.c
 	$(LINT.c) -c $<
 
-include ../../../../Makefile.cmd
-include ../../../../Makefile.cmd.64
-include ../../Makefile.amd64
-include ../../../Makefile.mdb
+include $(SRC)/cmd/Makefile.cmd
+include $(SRC)/cmd/Makefile.cmd.64
+include $(SRC)/cmd/mdb/intel/Makefile.amd64
+include $(SRC)/cmd/mdb/Makefile.mdb
+include $(SRC)/Makefile.psm
 
 CPPFLAGS += -I../../mdb
-LDLIBS += -lsaveargs
+
+LDLIBS += -lsaveargs -lvmm
 
 install: all $(ISAEXEC) $(ROOTPROG64) $(ROOTLINK64)
diff --git a/usr/src/cmd/mdb/intel/kmdb/kaif.c b/usr/src/cmd/mdb/intel/kmdb/kaif.c
index 39cc9b620a..e4f80ad228 100644
--- a/usr/src/cmd/mdb/intel/kmdb/kaif.c
+++ b/usr/src/cmd/mdb/intel/kmdb/kaif.c
@@ -296,12 +296,12 @@ kaif_brkpt_arm(uintptr_t addr, mdb_instr_t *instrp)
 		return (set_errno(EMDB_TGTNOTSUP));
 	}
 
-	if (mdb_tgt_vread(mdb.m_target, instrp, sizeof (mdb_instr_t), addr) !=
-	    sizeof (mdb_instr_t))
+	if (mdb_tgt_aread(mdb.m_target, MDB_TGT_AS_VIRT_I, instrp,
+	    sizeof (mdb_instr_t), addr) != sizeof (mdb_instr_t))
 		return (-1); /* errno is set for us */
 
-	if (mdb_tgt_vwrite(mdb.m_target, &bkpt, sizeof (mdb_instr_t), addr) !=
-	    sizeof (mdb_instr_t))
+	if (mdb_tgt_awrite(mdb.m_target, MDB_TGT_AS_VIRT_I, &bkpt,
+	    sizeof (mdb_instr_t), addr) != sizeof (mdb_instr_t))
 		return (-1); /* errno is set for us */
 
 	return (0);
@@ -310,8 +310,8 @@ kaif_brkpt_arm(uintptr_t addr, mdb_instr_t *instrp)
 static int
 kaif_brkpt_disarm(uintptr_t addr, mdb_instr_t instrp)
 {
-	if (mdb_tgt_vwrite(mdb.m_target, &instrp, sizeof (mdb_instr_t), addr) !=
-	    sizeof (mdb_instr_t))
+	if (mdb_tgt_awrite(mdb.m_target, MDB_TGT_AS_VIRT_I, &instrp,
+	    sizeof (mdb_instr_t), addr) != sizeof (mdb_instr_t))
 		return (-1); /* errno is set for us */
 
 	return (0);
@@ -486,7 +486,7 @@ kaif_step(void)
 	}
 
 	if ((npc = mdb_dis_nextins(mdb.m_disasm, mdb.m_target,
-	    MDB_TGT_AS_VIRT, pc)) == pc) {
+	    MDB_TGT_AS_VIRT_I, pc)) == pc) {
 		warn("failed to decode instruction at %a for step\n", pc);
 		return (set_errno(EINVAL));
 	}
@@ -498,8 +498,8 @@ kaif_step(void)
 	 * versus their 64-bit counterparts.
 	 */
 	do {
-		if (mdb_tgt_vread(mdb.m_target, &instr, sizeof (mdb_instr_t),
-		    pc + pcoff) != sizeof (mdb_instr_t)) {
+		if (mdb_tgt_aread(mdb.m_target, MDB_TGT_AS_VIRT_I, &instr,
+		    sizeof (mdb_instr_t), pc + pcoff) != sizeof (mdb_instr_t)) {
 			warn("failed to read at %p for step",
 			    (void *)(pc + pcoff));
 			return (-1);
@@ -518,8 +518,8 @@ kaif_step(void)
 		return (set_errno(EMDB_TGTNOTSUP));
 
 	case M_ESC:
-		if (mdb_tgt_vread(mdb.m_target, &instr, sizeof (mdb_instr_t),
-		    pc + pcoff) != sizeof (mdb_instr_t)) {
+		if (mdb_tgt_aread(mdb.m_target, MDB_TGT_AS_VIRT_I, &instr,
+		    sizeof (mdb_instr_t), pc + pcoff) != sizeof (mdb_instr_t)) {
 			warn("failed to read at %p for step",
 			    (void *)(pc + pcoff));
 			return (-1);
@@ -568,8 +568,8 @@ kaif_step(void)
 		(void) kmdb_dpi_get_register("sp", &sp);
 		(void) kmdb_dpi_get_register(FLAGS_REG_NAME, &fl);
 
-		if (mdb_tgt_vread(mdb.m_target, &newfl, sizeof (kreg_t),
-		    sp) != sizeof (kreg_t)) {
+		if (mdb_tgt_aread(mdb.m_target, MDB_TGT_AS_VIRT_S, &newfl,
+		    sizeof (kreg_t), sp) != sizeof (kreg_t)) {
 			warn("failed to read " FLAGS_REG_NAME
 			    " at %p for popfl step\n", (void *)sp);
 			return (set_errno(EMDB_TGTNOTSUP)); /* XXX ? */
@@ -577,8 +577,8 @@ kaif_step(void)
 
 		fl = (fl & ~KREG_EFLAGS_IF_MASK) | KREG_EFLAGS_TF_MASK;
 
-		if (mdb_tgt_vwrite(mdb.m_target, &fl, sizeof (kreg_t),
-		    sp) != sizeof (kreg_t)) {
+		if (mdb_tgt_awrite(mdb.m_target, MDB_TGT_AS_VIRT_S, &fl,
+		    sizeof (kreg_t), sp) != sizeof (kreg_t)) {
 			warn("failed to update " FLAGS_REG_NAME
 			    " at %p for popfl step\n", (void *)sp);
 			return (set_errno(EMDB_TGTNOTSUP)); /* XXX ? */
@@ -617,8 +617,8 @@ kaif_step(void)
 		 */
 		(void) kmdb_dpi_get_register("sp", &sp);
 
-		if (mdb_tgt_vwrite(mdb.m_target, &oldfl, sizeof (kreg_t),
-		    sp) != sizeof (kreg_t)) {
+		if (mdb_tgt_awrite(mdb.m_target, MDB_TGT_AS_VIRT_S, &oldfl,
+		    sizeof (kreg_t), sp) != sizeof (kreg_t)) {
 			warn("failed to update pushed " FLAGS_REG_NAME
 			    " at %p after pushfl step\n", (void *)sp);
 			return (set_errno(EMDB_TGTNOTSUP)); /* XXX ? */
diff --git a/usr/src/cmd/mdb/intel/mdb/kvm_amd64dep.c b/usr/src/cmd/mdb/intel/mdb/kvm_amd64dep.c
index f3352b6e21..0382c7e0fb 100644
--- a/usr/src/cmd/mdb/intel/mdb/kvm_amd64dep.c
+++ b/usr/src/cmd/mdb/intel/mdb/kvm_amd64dep.c
@@ -46,6 +46,7 @@
 #include <mdb/mdb_modapi.h>
 #include <mdb/mdb_conf.h>
 #include <mdb/mdb_kreg_impl.h>
+#include <mdb/mdb_isautil.h>
 #include <mdb/mdb_amd64util.h>
 #include <mdb/kvm_isadep.h>
 #include <mdb/mdb_kvm.h>
diff --git a/usr/src/cmd/mdb/intel/mdb/kvm_ia32dep.c b/usr/src/cmd/mdb/intel/mdb/kvm_ia32dep.c
index 2dfced82d3..7a6ecaeb6b 100644
--- a/usr/src/cmd/mdb/intel/mdb/kvm_ia32dep.c
+++ b/usr/src/cmd/mdb/intel/mdb/kvm_ia32dep.c
@@ -45,6 +45,7 @@
 #include <mdb/mdb_modapi.h>
 #include <mdb/mdb_conf.h>
 #include <mdb/mdb_kreg_impl.h>
+#include <mdb/mdb_isautil.h>
 #include <mdb/mdb_ia32util.h>
 #include <mdb/kvm_isadep.h>
 #include <mdb/mdb_kvm.h>
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c
index 14c81f47fd..7740a82d8f 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c
@@ -39,6 +39,7 @@
 #include <mdb/mdb_kreg_impl.h>
 #include <mdb/mdb_debug.h>
 #include <mdb/mdb_modapi.h>
+#include <mdb/mdb_isautil.h>
 #include <mdb/mdb_amd64util.h>
 #include <mdb/mdb_ctf.h>
 #include <mdb/mdb_err.h>
@@ -244,7 +245,8 @@ mdb_amd64_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 	while (fp != 0) {
 		int args_style = 0;
 
-		if (mdb_tgt_vread(t, &fr, sizeof (fr), fp) != sizeof (fr)) {
+		if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S, &fr, sizeof (fr), fp) !=
+		    sizeof (fr)) {
 			err = EMDB_NOMAP;
 			goto badfp;
 		}
@@ -259,8 +261,9 @@ mdb_amd64_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 			if (advance_tortoise != 0) {
 				struct fr tfr;
 
-				if (mdb_tgt_vread(t, &tfr, sizeof (tfr),
-				    tortoise_fp) != sizeof (tfr)) {
+				if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S, &tfr,
+				    sizeof (tfr), tortoise_fp) !=
+				    sizeof (tfr)) {
 					err = EMDB_NOMAP;
 					goto badfp;
 				}
@@ -330,7 +333,8 @@ mdb_amd64_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 		insnsize = MIN(MIN(s.st_size, SAVEARGS_INSN_SEQ_LEN),
 		    pc - s.st_value);
 
-		if (mdb_tgt_vread(t, ins, insnsize, s.st_value) != insnsize)
+		if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_I, ins, insnsize,
+		    s.st_value) != insnsize)
 			argc = 0;
 
 		if ((argc != 0) &&
@@ -349,8 +353,8 @@ mdb_amd64_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 			if (args_style == SAVEARGS_STRUCT_ARGS)
 				size += sizeof (long);
 
-			if (mdb_tgt_vread(t, fr_argv, size, (fp - size))
-			    != size)
+			if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S, fr_argv, size,
+			    (fp - size)) != size)
 				return (-1);	/* errno has been set for us */
 
 			/*
@@ -369,7 +373,8 @@ mdb_amd64_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 				    sizeof (fr_argv) -
 				    (reg_argc * sizeof (long)));
 
-				if (mdb_tgt_vread(t, &fr_argv[reg_argc], size,
+				if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S,
+				    &fr_argv[reg_argc], size,
 				    fp + sizeof (fr)) != size)
 					return (-1); /* errno has been set */
 			}
@@ -434,14 +439,15 @@ mdb_amd64_step_out(mdb_tgt_t *t, uintptr_t *p, kreg_t pc, kreg_t fp, kreg_t sp,
 		if (pc == s.st_value && curinstr == M_PUSHQ_RBP)
 			fp = sp - 8;
 		else if (pc == s.st_value + 1 && curinstr == M_REX_W) {
-			if (mdb_tgt_vread(t, &curinstr, sizeof (curinstr),
-			    pc + 1) == sizeof (curinstr) && curinstr ==
-			    M_MOVL_RBP)
+			if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_I, &curinstr,
+			    sizeof (curinstr), pc + 1) == sizeof (curinstr) &&
+			    curinstr == M_MOVL_RBP)
 				fp = sp;
 		}
 	}
 
-	if (mdb_tgt_vread(t, &fr, sizeof (fr), fp) == sizeof (fr)) {
+	if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S, &fr, sizeof (fr), fp) ==
+	    sizeof (fr)) {
 		*p = fr.fr_savpc;
 		return (0);
 	}
@@ -476,8 +482,8 @@ mdb_amd64_next(mdb_tgt_t *t, uintptr_t *p, kreg_t pc, mdb_instr_t curinstr)
 	/* Skip the rex prefix, if any */
 	callpc = pc;
 	while (curinstr >= M_REX_LO && curinstr <= M_REX_HI) {
-		if (mdb_tgt_vread(t, &curinstr, sizeof (curinstr), ++callpc) !=
-		    sizeof (curinstr))
+		if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_I, &curinstr,
+		    sizeof (curinstr), ++callpc) != sizeof (curinstr))
 			return (-1); /* errno is set for us */
 	}
 
@@ -486,7 +492,8 @@ mdb_amd64_next(mdb_tgt_t *t, uintptr_t *p, kreg_t pc, mdb_instr_t curinstr)
 		return (set_errno(EAGAIN));
 	}
 
-	if ((npc = mdb_dis_nextins(mdb.m_disasm, t, MDB_TGT_AS_VIRT, pc)) == pc)
+	npc = mdb_dis_nextins(mdb.m_disasm, t, MDB_TGT_AS_VIRT_I, pc);
+	if (npc == pc)
 		return (-1); /* errno is set for us */
 
 	*p = npc;
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.h b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.h
index f8c6097cef..b3f060bf05 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.h
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.h
@@ -22,12 +22,13 @@
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2018, Joyent, Inc.  All rights reserved.
+ */
 
 #ifndef _MDB_AMD64UTIL_H
 #define	_MDB_AMD64UTIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <mdb/mdb_kreg.h>
 #include <mdb/mdb_target_impl.h>
 
@@ -35,8 +36,6 @@
 extern "C" {
 #endif
 
-typedef uchar_t mdb_instr_t;
-
 extern const mdb_tgt_regdesc_t mdb_amd64_kregs[];
 
 extern void mdb_amd64_printregs(const mdb_tgt_gregset_t *);
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_bhyve.c b/usr/src/cmd/mdb/intel/mdb/mdb_bhyve.c
new file mode 100644
index 0000000000..9477bf5056
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_bhyve.c
@@ -0,0 +1,1461 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * bhyve target
+ *
+ * The bhyve target is used to examine and manipulate a bhyve VM. Access to
+ * a bhyve VM is provided by libvmm, which itself uses libvmmapi, which uses
+ * the vmm driver's ioctl interface to carry out requests.
+ *
+ * The bhyve target does not know about threads or processes, but it handles
+ * multiple vCPUs and can switch between them. Execution control is currently
+ * limited to completely stopping or resuming all vCPUs of a VM, or single-
+ * stepping a particular vCPU while all other vCPUs remain stopped. Breakpoints
+ * are not implemented yet, and as such step-out and step-over don't work yet.
+ * All known x86 instruction sets are support, legacy IA-16, IA-32 and AMD64.
+ * The current CPU instruction set is automatically determined by parsing the
+ * code segment (CS) attributes in the current vCPU.
+ *
+ * All of the VMs physical memory and device memory segments are mapped R/W
+ * into mdb's address space by libvmm. All accesses to those memory are
+ * facilitated through libvmm calls, which may include virtual address
+ * translation according to the current vCPU mode. Both real-mode and protected-
+ * mode segmentation are understood and used for translating virtual addresses
+ * into linear addresses, which may further be translated using 2-level, 3-level
+ * or 4-level paging.
+ *
+ * To handle disassembly and stack tracing properly when segmentation is used by
+ * a vCPU (always in real mode, sometimes in protected mode) the bhyve target
+ * has a notion of three virtual address spaces used for reading/writing memory:
+ *   - MDB_TGT_AS_VIRT, the default virtual address space uses the DS segment
+ *     by default, but this default can be changed with the ::defseg dcmd.
+ *   - MDB_TGT_AS_VIRT_I, the virtual address space for instructions always
+ *     uses the code segment (CS) for translation
+ *   - MDB_TGT_AS_VIRT_S, the virtual address space for the stack always uses
+ *     the stack segment (SS) for translation
+ *
+ * Register printing and stack tracing is using the common x86 ISA-specific code
+ * in IA-32 and AMD64 modes. There is no stack tracing for IA-16 mode yet.
+ *
+ * Todo:
+ *   - support for breakpoint, step-out, and step-over
+ *   - support for x86 stack tracing
+ */
+#include <mdb/mdb_conf.h>
+#include <mdb/mdb_err.h>
+#include <mdb/mdb_signal.h>
+#include <mdb/mdb_modapi.h>
+#include <mdb/mdb_io_impl.h>
+#include <mdb/mdb_kreg_impl.h>
+#include <mdb/mdb_target_impl.h>
+#include <mdb/mdb_isautil.h>
+#include <mdb/mdb_amd64util.h>
+#include <mdb/mdb_ia32util.h>
+#include <mdb/mdb_x86util.h>
+#include <mdb/mdb.h>
+
+#include <sys/controlregs.h>
+#include <sys/debugreg.h>
+#include <sys/sysmacros.h>
+#include <sys/note.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <libvmm.h>
+
+#define	MDB_DEF_PROMPT	"[%<_cpuid>]> "
+
+typedef struct bhyve_data {
+	vmm_t *bd_vmm;
+	uint_t bd_curcpu;
+	int bd_defseg;
+
+	/* must be last */
+	char bd_name[];
+} bhyve_data_t;
+
+
+const mdb_tgt_regdesc_t bhyve_kregs[] = {
+	{ "rdi",	KREG_RDI,	MDB_TGT_R_EXPORT },
+	{ "edi",	KREG_RDI,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "di",		KREG_RDI,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "dil",	KREG_RDI,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "rsi",	KREG_RSI,	MDB_TGT_R_EXPORT },
+	{ "esi",	KREG_RSI,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "si",		KREG_RSI,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "sil",	KREG_RSI,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "rdx",	KREG_RDX,	MDB_TGT_R_EXPORT },
+	{ "edx",	KREG_RDX,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "dx",		KREG_RDX,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "dh",		KREG_RDX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8H },
+	{ "dl",		KREG_RDX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "rcx",	KREG_RCX,	MDB_TGT_R_EXPORT },
+	{ "ecx",	KREG_RCX,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "cx",		KREG_RCX,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "ch",		KREG_RCX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8H },
+	{ "cl",		KREG_RCX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r8",		KREG_R8,	MDB_TGT_R_EXPORT },
+	{ "r8d",	KREG_R8,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r8w",	KREG_R8,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r8l",	KREG_R8,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r9",		KREG_R9,	MDB_TGT_R_EXPORT },
+	{ "r9d",	KREG_R8,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r9w",	KREG_R8,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r9l",	KREG_R8,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "rax",	KREG_RAX,	MDB_TGT_R_EXPORT },
+	{ "eax",	KREG_RAX,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "ax",		KREG_RAX,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "ah",		KREG_RAX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8H },
+	{ "al",		KREG_RAX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "rbx",	KREG_RBX,	MDB_TGT_R_EXPORT },
+	{ "ebx",	KREG_RBX,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "bx",		KREG_RBX,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "bh",		KREG_RBX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8H },
+	{ "bl",		KREG_RBX,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "rbp",	KREG_RBP,	MDB_TGT_R_EXPORT },
+	{ "ebp",	KREG_RBP,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "bp",		KREG_RBP,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "bpl",	KREG_RBP,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r10",	KREG_R10,	MDB_TGT_R_EXPORT },
+	{ "r10d",	KREG_R10,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r10w",	KREG_R10,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r10l",	KREG_R10,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r11",	KREG_R11,	MDB_TGT_R_EXPORT },
+	{ "r11d",	KREG_R11,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r11w",	KREG_R11,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r11l",	KREG_R11,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r12",	KREG_R12,	MDB_TGT_R_EXPORT },
+	{ "r12d",	KREG_R12,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r12w",	KREG_R12,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r12l",	KREG_R12,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r13",	KREG_R13,	MDB_TGT_R_EXPORT },
+	{ "r13d",	KREG_R13,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r13w",	KREG_R13,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r13l",	KREG_R13,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r14",	KREG_R14,	MDB_TGT_R_EXPORT },
+	{ "r14d",	KREG_R14,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r14w",	KREG_R14,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r14l",	KREG_R14,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "r15",	KREG_R15,	MDB_TGT_R_EXPORT },
+	{ "r15d",	KREG_R15,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "r15w",	KREG_R15,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "r15l",	KREG_R15,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "ds",		KREG_DS,	MDB_TGT_R_EXPORT },
+	{ "es",		KREG_ES,	MDB_TGT_R_EXPORT },
+	{ "fs",		KREG_FS,	MDB_TGT_R_EXPORT },
+	{ "gs",		KREG_GS,	MDB_TGT_R_EXPORT },
+	{ "rip",	KREG_RIP,	MDB_TGT_R_EXPORT },
+	{ "cs",		KREG_CS,	MDB_TGT_R_EXPORT },
+	{ "rflags",	KREG_RFLAGS,	MDB_TGT_R_EXPORT },
+	{ "eflags",	KREG_RFLAGS,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "rsp",	KREG_RSP,	MDB_TGT_R_EXPORT },
+	{ "esp",	KREG_RSP,	MDB_TGT_R_EXPORT | MDB_TGT_R_32 },
+	{ "sp",		KREG_RSP,	MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
+	{ "spl",	KREG_RSP,	MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
+	{ "ss",		KREG_SS,	MDB_TGT_R_EXPORT },
+	{ "cr2",	KREG_CR2,	MDB_TGT_R_EXPORT },
+	{ "cr3",	KREG_CR3,	MDB_TGT_R_EXPORT },
+	{ NULL, 0, 0 }
+};
+
+static const char *segments[] = { "CS", "DS", "ES", "FS", "GS", "SS" };
+
+
+/*ARGSUSED*/
+static uintmax_t
+bhyve_cpuid_get(const mdb_var_t *v)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+
+	return (bd->bd_curcpu);
+}
+
+static const mdb_nv_disc_t bhyve_cpuid_disc = {
+	.disc_get = bhyve_cpuid_get
+};
+
+
+static uintmax_t
+bhyve_reg_get(const mdb_var_t *v)
+{
+	mdb_tgt_reg_t r = 0;
+
+	if (mdb_tgt_getareg(MDB_NV_COOKIE(v), 0, mdb_nv_get_name(v), &r) == -1)
+		mdb_warn("failed to get %%%s register", mdb_nv_get_name(v));
+
+	return (r);
+}
+
+static void
+bhyve_reg_set(mdb_var_t *v, uintmax_t r)
+{
+	if (mdb_tgt_putareg(MDB_NV_COOKIE(v), 0, mdb_nv_get_name(v), r) == -1)
+		mdb_warn("failed to modify %%%s register", mdb_nv_get_name(v));
+}
+
+static const mdb_nv_disc_t bhyve_reg_disc = {
+	.disc_set = bhyve_reg_set,
+	.disc_get = bhyve_reg_get
+};
+
+static int
+bhyve_get_gregset(bhyve_data_t *bd, int cpu, mdb_tgt_gregset_t *gregs)
+{
+	vmm_desc_t fs, gs;
+
+	/*
+	 * Register numbers to get, the order must match the definitions of
+	 * KREG_* in mdb_kreg.h so that we get a proper mdb_tgt_gregset_t
+	 * that the register printing functions will understand.
+	 *
+	 * There are a few fields in mdb_tgt_gregset_t that can't be accessed
+	 * with vmm_get_regset(), either because they don't exist in bhyve or
+	 * or because they need to be accessed with vmm_get_desc(). For these
+	 * cases we ask for RAX instead and fill it with 0 or the real value,
+	 * respectively.
+	 */
+	static const int regnums[] = {
+		KREG_RAX, /* dummy for SAVFP */
+		KREG_RAX, /* dummy for SAVFP */
+		KREG_RDI,
+		KREG_RSI,
+		KREG_RDX,
+		KREG_RCX,
+		KREG_R8,
+		KREG_R9,
+		KREG_RAX,
+		KREG_RBX,
+		KREG_RBP,
+		KREG_R10,
+		KREG_R11,
+		KREG_R12,
+		KREG_R13,
+		KREG_R14,
+		KREG_R15,
+		KREG_RAX, /* dummy for FSBASE */
+		KREG_RAX, /* dummy for GSBASE */
+		KREG_RAX, /* dummy for KGSBASE */
+		KREG_CR2,
+		KREG_CR3,
+		KREG_DS,
+		KREG_ES,
+		KREG_FS,
+		KREG_GS,
+		KREG_RAX, /* dummy for TRAPNO */
+		KREG_RAX, /* dummy for ERR */
+		KREG_RIP,
+		KREG_CS,
+		KREG_RFLAGS,
+		KREG_RSP,
+		KREG_SS
+	};
+
+	if (vmm_get_regset(bd->bd_vmm, cpu, KREG_NGREG, regnums,
+	    &gregs->kregs[0]) != 0) {
+		mdb_warn("failed to get general-purpose registers for CPU %d",
+		    cpu);
+		return (-1);
+	}
+
+	if (vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_FS, &fs) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_GS, &gs) != 0) {
+		mdb_warn("failed to get FS/GS descriptors for CPU %d", cpu);
+		return (-1);
+	}
+
+	gregs->kregs[KREG_SAVFP] = 0;
+	gregs->kregs[KREG_SAVPC] = 0;
+	gregs->kregs[KREG_KGSBASE] = 0;
+	gregs->kregs[KREG_TRAPNO] = 0;
+	gregs->kregs[KREG_ERR] = 0;
+
+	gregs->kregs[KREG_FSBASE] = fs.vd_base;
+	gregs->kregs[KREG_GSBASE] = gs.vd_base;
+
+	return (0);
+}
+
+static int
+bhyve_cpuregs_dcmd(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	uint64_t cpu = bd->bd_curcpu;
+	mdb_tgt_gregset_t gregs;
+	int i;
+
+
+	if (flags & DCMD_ADDRSPEC) {
+		if (argc != 0)
+			return (DCMD_USAGE);
+
+		cpu = (uint64_t)addr;
+	}
+
+	i = mdb_getopts(argc, argv, 'c', MDB_OPT_UINT64, &cpu, NULL);
+
+	argc -= i;
+	argv += i;
+
+	if (argc != 0)
+		return (DCMD_USAGE);
+
+	if (cpu >= vmm_ncpu(bd->bd_vmm)) {
+		mdb_warn("no such CPU\n");
+		return (DCMD_ERR);
+	}
+
+	if (bhyve_get_gregset(bd, cpu, &gregs) != 0)
+		return (DCMD_ERR);
+
+
+	switch (vmm_vcpu_isa(bd->bd_vmm, cpu)) {
+	case VMM_ISA_64:
+		mdb_amd64_printregs(&gregs);
+		break;
+	case VMM_ISA_32:
+	case VMM_ISA_16:
+		mdb_ia32_printregs(&gregs);
+		break;
+	default:
+		mdb_warn("CPU %d mode unknown", cpu);
+		return (DCMD_ERR);
+	}
+
+	return (0);
+}
+
+static int
+bhyve_regs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	if ((flags & DCMD_ADDRSPEC) || argc != 0)
+		return (DCMD_USAGE);
+
+	return (bhyve_cpuregs_dcmd(addr, flags, argc, argv));
+}
+
+static int
+bhyve_stack_common(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv, int vcpu, boolean_t verbose)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	void *arg = (void *)(uintptr_t)mdb.m_nargs;
+
+	mdb_tgt_gregset_t gregs;
+	mdb_tgt_stack_f *func;
+
+	if (vcpu == -1)
+		vcpu = bd->bd_curcpu;
+
+	if (flags & DCMD_ADDRSPEC) {
+		bzero(&gregs, sizeof (gregs));
+		gregs.kregs[KREG_RBP] = addr;
+	} else if (bhyve_get_gregset(bd, vcpu, &gregs) != 0)
+		return (DCMD_ERR);
+
+	switch (vmm_vcpu_isa(bd->bd_vmm, vcpu)) {
+	case VMM_ISA_64:
+		func = verbose ? mdb_amd64_kvm_framev : mdb_amd64_kvm_frame;
+		(void) mdb_amd64_kvm_stack_iter(mdb.m_target, &gregs, func,
+		    arg);
+		break;
+	case VMM_ISA_32:
+		func = verbose ? mdb_ia32_kvm_framev : mdb_amd64_kvm_frame;
+		(void) mdb_ia32_kvm_stack_iter(mdb.m_target, &gregs, func, arg);
+		break;
+	case VMM_ISA_16:
+		mdb_warn("IA16 stack tracing not implemented\n");
+		return (DCMD_ERR);
+	default:
+		mdb_warn("CPU %d mode unknown", vcpu);
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
+static int
+bhyve_cpustack_dcmd(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	uint64_t cpu = bd->bd_curcpu;
+	boolean_t verbose;
+	int i;
+
+	if (flags & DCMD_ADDRSPEC) {
+		if (argc != 0)
+			return (DCMD_USAGE);
+
+		if (addr < vmm_ncpu(bd->bd_vmm)) {
+			cpu = (uint64_t)addr;
+			flags &= ~DCMD_ADDRSPEC;
+		}
+	}
+
+	i = mdb_getopts(argc, argv,
+	    'c', MDB_OPT_UINT64, &cpu,
+	    'v', MDB_OPT_SETBITS, 1, &verbose,
+	    NULL);
+
+	argc -= i;
+	argv += i;
+
+	if (argc != 0)
+		return (DCMD_USAGE);
+
+	return (bhyve_stack_common(addr, flags, argc, argv, cpu, verbose));
+}
+
+static int
+bhyve_stack_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	return (bhyve_stack_common(addr, flags, argc, argv, -1, B_FALSE));
+}
+
+static int
+bhyve_stackv_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	return (bhyve_stack_common(addr, flags, argc, argv, -1, B_TRUE));
+}
+
+static int
+bhyve_stackr_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	return (bhyve_stack_common(addr, flags, argc, argv, -1, B_TRUE));
+}
+
+static int
+bhyve_status_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	vmm_mode_t mode;
+	vmm_isa_t isa;
+
+	static const char *modes[] = {
+		"unknown mode",
+		"real mode",
+		"protected mode, no PAE",
+		"protected mode, PAE",
+		"long mode"
+	};
+	static const char *isas[] = {
+		"unknown ISA",
+		"IA16",
+		"IA32",
+		"AMD64"
+	};
+
+	if ((flags & DCMD_ADDRSPEC) || argc != 0)
+		return (DCMD_USAGE);
+
+	mode = vmm_vcpu_mode(bd->bd_vmm, bd->bd_curcpu);
+	isa = vmm_vcpu_isa(bd->bd_vmm, bd->bd_curcpu);
+
+	mdb_printf("debugging live VM '%s'\n", bd->bd_name);
+	mdb_printf("VM memory size: %d MB\n",
+	    vmm_memsize(bd->bd_vmm) / 1024 / 1024);
+	mdb_printf("vCPUs: %d\n", vmm_ncpu(bd->bd_vmm));
+	mdb_printf("current CPU: %d (%s, %s)\n", bd->bd_curcpu, modes[mode],
+	    isas[isa]);
+	mdb_printf("default segment: %s",
+	    segments[bd->bd_defseg - VMM_DESC_CS]);
+
+	return (DCMD_OK);
+}
+
+
+static int
+bhyve_sysregs_dcmd(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	uint64_t cpu = bd->bd_curcpu;
+	int ret = DCMD_ERR;
+	struct sysregs sregs;
+	int i;
+
+	/*
+	 * This array must use the order of the elements of struct sysregs.
+	 */
+	static const int regnums[] = {
+		VMM_REG_CR0,
+		VMM_REG_CR2,
+		VMM_REG_CR3,
+		VMM_REG_CR4,
+		VMM_REG_DR0,
+		VMM_REG_DR1,
+		VMM_REG_DR2,
+		VMM_REG_DR3,
+		VMM_REG_DR6,
+		VMM_REG_DR7,
+		VMM_REG_EFER,
+		VMM_REG_PDPTE0,
+		VMM_REG_PDPTE1,
+		VMM_REG_PDPTE2,
+		VMM_REG_PDPTE3,
+		VMM_REG_INTR_SHADOW
+	};
+
+	if (flags & DCMD_ADDRSPEC) {
+		if (argc != 0)
+			return (DCMD_USAGE);
+
+		cpu = (uint64_t)addr;
+	}
+
+	i = mdb_getopts(argc, argv, 'c', MDB_OPT_UINT64, &cpu, NULL);
+
+	argc -= i;
+	argv += i;
+
+	if (argc != 0)
+		return (DCMD_USAGE);
+
+	if (cpu >= vmm_ncpu(bd->bd_vmm)) {
+		mdb_warn("no such CPU\n");
+		return (DCMD_ERR);
+	}
+
+	if (vmm_get_regset(bd->bd_vmm, cpu, ARRAY_SIZE(regnums), regnums,
+	    (uint64_t *)&sregs) != 0)
+		goto fail;
+
+	if (vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_GDTR,
+	    (vmm_desc_t *)&sregs.sr_gdtr) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_IDTR,
+	    (vmm_desc_t *)&sregs.sr_idtr) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_LDTR,
+	    (vmm_desc_t *)&sregs.sr_ldtr) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_TR,
+	    (vmm_desc_t *)&sregs.sr_tr) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_CS,
+	    (vmm_desc_t *)&sregs.sr_cs) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_DS,
+	    (vmm_desc_t *)&sregs.sr_ds) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_ES,
+	    (vmm_desc_t *)&sregs.sr_es) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_FS,
+	    (vmm_desc_t *)&sregs.sr_fs) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_GS,
+	    (vmm_desc_t *)&sregs.sr_gs) != 0 ||
+	    vmm_get_desc(bd->bd_vmm, cpu, VMM_DESC_SS,
+	    (vmm_desc_t *)&sregs.sr_ss) != 0)
+		goto fail;
+
+	mdb_x86_print_sysregs(&sregs, vmm_vcpu_mode(bd->bd_vmm, cpu) ==
+	    VMM_MODE_LONG);
+
+	ret = DCMD_OK;
+
+fail:
+	if (ret != DCMD_OK)
+		mdb_warn("failed to get system registers for CPU %d\n", cpu);
+	return (ret);
+}
+
+static int
+bhyve_dbgregs_dcmd(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	uint64_t cpu = bd->bd_curcpu;
+	int ret = DCMD_ERR;
+	vmm_desc_t gdtr, ldtr, idtr, tr, cs, ds, es, fs, gs, ss;
+	uint64_t *regvals;
+	int i;
+
+	/*
+	 * This array must use the order of definitions set in libvmm.h
+	 * to make GETREG() work.
+	 */
+#define	GETREG(r)	(regvals[r - VMM_REG_DR0])
+	static const int regnums[] = {
+		VMM_REG_DR0,
+		VMM_REG_DR1,
+		VMM_REG_DR2,
+		VMM_REG_DR3,
+		VMM_REG_DR6,
+		VMM_REG_DR7,
+	};
+
+	static const mdb_bitmask_t dr6_flag_bits[] = {
+		{ "DR0",	DR_TRAP0,	DR_TRAP0 },
+		{ "DR1",	DR_TRAP1,	DR_TRAP1 },
+		{ "DR2",	DR_TRAP2,	DR_TRAP2 },
+		{ "DR3",	DR_TRAP3,	DR_TRAP3 },
+		{ "debug reg",	DR_ICEALSO,	DR_ICEALSO },
+		{ "single step", DR_SINGLESTEP,	DR_SINGLESTEP },
+		{ "task switch", DR_TASKSWITCH,	DR_TASKSWITCH },
+		{ NULL,		0,		0 }
+	};
+
+#define	DR_RW(x, m)	\
+	((DR_RW_MASK & (m)) << (DR_CONTROL_SHIFT + (x) * DR_CONTROL_SIZE))
+#define	DR_LEN(x, m)	\
+	((DR_LEN_MASK & (m)) << (DR_CONTROL_SHIFT + (x) * DR_CONTROL_SIZE))
+
+	static const mdb_bitmask_t dr7_flag_bits[] = {
+		{ "L0",   DR_ENABLE0,	DR_LOCAL_ENABLE_MASK & DR_ENABLE0 },
+		{ "G0",   DR_ENABLE0,	DR_GLOBAL_ENABLE_MASK & DR_ENABLE0 },
+		{ "L1",   DR_ENABLE1,	DR_LOCAL_ENABLE_MASK & DR_ENABLE1 },
+		{ "G1",   DR_ENABLE1,	DR_GLOBAL_ENABLE_MASK & DR_ENABLE1 },
+		{ "L2",   DR_ENABLE2,	DR_LOCAL_ENABLE_MASK & DR_ENABLE2 },
+		{ "G2",   DR_ENABLE2,	DR_GLOBAL_ENABLE_MASK & DR_ENABLE2 },
+		{ "L3",   DR_ENABLE3,	DR_LOCAL_ENABLE_MASK & DR_ENABLE3 },
+		{ "G3",   DR_ENABLE3,	DR_GLOBAL_ENABLE_MASK & DR_ENABLE3 },
+		{ "LE",   DR_LOCAL_SLOWDOWN,	DR_LOCAL_SLOWDOWN },
+		{ "GE",   DR_GLOBAL_SLOWDOWN,	DR_GLOBAL_SLOWDOWN },
+		{ "RTM",  DR_RTM,		DR_RTM },
+		{ "GD",   DR_GENERAL_DETECT,	DR_GENERAL_DETECT },
+		{ "0:X",  DR_RW(0, DR_RW_MASK),   DR_RW(0, DR_RW_EXECUTE) },
+		{ "0:W",  DR_RW(0, DR_RW_MASK),   DR_RW(0, DR_RW_WRITE) },
+		{ "0:IO", DR_RW(0, DR_RW_MASK),   DR_RW(0, DR_RW_IO_RW) },
+		{ "0:RW", DR_RW(0, DR_RW_MASK),   DR_RW(0, DR_RW_READ) },
+		{ "1:X",  DR_RW(1, DR_RW_MASK),   DR_RW(1, DR_RW_EXECUTE) },
+		{ "1:W",  DR_RW(1, DR_RW_MASK),   DR_RW(1, DR_RW_WRITE) },
+		{ "1:IO", DR_RW(1, DR_RW_MASK),   DR_RW(1, DR_RW_IO_RW) },
+		{ "1:RW", DR_RW(1, DR_RW_MASK),   DR_RW(1, DR_RW_READ) },
+		{ "2:X",  DR_RW(2, DR_RW_MASK),   DR_RW(2, DR_RW_EXECUTE) },
+		{ "2:W",  DR_RW(2, DR_RW_MASK),   DR_RW(2, DR_RW_WRITE) },
+		{ "2:IO", DR_RW(2, DR_RW_MASK),   DR_RW(2, DR_RW_IO_RW) },
+		{ "2:RW", DR_RW(2, DR_RW_MASK),   DR_RW(2, DR_RW_READ) },
+		{ "3:X",  DR_RW(3, DR_RW_MASK),   DR_RW(3, DR_RW_EXECUTE) },
+		{ "3:W",  DR_RW(3, DR_RW_MASK),   DR_RW(3, DR_RW_WRITE) },
+		{ "3:IO", DR_RW(3, DR_RW_MASK),   DR_RW(3, DR_RW_IO_RW) },
+		{ "3:RW", DR_RW(3, DR_RW_MASK),   DR_RW(3, DR_RW_READ) },
+		{ "0:1",  DR_LEN(0, DR_LEN_MASK), DR_LEN(0, DR_LEN_1) },
+		{ "0:2",  DR_LEN(0, DR_LEN_MASK), DR_LEN(0, DR_LEN_2) },
+		{ "0:4",  DR_LEN(0, DR_LEN_MASK), DR_LEN(0, DR_LEN_4) },
+		{ "0:8",  DR_LEN(0, DR_LEN_MASK), DR_LEN(0, DR_LEN_8) },
+		{ "1:1",  DR_LEN(1, DR_LEN_MASK), DR_LEN(1, DR_LEN_1) },
+		{ "1:2",  DR_LEN(1, DR_LEN_MASK), DR_LEN(1, DR_LEN_2) },
+		{ "1:4",  DR_LEN(1, DR_LEN_MASK), DR_LEN(1, DR_LEN_4) },
+		{ "1:8",  DR_LEN(1, DR_LEN_MASK), DR_LEN(1, DR_LEN_8) },
+		{ "2:1",  DR_LEN(2, DR_LEN_MASK), DR_LEN(2, DR_LEN_1) },
+		{ "2:2",  DR_LEN(2, DR_LEN_MASK), DR_LEN(2, DR_LEN_2) },
+		{ "2:4",  DR_LEN(2, DR_LEN_MASK), DR_LEN(2, DR_LEN_4) },
+		{ "2:8",  DR_LEN(2, DR_LEN_MASK), DR_LEN(2, DR_LEN_8) },
+		{ "3:1",  DR_LEN(3, DR_LEN_MASK), DR_LEN(3, DR_LEN_1) },
+		{ "3:2",  DR_LEN(3, DR_LEN_MASK), DR_LEN(3, DR_LEN_2) },
+		{ "3:4",  DR_LEN(3, DR_LEN_MASK), DR_LEN(3, DR_LEN_4) },
+		{ "3:8",  DR_LEN(3, DR_LEN_MASK), DR_LEN(3, DR_LEN_8) },
+		{ NULL, 0, 0 },
+	};
+
+
+	if (flags & DCMD_ADDRSPEC) {
+		if (argc != 0)
+			return (DCMD_USAGE);
+
+		cpu = (uint64_t)addr;
+	}
+
+	i = mdb_getopts(argc, argv, 'c', MDB_OPT_UINT64, &cpu, NULL);
+
+	argc -= i;
+	argv += i;
+
+	if (argc != 0)
+		return (DCMD_USAGE);
+
+	if (cpu >= vmm_ncpu(bd->bd_vmm)) {
+		mdb_warn("no such CPU\n");
+		return (DCMD_ERR);
+	}
+
+	regvals = mdb_zalloc(ARRAY_SIZE(regnums) * sizeof (uint64_t), UM_SLEEP);
+
+	if (vmm_get_regset(bd->bd_vmm, cpu, ARRAY_SIZE(regnums), regnums,
+	    regvals) != 0)
+		goto fail;
+
+	mdb_printf("%%dr0 = 0x%0?p %A\n",
+	    GETREG(VMM_REG_DR0), GETREG(VMM_REG_DR0));
+	mdb_printf("%%dr1 = 0x%0?p %A\n",
+	    GETREG(VMM_REG_DR1), GETREG(VMM_REG_DR1));
+	mdb_printf("%%dr2 = 0x%0?p %A\n",
+	    GETREG(VMM_REG_DR2), GETREG(VMM_REG_DR2));
+	mdb_printf("%%dr3 = 0x%0?p %A\n",
+	    GETREG(VMM_REG_DR3), GETREG(VMM_REG_DR3));
+	mdb_printf("%%dr6 = 0x%0lx <%b>\n",
+	    GETREG(VMM_REG_DR6), GETREG(VMM_REG_DR6), dr6_flag_bits);
+	mdb_printf("%%dr7 = 0x%0lx <%b>\n",
+	    GETREG(VMM_REG_DR7), GETREG(VMM_REG_DR7), dr7_flag_bits);
+#undef GETREG
+
+	ret = DCMD_OK;
+
+fail:
+	if (ret != DCMD_OK)
+		mdb_warn("failed to get debug registers for CPU %d\n", cpu);
+	mdb_free(regvals, ARRAY_SIZE(regnums) * sizeof (uint64_t));
+	return (ret);
+}
+
+static int
+bhyve_switch_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	size_t cpu = (int)addr;
+
+	if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+		return (DCMD_USAGE);
+
+	if (cpu >= vmm_ncpu(bd->bd_vmm)) {
+		mdb_warn("no such CPU\n");
+		return (DCMD_ERR);
+	}
+
+	bd->bd_curcpu = cpu;
+	return (DCMD_OK);
+
+}
+
+static int
+bhyve_seg2reg(const char *seg)
+{
+	if (strcasecmp(seg, "cs") == 0)
+		return (VMM_DESC_CS);
+	else if (strcasecmp(seg, "ds") == 0)
+		return (VMM_DESC_DS);
+	else if (strcasecmp(seg, "es") == 0)
+		return (VMM_DESC_ES);
+	else if (strcasecmp(seg, "fs") == 0)
+		return (VMM_DESC_FS);
+	else if (strcasecmp(seg, "gs") == 0)
+		return (VMM_DESC_GS);
+	else if (strcasecmp(seg, "ss") == 0)
+		return (VMM_DESC_SS);
+	else
+		return (-1);
+}
+
+static int
+bhyve_vtol_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	int segreg = bd->bd_defseg;
+	char *seg = "";
+	uint64_t laddr;
+	int i;
+
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	i = mdb_getopts(argc, argv, 's', MDB_OPT_STR, &seg, NULL);
+
+	argc -= i;
+	argv += i;
+
+	if (i != 0) {
+		if (argc != 0)
+			return (DCMD_USAGE);
+
+		segreg = bhyve_seg2reg(seg);
+		if (segreg == -1)
+			return (DCMD_USAGE);
+	}
+
+	if (vmm_vtol(bd->bd_vmm, bd->bd_curcpu, segreg, addr, &laddr) != 0) {
+		if (errno == EFAULT)
+			(void) set_errno(EMDB_NOMAP);
+		return (DCMD_ERR);
+	}
+
+	if (flags & DCMD_PIPE_OUT)
+		mdb_printf("%llr\n", laddr);
+	else
+		mdb_printf("virtual %lr mapped to linear %llr\n", addr, laddr);
+
+	return (DCMD_OK);
+}
+
+static int
+bhyve_vtop_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	int segreg = bd->bd_defseg;
+	char *seg = "";
+	physaddr_t pa;
+	int i;
+
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	i = mdb_getopts(argc, argv, 's', MDB_OPT_STR, &seg, NULL);
+
+	argc -= i;
+	argv += i;
+
+	if (i != 0) {
+		segreg = bhyve_seg2reg(seg);
+		if (segreg == -1)
+			return (DCMD_USAGE);
+	}
+
+	if (vmm_vtop(bd->bd_vmm, bd->bd_curcpu, segreg, addr, &pa) == -1) {
+		mdb_warn("failed to get physical mapping");
+		return (DCMD_ERR);
+	}
+
+	if (flags & DCMD_PIPE_OUT)
+		mdb_printf("%llr\n", pa);
+	else
+		mdb_printf("virtual %lr mapped to physical %llr\n", addr, pa);
+	return (DCMD_OK);
+}
+
+static int
+bhyve_defseg_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	bhyve_data_t *bd = mdb.m_target->t_data;
+	int segreg = bd->bd_defseg;
+	char *seg = "";
+	int i;
+
+	if (flags & DCMD_ADDRSPEC)
+		return (DCMD_USAGE);
+
+	i = mdb_getopts(argc, argv, 's', MDB_OPT_STR, &seg, NULL);
+
+	argc -= i;
+	argv += i;
+
+	if (i != 0) {
+		if (argc != 0)
+			return (DCMD_USAGE);
+
+		segreg = bhyve_seg2reg(seg);
+		if (segreg == -1)
+			return (DCMD_USAGE);
+
+		bd->bd_defseg = segreg;
+	}
+
+	mdb_printf("using segment %s for virtual to linear address translation",
+	    segments[bd->bd_defseg - VMM_DESC_CS]);
+
+	return (DCMD_OK);
+}
+
+static const mdb_dcmd_t bhyve_dcmds[] = {
+	{ "$c", NULL, "print stack backtrace", bhyve_stack_dcmd },
+	{ "$C", NULL, "print stack backtrace", bhyve_stackv_dcmd },
+	{ "$r", NULL, "print general-purpose registers", bhyve_regs_dcmd },
+	{ "$?", NULL, "print status and registers", bhyve_regs_dcmd },
+	{ ":x", ":", "change the active CPU", bhyve_switch_dcmd },
+	{ "cpustack", "?[-v] [-c cpuid] [cnt]", "print stack backtrace for a "
+	    "specific CPU", bhyve_cpustack_dcmd },
+	{ "cpuregs", "?[-c cpuid]", "print general-purpose registers for a "
+	    "specific CPU", bhyve_cpuregs_dcmd },
+	{ "dbgregs", "?[-c cpuid]", "print debug registers for a specific CPU",
+	    bhyve_dbgregs_dcmd },
+	{ "defseg", "?[-s segment]", "change the default segment used to "
+	    "translate addresses", bhyve_defseg_dcmd },
+	{ "regs", NULL, "print general-purpose registers", bhyve_regs_dcmd },
+	{ "stack", NULL, "print stack backtrace", bhyve_stack_dcmd },
+	{ "stackregs", NULL, "print stack backtrace and registers",
+	    bhyve_stackr_dcmd },
+	{ "status", NULL, "print summary of current target",
+	    bhyve_status_dcmd },
+	{ "sysregs", "?[-c cpuid]", "print system registers for a specific CPU",
+	    bhyve_sysregs_dcmd },
+	{ "switch", ":", "change the active CPU", bhyve_switch_dcmd },
+	{ "vtol", ":[-s segment]", "print linear mapping of virtual address",
+	    bhyve_vtol_dcmd },
+	{ "vtop", ":[-s segment]", "print physical mapping of virtual "
+	    "address", bhyve_vtop_dcmd },
+	{ NULL }
+};
+
+
+/*
+ * t_setflags: change target flags
+ */
+static int
+bhyve_setflags(mdb_tgt_t *tgt, int flags)
+{
+	bhyve_data_t *bd = tgt->t_data;
+
+	if (((tgt->t_flags ^ flags) & MDB_TGT_F_RDWR) != 0) {
+		boolean_t writable = (flags & MDB_TGT_F_RDWR) != 0;
+
+		vmm_unmap(bd->bd_vmm);
+		if (vmm_map(bd->bd_vmm, writable) != 0) {
+			mdb_warn("failed to map guest memory");
+			return (set_errno(EMDB_TGT));
+		}
+	}
+
+	tgt->t_flags = flags;
+
+	return (0);
+}
+
+/*
+ * t_activate: activate target
+ */
+static void
+bhyve_activate(mdb_tgt_t *tgt)
+{
+	mdb_tgt_status_t *tsp = &tgt->t_status;
+	bhyve_data_t *bd = tgt->t_data;
+	const char *format;
+	char buf[BUFSIZ];
+
+	(void) mdb_set_prompt(MDB_DEF_PROMPT);
+
+	(void) mdb_tgt_register_dcmds(tgt, bhyve_dcmds, MDB_MOD_FORCE);
+	mdb_tgt_register_regvars(tgt, bhyve_kregs, &bhyve_reg_disc, 0);
+
+	(void) vmm_stop(bd->bd_vmm);
+
+	if (mdb_tgt_status(tgt, tsp) != 0)
+		return;
+
+	if (tsp->st_pc != 0) {
+		if (mdb_dis_ins2str(mdb.m_disasm, mdb.m_target,
+		    MDB_TGT_AS_VIRT_I, buf, sizeof (buf), tsp->st_pc) !=
+		    tsp->st_pc)
+			format = "target stopped at:\n%-#16a%8T%s\n";
+		else
+			format = "target stopped at %a:\n";
+		mdb_warn(format, tsp->st_pc, buf);
+	}
+}
+
+/*
+ * t_deactivate: deactivate target
+ */
+static void
+bhyve_deactivate(mdb_tgt_t *tgt)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	const mdb_tgt_regdesc_t *rd;
+	const mdb_dcmd_t *dc;
+
+	for (rd = bhyve_kregs; rd->rd_name != NULL; rd++) {
+		mdb_var_t *var;
+
+		if (!(rd->rd_flags & MDB_TGT_R_EXPORT))
+			continue; /* didn't export register as variable */
+
+		if ((var = mdb_nv_lookup(&mdb.m_nv, rd->rd_name)) != NULL) {
+			var->v_flags &= ~MDB_NV_PERSIST;
+			mdb_nv_remove(&mdb.m_nv, var);
+		}
+	}
+
+	for (dc = bhyve_dcmds; dc->dc_name != NULL; dc++)
+		if (mdb_module_remove_dcmd(tgt->t_module, dc->dc_name) == -1)
+			mdb_warn("failed to remove dcmd %s", dc->dc_name);
+
+	(void) vmm_cont(bd->bd_vmm);
+}
+
+/*
+ * t_name: return name of target
+ */
+static const char *
+bhyve_name(mdb_tgt_t *tgt)
+{
+	_NOTE(ARGUNUSED(tgt));
+
+	return ("bhyve");
+}
+
+/*
+ * t_destroy: cleanup target private resources
+ */
+static void
+bhyve_destroy(mdb_tgt_t *tgt)
+{
+	bhyve_data_t *bd = tgt->t_data;
+
+	(void) vmm_cont(bd->bd_vmm);
+	vmm_unmap(bd->bd_vmm);
+	vmm_close_vm(bd->bd_vmm);
+	mdb_free(bd, sizeof (bhyve_data_t));
+	tgt->t_data = NULL;
+}
+
+/*
+ * t_isa: return name of target ISA
+ */
+const char *
+bhyve_isa(mdb_tgt_t *tgt)
+{
+	_NOTE(ARGUNUSED(tgt));
+
+	return ("amd64");
+}
+
+/*
+ * t_dmodel: return target data model
+ */
+static int
+bhyve_dmodel(mdb_tgt_t *tgt)
+{
+	_NOTE(ARGUNUSED(tgt));
+
+	return (MDB_TGT_MODEL_LP64);
+}
+
+/*ARGSUSED*/
+static ssize_t
+bhyve_aread(mdb_tgt_t *tgt, mdb_tgt_as_t as, void *buf, size_t nbytes,
+    mdb_tgt_addr_t addr)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	ssize_t cnt;
+
+	switch ((uintptr_t)as) {
+	case (uintptr_t)MDB_TGT_AS_VIRT:
+		cnt = vmm_vread(bd->bd_vmm, bd->bd_curcpu, bd->bd_defseg, buf,
+		    nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+		cnt = vmm_vread(bd->bd_vmm, bd->bd_curcpu, VMM_DESC_CS, buf,
+		    nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
+		cnt = vmm_vread(bd->bd_vmm, bd->bd_curcpu, VMM_DESC_SS, buf,
+		    nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_PHYS:
+		cnt = vmm_pread(bd->bd_vmm, buf, nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_FILE:
+	case (uintptr_t)MDB_TGT_AS_IO:
+		return (set_errno(EMDB_TGTNOTSUP));
+	}
+
+	if (errno == EFAULT)
+		return (set_errno(EMDB_NOMAP));
+
+	return (cnt);
+}
+
+/*ARGSUSED*/
+static ssize_t
+bhyve_awrite(mdb_tgt_t *tgt, mdb_tgt_as_t as, const void *buf, size_t nbytes,
+    mdb_tgt_addr_t addr)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	ssize_t cnt;
+
+	switch ((uintptr_t)as) {
+	case (uintptr_t)MDB_TGT_AS_VIRT:
+		cnt = vmm_vwrite(bd->bd_vmm, bd->bd_curcpu, bd->bd_defseg, buf,
+		    nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+		cnt = vmm_vwrite(bd->bd_vmm, bd->bd_curcpu, VMM_DESC_CS, buf,
+		    nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
+		cnt = vmm_vwrite(bd->bd_vmm, bd->bd_curcpu, VMM_DESC_SS, buf,
+		    nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_PHYS:
+		cnt = vmm_pwrite(bd->bd_vmm, buf, nbytes, addr);
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_FILE:
+	case (uintptr_t)MDB_TGT_AS_IO:
+		return (set_errno(EMDB_TGTNOTSUP));
+	}
+
+	if (errno == EFAULT)
+		return (set_errno(EMDB_NOMAP));
+
+	return (cnt);
+}
+
+/*
+ * t_vread: read from virtual memory
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_vread(mdb_tgt_t *tgt, void *buf, size_t nbytes, uintptr_t addr)
+{
+	return (bhyve_aread(tgt, MDB_TGT_AS_VIRT, buf, nbytes, addr));
+}
+
+/*
+ * t_vwrite: write to virtual memory
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_vwrite(mdb_tgt_t *tgt, const void *buf, size_t nbytes, uintptr_t addr)
+{
+	return (bhyve_awrite(tgt, MDB_TGT_AS_VIRT, buf, nbytes, addr));
+}
+
+/*
+ * t_pread: read from physical memory
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_pread(mdb_tgt_t *tgt, void *buf, size_t nbytes, physaddr_t addr)
+{
+	return (bhyve_aread(tgt, MDB_TGT_AS_PHYS, buf, nbytes, addr));
+}
+
+/*
+ * t_pwrite: write to physical memory
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_pwrite(mdb_tgt_t *tgt, const void *buf, size_t nbytes, physaddr_t addr)
+{
+	return (bhyve_awrite(tgt, MDB_TGT_AS_PHYS, buf, nbytes, addr));
+}
+
+/*
+ * t_fread: read from core/object file
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_fread(mdb_tgt_t *tgt, void *buf, size_t nbytes, uintptr_t addr)
+{
+	return (bhyve_aread(tgt, MDB_TGT_AS_FILE, buf, nbytes, addr));
+}
+
+/*
+ * t_fwrite: write to core/object file
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_fwrite(mdb_tgt_t *tgt, const void *buf, size_t nbytes, uintptr_t addr)
+{
+	return (bhyve_awrite(tgt, MDB_TGT_AS_FILE, buf, nbytes, addr));
+}
+
+/*
+ * t_ioread: read from I/O space
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_ioread(mdb_tgt_t *tgt, void *buf, size_t nbytes, uintptr_t addr)
+{
+	return (bhyve_aread(tgt, MDB_TGT_AS_IO, buf, nbytes, addr));
+}
+
+/*
+ * t_iowrite: write to I/O space
+ */
+/*ARGSUSED*/
+static ssize_t
+bhyve_iowrite(mdb_tgt_t *tgt, const void *buf, size_t nbytes, uintptr_t addr)
+{
+	return (bhyve_awrite(tgt, MDB_TGT_AS_IO, buf, nbytes, addr));
+}
+
+/*
+ * t_vtop: translate virtual to physical address
+ */
+static int
+bhyve_vtop(mdb_tgt_t *tgt, mdb_tgt_as_t as, uintptr_t va, physaddr_t *pa)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	int seg;
+
+	switch ((uintptr_t)as) {
+	case (uintptr_t)MDB_TGT_AS_VIRT:
+		seg = bd->bd_defseg;
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_VIRT_I:
+		seg = VMM_DESC_CS;
+		break;
+
+	case (uintptr_t)MDB_TGT_AS_VIRT_S:
+		seg = VMM_DESC_SS;
+		break;
+
+	default:
+		return (set_errno(EINVAL));
+	}
+
+	if (vmm_vtop(bd->bd_vmm, bd->bd_curcpu, seg, va, pa) != 0) {
+		if (errno == EFAULT)
+			return (set_errno(EMDB_NOMAP));
+		else
+			return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * t_status: get target status
+ */
+static int
+bhyve_status(mdb_tgt_t *tgt, mdb_tgt_status_t *tsp)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	mdb_tgt_reg_t rip;
+	vmm_desc_t cs;
+	int ret;
+
+	bzero(tsp, sizeof (mdb_tgt_status_t));
+
+	ret = vmm_getreg(bd->bd_vmm, bd->bd_curcpu, KREG_RIP, &rip);
+	if (ret != 0) {
+		tsp->st_state = MDB_TGT_UNDEAD;
+	} else {
+		tsp->st_state = MDB_TGT_STOPPED;
+		tsp->st_pc = rip;
+	}
+
+	switch (vmm_vcpu_isa(bd->bd_vmm, bd->bd_curcpu)) {
+	case VMM_ISA_16:
+		(void) mdb_dis_select("ia16");
+		break;
+	case VMM_ISA_32:
+		(void) mdb_dis_select("ia32");
+		break;
+	case VMM_ISA_64:
+		(void) mdb_dis_select("amd64");
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+static void
+bhyve_sighdl(int sig, siginfo_t *sip, ucontext_t *ucp, mdb_tgt_t *tgt)
+{
+	mdb_tgt_status_t *tsp = &tgt->t_status;
+	bhyve_data_t *bd = tgt->t_data;
+
+	switch (sig) {
+	case SIGINT:
+		/*
+		 * vmm_stop() may fail if the VM was destroyed while we were
+		 * waiting. This will be handled by mdb_tgt_status().
+		 */
+		(void) vmm_stop(bd->bd_vmm);
+		(void) mdb_tgt_status(tgt, tsp);
+		break;
+	}
+}
+
+/*
+ * t_step: single-step target
+ */
+static int
+bhyve_step(mdb_tgt_t *tgt, mdb_tgt_status_t *tsp)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	int ret;
+
+	ret = vmm_step(bd->bd_vmm, bd->bd_curcpu);
+	(void) mdb_tgt_status(tgt, tsp);
+
+	return (ret);
+}
+
+/*
+ * t_cont: continue target execution
+ *
+ * Catch SIGINT so that the target can be stopped with Ctrl-C.
+ */
+static int
+bhyve_cont(mdb_tgt_t *tgt, mdb_tgt_status_t *tsp)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	mdb_signal_f *intf;
+	void *intd;
+	int ret;
+
+	intf = mdb_signal_gethandler(SIGINT, &intd);
+	(void) mdb_signal_sethandler(SIGINT, (mdb_signal_f *)bhyve_sighdl, tgt);
+
+	if (ret = vmm_cont(bd->bd_vmm) != 0) {
+		mdb_warn("failed to continue target execution: %d", ret);
+		return (set_errno(EMDB_TGT));
+	}
+
+	tsp->st_state = MDB_TGT_RUNNING;
+	(void) pause();
+
+	(void) mdb_signal_sethandler(SIGINT, intf, intd);
+	(void) mdb_tgt_status(tgt, tsp);
+
+	return (ret);
+}
+
+static int
+bhyve_lookup_reg(mdb_tgt_t *tgt, const char *rname)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	const mdb_tgt_regdesc_t *rd;
+
+	for (rd = bhyve_kregs; rd->rd_name != NULL; rd++)
+		if (strcmp(rd->rd_name, rname) == 0)
+			return (rd->rd_num);
+
+	return (-1);
+}
+
+/*
+ * t_getareg: get the value of a single register
+ */
+static int
+bhyve_getareg(mdb_tgt_t *tgt, mdb_tgt_tid_t tid, const char *rname,
+    mdb_tgt_reg_t *rp)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	int reg = bhyve_lookup_reg(tgt, rname);
+	int ret;
+
+	if (reg == -1)
+		return (set_errno(EMDB_BADREG));
+
+	ret = vmm_getreg(bd->bd_vmm, bd->bd_curcpu, reg, rp);
+	if (ret == -1)
+		return (set_errno(EMDB_BADREG));
+
+	return (0);
+}
+
+/*
+ * t_putareg: set the value of a single register
+ */
+static int
+bhyve_putareg(mdb_tgt_t *tgt, mdb_tgt_tid_t tid, const char *rname,
+    mdb_tgt_reg_t r)
+{
+	bhyve_data_t *bd = tgt->t_data;
+	int reg = bhyve_lookup_reg(tgt, rname);
+	int ret;
+
+	if ((tgt->t_flags & MDB_TGT_F_RDWR) == 0)
+		return (set_errno(EMDB_TGTRDONLY));
+
+	if (reg == -1)
+		return (set_errno(EMDB_BADREG));
+
+	ret = vmm_setreg(bd->bd_vmm, bd->bd_curcpu, reg, r);
+	if (ret == -1)
+		return (set_errno(EMDB_BADREG));
+
+	return (0);
+}
+
+static const mdb_tgt_ops_t bhyve_ops = {
+	.t_setflags =		bhyve_setflags,
+	.t_setcontext =		(int (*)()) mdb_tgt_notsup,
+	.t_activate =		bhyve_activate,
+	.t_deactivate =		bhyve_deactivate,
+	.t_periodic =		(void (*)()) mdb_tgt_nop,
+	.t_destroy =		bhyve_destroy,
+	.t_name =		bhyve_name,
+	.t_isa =		bhyve_isa,
+	.t_platform =		(const char *(*)()) mdb_conf_platform,
+	.t_uname =		(int (*)()) mdb_tgt_notsup,
+	.t_dmodel =		bhyve_dmodel,
+	.t_aread =		bhyve_aread,
+	.t_awrite =		bhyve_awrite,
+	.t_vread =		bhyve_vread,
+	.t_vwrite =		bhyve_vwrite,
+	.t_pread =		bhyve_pread,
+	.t_pwrite =		bhyve_pwrite,
+	.t_fread =		bhyve_fread,
+	.t_fwrite =		bhyve_fwrite,
+	.t_ioread =		bhyve_ioread,
+	.t_iowrite =		bhyve_iowrite,
+	.t_vtop =		bhyve_vtop,
+	.t_lookup_by_name =	(int (*)()) mdb_tgt_notsup,
+	.t_lookup_by_addr =	(int (*)()) mdb_tgt_notsup,
+	.t_symbol_iter =	(int (*)()) mdb_tgt_notsup,
+	.t_mapping_iter =	(int (*)()) mdb_tgt_notsup,
+	.t_object_iter =	(int (*)()) mdb_tgt_notsup,
+	.t_addr_to_map =	(const mdb_map_t *(*)()) mdb_tgt_null,
+	.t_name_to_map =	(const mdb_map_t *(*)()) mdb_tgt_null,
+	.t_addr_to_ctf =	(struct ctf_file *(*)()) mdb_tgt_null,
+	.t_name_to_ctf =	(struct ctf_file *(*)()) mdb_tgt_null,
+	.t_status =		bhyve_status,
+	.t_run =		(int (*)()) mdb_tgt_notsup,
+	.t_step =		bhyve_step,
+	.t_step_out =		(int (*)()) mdb_tgt_notsup,
+	.t_next =		(int (*)()) mdb_tgt_notsup,
+	.t_cont =		bhyve_cont,
+	.t_signal =		(int (*)()) mdb_tgt_notsup,
+	.t_add_vbrkpt =		(int (*)()) mdb_tgt_null,
+	.t_add_sbrkpt =		(int (*)()) mdb_tgt_null,
+	.t_add_pwapt =		(int (*)()) mdb_tgt_null,
+	.t_add_vwapt =		(int (*)()) mdb_tgt_null,
+	.t_add_iowapt =		(int (*)()) mdb_tgt_null,
+	.t_add_sysenter =	(int (*)()) mdb_tgt_null,
+	.t_add_sysexit =	(int (*)()) mdb_tgt_null,
+	.t_add_signal =		(int (*)()) mdb_tgt_null,
+	.t_add_fault =		(int (*)()) mdb_tgt_null,
+	.t_getareg =		bhyve_getareg,
+	.t_putareg =		bhyve_putareg,
+	.t_stack_iter =		(int (*)()) mdb_tgt_notsup,
+	.t_auxv =		(int (*)()) mdb_tgt_notsup
+};
+
+int
+mdb_bhyve_tgt_create(mdb_tgt_t *tgt, int argc, const char *argv[])
+{
+	bhyve_data_t *bd;
+	vmm_t *vmm = NULL;
+	boolean_t writable = (tgt->t_flags & MDB_TGT_F_RDWR) != 0;
+
+	if (argc != 1)
+		return (set_errno(EINVAL));
+
+	vmm = vmm_open_vm(argv[0]);
+	if (vmm == NULL) {
+		mdb_warn("failed to open %s", argv[0]);
+		return (set_errno(EMDB_TGT));
+	}
+
+	if (vmm_map(vmm, writable) != 0) {
+		mdb_warn("failed to map %s", argv[0]);
+		vmm_close_vm(vmm);
+		return (set_errno(EMDB_TGT));
+	}
+
+	bd = mdb_zalloc(sizeof (bhyve_data_t) + strlen(argv[0]) + 1, UM_SLEEP);
+	(void) strcpy(bd->bd_name, argv[0]);
+	bd->bd_vmm = vmm;
+	bd->bd_curcpu = 0;
+	bd->bd_defseg = VMM_DESC_DS;
+
+	tgt->t_ops = &bhyve_ops;
+	tgt->t_data = bd;
+	tgt->t_flags |= MDB_TGT_F_ASIO;
+
+	(void) mdb_nv_insert(&mdb.m_nv, "cpuid", &bhyve_cpuid_disc, 0,
+	    MDB_NV_PERSIST | MDB_NV_RDONLY);
+
+	return (0);
+}
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.c b/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.c
index d6db4811b2..22e56a0eda 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.c
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.c
@@ -23,16 +23,18 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.  All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
+#include <sys/types32.h>
 #include <sys/reg.h>
 #include <sys/privregs.h>
 #include <sys/stack.h>
 #include <sys/frame.h>
 
+#include <mdb/mdb_isautil.h>
 #include <mdb/mdb_ia32util.h>
 #include <mdb/mdb_target_impl.h>
 #include <mdb/mdb_kreg_impl.h>
@@ -41,10 +43,14 @@
 #include <mdb/mdb_err.h>
 #include <mdb/mdb.h>
 
+#ifndef __amd64
 /*
  * We also define an array of register names and their corresponding
  * array indices.  This is used by the getareg and putareg entry points,
  * and also by our register variable discipline.
+ *
+ * When built into an amd64 mdb this won't be used as it's only a subset of
+ * mdb_amd64_kregs, hence the #ifdef.
  */
 const mdb_tgt_regdesc_t mdb_ia32_kregs[] = {
 	{ "savfp", KREG_SAVFP, MDB_TGT_R_EXPORT },
@@ -87,6 +93,7 @@ const mdb_tgt_regdesc_t mdb_ia32_kregs[] = {
 	{ "err", KREG_ERR, MDB_TGT_R_EXPORT | MDB_TGT_R_PRIV },
 	{ NULL, 0, 0 }
 };
+#endif
 
 void
 mdb_ia32_printregs(const mdb_tgt_gregset_t *gregs)
@@ -94,27 +101,27 @@ mdb_ia32_printregs(const mdb_tgt_gregset_t *gregs)
 	const kreg_t *kregs = &gregs->kregs[0];
 	kreg_t eflags = kregs[KREG_EFLAGS];
 
-	mdb_printf("%%cs = 0x%04x\t\t%%eax = 0x%0?p %A\n",
+	mdb_printf("%%cs = 0x%04x\t\t%%eax = 0x%08p %A\n",
 	    kregs[KREG_CS], kregs[KREG_EAX], kregs[KREG_EAX]);
 
-	mdb_printf("%%ds = 0x%04x\t\t%%ebx = 0x%0?p %A\n",
+	mdb_printf("%%ds = 0x%04x\t\t%%ebx = 0x%08p %A\n",
 	    kregs[KREG_DS], kregs[KREG_EBX], kregs[KREG_EBX]);
 
-	mdb_printf("%%ss = 0x%04x\t\t%%ecx = 0x%0?p %A\n",
+	mdb_printf("%%ss = 0x%04x\t\t%%ecx = 0x%08p %A\n",
 	    kregs[KREG_SS], kregs[KREG_ECX], kregs[KREG_ECX]);
 
-	mdb_printf("%%es = 0x%04x\t\t%%edx = 0x%0?p %A\n",
+	mdb_printf("%%es = 0x%04x\t\t%%edx = 0x%08p %A\n",
 	    kregs[KREG_ES], kregs[KREG_EDX], kregs[KREG_EDX]);
 
-	mdb_printf("%%fs = 0x%04x\t\t%%esi = 0x%0?p %A\n",
+	mdb_printf("%%fs = 0x%04x\t\t%%esi = 0x%08p %A\n",
 	    kregs[KREG_FS], kregs[KREG_ESI], kregs[KREG_ESI]);
 
-	mdb_printf("%%gs = 0x%04x\t\t%%edi = 0x%0?p %A\n\n",
+	mdb_printf("%%gs = 0x%04x\t\t%%edi = 0x%08p %A\n\n",
 	    kregs[KREG_GS], kregs[KREG_EDI], kregs[KREG_EDI]);
 
-	mdb_printf("%%eip = 0x%0?p %A\n", kregs[KREG_EIP], kregs[KREG_EIP]);
-	mdb_printf("%%ebp = 0x%0?p\n", kregs[KREG_EBP]);
-	mdb_printf("%%esp = 0x%0?p\n\n", kregs[KREG_ESP]);
+	mdb_printf("%%eip = 0x%08p %A\n", kregs[KREG_EIP], kregs[KREG_EIP]);
+	mdb_printf("%%ebp = 0x%08p\n", kregs[KREG_EBP]);
+	mdb_printf("%%esp = 0x%08p\n\n", kregs[KREG_ESP]);
 	mdb_printf("%%eflags = 0x%08x\n", eflags);
 
 	mdb_printf("  id=%u vip=%u vif=%u ac=%u vm=%u rf=%u nt=%u iopl=0x%x\n",
@@ -138,8 +145,8 @@ mdb_ia32_printregs(const mdb_tgt_gregset_t *gregs)
 	    (eflags & KREG_EFLAGS_PF_MASK) ? "PF" : "pf",
 	    (eflags & KREG_EFLAGS_CF_MASK) ? "CF" : "cf");
 
-#ifndef _KMDB
-	mdb_printf("  %%uesp = 0x%0?x\n", kregs[KREG_UESP]);
+#if !defined(__amd64) && !defined(_KMDB)
+	mdb_printf("  %%uesp = 0x%08x\n", kregs[KREG_UESP]);
 #endif
 	mdb_printf("%%trapno = 0x%x\n", kregs[KREG_TRAPNO]);
 	mdb_printf("   %%err = 0x%x\n", kregs[KREG_ERR]);
@@ -166,7 +173,8 @@ kvm_argcount(mdb_tgt_t *t, uintptr_t eip, ssize_t size)
 		M_ADD_IMM8  = 0x83	/* ADD imm8 to r/m32 */
 	};
 
-	if (mdb_tgt_vread(t, ins, sizeof (ins), eip) != sizeof (ins))
+	if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_I, ins, sizeof (ins), eip) !=
+	    sizeof (ins))
 		return (0);
 
 	if (ins[1] != M_MODRM_ESP)
@@ -185,7 +193,7 @@ kvm_argcount(mdb_tgt_t *t, uintptr_t eip, ssize_t size)
 		n = 0;
 	}
 
-	return (MIN((ssize_t)n, size) / sizeof (long));
+	return (MIN((ssize_t)n, size) / sizeof (uint32_t));
 }
 
 int
@@ -198,9 +206,9 @@ mdb_ia32_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 	int err;
 
 	struct fr {
-		uintptr_t fr_savfp;
-		uintptr_t fr_savpc;
-		long fr_argv[32];
+		uintptr32_t fr_savfp;
+		uintptr32_t fr_savpc;
+		uint32_t fr_argv[32];
 	} fr;
 
 	uintptr_t fp = gsp->kregs[KREG_EBP];
@@ -226,9 +234,9 @@ mdb_ia32_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 			err = EMDB_STKALIGN;
 			goto badfp;
 		}
-		if ((size = mdb_tgt_vread(t, &fr, sizeof (fr), fp)) >=
-		    (ssize_t)(2 * sizeof (uintptr_t))) {
-			size -= (ssize_t)(2 * sizeof (uintptr_t));
+		if ((size = mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S, &fr,
+		    sizeof (fr), fp)) >= (ssize_t)(2 * sizeof (uintptr32_t))) {
+			size -= (ssize_t)(2 * sizeof (uintptr32_t));
 			argc = kvm_argcount(t, fr.fr_savpc, size);
 		} else {
 			err = EMDB_NOMAP;
@@ -245,8 +253,9 @@ mdb_ia32_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 			if (advance_tortoise != 0) {
 				struct fr tfr;
 
-				if (mdb_tgt_vread(t, &tfr, sizeof (tfr),
-				    tortoise_fp) != sizeof (tfr)) {
+				if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S, &tfr,
+				    sizeof (tfr), tortoise_fp) !=
+				    sizeof (tfr)) {
 					err = EMDB_NOMAP;
 					goto badfp;
 				}
@@ -262,7 +271,8 @@ mdb_ia32_kvm_stack_iter(mdb_tgt_t *t, const mdb_tgt_gregset_t *gsp,
 
 		advance_tortoise = !advance_tortoise;
 
-		if (got_pc && func(arg, pc, argc, fr.fr_argv, &gregs) != 0)
+		if (got_pc &&
+		    func(arg, pc, argc, (const long *)fr.fr_argv, &gregs) != 0)
 			break;
 
 		kregs[KREG_ESP] = kregs[KREG_EBP];
@@ -294,6 +304,12 @@ badfp:
 	return (set_errno(err));
 }
 
+#ifndef __amd64
+/*
+ * The functions mdb_ia32_step_out and mdb_ia32_next haven't yet been adapted
+ * to work when built for an amd64 mdb. They are unused by the amd64-only bhyve
+ * target, hence the #ifdef.
+ */
 /*
  * Determine the return address for the current frame.  Typically this is the
  * fr_savpc value from the current frame, but we also perform some special
@@ -321,7 +337,8 @@ mdb_ia32_step_out(mdb_tgt_t *t, uintptr_t *p, kreg_t pc, kreg_t fp, kreg_t sp,
 			fp = sp;
 	}
 
-	if (mdb_tgt_vread(t, &fr, sizeof (fr), fp) == sizeof (fr)) {
+	if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_S, &fr, sizeof (fr), fp) ==
+	    sizeof (fr)) {
 		*p = fr.fr_savpc;
 		return (0);
 	}
@@ -372,7 +389,8 @@ mdb_ia32_next(mdb_tgt_t *t, uintptr_t *p, kreg_t pc, mdb_instr_t curinstr)
 	 * read the subsequent Mod/RM byte to perform additional decoding.
 	 */
 	if (curinstr == M_CALL_REG) {
-		if (mdb_tgt_vread(t, &m, sizeof (m), pc + 1) != sizeof (m))
+		if (mdb_tgt_aread(t, MDB_TGT_AS_VIRT_I, &m, sizeof (m), pc + 1)
+		    != sizeof (m))
 			return (-1); /* errno is set for us */
 
 		/*
@@ -404,13 +422,16 @@ mdb_ia32_next(mdb_tgt_t *t, uintptr_t *p, kreg_t pc, mdb_instr_t curinstr)
 
 	return (set_errno(EAGAIN));
 }
+#endif
 
 /*ARGSUSED*/
 int
-mdb_ia32_kvm_frame(void *arglim, uintptr_t pc, uint_t argc, const long *argv,
+mdb_ia32_kvm_frame(void *arglim, uintptr_t pc, uint_t argc, const long *largv,
     const mdb_tgt_gregset_t *gregs)
 {
-	argc = MIN(argc, (uint_t)arglim);
+	const uint32_t *argv = (const uint32_t *)largv;
+
+	argc = MIN(argc, (uintptr_t)arglim);
 	mdb_printf("%a(", pc);
 
 	if (argc != 0) {
@@ -424,11 +445,13 @@ mdb_ia32_kvm_frame(void *arglim, uintptr_t pc, uint_t argc, const long *argv,
 }
 
 int
-mdb_ia32_kvm_framev(void *arglim, uintptr_t pc, uint_t argc, const long *argv,
+mdb_ia32_kvm_framev(void *arglim, uintptr_t pc, uint_t argc, const long *largv,
     const mdb_tgt_gregset_t *gregs)
 {
-	argc = MIN(argc, (uint_t)arglim);
-	mdb_printf("%0?lr %a(", gregs->kregs[KREG_EBP], pc);
+	const uint32_t *argv = (const uint32_t *)largv;
+
+	argc = MIN(argc, (uintptr_t)arglim);
+	mdb_printf("%08lr %a(", gregs->kregs[KREG_EBP], pc);
 
 	if (argc != 0) {
 		mdb_printf("%lr", *argv++);
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.h b/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.h
index 597b234b3e..1645b86d8d 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.h
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_ia32util.h
@@ -22,12 +22,13 @@
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2018, Joyent, Inc.  All rights reserved.
+ */
 
 #ifndef _MDB_IA32UTIL_H
 #define	_MDB_IA32UTIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <mdb/mdb_kreg.h>
 #include <mdb/mdb_target_impl.h>
 
@@ -35,8 +36,6 @@
 extern "C" {
 #endif
 
-typedef uchar_t mdb_instr_t;
-
 extern const mdb_tgt_regdesc_t mdb_ia32_kregs[];
 
 extern void mdb_ia32_printregs(const mdb_tgt_gregset_t *);
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_isautil.h b/usr/src/cmd/mdb/intel/mdb/mdb_isautil.h
index b6d17e1045..237e6663ae 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_isautil.h
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_isautil.h
@@ -23,16 +23,19 @@
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2018, Joyent, Inc.  All rights reserved.
+ */
 
 #ifndef _MDB_ISAUTIL_H
 #define	_MDB_ISAUTIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef uchar_t mdb_instr_t;
+
 #ifdef __amd64
 #include <mdb/mdb_amd64util.h>
 
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h b/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h
index a3edf864d7..3db7d6a1d6 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h
@@ -75,6 +75,7 @@ typedef uint32_t kreg_t;
 #define	KREG_ES		KDIREG_ES
 #define	KREG_FS		KDIREG_FS
 #define	KREG_GS		KDIREG_GS
+#define	KREG_FSBASE	KDIREG_FSBASE
 #define	KREG_GSBASE	KDIREG_GSBASE
 #define	KREG_KGSBASE	KDIREG_KGSBASE
 #define	KREG_TRAPNO	KDIREG_TRAPNO
@@ -91,6 +92,17 @@ typedef uint32_t kreg_t;
 #define	KREG_SP		KREG_RSP
 #define	KREG_FP		KREG_RBP
 
+#define	KREG_EAX	KREG_RAX
+#define	KREG_EBX	KREG_RBX
+#define	KREG_ECX	KREG_RCX
+#define	KREG_EDX	KREG_RDX
+#define	KREG_ESI	KREG_RSI
+#define	KREG_EDI	KREG_RDI
+#define	KREG_EBP	KREG_RBP
+#define	KREG_ESP	KREG_RSP
+#define	KREG_EFLAGS	KREG_RFLAGS
+#define	KREG_EIP	KREG_RIP
+
 #else	/* __amd64 */
 
 #define	KREG_SAVFP	KDIREG_SAVFP
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_x86util.c b/usr/src/cmd/mdb/intel/mdb/mdb_x86util.c
new file mode 100644
index 0000000000..a01ee6cffb
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_x86util.c
@@ -0,0 +1,215 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * ISA-independent utility functions for the x86 architecture
+ */
+
+#include <mdb/mdb_modapi.h>
+#include <mdb/mdb_x86util.h>
+
+#include <sys/controlregs.h>
+#include <inttypes.h>
+
+#define	MMU_PAGESHIFT	12
+#define	MMU_PAGESIZE	(1 << MMU_PAGESHIFT)
+#define	MMU_PAGEOFFSET	(MMU_PAGESIZE - 1)
+#define	MMU_PAGEMASK	(~MMU_PAGEOFFSET)
+
+#ifndef _KMDB
+static void
+mdb_x86_print_desc(const char *name, const mdb_x86_desc_t *desc, uint_t width)
+{
+	const char *type;
+	const mdb_bitmask_t *bits;
+
+	static const mdb_bitmask_t mem_desc_flag_bits[] = {
+		{ "P",		0x80,	0x80 },
+		{ "16b",	0x6000, 0x0 },
+		{ "32b",	0x6000, 0x4000 },
+		{ "64b",	0x6000,	0x2000 },
+		{ "G",		0x8000,	0x8000 },
+		{ "A",		0x1,	0x1 },
+		{ NULL,		0,	0 },
+	};
+
+	static const char *mem_desc_types[] = {
+		"data, up, read-only",
+		"data, up, read-write",
+		"data, down, read-only",
+		"data, down, read-write",
+		"code, non-conforming, execute-only",
+		"code, non-conforming, execute-read",
+		"code, conforming, execute-only",
+		"code, conforming, execute-read"
+	};
+
+	static const mdb_bitmask_t sys_desc_flag_bits[] = {
+		{ "P",		0x80,	0x80 },
+		{ "16b",	0x6000, 0x0 },
+		{ "32b",	0x6000, 0x4000 },
+		{ "64b",	0x6000,	0x2000 },
+		{ "G",		0x8000,	0x8000 },
+		{ NULL,		0,	0 },
+	};
+
+	static const char *sys_desc_types[] = {
+		"reserved",
+		"16b TSS, available",
+		"LDT",
+		"16b TSS, busy",
+		"16b call gate",
+		"task gate",
+		"16b interrupt gate",
+		"16b trap gate",
+		"reserved",
+		"32b/64b TSS, available",
+		"reserved",
+		"32b/64b TSS, busy",
+		"32b/64b call gate",
+		"reserved",
+		"32b/64b interrupt gate"
+		"32b/64b trap gate",
+	};
+
+	if (desc->d_acc & 0x10) {
+		type = mem_desc_types[(desc->d_acc >> 1) & 7];
+		bits = mem_desc_flag_bits;
+	} else {
+		type = sys_desc_types[desc->d_acc & 0xf];
+		bits = sys_desc_flag_bits;
+	}
+
+	mdb_printf("%%%s = 0x%0*lx/0x%0*x 0x%05x "
+	    "<%susable, %s, dpl %d, flags: %b>\n",
+	    name, width, desc->d_base, width / 2, desc->d_lim, desc->d_acc,
+	    (desc->d_acc >> 16) & 1 ? "un" : "", type,
+	    (desc->d_acc >> 5) & 3, desc->d_acc, bits);
+}
+#endif
+
+void
+mdb_x86_print_sysregs(struct sysregs *sregs, boolean_t long_mode)
+{
+	const uint_t width =
+	    2 * (long_mode ? sizeof (uint64_t) : sizeof (uint32_t));
+
+
+#ifndef _KMDB
+	static const mdb_bitmask_t efer_flag_bits[] = {
+		{ "SCE",	AMD_EFER_SCE,	AMD_EFER_SCE },
+		{ "LME",	AMD_EFER_LME,	AMD_EFER_LME },
+		{ "LMA",	AMD_EFER_LMA,	AMD_EFER_LMA },
+		{ "NXE",	AMD_EFER_NXE,	AMD_EFER_NXE },
+		{ "SVME",	AMD_EFER_SVME,	AMD_EFER_SVME },
+		{ "LMSLE",	AMD_EFER_LMSLE,	AMD_EFER_LMSLE },
+		{ "FFXSR",	AMD_EFER_FFXSR,	AMD_EFER_FFXSR },
+		{ "TCE",	AMD_EFER_TCE,	AMD_EFER_TCE },
+		{ NULL,		0,		0 }
+	};
+#endif
+
+	static const mdb_bitmask_t cr0_flag_bits[] = {
+		{ "PE",		CR0_PE,		CR0_PE },
+		{ "MP",		CR0_MP,		CR0_MP },
+		{ "EM",		CR0_EM,		CR0_EM },
+		{ "TS",		CR0_TS,		CR0_TS },
+		{ "ET",		CR0_ET,		CR0_ET },
+		{ "NE",		CR0_NE,		CR0_NE },
+		{ "WP",		CR0_WP,		CR0_WP },
+		{ "AM",		CR0_AM,		CR0_AM },
+		{ "NW",		CR0_NW,		CR0_NW },
+		{ "CD",		CR0_CD,		CR0_CD },
+		{ "PG",		CR0_PG,		CR0_PG },
+		{ NULL,		0,		0 }
+	};
+
+	static const mdb_bitmask_t cr3_flag_bits[] = {
+		{ "PCD",	CR3_PCD,	CR3_PCD },
+		{ "PWT",	CR3_PWT,	CR3_PWT },
+		{ NULL,		0,		0, }
+	};
+
+	static const mdb_bitmask_t cr4_flag_bits[] = {
+		{ "VME",	CR4_VME,	CR4_VME },
+		{ "PVI",	CR4_PVI,	CR4_PVI },
+		{ "TSD",	CR4_TSD,	CR4_TSD },
+		{ "DE",		CR4_DE,		CR4_DE },
+		{ "PSE",	CR4_PSE,	CR4_PSE },
+		{ "PAE",	CR4_PAE,	CR4_PAE },
+		{ "MCE",	CR4_MCE,	CR4_MCE },
+		{ "PGE",	CR4_PGE,	CR4_PGE },
+		{ "PCE",	CR4_PCE,	CR4_PCE },
+		{ "OSFXSR",	CR4_OSFXSR,	CR4_OSFXSR },
+		{ "OSXMMEXCPT",	CR4_OSXMMEXCPT,	CR4_OSXMMEXCPT },
+		{ "UMIP",	CR4_UMIP,	CR4_UMIP },
+		{ "VMXE",	CR4_VMXE,	CR4_VMXE },
+		{ "SMXE",	CR4_SMXE,	CR4_SMXE },
+		{ "FSGSBASE",	CR4_FSGSBASE,	CR4_FSGSBASE },
+		{ "PCIDE",	CR4_PCIDE,	CR4_PCIDE },
+		{ "OSXSAVE",	CR4_OSXSAVE,	CR4_OSXSAVE },
+		{ "SMEP",	CR4_SMEP,	CR4_SMEP },
+		{ "SMAP",	CR4_SMAP,	CR4_SMAP },
+		{ "PKE",	CR4_PKE,	CR4_PKE },
+		{ NULL,		0,		0 }
+	};
+
+#ifndef _KMDB
+	mdb_printf("%%efer = 0x%0lx <%b>\n",
+	    sregs->sr_efer, sregs->sr_efer, efer_flag_bits);
+#endif
+	mdb_printf("%%cr0 = 0x%0lx <%b>\n",
+	    sregs->sr_cr0, sregs->sr_cr0, cr0_flag_bits);
+	mdb_printf("%%cr2 = 0x%0*x <%a>\n", width,
+	    sregs->sr_cr2, sregs->sr_cr2);
+	mdb_printf("%%cr3 = 0x%0lx <pfn:0x%lx ",
+	    sregs->sr_cr3, sregs->sr_cr3 >> MMU_PAGESHIFT);
+	if (sregs->sr_cr4 & CR4_PCIDE)
+		mdb_printf("pcid:%lu>\n", sregs->sr_cr3 & MMU_PAGEOFFSET);
+	else
+		mdb_printf("flags:%b>\n", sregs->sr_cr3, cr3_flag_bits);
+	mdb_printf("%%cr4 = 0x%0lx <%b>\n",
+	    sregs->sr_cr4, sregs->sr_cr4, cr4_flag_bits);
+
+#ifndef _KMDB
+	mdb_printf("\n");
+	mdb_printf("%%pdpte0 = 0x%0?lx\t%%pdpte2 = 0x%0?lx\n",
+	    sregs->sr_pdpte0, sregs->sr_pdpte2);
+	mdb_printf("%%pdpte1 = 0x%0?lx\t%%pdpte3 = 0x%0?lx\n",
+	    sregs->sr_pdpte1, sregs->sr_pdpte3);
+	mdb_printf("\n");
+
+	mdb_printf("%%gdtr = 0x%0*lx/0x%hx\n",
+	    width, sregs->sr_gdtr.d_base, sregs->sr_gdtr.d_lim);
+#else
+	mdb_printf("%%gdtr.base = 0x%0*lx, %%gdtr.limit = 0x%hx\n",
+	    width, sregs->sr_gdtr.d_base, sregs->sr_gdtr.d_lim);
+#endif
+#ifndef _KMDB
+	mdb_printf("%%idtr = 0x%0*lx/0x%hx\n",
+	    width, sregs->sr_idtr.d_base, sregs->sr_idtr.d_lim);
+	mdb_x86_print_desc("ldtr", &sregs->sr_ldtr, width);
+	mdb_x86_print_desc("tr  ", &sregs->sr_tr, width);
+	mdb_x86_print_desc("cs  ", &sregs->sr_cs, width);
+	mdb_x86_print_desc("ss  ", &sregs->sr_ss, width);
+	mdb_x86_print_desc("ds  ", &sregs->sr_ds, width);
+	mdb_x86_print_desc("es  ", &sregs->sr_es, width);
+	mdb_x86_print_desc("fs  ", &sregs->sr_fs, width);
+	mdb_x86_print_desc("gs  ", &sregs->sr_gs, width);
+
+	mdb_printf("%%intr_shadow = 0x%lx\n",
+	    sregs->sr_intr_shadow);
+#endif
+}
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_x86util.h b/usr/src/cmd/mdb/intel/mdb/mdb_x86util.h
new file mode 100644
index 0000000000..7641595d2a
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_x86util.h
@@ -0,0 +1,68 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _MDB_X86UTIL_H
+#define	_MDB_X86UTIL_H
+
+#include <sys/types.h>
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mdb_x86_desc {
+	uint64_t d_base;
+	uint32_t d_lim;
+	uint32_t d_acc;
+} mdb_x86_desc_t;
+
+struct sysregs {
+	uint64_t sr_cr0;
+	uint64_t sr_cr2;
+	uint64_t sr_cr3;
+	uint64_t sr_cr4;
+	uint64_t sr_dr0;
+	uint64_t sr_dr1;
+	uint64_t sr_dr2;
+	uint64_t sr_dr3;
+	uint64_t sr_dr6;
+	uint64_t sr_dr7;
+	uint64_t sr_efer;
+	uint64_t sr_pdpte0;
+	uint64_t sr_pdpte1;
+	uint64_t sr_pdpte2;
+	uint64_t sr_pdpte3;
+	uint64_t sr_intr_shadow;
+	mdb_x86_desc_t sr_gdtr;
+	mdb_x86_desc_t sr_idtr;
+	mdb_x86_desc_t sr_ldtr;
+	mdb_x86_desc_t sr_tr;
+	mdb_x86_desc_t sr_cs;
+	mdb_x86_desc_t sr_ss;
+	mdb_x86_desc_t sr_ds;
+	mdb_x86_desc_t sr_es;
+	mdb_x86_desc_t sr_fs;
+	mdb_x86_desc_t sr_gs;
+};
+
+extern void mdb_x86_print_sysregs(struct sysregs *, boolean_t);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MDB_X86UTIL_H */
diff --git a/usr/src/cmd/mdb/intel/mdb/proc_amd64dep.c b/usr/src/cmd/mdb/intel/mdb/proc_amd64dep.c
index cb613e2f0c..b19498b96e 100644
--- a/usr/src/cmd/mdb/intel/mdb/proc_amd64dep.c
+++ b/usr/src/cmd/mdb/intel/mdb/proc_amd64dep.c
@@ -38,6 +38,7 @@
 #include <mdb/mdb_proc.h>
 #include <mdb/mdb_kreg.h>
 #include <mdb/mdb_err.h>
+#include <mdb/mdb_isautil.h>
 #include <mdb/mdb_amd64util.h>
 #include <mdb/mdb.h>
 
@@ -144,7 +145,8 @@ pt_read_instr(mdb_tgt_t *t)
 	const lwpstatus_t *psp = &Pstatus(t->t_pshandle)->pr_lwp;
 	uint8_t ret = 0;
 
-	(void) mdb_tgt_vread(t, &ret, sizeof (ret), psp->pr_reg[REG_RIP]);
+	(void) mdb_tgt_aread(t, MDB_TGT_AS_VIRT_I, &ret, sizeof (ret),
+	    psp->pr_reg[REG_RIP]);
 
 	return (ret);
 }
diff --git a/usr/src/cmd/mdb/intel/mdb/proc_ia32dep.c b/usr/src/cmd/mdb/intel/mdb/proc_ia32dep.c
index 7f505b950f..c03a73f31e 100644
--- a/usr/src/cmd/mdb/intel/mdb/proc_ia32dep.c
+++ b/usr/src/cmd/mdb/intel/mdb/proc_ia32dep.c
@@ -24,8 +24,8 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2018, Joyent, Inc.
  * Copyright 2019 Doma Gergő Mihály <doma.gergo.mihaly@gmail.com>
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -38,6 +38,7 @@
 #include <mdb/mdb_proc.h>
 #include <mdb/mdb_kreg.h>
 #include <mdb/mdb_err.h>
+#include <mdb/mdb_isautil.h>
 #include <mdb/mdb_ia32util.h>
 #include <mdb/mdb.h>
 
@@ -99,7 +100,8 @@ pt_read_instr(mdb_tgt_t *t)
 	const lwpstatus_t *psp = &Pstatus(t->t_pshandle)->pr_lwp;
 	uint8_t ret = 0;
 
-	(void) mdb_tgt_vread(t, &ret, sizeof (ret), psp->pr_reg[EIP]);
+	(void) mdb_tgt_aread(t, MDB_TGT_AS_VIRT_I, &ret, sizeof (ret),
+	    psp->pr_reg[EIP]);
 
 	return (ret);
 }
diff --git a/usr/src/compat/freebsd/amd64/machine/specialreg.h b/usr/src/compat/freebsd/amd64/machine/specialreg.h
index 871573ea6b..ead63aaaab 100644
--- a/usr/src/compat/freebsd/amd64/machine/specialreg.h
+++ b/usr/src/compat/freebsd/amd64/machine/specialreg.h
@@ -38,6 +38,7 @@
 #undef	CR4_SMEP
 #undef	CR4_SMAP
 #undef	CR4_PKE
+#undef	CR4_FSGSBASE
 #undef	CR4_PCIDE
 #endif /* _SYS_CONTROLREGS_H */
 
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index c40721fd55..374c46d532 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -282,6 +282,7 @@ SUBDIRS +=				\
 i386_SUBDIRS=		\
 	libfdisk	\
 	libsaveargs	\
+	libvmm		\
 	libvmmapi
 
 sparc_SUBDIRS=		\
@@ -508,6 +509,7 @@ HDRSUBDIRS=				\
 i386_HDRSUBDIRS=	\
 	libfdisk	\
 	libsaveargs	\
+	libvmm		\
 	libvmmapi
 
 sparc_HDRSUBDIRS=	\
@@ -694,6 +696,7 @@ libtsnet:	libtsol libsecdb
 libtsol:	libsecdb
 libuuid:	libdlpi
 libv12n:	libds libuuid
+libvmm:		libvmmapi
 libvolmgt:	libadm
 libvrrpadm:	libdladm libscf
 libvscan:	libscf libsecdb
diff --git a/usr/src/lib/libvmm/Makefile b/usr/src/lib/libvmm/Makefile
new file mode 100644
index 0000000000..66bd60eb46
--- /dev/null
+++ b/usr/src/lib/libvmm/Makefile
@@ -0,0 +1,43 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include ../Makefile.lib
+include ../Makefile.rootfs
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+HDRS =		libvmm.h
+HDRDIR =	.
+CHECKHDRS =	$(HDRS:%.h=%.check)
+
+all:=		TARGET= all
+install:=	TARGET= install
+clean:=		TARGET= clean
+clobber:=	TARGET= clobber
+lint:=		TARGET= lint
+
+.KEEP_STATE:
+
+all install clean clobber lint: $(SUBDIRS)
+
+install_h:	$(ROOTHDRS)
+check:		$(CHECKHDRS)
+
+$(SUBDIRS): FRC
+	cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/lib/libvmm/Makefile.com b/usr/src/lib/libvmm/Makefile.com
new file mode 100644
index 0000000000..d85abae8ce
--- /dev/null
+++ b/usr/src/lib/libvmm/Makefile.com
@@ -0,0 +1,51 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+LIBRARY =	libvmm.a
+VERS =		.1
+OBJECTS =	libvmm.o list.o
+
+SRCDIR =	.
+
+include ../../Makefile.lib
+include ../../Makefile.rootfs
+
+LIBS		= $(DYNLIB)
+
+# The FreeBSD compat and contrib headers need to be first in the search
+# path, hence we can't just append them to CPPFLAGS. So we assign CPPFLAGS
+# directly and pull in CPPFLAGS.master at the appropriate place.
+CPPFLAGS =	-I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+		-I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \
+		$(CPPFLAGS.master) -I$(SRC)/uts/i86pc
+
+LDLIBS +=	-lc -lvmmapi
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+pics/%.o: $(SRC)/common/list/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+pics/%.o: ../%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+# include library targets
+include ../../Makefile.targ
diff --git a/usr/src/lib/libvmm/amd64/Makefile b/usr/src/lib/libvmm/amd64/Makefile
new file mode 100644
index 0000000000..5ba4f14479
--- /dev/null
+++ b/usr/src/lib/libvmm/amd64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libvmm/libvmm.c b/usr/src/lib/libvmm/libvmm.c
new file mode 100644
index 0000000000..dc552a8de0
--- /dev/null
+++ b/usr/src/lib/libvmm/libvmm.c
@@ -0,0 +1,860 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Library for native code to access bhyve VMs, without the need to use
+ * FreeBSD compat headers
+ */
+
+#include <sys/param.h>
+#include <sys/list.h>
+#include <sys/stddef.h>
+#include <sys/mman.h>
+#include <sys/kdi_regs.h>
+#include <sys/sysmacros.h>
+#include <sys/controlregs.h>
+#include <sys/note.h>
+#include <sys/debug.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <libvmm.h>
+
+typedef struct vmm_memseg vmm_memseg_t;
+
+#define	VMM_MEMSEG_DEVMEM	0x1
+
+struct vmm_memseg {
+	list_node_t vms_list;
+	int vms_segid;
+	int vms_prot;
+	int vms_flags;
+	uintptr_t vms_gpa;
+	off_t vms_segoff;
+	size_t vms_seglen;
+	size_t vms_maplen;
+	char vms_name[64];
+};
+
+struct vmm {
+	struct vmctx *vmm_ctx;
+	list_t vmm_memlist;
+	char *vmm_mem;
+	size_t vmm_memsize;
+	size_t vmm_ncpu;
+};
+
+
+/*
+ * This code relies on two assumptions:
+ * - CPUs are never removed from the "active set", not even when suspended.
+ *   A CPU being active just means that it has been used by the guest OS.
+ * - The CPU numbering is consecutive.
+ */
+static void
+vmm_update_ncpu(vmm_t *vmm)
+{
+	cpuset_t cpuset;
+
+	assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);
+
+	for (vmm->vmm_ncpu = 0;
+	    CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
+	    vmm->vmm_ncpu++)
+		;
+}
+
+vmm_t *
+vmm_open_vm(const char *name)
+{
+	vmm_t *vmm = NULL;
+
+	vmm = malloc(sizeof (vmm_t));
+	if (vmm == NULL)
+		return (NULL);
+
+	bzero(vmm, sizeof (vmm_t));
+	vmm->vmm_mem = MAP_FAILED;
+
+	list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
+	    offsetof(vmm_memseg_t, vms_list));
+
+	vmm->vmm_ctx = vm_open(name);
+	if (vmm->vmm_ctx == NULL) {
+		free(vmm);
+		return (NULL);
+	}
+
+	vmm_update_ncpu(vmm);
+
+	/*
+	 * If we open a VM that has just been created we may see a state
+	 * where it has no CPUs configured yet. We'll just wait for 10ms
+	 * and retry until we get a non-zero CPU count.
+	 */
+	if (vmm->vmm_ncpu == 0) {
+		do {
+			(void) usleep(10000);
+			vmm_update_ncpu(vmm);
+		} while (vmm->vmm_ncpu == 0);
+	}
+
+	return (vmm);
+}
+
+void
+vmm_close_vm(vmm_t *vmm)
+{
+	vmm_unmap(vmm);
+
+	list_destroy(&vmm->vmm_memlist);
+
+	if (vmm->vmm_ctx != NULL)
+		vm_close(vmm->vmm_ctx);
+
+	free(vmm);
+}
+
+static vmm_memseg_t *
+vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
+{
+	vmm_memseg_t ms, *ret;
+	int error, flags;
+
+	bzero(&ms, sizeof (vmm_memseg_t));
+	ms.vms_gpa = gpa;
+	error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
+	    &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
+	if (error)
+		return (NULL);
+
+	error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
+	    ms.vms_name, sizeof (ms.vms_name));
+	if (error)
+		return (NULL);
+
+	/*
+	 * Regular memory segments don't have a name, but devmem segments do.
+	 * We can use that information to set the DEVMEM flag if necessary.
+	 */
+	ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;
+
+	ret = malloc(sizeof (vmm_memseg_t));
+	if (ret == NULL)
+		return (NULL);
+
+	*ret = ms;
+
+	return (ret);
+}
+
+int
+vmm_map(vmm_t *vmm, boolean_t writable)
+{
+	uintptr_t last_gpa = 0;
+	vmm_memseg_t *ms;
+	int prot_write = writable ? PROT_WRITE : 0;
+
+	if (vmm->vmm_mem != MAP_FAILED) {
+		errno = EINVAL;
+		return (-1);
+	}
+
+	assert(list_is_empty(&vmm->vmm_memlist));
+
+	for (;;) {
+		ms = vmm_get_memseg(vmm, last_gpa);
+
+		if (ms == NULL)
+			break;
+
+		last_gpa = ms->vms_gpa + ms->vms_maplen;
+		list_insert_tail(&vmm->vmm_memlist, ms);
+	}
+
+	vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
+	    MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);
+
+	if (vmm->vmm_mem == MAP_FAILED)
+		goto fail;
+
+	for (ms = list_head(&vmm->vmm_memlist);
+	    ms != NULL;
+	    ms = list_next(&vmm->vmm_memlist, ms)) {
+		off_t mapoff = ms->vms_gpa;
+
+		if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) &&
+		    vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid, &mapoff)
+		    != 0)
+			goto fail;
+
+		vmm->vmm_memsize += ms->vms_maplen;
+
+		if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
+		    PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
+		    vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
+			goto fail;
+	}
+
+	return (0);
+
+fail:
+	vmm_unmap(vmm);
+
+	return (-1);
+}
+
+void
+vmm_unmap(vmm_t *vmm)
+{
+	while (!list_is_empty(&vmm->vmm_memlist)) {
+		vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);
+
+		if (vmm->vmm_mem != MAP_FAILED) {
+			(void) munmap(vmm->vmm_mem + ms->vms_gpa,
+			    ms->vms_maplen);
+		}
+
+		free(ms);
+	}
+
+	if (vmm->vmm_mem != MAP_FAILED)
+		(void) munmap(vmm->vmm_mem, vmm->vmm_memsize);
+
+	vmm->vmm_mem = MAP_FAILED;
+	vmm->vmm_memsize = 0;
+}
+
+ssize_t
+vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
+{
+	ssize_t count = 0;
+	vmm_memseg_t *ms;
+	ssize_t res = len;
+
+	for (ms = list_head(&vmm->vmm_memlist);
+	    ms != NULL && len != 0;
+	    ms = list_next(&vmm->vmm_memlist, ms)) {
+
+		if (addr >= ms->vms_gpa &&
+		    addr < ms->vms_gpa + ms->vms_maplen) {
+			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
+
+			if (res < 0)
+				res = 0;
+
+			bcopy(vmm->vmm_mem + addr, buf, len - res);
+			count += len - res;
+			addr += len - res;
+			len = res;
+		}
+	}
+
+	if (res)
+		errno = EFAULT;
+	else
+		errno = 0;
+
+	return (count);
+}
+
+ssize_t
+vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
+{
+	ssize_t count = 0;
+	vmm_memseg_t *ms;
+	ssize_t res = len;
+
+	for (ms = list_head(&vmm->vmm_memlist);
+	    ms != NULL;
+	    ms = list_next(&vmm->vmm_memlist, ms)) {
+		if (addr >= ms->vms_gpa &&
+		    addr < ms->vms_gpa + ms->vms_maplen) {
+			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
+
+			if (res < 0)
+				res = 0;
+
+			bcopy(buf, vmm->vmm_mem + addr, len - res);
+			count += len - res;
+			addr += len - res;
+			len = res;
+		}
+	}
+
+	if (res)
+		errno = EFAULT;
+	else
+		errno = 0;
+
+	return (count);
+}
+
+size_t
+vmm_ncpu(vmm_t *vmm)
+{
+	return (vmm->vmm_ncpu);
+}
+
+size_t
+vmm_memsize(vmm_t *vmm)
+{
+	return (vmm->vmm_memsize);
+}
+
+int
+vmm_cont(vmm_t *vmm)
+{
+	return (vm_resume_cpu(vmm->vmm_ctx, -1));
+}
+
+int
+vmm_step(vmm_t *vmm, int vcpu)
+{
+	cpuset_t cpuset;
+	int ret;
+
+	if (vcpu >= vmm->vmm_ncpu) {
+		errno = EINVAL;
+		return (-1);
+	}
+
+	ret = vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 1);
+	if (ret != 0)
+		return (-1);
+
+	assert(vm_resume_cpu(vmm->vmm_ctx, vcpu) == 0);
+
+	do {
+		(void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
+	} while (!CPU_ISSET(vcpu, &cpuset));
+
+	(void) vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 0);
+
+	return (ret);
+}
+
+int
+vmm_stop(vmm_t *vmm)
+{
+	int ret = vm_suspend_cpu(vmm->vmm_ctx, -1);
+
+	if (ret == 0)
+		vmm_update_ncpu(vmm);
+
+	return (ret);
+}
+
+/*
+ * Mapping of KDI-defined registers to vmmapi-defined registers.
+ * Registers not known to vmmapi use VM_REG_LAST, which is invalid and
+ * causes an error in vm_{get,set}_register_set().
+ *
+ * This array must be kept in sync with the definitions in kdi_regs.h.
+ */
+static int vmm_kdi_regmap[] = {
+	VM_REG_LAST,		/* KDIREG_SAVFP */
+	VM_REG_LAST,		/* KDIREG_SAVPC */
+	VM_REG_GUEST_RDI,	/* KDIREG_RDI */
+	VM_REG_GUEST_RSI,	/* KDIREG_RSI */
+	VM_REG_GUEST_RDX,	/* KDIREG_RDX */
+	VM_REG_GUEST_RCX,	/* KDIREG_RCX */
+	VM_REG_GUEST_R8,	/* KDIREG_R8 */
+	VM_REG_GUEST_R9,	/* KDIREG_R9 */
+	VM_REG_GUEST_RAX,	/* KDIREG_RAX */
+	VM_REG_GUEST_RBX,	/* KDIREG_RBX */
+	VM_REG_GUEST_RBP,	/* KDIREG_RBP */
+	VM_REG_GUEST_R10,	/* KDIREG_R10 */
+	VM_REG_GUEST_R11,	/* KDIREG_R11 */
+	VM_REG_GUEST_R12,	/* KDIREG_R12 */
+	VM_REG_GUEST_R13,	/* KDIREG_R13 */
+	VM_REG_GUEST_R14,	/* KDIREG_R14 */
+	VM_REG_GUEST_R15,	/* KDIREG_R15 */
+	VM_REG_LAST,		/* KDIREG_FSBASE */
+	VM_REG_LAST,		/* KDIREG_GSBASE */
+	VM_REG_LAST,		/* KDIREG_KGSBASE */
+	VM_REG_GUEST_CR2,	/* KDIREG_CR2 */
+	VM_REG_GUEST_CR3,	/* KDIREG_CR3 */
+	VM_REG_GUEST_DS,	/* KDIREG_DS */
+	VM_REG_GUEST_ES,	/* KDIREG_ES */
+	VM_REG_GUEST_FS,	/* KDIREG_FS */
+	VM_REG_GUEST_GS,	/* KDIREG_GS */
+	VM_REG_LAST,		/* KDIREG_TRAPNO */
+	VM_REG_LAST,		/* KDIREG_ERR */
+	VM_REG_GUEST_RIP,	/* KDIREG_RIP */
+	VM_REG_GUEST_CS,	/* KDIREG_CS */
+	VM_REG_GUEST_RFLAGS,	/* KDIREG_RFLAGS */
+	VM_REG_GUEST_RSP,	/* KDIREG_RSP */
+	VM_REG_GUEST_SS		/* KDIREG_SS */
+};
+CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);
+
+/*
+ * Mapping of libvmm-defined registers to vmmapi-defined registers.
+ *
+ * This array must be kept in sync with the definitions in libvmm.h
+ */
+static int vmm_sys_regmap[] = {
+	VM_REG_GUEST_CR0,	/* VMM_REG_CR0 */
+	VM_REG_GUEST_CR2,	/* VMM_REG_CR2 */
+	VM_REG_GUEST_CR3,	/* VMM_REG_CR3 */
+	VM_REG_GUEST_CR4,	/* VMM_REG_CR4 */
+	VM_REG_GUEST_DR0,	/* VMM_REG_DR0 */
+	VM_REG_GUEST_DR1,	/* VMM_REG_DR1 */
+	VM_REG_GUEST_DR2,	/* VMM_REG_DR2 */
+	VM_REG_GUEST_DR3,	/* VMM_REG_DR3 */
+	VM_REG_GUEST_DR6,	/* VMM_REG_DR6 */
+	VM_REG_GUEST_DR7,	/* VMM_REG_DR7 */
+	VM_REG_GUEST_EFER,	/* VMM_REG_EFER */
+	VM_REG_GUEST_PDPTE0,	/* VMM_REG_PDPTE0 */
+	VM_REG_GUEST_PDPTE1,	/* VMM_REG_PDPTE1 */
+	VM_REG_GUEST_PDPTE2,	/* VMM_REG_PDPTE2 */
+	VM_REG_GUEST_PDPTE3,	/* VMM_REG_PDPTE3 */
+	VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */
+};
+
+/*
+ * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors.
+ *
+ * This array must be kept in sync with the definitions in libvmm.h
+ */
+static int vmm_descmap[] = {
+	VM_REG_GUEST_GDTR,
+	VM_REG_GUEST_LDTR,
+	VM_REG_GUEST_IDTR,
+	VM_REG_GUEST_TR,
+	VM_REG_GUEST_CS,
+	VM_REG_GUEST_DS,
+	VM_REG_GUEST_ES,
+	VM_REG_GUEST_FS,
+	VM_REG_GUEST_GS,
+	VM_REG_GUEST_SS
+};
+
+static int
+vmm_mapreg(int reg)
+{
+	errno = 0;
+
+	if (reg < 0)
+		goto fail;
+
+	if (reg < KDIREG_NGREG)
+		return (vmm_kdi_regmap[reg]);
+
+	if (reg >= VMM_REG_OFFSET &&
+	    reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
+		return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);
+
+fail:
+	errno = EINVAL;
+	return (VM_REG_LAST);
+}
+
+static int
+vmm_mapdesc(int desc)
+{
+	errno = 0;
+
+	if (desc >= VMM_DESC_OFFSET &&
+	    desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
+		return (vmm_descmap[desc - VMM_DESC_OFFSET]);
+
+	errno = EINVAL;
+	return (VM_REG_LAST);
+}
+
+int
+vmm_getreg(vmm_t *vmm, int vcpu, int reg, uint64_t *val)
+{
+	reg = vmm_mapreg(reg);
+
+	if (reg == VM_REG_LAST)
+		return (-1);
+
+	return (vm_get_register(vmm->vmm_ctx, vcpu, reg, val));
+}
+
+int
+vmm_setreg(vmm_t *vmm, int vcpu, int reg, uint64_t val)
+{
+	reg = vmm_mapreg(reg);
+
+	if (reg == VM_REG_LAST)
+		return (-1);
+
+	return (vm_set_register(vmm->vmm_ctx, vcpu, reg, val));
+}
+
+int
+vmm_get_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
+    uint64_t *regvals)
+{
+	int *vm_regnums;
+	int i;
+	int ret = -1;
+
+	vm_regnums = malloc(sizeof (int) * nregs);
+	if (vm_regnums == NULL)
+		return (ret);
+
+	for (i = 0; i != nregs; i++) {
+		vm_regnums[i] = vmm_mapreg(regnums[i]);
+		if (vm_regnums[i] == VM_REG_LAST)
+			goto fail;
+	}
+
+	ret = vm_get_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
+	    regvals);
+
+fail:
+	free(vm_regnums);
+	return (ret);
+}
+
+int
+vmm_set_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
+    uint64_t *regvals)
+{
+	int *vm_regnums;
+	int i;
+	int ret = -1;
+
+	vm_regnums = malloc(sizeof (int) * nregs);
+	if (vm_regnums == NULL)
+		return (ret);
+
+	for (i = 0; i != nregs; i++) {
+		vm_regnums[i] = vmm_mapreg(regnums[i]);
+		if (vm_regnums[i] == VM_REG_LAST)
+			goto fail;
+	}
+
+	ret = vm_set_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
+	    regvals);
+
+fail:
+	free(vm_regnums);
+	return (ret);
+}
+
+int
+vmm_get_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
+{
+	desc = vmm_mapdesc(desc);
+	if (desc == VM_REG_LAST)
+		return (-1);
+
+	return (vm_get_desc(vmm->vmm_ctx, vcpu, desc, &vd->vd_base, &vd->vd_lim,
+	    &vd->vd_acc));
+}
+
+int
+vmm_set_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
+{
+	desc = vmm_mapdesc(desc);
+	if (desc == VM_REG_LAST)
+		return (-1);
+
+	return (vm_set_desc(vmm->vmm_ctx, vcpu, desc, vd->vd_base, vd->vd_lim,
+	    vd->vd_acc));
+}
+
+/*
+ * Structure to hold MMU state during address translation.
+ * The contents of vmm_mmu_regnum[] must be kept in sync with this.
+ */
+typedef struct vmm_mmu {
+	uint64_t vm_cr0;
+	uint64_t vm_cr3;
+	uint64_t vm_cr4;
+	uint64_t vm_efer;
+} vmm_mmu_t;
+
+static const int vmm_mmu_regnum[] = {
+	VMM_REG_CR0,
+	VMM_REG_CR3,
+	VMM_REG_CR4,
+	VMM_REG_EFER
+};
+
+#define	X86_PTE_P		0x001ULL
+#define	X86_PTE_PS		0x080ULL
+
+#define	X86_PTE_PHYSMASK	0x000ffffffffff000ULL
+#define	X86_PAGE_SHIFT		12
+#define	X86_PAGE_SIZE		(1ULL << X86_PAGE_SHIFT)
+
+#define	X86_SEG_CODE_DATA	(1ULL << 4)
+#define	X86_SEG_PRESENT		(1ULL << 7)
+#define	X86_SEG_LONG		(1ULL << 13)
+#define	X86_SEG_BIG		(1ULL << 14)
+#define	X86_SEG_GRANULARITY	(1ULL << 15)
+#define	X86_SEG_UNUSABLE	(1ULL << 16)
+
+#define	X86_SEG_USABLE		(X86_SEG_PRESENT | X86_SEG_CODE_DATA)
+#define	X86_SEG_USABLE_MASK	(X86_SEG_UNUSABLE | X86_SEG_USABLE)
+
+/*
+ * vmm_pte2paddr:
+ *
+ * Recursively calculate the physical address from a virtual address,
+ * starting at the given PTE level using the given PTE.
+ */
+static int
+vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
+    uint64_t vaddr, uint64_t *paddr)
+{
+	int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
+	int off_bits = ia32 ? 10 : 9;
+	boolean_t hugepage = B_FALSE;
+	uint64_t offset;
+	uint64_t off_mask, off_shift;
+
+	if (level < 4 && (pte & X86_PTE_P) == 0) {
+		errno = EFAULT;
+		return (-1);
+	}
+
+	off_shift = X86_PAGE_SHIFT + off_bits * level;
+	off_mask = (1ULL << off_shift) - 1;
+
+	offset = vaddr & off_mask;
+
+	if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
+		hugepage = B_TRUE;
+	} else {
+		if (level > 0) {
+			offset >>= off_shift - off_bits;
+			offset <<= X86_PAGE_SHIFT - off_bits;
+		}
+		off_mask = 0xfff;
+	}
+
+	*paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;
+
+	if (level == 0 || hugepage)
+		return (0);
+
+	pte = 0;
+	if (vmm_pread(vmm, &pte,  pte_size, *paddr) != pte_size)
+		return (-1);
+	return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
+}
+
+static vmm_mode_t
+vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpu, vmm_mmu_t *mmu)
+{
+	if ((mmu->vm_cr0 & CR0_PE) == 0)
+		return (VMM_MODE_REAL);
+	else if ((mmu->vm_cr4 & CR4_PAE) == 0)
+		return (VMM_MODE_PROT);
+	else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
+		return (VMM_MODE_PAE);
+	else
+		return (VMM_MODE_LONG);
+}
+
+vmm_mode_t
+vmm_vcpu_mode(vmm_t *vmm, int vcpu)
+{
+	vmm_mmu_t mmu = { 0 };
+
+	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
+	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
+		return (VMM_MODE_UNKNOWN);
+
+	return (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu));
+}
+
+vmm_isa_t
+vmm_vcpu_isa(vmm_t *vmm, int vcpu)
+{
+	vmm_desc_t cs;
+
+	if (vmm_get_desc(vmm, vcpu, VMM_DESC_CS, &cs) != 0)
+		return (VMM_ISA_UNKNOWN);
+
+	switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
+	case 0x0:		/* 16b code segment */
+		return (VMM_ISA_16);
+	case X86_SEG_LONG:	/* 64b code segment */
+		return (VMM_ISA_64);
+	case X86_SEG_BIG:	/* 32b code segment */
+		return (VMM_ISA_32);
+	}
+
+	return (VMM_ISA_UNKNOWN);
+}
+
+/*
+ * vmm_vtol:
+ *
+ * Translate a virtual address to a physical address on a certain vCPU,
+ * using the specified segment register or descriptor according to the mode.
+ *
+ */
+int
+vmm_vtol(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *laddr)
+{
+	vmm_desc_t desc;
+	uint64_t limit;
+
+	if (vmm_get_desc(vmm, vcpu, seg, &desc) != 0)
+		return (-1);
+
+	switch (vmm_vcpu_mode(vmm, vcpu)) {
+	case VMM_MODE_REAL:
+		if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
+			goto fault;
+		/* FALLTHRU */
+	case VMM_MODE_PROT:
+	case VMM_MODE_PAE:
+		if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
+			/* unusable, system segment, or not present */
+			goto fault;
+
+		limit = desc.vd_lim;
+		if (desc.vd_acc & X86_SEG_GRANULARITY)
+			limit *= 4096;
+
+		if (vaddr > limit)
+			goto fault;
+		/* FALLTHRU */
+	case VMM_MODE_LONG:
+		*laddr = desc.vd_base + vaddr;
+		return (0);
+
+	default:
+	fault:
+		errno = EFAULT;
+		return (-1);
+	}
+
+}
+
+/*
+ * vmm_vtop:
+ *
+ * Translate a virtual address to a guest physical address on a certain vCPU,
+ * according to the mode the vCPU is in.
+ */
+int
+vmm_vtop(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *paddr)
+{
+	vmm_mmu_t mmu = { 0 };
+	int ret = 0;
+
+	if (vmm_vtol(vmm, vcpu, seg, vaddr, &vaddr) != 0)
+		return (-1);
+
+	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
+	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
+		return (-1);
+
+	if ((mmu.vm_cr0 & CR0_PG) == 0) {
+		/* no paging, physical equals virtual */
+		*paddr = vaddr;
+		return (0);
+	}
+
+	switch (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)) {
+	case VMM_MODE_PROT:
+		/* protected mode, no PAE: 2-level paging, 32bit PTEs */
+		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
+		break;
+	case VMM_MODE_PAE:
+		/* protected mode with PAE: 3-level paging, 64bit PTEs */
+		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
+		break;
+	case VMM_MODE_LONG:
+		/* long mode: 4-level paging, 64bit PTEs */
+		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
+		break;
+	default:
+		ret = -1;
+	}
+
+	return (ret);
+}
+
+ssize_t
+vmm_vread(vmm_t *vmm, int vcpu, int seg, void *buf, size_t len, uintptr_t addr)
+{
+	ssize_t res = 0;
+	uint64_t paddr;
+	size_t plen;
+	uint64_t boundary;
+
+	while (len != 0) {
+		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
+			errno = EFAULT;
+			return (0);
+		}
+
+		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
+		if (addr + len > boundary)
+			plen = boundary - addr;
+		else
+			plen = len;
+
+		if (vmm_pread(vmm, buf, plen, paddr) != plen)
+			return (0);
+		len -= plen;
+		addr += plen;
+		buf += plen;
+		res += plen;
+	}
+
+	return (res);
+}
+
+ssize_t
+vmm_vwrite(vmm_t *vmm, int vcpu, int seg, const void *buf, size_t len,
+    uintptr_t addr)
+{
+	ssize_t res = 0;
+	uint64_t paddr;
+	size_t plen;
+	uint64_t boundary;
+
+	while (len != 0) {
+		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
+			errno = EFAULT;
+			return (0);
+		}
+
+		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
+		if (addr + len > boundary)
+			plen = boundary - addr;
+		else
+			plen = len;
+
+		if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
+			return (0);
+		len -= plen;
+		addr += plen;
+		buf += plen;
+		res += plen;
+	}
+
+	return (res);
+}
diff --git a/usr/src/lib/libvmm/libvmm.h b/usr/src/lib/libvmm/libvmm.h
new file mode 100644
index 0000000000..352b09e970
--- /dev/null
+++ b/usr/src/lib/libvmm/libvmm.h
@@ -0,0 +1,122 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _LIBVMM_H
+#define	_LIBVMM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vmm vmm_t;
+
+typedef struct vmm_desc {
+	uint64_t vd_base;
+	uint32_t vd_lim;
+	uint32_t vd_acc;
+} vmm_desc_t;
+
+
+/*
+ * This enum must be kept in sync with vmm_sys_regmap[] in libvmm.c.
+ */
+#define	VMM_REG_OFFSET	0x100
+enum vmm_regs {
+	VMM_REG_CR0 = VMM_REG_OFFSET,
+	VMM_REG_CR2,
+	VMM_REG_CR3,
+	VMM_REG_CR4,
+	VMM_REG_DR0,
+	VMM_REG_DR1,
+	VMM_REG_DR2,
+	VMM_REG_DR3,
+	VMM_REG_DR6,
+	VMM_REG_DR7,
+	VMM_REG_EFER,
+	VMM_REG_PDPTE0,
+	VMM_REG_PDPTE1,
+	VMM_REG_PDPTE2,
+	VMM_REG_PDPTE3,
+	VMM_REG_INTR_SHADOW
+};
+
+/*
+ * This enum must be kept in sync with vmm_descmap[] in libvmm.c.
+ */
+#define	VMM_DESC_OFFSET	0x200
+enum vmm_descs {
+	VMM_DESC_GDTR = VMM_DESC_OFFSET,
+	VMM_DESC_LDTR,
+	VMM_DESC_IDTR,
+	VMM_DESC_TR,
+	VMM_DESC_CS,
+	VMM_DESC_DS,
+	VMM_DESC_ES,
+	VMM_DESC_FS,
+	VMM_DESC_GS,
+	VMM_DESC_SS
+};
+
+typedef enum {
+	VMM_MODE_UNKNOWN = 0,
+	VMM_MODE_REAL,
+	VMM_MODE_PROT,
+	VMM_MODE_PAE,
+	VMM_MODE_LONG
+} vmm_mode_t;
+
+typedef enum {
+	VMM_ISA_UNKNOWN = 0,
+	VMM_ISA_16,
+	VMM_ISA_32,
+	VMM_ISA_64
+} vmm_isa_t;
+
+vmm_t *vmm_open_vm(const char *);
+void vmm_close_vm(vmm_t *);
+
+int vmm_map(vmm_t *, boolean_t);
+void vmm_unmap(vmm_t *);
+
+ssize_t vmm_pread(vmm_t *, void *, size_t, uintptr_t);
+ssize_t vmm_pwrite(vmm_t *, const void *, size_t, uintptr_t);
+ssize_t vmm_vread(vmm_t *, int, int, void *, size_t, uintptr_t);
+ssize_t vmm_vwrite(vmm_t *, int, int, const void *, size_t, uintptr_t);
+
+size_t vmm_ncpu(vmm_t *);
+size_t vmm_memsize(vmm_t *);
+
+int vmm_cont(vmm_t *);
+int vmm_step(vmm_t *, int);
+int vmm_stop(vmm_t *);
+
+int vmm_getreg(vmm_t *, int, int, uint64_t *);
+int vmm_setreg(vmm_t *, int, int, uint64_t);
+int vmm_get_regset(vmm_t *, int, size_t, const int *, uint64_t *);
+int vmm_set_regset(vmm_t *, int, size_t, const int *, uint64_t *);
+
+int vmm_get_desc(vmm_t *, int, int, vmm_desc_t *);
+int vmm_set_desc(vmm_t *, int, int, vmm_desc_t *);
+
+vmm_mode_t vmm_vcpu_mode(vmm_t *, int);
+vmm_isa_t vmm_vcpu_isa(vmm_t *, int);
+int vmm_vtol(vmm_t *, int, int, uint64_t, uint64_t *);
+int vmm_vtop(vmm_t *, int, int, uint64_t, uint64_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVMM_H */
diff --git a/usr/src/lib/libvmm/mapfile-vers b/usr/src/lib/libvmm/mapfile-vers
new file mode 100644
index 0000000000..19a15802ac
--- /dev/null
+++ b/usr/src/lib/libvmm/mapfile-vers
@@ -0,0 +1,60 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING:  STOP NOW.  DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+#	usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+	global:
+		vmm_open_vm;
+		vmm_close_vm;
+		vmm_map;
+		vmm_unmap;
+		vmm_pread;
+		vmm_pwrite;
+		vmm_ncpu;
+		vmm_memsize;
+		vmm_cont;
+		vmm_step;
+		vmm_stop;
+		vmm_setreg;
+		vmm_getreg;
+		vmm_set_regset;
+		vmm_get_regset;
+		vmm_set_desc;
+		vmm_get_desc;
+		vmm_vcpu_isa;
+		vmm_vcpu_mode;
+		vmm_vtol;
+		vmm_vtop;
+		vmm_vread;
+		vmm_vwrite;
+
+	local:
+		*;
+};
diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers
index 397ebd7d59..f8fe636386 100644
--- a/usr/src/lib/libvmmapi/common/mapfile-vers
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers
@@ -40,6 +40,7 @@ SYMBOL_VERSION ILLUMOSprivate {
 		vm_assign_pptdev;
 		vm_capability_name2type;
 		vm_capability_type2name;
+		vm_close;
 		vm_copy_setup;
 		vm_copy_teardown;
 		vm_copyin;
@@ -53,6 +54,7 @@ SYMBOL_VERSION ILLUMOSprivate {
 		vm_get_capability;
 		vm_get_desc;
 		vm_get_device_fd;
+		vm_get_devmem_offset;
 		vm_get_gpa_pmap;
 		vm_get_hpet_capabilities;
 		vm_get_highmem_size;
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
index ceac495746..bae214aba0 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.c
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -179,10 +179,32 @@ vm_open(const char *name)
 
 	return (vm);
 err:
+#ifdef __FreeBSD__
 	vm_destroy(vm);
+#else
+	/*
+	 * As libvmmapi is used by other programs to query and control bhyve
+	 * VMs, destroying a VM just because the open failed isn't useful. We
+	 * have to free what we have allocated, though.
+	 */
+	free(vm);
+#endif
 	return (NULL);
 }
 
+#ifndef __FreeBSD__
+void
+vm_close(struct vmctx *vm)
+{
+	assert(vm != NULL);
+	assert(vm->fd >= 0);
+
+	(void) close(vm->fd);
+
+	free(vm);
+}
+#endif
+
 void
 vm_destroy(struct vmctx *vm)
 {
@@ -551,6 +573,22 @@ vm_get_highmem_size(struct vmctx *ctx)
 	return (ctx->highmem);
 }
 
+#ifndef __FreeBSD__
+int
+vm_get_devmem_offset(struct vmctx *ctx, int segid, off_t *mapoff)
+{
+	struct vm_devmem_offset vdo;
+	int error;
+
+	vdo.segid = segid;
+	error = ioctl(ctx->fd, VM_DEVMEM_GETOFFSET, &vdo);
+	if (error == 0)
+		*mapoff = vdo.offset;
+
+	return (error);
+}
+#endif
+
 void *
 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
 {
@@ -583,17 +621,8 @@ vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
 	if (fd < 0)
 		goto done;
 #else
-	{
-		struct vm_devmem_offset vdo;
-
-		vdo.segid = segid;
-		error = ioctl(ctx->fd, VM_DEVMEM_GETOFFSET, &vdo);
-		if (error == 0) {
-			mapoff = vdo.offset;
-		} else {
-			goto done;
-		}
-	}
+	if (vm_get_devmem_offset(ctx, segid, &mapoff) != 0)
+		goto done;
 #endif
 
 	/*
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
index 1b08a9cae5..6cb7a1186d 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.h
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -114,6 +114,13 @@ int	vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
 void	*vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
 	    size_t len);
 
+#ifndef __FreeBSD__
+/*
+ * Return the map offset for the device memory segment 'segid'.
+ */
+int	vm_get_devmem_offset(struct vmctx *ctx, int segid, off_t *mapoff);
+#endif
+
 /*
  * Map the memory segment identified by 'segid' into the guest address space
  * at [gpa,gpa+len) with protection 'prot'.
@@ -124,6 +131,9 @@ int	vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
 int	vm_create(const char *name);
 int	vm_get_device_fd(struct vmctx *ctx);
 struct vmctx *vm_open(const char *name);
+#ifndef __FreeBSD__
+void	vm_close(struct vmctx *ctx);
+#endif
 void	vm_destroy(struct vmctx *ctx);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
diff --git a/usr/src/man/man1/mdb.1 b/usr/src/man/man1/mdb.1
index 8fdd91e661..54f00f4b61 100644
--- a/usr/src/man/man1/mdb.1
+++ b/usr/src/man/man1/mdb.1
@@ -5,13 +5,13 @@
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH MDB 1 "Feb 21, 2019"
+.TH MDB 1 "May 20, 2020"
 .SH NAME
 mdb \- modular debugger
 .SH SYNOPSIS
 .LP
 .nf
-\fBmdb\fR [\fB-fkmuwyAFKMSUW\fR] [\(+-o \fIoption\fR] [\fB-p\fR \fIpid\fR] [\fB-s\fR \fIdistance\fR]
+\fBmdb\fR [\fB-fkmuwyAFKMSUW\fR] [\(+-o \fIoption\fR] [\fB-b\fR \fIVM\fR] [\fB-p\fR \fIpid\fR] [\fB-s\fR \fIdistance\fR]
      [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-P\fR \fIprompt\fR] [\fB-R\fR \fIroot\fR]
      [\fB-V\fR \fIdis-version\fR] [\fB-e\fR \fIexpr\fR] [object [core] | core | suffix]
 .fi
@@ -50,12 +50,13 @@ different contexts, including live and post-mortem.
 .LP
 The \fItarget\fR is the program being inspected by the debugger. \fBmdb\fR
 currently provides support for the following types of targets: user processes,
-user process core files, the live operating system (via \fB/dev/kmem\fR and
-\fB/dev/ksyms\fR), operating system crash dumps, user process images recorded
-inside an operating system crash dump, \fBELF\fR object files, and raw binary
-files. Each target exports a standard set of properties, including one or more
-address spaces, one or more symbol tables, a set of load objects, and a set of
-threads that can be examined using the debugger commands described below.
+user process core files, live bhyve VMs, the live operating system (via
+\fB/dev/kmem\fR and \fB/dev/ksyms\fR), operating system crash dumps, user
+process images recorded inside an operating system crash dump, \fBELF\fR object
+files, and raw binary files. Each target exports a standard set of properties,
+including one or more address spaces, one or more symbol tables, a set of load
+objects, and a set of threads that can be examined using the debugger commands
+described below.
 .sp
 .LP
 A debugger command, or \fIdcmd\fR (pronounced dee-command) in \fBmdb\fR
@@ -3695,6 +3696,15 @@ process or core file, or to the loaded kernel modules in the live operating
 system or an operating system crash dump.
 .RE
 
+.sp
+.ne 2
+.na
+\fB\fB-b\fR \fIVM\fR\fR
+.ad
+.RS 15n
+Attaches to and stops the specified bhyve VM.
+.RE
+
 .sp
 .ne 2
 .na
diff --git a/usr/src/pkg/manifests/system-library-bhyve.mf b/usr/src/pkg/manifests/system-library-bhyve.mf
index 3c7e52c938..5408778c90 100644
--- a/usr/src/pkg/manifests/system-library-bhyve.mf
+++ b/usr/src/pkg/manifests/system-library-bhyve.mf
@@ -28,6 +28,7 @@ dir path=lib/$(ARCH64) group=bin
 dir path=usr group=sys
 dir path=usr/lib group=bin
 dir path=usr/lib/$(ARCH64) group=bin
+file path=lib/$(ARCH64)/libvmm.so.1
 file path=lib/$(ARCH64)/libvmmapi.so.1
 file path=usr/lib/$(ARCH64)/libppt.so.1
 file path=usr/lib/libppt.so.1
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index dd24a18f6a..47a5f26cb7 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -2116,6 +2116,10 @@ restart:
 			break;
 		}
 
+		case VM_EXITCODE_MTRAP:
+			vm_suspend_cpu(vm, vcpuid);
+			retu = true;
+			break;
 #endif
 		default:
 			retu = true;	/* handled in userland */
diff --git a/usr/src/uts/intel/sys/controlregs.h b/usr/src/uts/intel/sys/controlregs.h
index 43b5247274..0be7b3b650 100644
--- a/usr/src/uts/intel/sys/controlregs.h
+++ b/usr/src/uts/intel/sys/controlregs.h
@@ -86,8 +86,8 @@ extern "C" {
 
 /* CR3 Register */
 
-#define	CR3_PCD	0x00000010		/* cache disable 		*/
-#define	CR3_PWT 0x00000008		/* write through 		*/
+#define	CR3_PCD	0x00000010		/* cache disable		*/
+#define	CR3_PWT 0x00000008		/* write through		*/
 #if defined(_ASM)
 #define	CR3_NOINVL_BIT	0x8000000000000000
 #else
@@ -110,18 +110,22 @@ extern "C" {
 #define	CR4_PCE		0x0100		/* perf-monitoring counter enable */
 #define	CR4_OSFXSR	0x0200		/* OS fxsave/fxrstor support	*/
 #define	CR4_OSXMMEXCPT	0x0400		/* OS unmasked exception support */
-					/* 0x0800 reserved */
+#define	CR4_UMIP	0x0800		/* user-mode instruction prevention */
 					/* 0x1000 reserved */
-#define	CR4_VMXE	0x2000
-#define	CR4_SMXE	0x4000
+#define	CR4_VMXE	0x2000		/* VMX enable */
+#define	CR4_SMXE	0x4000		/* SMX enable */
+					/* 0x8000 reserved */
+#define	CR4_FSGSBASE	0x10000		/* FSGSBASE enable */
 #define	CR4_PCIDE	0x20000		/* PCID enable */
 #define	CR4_OSXSAVE	0x40000		/* OS xsave/xrestore support	*/
 #define	CR4_SMEP	0x100000	/* NX for user pages in kernel */
 #define	CR4_SMAP	0x200000	/* kernel can't access user pages */
+#define	CR4_PKE		0x400000	/* protection key enable */
 
 #define	FMT_CR4						\
-	"\20\26smap\25smep\23osxsav\22pcide"		\
-	"\17smxe\16vmxe\13xmme\12fxsr\11pce\10pge"	\
+	"\20\27pke\26smap\25smep\23osxsav"		\
+	"\22pcide\20fsgsbase\17smxe\16vmxe"		\
+	"\14umip\13xmme\12fxsr\11pce\10pge"		\
 	"\7mce\6pae\5pse\4de\3tsd\2pvi\1vme"
 
 /*
@@ -158,7 +162,9 @@ extern "C" {
 
 #define	MSR_AMD_EFER	0xc0000080	/* extended feature enable MSR */
 
+#define	AMD_EFER_TCE	0x8000		/* translation cache extension */
 #define	AMD_EFER_FFXSR	0x4000		/* fast fxsave/fxrstor		*/
+#define	AMD_EFER_LMSLE	0x2000		/* long mode segment limit enable */
 #define	AMD_EFER_SVME	0x1000		/* svm enable			*/
 #define	AMD_EFER_NXE	0x0800		/* no-execute enable		*/
 #define	AMD_EFER_LMA	0x0400		/* long mode active (read-only)	*/
@@ -166,7 +172,7 @@ extern "C" {
 #define	AMD_EFER_SCE	0x0001		/* system call extensions	*/
 
 #define	FMT_AMD_EFER \
-	"\20\17ffxsr\15svme\14nxe\13lma\11lme\1sce"
+	"\20\20tce\17ffxsr\16lmsle\15svme\14nxe\13lma\11lme\1sce"
 
 /* AMD's SYSCFG register */
 
diff --git a/usr/src/uts/intel/sys/debugreg.h b/usr/src/uts/intel/sys/debugreg.h
index b537076d26..8528a293ab 100644
--- a/usr/src/uts/intel/sys/debugreg.h
+++ b/usr/src/uts/intel/sys/debugreg.h
@@ -26,6 +26,9 @@
 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
 /*	  All Rights Reserved	*/
+/*
+ * Copyright (c) 2018, Joyent, Inc.  All rights reserved.
+ */
 
 #ifndef	_SYS_DEBUGREG_H
 #define	_SYS_DEBUGREG_H
@@ -57,6 +60,7 @@ extern "C" {
 #define	DR_ICEALSO	0x2000	/* Flag bit reserved for in-circuit-emulator */
 #define	DR_SINGLESTEP	0x4000	/* Trap resulting from the single-step flag */
 #define	DR_TASKSWITCH	0x8000	/* Trap resulting from a task-switch */
+#define	DR_IN_RTM	0x10000	/* Trap inside an RTM region */
 
 /*
  * dr7 controls the rest of the debug registers.
@@ -73,6 +77,8 @@ extern "C" {
 #define	DR_CONTROL_RESERVED	0xFC00	/* Bits reserved by Intel */
 #define	DR_LOCAL_SLOWDOWN	0x100	/* Slow the pipeline for ldt addrs */
 #define	DR_GLOBAL_SLOWDOWN	0x200	/* Slow the pipeline for gdt addrs */
+#define	DR_RTM			0x800	/* Restricted Transactional Memory */
+#define	DR_GENERAL_DETECT	0x2000	/* General Detect Enable */
 
 #define	DR_LOCAL_ENABLE_SHIFT	0	/* Additional shift: local enable  */
 #define	DR_GLOBAL_ENABLE_SHIFT	1	/* Additional shift: global enable */
@@ -95,6 +101,7 @@ extern "C" {
 #define	DR_LEN_1		0x0	/* Settings for data length */
 #define	DR_LEN_2		0x4
 #define	DR_LEN_4		0xC
+#define	DR_LEN_8		0x8
 
 #ifdef	__cplusplus
 }
-- 
cgit v1.2.3


From 84659b24a533984de271059abf9a1092835d15a9 Mon Sep 17 00:00:00 2001
From: Michael Zeller <mike@mikezeller.net>
Date: Wed, 11 Mar 2020 16:55:43 -0400
Subject: 12735 bhyve upstream sync 2019 Sept Reviewed by: Dan McDonald
 <danmcd@kebe.com> Reviewed by: John Levon <john.levon@joyent.com> Reviewed
 by: Patrick Mooney <pmooney@oxide.computer> Approved by: Robert Mustacchi
 <rm@fingolfin.org>

---
 exception_lists/cstyle                          |    7 +
 exception_lists/hdrchk                          |    2 +
 exception_lists/wscheck                         |    7 +
 usr/src/cmd/bhyve/Makefile                      |   10 +
 usr/src/cmd/bhyve/audio.c                       |  285 +++++
 usr/src/cmd/bhyve/audio.h                       |   88 ++
 usr/src/cmd/bhyve/gdb.c                         |   13 +-
 usr/src/cmd/bhyve/hda_codec.c                   |  952 ++++++++++++++++
 usr/src/cmd/bhyve/hda_reg.h                     | 1369 +++++++++++++++++++++++
 usr/src/cmd/bhyve/hdac_reg.h                    |  271 +++++
 usr/src/cmd/bhyve/mevent.c                      |    2 +-
 usr/src/cmd/bhyve/net_backends.c                |  807 +++++++++++++
 usr/src/cmd/bhyve/net_backends.h                |   89 ++
 usr/src/cmd/bhyve/net_utils.c                   |   89 ++
 usr/src/cmd/bhyve/net_utils.h                   |   39 +
 usr/src/cmd/bhyve/pci_e82545.c                  |   47 +-
 usr/src/cmd/bhyve/pci_emul.c                    |   96 +-
 usr/src/cmd/bhyve/pci_emul.h                    |    1 +
 usr/src/cmd/bhyve/pci_fbuf.c                    |   17 +-
 usr/src/cmd/bhyve/pci_hda.c                     | 1331 ++++++++++++++++++++++
 usr/src/cmd/bhyve/pci_hda.h                     |   92 ++
 usr/src/cmd/bhyve/pci_nvme.c                    |  108 +-
 usr/src/cmd/bhyve/pci_virtio_console.c          |    9 +-
 usr/src/cmd/bhyve/pci_virtio_net.c              |  132 +--
 usr/src/cmd/bhyve/pci_virtio_scsi.c             |   10 +-
 usr/src/cmd/bhyve/pci_xhci.c                    |   28 +-
 usr/src/cmd/bhyve/rfb.c                         |   30 +-
 usr/src/cmd/bhyve/uart_emul.c                   |    7 +-
 usr/src/cmd/bhyve/virtio.c                      |    3 +-
 usr/src/cmd/bhyve/virtio.h                      |   21 +
 usr/src/compat/freebsd/net/ieee_oui.h           |   85 ++
 usr/src/compat/freebsd/sys/param.h              |    2 +-
 usr/src/compat/freebsd/sys/pcpu.h               |    4 +-
 usr/src/uts/i86pc/io/vmm/README.sync            |   32 +-
 usr/src/uts/i86pc/io/vmm/amd/svm.c              |   48 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx.c            |   46 +-
 usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c        |   21 +-
 usr/src/uts/i86pc/io/vmm/intel/vtd.c            |  123 +-
 usr/src/uts/i86pc/io/vmm/io/vatpit.c            |   11 +-
 usr/src/uts/i86pc/io/vmm/io/vlapic.c            |    5 +-
 usr/src/uts/i86pc/io/vmm/vm/vm_page.h           |    6 +-
 usr/src/uts/i86pc/io/vmm/vmm.c                  |   32 +-
 usr/src/uts/i86pc/io/vmm/vmm_host.h             |    7 +-
 usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c |   95 ++
 usr/src/uts/i86pc/io/vmm/vmm_lapic.c            |   14 +-
 usr/src/uts/i86pc/io/vmm/vmm_lapic.h            |    2 +-
 usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c           |   28 +-
 usr/src/uts/i86pc/io/vmm/vmm_util.c             |   20 +-
 usr/src/uts/i86pc/io/vmm/vmm_util.h             |    6 +-
 usr/src/uts/i86pc/sys/vmm.h                     |   39 +-
 50 files changed, 6176 insertions(+), 412 deletions(-)
 create mode 100644 usr/src/cmd/bhyve/audio.c
 create mode 100644 usr/src/cmd/bhyve/audio.h
 create mode 100644 usr/src/cmd/bhyve/hda_codec.c
 create mode 100644 usr/src/cmd/bhyve/hda_reg.h
 create mode 100644 usr/src/cmd/bhyve/hdac_reg.h
 create mode 100644 usr/src/cmd/bhyve/net_backends.c
 create mode 100644 usr/src/cmd/bhyve/net_backends.h
 create mode 100644 usr/src/cmd/bhyve/net_utils.c
 create mode 100644 usr/src/cmd/bhyve/net_utils.h
 create mode 100644 usr/src/cmd/bhyve/pci_hda.c
 create mode 100644 usr/src/cmd/bhyve/pci_hda.h
 create mode 100644 usr/src/compat/freebsd/net/ieee_oui.h

(limited to 'usr/src/uts/i86pc')

diff --git a/exception_lists/cstyle b/exception_lists/cstyle
index 73edc10e88..74db906b33 100644
--- a/exception_lists/cstyle
+++ b/exception_lists/cstyle
@@ -1332,6 +1332,7 @@ syntax: glob
 usr/src/cmd/bhyve/acpi.[ch]
 usr/src/cmd/bhyve/ahci.h
 usr/src/cmd/bhyve/atkbdc.[ch]
+usr/src/cmd/bhyve/audio.[ch]
 usr/src/cmd/bhyve/bhyvegc.[ch]
 usr/src/cmd/bhyve/bhyverun.[ch]
 usr/src/cmd/bhyve/block_if.[ch]
@@ -1341,6 +1342,9 @@ usr/src/cmd/bhyve/consport.c
 usr/src/cmd/bhyve/dbgport.[ch]
 usr/src/cmd/bhyve/fwctl.[ch]
 usr/src/cmd/bhyve/gdb.[ch]
+usr/src/cmd/bhyve/hda_codec.c
+usr/src/cmd/bhyve/hda_reg.h
+usr/src/cmd/bhyve/hdac_reg.h
 usr/src/cmd/bhyve/inout.[ch]
 usr/src/cmd/bhyve/ioapic.[ch]
 usr/src/cmd/bhyve/iov.[ch]
@@ -1348,10 +1352,13 @@ usr/src/cmd/bhyve/mem.[ch]
 usr/src/cmd/bhyve/mevent.[ch]
 usr/src/cmd/bhyve/mevent_test.c
 usr/src/cmd/bhyve/mptbl.[ch]
+usr/src/cmd/bhyve/net_utils.[ch]
+usr/src/cmd/bhyve/net_backends.[ch]
 usr/src/cmd/bhyve/pci_ahci.c
 usr/src/cmd/bhyve/pci_e82545.c
 usr/src/cmd/bhyve/pci_emul.[ch]
 usr/src/cmd/bhyve/pci_fbuf.c
+usr/src/cmd/bhyve/pci_hda.[ch]
 usr/src/cmd/bhyve/pci_hostbridge.c
 usr/src/cmd/bhyve/pci_irq.[ch]
 usr/src/cmd/bhyve/pci_lpc.[ch]
diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk
index 7fa467f735..8eda3a7249 100644
--- a/exception_lists/hdrchk
+++ b/exception_lists/hdrchk
@@ -380,6 +380,7 @@ syntax: glob
 usr/src/cmd/bhyve/acpi.h
 usr/src/cmd/bhyve/ahci.h
 usr/src/cmd/bhyve/atkbdc.h
+usr/src/cmd/bhyve/audio.h
 usr/src/cmd/bhyve/bhyvegc.h
 usr/src/cmd/bhyve/bhyverun.h
 usr/src/cmd/bhyve/block_if.h
@@ -391,6 +392,7 @@ usr/src/cmd/bhyve/ioapic.h
 usr/src/cmd/bhyve/mem.h
 usr/src/cmd/bhyve/mptbl.h
 usr/src/cmd/bhyve/pci_emul.h
+usr/src/cmd/bhyve/pci_hda.h
 usr/src/cmd/bhyve/pci_irq.h
 usr/src/cmd/bhyve/pci_lpc.h
 usr/src/cmd/bhyve/ps2kbd.h
diff --git a/exception_lists/wscheck b/exception_lists/wscheck
index ac16cc54b2..6dc4169c91 100644
--- a/exception_lists/wscheck
+++ b/exception_lists/wscheck
@@ -31,6 +31,7 @@ usr/src/uts/common/io/ixgbe/core/*
 usr/src/cmd/bhyve/acpi.[ch]
 usr/src/cmd/bhyve/ahci.h
 usr/src/cmd/bhyve/atkbdc.[ch]
+usr/src/cmd/bhyve/audio.[ch]
 usr/src/cmd/bhyve/bhyvegc.[ch]
 usr/src/cmd/bhyve/bhyverun.[ch]
 usr/src/cmd/bhyve/block_if.[ch]
@@ -40,16 +41,22 @@ usr/src/cmd/bhyve/consport.c
 usr/src/cmd/bhyve/dbgport.[ch]
 usr/src/cmd/bhyve/fwctl.[ch]
 usr/src/cmd/bhyve/gdb.[ch]
+usr/src/cmd/bhyve/hda_codec.c
+usr/src/cmd/bhyve/hda_reg.h
+usr/src/cmd/bhyve/hdac_reg.h
 usr/src/cmd/bhyve/inout.[ch]
 usr/src/cmd/bhyve/ioapic.[ch]
 usr/src/cmd/bhyve/mem.[ch]
 usr/src/cmd/bhyve/mevent.[ch]
 usr/src/cmd/bhyve/mevent_test.c
 usr/src/cmd/bhyve/mptbl.[ch]
+usr/src/cmd/bhyve/net_utils.[ch]
+usr/src/cmd/bhyve/net_backends.[ch]
 usr/src/cmd/bhyve/pci_ahci.c
 usr/src/cmd/bhyve/pci_e82545.c
 usr/src/cmd/bhyve/pci_emul.[ch]
 usr/src/cmd/bhyve/pci_fbuf.c
+usr/src/cmd/bhyve/pci_hda.[ch]
 usr/src/cmd/bhyve/pci_hostbridge.c
 usr/src/cmd/bhyve/pci_irq.[ch]
 usr/src/cmd/bhyve/pci_lpc.[ch]
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index 2301e6c8a6..eb7c4def30 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -44,6 +44,7 @@ SRCS =	acpi.c			\
 	mem.c			\
 	mevent.c		\
 	mptbl.c			\
+	net_utils.c		\
 	pci_ahci.c		\
 	pci_e82545.c		\
 	pci_emul.c		\
@@ -86,6 +87,15 @@ SRCS =	acpi.c			\
 	#ctl_scsi_all.c		\
 	#pci_virtio_scsi.c	\
 
+# The audio backend in FreeBSD is different than the one found in audio_oss.h
+	#audio.c		\
+	#hda_codec.c		\
+	#pci_hda.c		\
+
+# The bhyve generic net-backend stuff has been ignored by us at the moment
+# because SmartOS users prefer to use viona for its superior network perf.
+	#net_backends.c		\
+
 
 OBJS = $(SRCS:.c=.o)
 
diff --git a/usr/src/cmd/bhyve/audio.c b/usr/src/cmd/bhyve/audio.c
new file mode 100644
index 0000000000..15e370284e
--- /dev/null
+++ b/usr/src/cmd/bhyve/audio.c
@@ -0,0 +1,285 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Alex Teaca <iateaca@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#include <capsicum_helpers.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <err.h>
+#include <sysexits.h>
+
+#include "audio.h"
+#include "pci_hda.h"
+
+/*
+ * Audio Player internal data structures
+ */
+
+struct audio {
+	int fd;
+	uint8_t dir;
+	uint8_t inited;
+	char dev_name[64];
+};
+
+/*
+ * Audio Player module function definitions
+ */
+
+/*
+ * audio_init - initialize an instance of audio player
+ * @dev_name - the backend sound device used to play / capture
+ * @dir - dir = 1 for write mode, dir = 0 for read mode
+ */
+struct audio *
+audio_init(const char *dev_name, uint8_t dir)
+{
+	struct audio *aud = NULL;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+	cap_ioctl_t cmds[] = {
+	    SNDCTL_DSP_RESET, SNDCTL_DSP_SETFMT, SNDCTL_DSP_CHANNELS,
+	    SNDCTL_DSP_SPEED,
+#ifdef DEBUG_HDA
+	    SNDCTL_DSP_GETOSPACE, SNDCTL_DSP_GETISPACE,
+#endif
+	};
+#endif
+
+	assert(dev_name);
+
+	aud = calloc(1, sizeof(*aud));
+	if (!aud)
+		return NULL;
+
+	if (strlen(dev_name) < sizeof(aud->dev_name))
+		memcpy(aud->dev_name, dev_name, strlen(dev_name) + 1);
+	else {
+		DPRINTF("dev_name too big\n");
+		free(aud);
+		return NULL;
+	}
+
+	aud->dir = dir;
+
+	aud->fd = open(aud->dev_name, aud->dir ? O_WRONLY : O_RDONLY, 0);
+	if (aud->fd == -1) {
+		DPRINTF("Failed to open dev: %s, errno: %d\n",
+		    aud->dev_name, errno);
+		free(aud);
+		return (NULL);
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(aud->fd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+	if (caph_ioctls_limit(aud->fd, cmds, nitems(cmds)) == -1)
+		errx(EX_OSERR, "Unable to limit ioctl rights for sandbox");
+#endif
+
+	return aud;
+}
+
+/*
+ * audio_set_params - reset the sound device and set the audio params
+ * @aud - the audio player to be configured
+ * @params - the audio parameters to be set
+ */
+int
+audio_set_params(struct audio *aud, struct audio_params *params)
+{
+	int audio_fd;
+	int format, channels, rate;
+	int err;
+#if DEBUG_HDA == 1
+	audio_buf_info info;
+#endif
+
+	assert(aud);
+	assert(params);
+
+	if ((audio_fd = aud->fd) < 0) {
+		DPRINTF("Incorrect audio device descriptor for %s\n",
+		    aud->dev_name);
+		return (-1);
+	}
+
+	/* Reset the device if it was previously opened */
+	if (aud->inited) {
+		err = ioctl(audio_fd, SNDCTL_DSP_RESET, NULL);
+		if (err == -1) {
+			DPRINTF("Failed to reset fd: %d, errno: %d\n",
+			    aud->fd, errno);
+			return (-1);
+		}
+	} else
+		aud->inited = 1;
+
+	/* Set the Format (Bits per Sample) */
+	format = params->format;
+	err = ioctl(audio_fd, SNDCTL_DSP_SETFMT, &format);
+	if (err == -1) {
+		DPRINTF("Fail to set fmt: 0x%x errno: %d\n",
+		    params->format, errno);
+		return -1;
+	}
+
+	/* The device does not support the requested audio format */
+	if (format != params->format) {
+		DPRINTF("Mismatch format: 0x%x params->format: 0x%x\n",
+		    format, params->format);
+		return -1;
+	}
+
+	/* Set the Number of Channels */
+	channels = params->channels;
+	err = ioctl(audio_fd, SNDCTL_DSP_CHANNELS, &channels);
+	if (err == -1) {
+		DPRINTF("Fail to set channels: %d errno: %d\n",
+		    params->channels, errno);
+		return -1;
+	}
+
+	/* The device does not support the requested no. of channels */
+	if (channels != params->channels) {
+		DPRINTF("Mismatch channels: %d params->channels: %d\n",
+		    channels, params->channels);
+		return -1;
+	}
+
+	/* Set the Sample Rate / Speed */
+	rate = params->rate;
+	err = ioctl(audio_fd, SNDCTL_DSP_SPEED, &rate);
+	if (err == -1) {
+		DPRINTF("Fail to set speed: %d errno: %d\n",
+		    params->rate, errno);
+		return -1;
+	}
+
+	/* The device does not support the requested rate / speed */
+	if (rate != params->rate) {
+		DPRINTF("Mismatch rate: %d params->rate: %d\n",
+		    rate, params->rate);
+		return -1;
+	}
+
+#if DEBUG_HDA == 1
+	err = ioctl(audio_fd, aud->dir ? SNDCTL_DSP_GETOSPACE :
+	    SNDCTL_DSP_GETISPACE, &info);
+	if (err == -1) {
+		DPRINTF("Fail to get audio buf info errno: %d\n", errno);
+		return -1;
+	}
+	DPRINTF("fragstotal: 0x%x fragsize: 0x%x\n",
+	    info.fragstotal, info.fragsize);
+#endif
+	return 0;
+}
+
+/*
+ * audio_playback - plays samples to the sound device using blocking operations
+ * @aud - the audio player used to play the samples
+ * @buf - the buffer containing the samples
+ * @count - the number of bytes in buffer
+ */
+int
+audio_playback(struct audio *aud, const void *buf, size_t count)
+{
+	int audio_fd = -1;
+	ssize_t len = 0, total = 0;
+
+	assert(aud);
+	assert(aud->dir);
+	assert(buf);
+
+	audio_fd = aud->fd;
+	assert(audio_fd != -1);
+
+	total = 0;
+	while (total < count) {
+		len = write(audio_fd, buf + total, count - total);
+		if (len == -1) {
+			DPRINTF("Fail to write to fd: %d, errno: %d\n",
+			    audio_fd, errno);
+			return -1;
+		}
+
+		total += len;
+	}
+
+	return 0;
+}
+
+/*
+ * audio_record - records samples from the sound device using
+ * blocking operations.
+ * @aud - the audio player used to capture the samples
+ * @buf - the buffer to receive the samples
+ * @count - the number of bytes to capture in buffer
+ * Returns -1 on error and 0 on success
+ */
+int
+audio_record(struct audio *aud, void *buf, size_t count)
+{
+	int audio_fd = -1;
+	ssize_t len = 0, total = 0;
+
+	assert(aud);
+	assert(!aud->dir);
+	assert(buf);
+
+	audio_fd = aud->fd;
+	assert(audio_fd != -1);
+
+	total = 0;
+	while (total < count) {
+		len = read(audio_fd, buf + total, count - total);
+		if (len == -1) {
+			DPRINTF("Fail to write to fd: %d, errno: %d\n",
+			    audio_fd, errno);
+			return -1;
+		}
+
+		total += len;
+	}
+
+	return 0;
+}
diff --git a/usr/src/cmd/bhyve/audio.h b/usr/src/cmd/bhyve/audio.h
new file mode 100644
index 0000000000..2b559a43e5
--- /dev/null
+++ b/usr/src/cmd/bhyve/audio.h
@@ -0,0 +1,88 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Alex Teaca <iateaca@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _AUDIO_EMUL_H_ 
+#define _AUDIO_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/soundcard.h>
+
+/*
+ * Audio Player data structures
+ */
+
+struct audio;
+
+struct audio_params {
+	int channels;
+	int format;
+	int rate;
+};
+
+/*
+ * Audio Player API
+ */
+
+/*
+ * audio_init - initialize an instance of audio player
+ * @dev_name - the backend sound device used to play / capture
+ * @dir - dir = 1 for write mode, dir = 0 for read mode
+ * Returns NULL on error and the address of the audio player instance
+ */
+struct audio *audio_init(const char *dev_name, uint8_t dir);
+
+/*
+ * audio_set_params - reset the sound device and set the audio params
+ * @aud - the audio player to be configured
+ * @params - the audio parameters to be set
+ * Returns -1 on error and 0 on success
+ */
+int audio_set_params(struct audio *aud, struct audio_params *params);
+
+/*
+ * audio_playback - plays samples to the sound device using blocking operations
+ * @aud - the audio player used to play the samples
+ * @buf - the buffer containing the samples
+ * @count - the number of bytes in buffer
+ * Returns -1 on error and 0 on success
+ */
+int audio_playback(struct audio *aud, const void *buf, size_t count);
+
+/*
+ * audio_record - records samples from the sound device using blocking
+ * operations.
+ * @aud - the audio player used to capture the samples
+ * @buf - the buffer to receive the samples
+ * @count - the number of bytes to capture in buffer
+ * Returns -1 on error and 0 on success
+ */
+int audio_record(struct audio *aud, void *buf, size_t count);
+
+#endif  /* _AUDIO_EMUL_H_ */
diff --git a/usr/src/cmd/bhyve/gdb.c b/usr/src/cmd/bhyve/gdb.c
index 71cb780544..06809860c6 100644
--- a/usr/src/cmd/bhyve/gdb.c
+++ b/usr/src/cmd/bhyve/gdb.c
@@ -32,6 +32,11 @@ __FBSDID("$FreeBSD$");
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#else
+#include <endian.h>
+#endif
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
@@ -969,14 +974,10 @@ gdb_write_mem(const uint8_t *data, size_t len)
 					val = parse_byte(data);
 				} else if (gpa & 2 || todo == 2) {
 					bytes = 2;
-					val = parse_byte(data) |
-					    (parse_byte(data + 2) << 8);
+					val = be16toh(parse_integer(data, 4));
 				} else {
 					bytes = 4;
-					val = parse_byte(data) |
-					    (parse_byte(data + 2) << 8) |
-					    (parse_byte(data + 4) << 16) |
-					    (parse_byte(data + 6) << 24);
+					val = be32toh(parse_integer(data, 8));
 				}
 				error = write_mem(ctx, cur_vcpu, gpa, val,
 				    bytes);
diff --git a/usr/src/cmd/bhyve/hda_codec.c b/usr/src/cmd/bhyve/hda_codec.c
new file mode 100644
index 0000000000..82f5fb1eed
--- /dev/null
+++ b/usr/src/cmd/bhyve/hda_codec.c
@@ -0,0 +1,952 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Alex Teaca <iateaca@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <pthread.h>
+#include <pthread_np.h>
+#include <unistd.h>
+
+#include "pci_hda.h"
+#include "audio.h"
+
+/*
+ * HDA Codec defines
+ */
+#define INTEL_VENDORID				0x8086
+
+#define HDA_CODEC_SUBSYSTEM_ID			((INTEL_VENDORID << 16) | 0x01)
+#define HDA_CODEC_ROOT_NID			0x00
+#define HDA_CODEC_FG_NID			0x01
+#define HDA_CODEC_AUDIO_OUTPUT_NID		0x02
+#define HDA_CODEC_PIN_OUTPUT_NID		0x03
+#define HDA_CODEC_AUDIO_INPUT_NID		0x04
+#define HDA_CODEC_PIN_INPUT_NID			0x05
+
+#define HDA_CODEC_STREAMS_COUNT			0x02
+#define HDA_CODEC_STREAM_OUTPUT			0x00
+#define HDA_CODEC_STREAM_INPUT			0x01
+
+#define HDA_CODEC_PARAMS_COUNT			0x14
+#define HDA_CODEC_CONN_LIST_COUNT		0x01
+#define HDA_CODEC_RESPONSE_EX_UNSOL		0x10
+#define HDA_CODEC_RESPONSE_EX_SOL		0x00
+#define HDA_CODEC_AMP_NUMSTEPS			0x4a
+
+#define HDA_CODEC_SUPP_STREAM_FORMATS_PCM				\
+	(1 << HDA_PARAM_SUPP_STREAM_FORMATS_PCM_SHIFT)
+
+#define HDA_CODEC_FMT_BASE_MASK			(0x01 << 14)
+
+#define HDA_CODEC_FMT_MULT_MASK			(0x07 << 11)
+#define HDA_CODEC_FMT_MULT_2			(0x01 << 11)
+#define HDA_CODEC_FMT_MULT_3			(0x02 << 11)
+#define HDA_CODEC_FMT_MULT_4			(0x03 << 11)
+
+#define HDA_CODEC_FMT_DIV_MASK			0x07
+#define HDA_CODEC_FMT_DIV_SHIFT			8
+
+#define HDA_CODEC_FMT_BITS_MASK			(0x07 << 4)
+#define HDA_CODEC_FMT_BITS_8			(0x00 << 4)
+#define HDA_CODEC_FMT_BITS_16			(0x01 << 4)
+#define HDA_CODEC_FMT_BITS_24			(0x03 << 4)
+#define HDA_CODEC_FMT_BITS_32			(0x04 << 4)
+
+#define HDA_CODEC_FMT_CHAN_MASK			(0x0f << 0)
+
+#define HDA_CODEC_AUDIO_WCAP_OUTPUT					\
+	(0x00 << HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_INPUT					\
+	(0x01 << HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_PIN					\
+	(0x04 << HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_CONN_LIST					\
+	(1 << HDA_PARAM_AUDIO_WIDGET_CAP_CONN_LIST_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_FORMAT_OVR					\
+	(1 << HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_AMP_OVR					\
+	(1 << HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_OUT_AMP					\
+	(1 << HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_IN_AMP					\
+	(1 << HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP_SHIFT)
+#define HDA_CODEC_AUDIO_WCAP_STEREO					\
+	(1 << HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_SHIFT)
+
+#define HDA_CODEC_PIN_CAP_OUTPUT					\
+	(1 << HDA_PARAM_PIN_CAP_OUTPUT_CAP_SHIFT)
+#define HDA_CODEC_PIN_CAP_INPUT						\
+	(1 << HDA_PARAM_PIN_CAP_INPUT_CAP_SHIFT)
+#define HDA_CODEC_PIN_CAP_PRESENCE_DETECT				\
+	(1 << HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP_SHIFT)
+
+#define HDA_CODEC_OUTPUT_AMP_CAP_MUTE_CAP				\
+	(1 << HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP_SHIFT)
+#define HDA_CODEC_OUTPUT_AMP_CAP_STEPSIZE				\
+	(0x03 << HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE_SHIFT)
+#define HDA_CODEC_OUTPUT_AMP_CAP_NUMSTEPS				\
+	(HDA_CODEC_AMP_NUMSTEPS << HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS_SHIFT)
+#define HDA_CODEC_OUTPUT_AMP_CAP_OFFSET					\
+	(HDA_CODEC_AMP_NUMSTEPS << HDA_PARAM_OUTPUT_AMP_CAP_OFFSET_SHIFT)
+
+#define HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE	0x80
+#define HDA_CODEC_SET_AMP_GAIN_MUTE_GAIN_MASK	0x7f
+
+#define HDA_CODEC_PIN_SENSE_PRESENCE_PLUGGED	(1 << 31)
+#define HDA_CODEC_PIN_WIDGET_CTRL_OUT_ENABLE				\
+	(1 << HDA_CMD_GET_PIN_WIDGET_CTRL_OUT_ENABLE_SHIFT)
+#define HDA_CODEC_PIN_WIDGET_CTRL_IN_ENABLE				\
+	(1 << HDA_CMD_GET_PIN_WIDGET_CTRL_IN_ENABLE_SHIFT)
+
+#define HDA_CONFIG_DEFAULTCONF_COLOR_BLACK				\
+	(0x01 << HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_COLOR_RED				\
+	(0x05 << HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT)
+
+#define HDA_CODEC_BUF_SIZE			HDA_FIFO_SIZE
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+
+/*
+ * HDA Audio Context data structures
+ */
+
+typedef void (*transfer_func_t)(void *arg);
+typedef int (*setup_func_t)(void *arg);
+
+struct hda_audio_ctxt {
+	char name[64];
+	uint8_t run;
+	uint8_t started;
+	void *priv;
+	pthread_t tid;
+	pthread_mutex_t mtx;
+	pthread_cond_t cond;
+	setup_func_t do_setup;
+	transfer_func_t do_transfer;
+};
+
+/*
+ * HDA Audio Context module function declarations
+ */
+
+static void *hda_audio_ctxt_thr(void *arg);
+static int hda_audio_ctxt_init(struct hda_audio_ctxt *actx, const char *tname,
+    transfer_func_t do_transfer, setup_func_t do_setup, void *priv);
+static int hda_audio_ctxt_start(struct hda_audio_ctxt *actx);
+static int hda_audio_ctxt_stop(struct hda_audio_ctxt *actx);
+
+/*
+ * HDA Codec data structures
+ */
+
+struct hda_codec_softc;
+
+typedef uint32_t (*verb_func_t)(struct hda_codec_softc *sc, uint16_t verb,
+				    uint16_t payload);
+
+struct hda_codec_stream {
+	uint8_t buf[HDA_CODEC_BUF_SIZE];
+	uint8_t channel;
+	uint16_t fmt;
+	uint8_t stream;
+
+	uint8_t left_gain;
+	uint8_t right_gain;
+	uint8_t left_mute;
+	uint8_t right_mute;
+
+	struct audio *aud;
+	struct hda_audio_ctxt actx;
+};
+
+struct hda_codec_softc {
+	uint32_t no_nodes;
+	uint32_t subsystem_id;
+	const uint32_t (*get_parameters)[HDA_CODEC_PARAMS_COUNT];
+	const uint8_t (*conn_list)[HDA_CODEC_CONN_LIST_COUNT];
+	const uint32_t *conf_default;
+	const uint8_t *pin_ctrl_default;
+	const verb_func_t *verb_handlers;
+
+	struct hda_codec_inst *hci;
+	struct hda_codec_stream streams[HDA_CODEC_STREAMS_COUNT];
+};
+
+/*
+ * HDA Codec module function declarations
+ */
+static int hda_codec_init(struct hda_codec_inst *hci, const char *play,
+    const char *rec, const char *opts);
+static int hda_codec_reset(struct hda_codec_inst *hci);
+static int hda_codec_command(struct hda_codec_inst *hci, uint32_t cmd_data);
+static int hda_codec_notify(struct hda_codec_inst *hci, uint8_t run,
+    uint8_t stream, uint8_t dir);
+
+static int hda_codec_parse_format(uint16_t fmt, struct audio_params *params);
+
+static uint32_t hda_codec_audio_output_nid(struct hda_codec_softc *sc,
+    uint16_t verb, uint16_t payload);
+static void hda_codec_audio_output_do_transfer(void *arg);
+static int hda_codec_audio_output_do_setup(void *arg);
+static uint32_t hda_codec_audio_input_nid(struct hda_codec_softc *sc,
+    uint16_t verb, uint16_t payload);
+static void hda_codec_audio_input_do_transfer(void *arg);
+static int hda_codec_audio_input_do_setup(void *arg);
+
+static uint32_t hda_codec_audio_inout_nid(struct hda_codec_stream *st,
+    uint16_t verb, uint16_t payload);
+
+/*
+ * HDA Codec global data
+ */
+
+#define HDA_CODEC_ROOT_DESC						\
+	[HDA_CODEC_ROOT_NID] = {					\
+		[HDA_PARAM_VENDOR_ID] = INTEL_VENDORID,			\
+		[HDA_PARAM_REVISION_ID] = 0xffff,			\
+		/* 1 Subnode, StartNid = 1 */				\
+		[HDA_PARAM_SUB_NODE_COUNT] = 0x00010001,		\
+	},								\
+
+#define HDA_CODEC_FG_COMMON_DESC					\
+	[HDA_PARAM_FCT_GRP_TYPE] = HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_AUDIO,\
+	/* B8 - B32, 8.0 - 192.0kHz */					\
+	[HDA_PARAM_SUPP_PCM_SIZE_RATE] = (0x1f << 16) | 0x7ff,		\
+	[HDA_PARAM_SUPP_STREAM_FORMATS] = HDA_CODEC_SUPP_STREAM_FORMATS_PCM,\
+	[HDA_PARAM_INPUT_AMP_CAP] = 0x00,	/* None */		\
+	[HDA_PARAM_OUTPUT_AMP_CAP] = 0x00,	/* None */		\
+	[HDA_PARAM_GPIO_COUNT] = 0x00,					\
+
+#define HDA_CODEC_FG_OUTPUT_DESC					\
+	[HDA_CODEC_FG_NID] = {						\
+		/* 2 Subnodes, StartNid = 2 */				\
+		[HDA_PARAM_SUB_NODE_COUNT] = 0x00020002,		\
+		HDA_CODEC_FG_COMMON_DESC				\
+	},								\
+
+#define HDA_CODEC_FG_INPUT_DESC						\
+	[HDA_CODEC_FG_NID] = {						\
+		/* 2 Subnodes, StartNid = 4 */				\
+		[HDA_PARAM_SUB_NODE_COUNT] = 0x00040002,		\
+		HDA_CODEC_FG_COMMON_DESC				\
+	},								\
+
+#define HDA_CODEC_FG_DUPLEX_DESC					\
+	[HDA_CODEC_FG_NID] = {						\
+		/* 4 Subnodes, StartNid = 2 */				\
+		[HDA_PARAM_SUB_NODE_COUNT] = 0x00020004,		\
+		HDA_CODEC_FG_COMMON_DESC				\
+	},								\
+
+#define HDA_CODEC_OUTPUT_DESC						\
+	[HDA_CODEC_AUDIO_OUTPUT_NID] = {				\
+		[HDA_PARAM_AUDIO_WIDGET_CAP] = 				\
+				HDA_CODEC_AUDIO_WCAP_OUTPUT |		\
+				HDA_CODEC_AUDIO_WCAP_FORMAT_OVR |	\
+				HDA_CODEC_AUDIO_WCAP_AMP_OVR |		\
+				HDA_CODEC_AUDIO_WCAP_OUT_AMP |		\
+				HDA_CODEC_AUDIO_WCAP_STEREO,		\
+		/* B16, 16.0 - 192.0kHz */				\
+		[HDA_PARAM_SUPP_PCM_SIZE_RATE] = (0x02 << 16) | 0x7fc,	\
+		[HDA_PARAM_SUPP_STREAM_FORMATS] =			\
+				HDA_CODEC_SUPP_STREAM_FORMATS_PCM,	\
+		[HDA_PARAM_INPUT_AMP_CAP] = 0x00,	/* None */	\
+		[HDA_PARAM_CONN_LIST_LENGTH] = 0x00,			\
+		[HDA_PARAM_OUTPUT_AMP_CAP] =				\
+				HDA_CODEC_OUTPUT_AMP_CAP_MUTE_CAP |	\
+				HDA_CODEC_OUTPUT_AMP_CAP_STEPSIZE |	\
+				HDA_CODEC_OUTPUT_AMP_CAP_NUMSTEPS |	\
+				HDA_CODEC_OUTPUT_AMP_CAP_OFFSET,	\
+	},								\
+	[HDA_CODEC_PIN_OUTPUT_NID] = {					\
+		[HDA_PARAM_AUDIO_WIDGET_CAP] =				\
+				HDA_CODEC_AUDIO_WCAP_PIN |		\
+				HDA_CODEC_AUDIO_WCAP_CONN_LIST |	\
+				HDA_CODEC_AUDIO_WCAP_STEREO,		\
+		[HDA_PARAM_PIN_CAP] = HDA_CODEC_PIN_CAP_OUTPUT |	\
+				      HDA_CODEC_PIN_CAP_PRESENCE_DETECT,\
+		[HDA_PARAM_INPUT_AMP_CAP] = 0x00,	/* None */	\
+		[HDA_PARAM_CONN_LIST_LENGTH] = 0x01,			\
+		[HDA_PARAM_OUTPUT_AMP_CAP] = 0x00,	/* None */	\
+	},								\
+
+#define HDA_CODEC_INPUT_DESC						\
+	[HDA_CODEC_AUDIO_INPUT_NID] = {					\
+		[HDA_PARAM_AUDIO_WIDGET_CAP] =				\
+				HDA_CODEC_AUDIO_WCAP_INPUT |		\
+				HDA_CODEC_AUDIO_WCAP_CONN_LIST |	\
+				HDA_CODEC_AUDIO_WCAP_FORMAT_OVR |	\
+				HDA_CODEC_AUDIO_WCAP_AMP_OVR |		\
+				HDA_CODEC_AUDIO_WCAP_IN_AMP |		\
+				HDA_CODEC_AUDIO_WCAP_STEREO,		\
+		/* B16, 16.0 - 192.0kHz */				\
+		[HDA_PARAM_SUPP_PCM_SIZE_RATE] = (0x02 << 16) | 0x7fc,	\
+		[HDA_PARAM_SUPP_STREAM_FORMATS] =			\
+				HDA_CODEC_SUPP_STREAM_FORMATS_PCM,	\
+		[HDA_PARAM_OUTPUT_AMP_CAP] = 0x00,	/* None */	\
+		[HDA_PARAM_CONN_LIST_LENGTH] = 0x01,			\
+		[HDA_PARAM_INPUT_AMP_CAP] =				\
+				HDA_CODEC_OUTPUT_AMP_CAP_MUTE_CAP |	\
+				HDA_CODEC_OUTPUT_AMP_CAP_STEPSIZE |	\
+				HDA_CODEC_OUTPUT_AMP_CAP_NUMSTEPS |	\
+				HDA_CODEC_OUTPUT_AMP_CAP_OFFSET,	\
+	},								\
+	[HDA_CODEC_PIN_INPUT_NID] = {					\
+		[HDA_PARAM_AUDIO_WIDGET_CAP] =				\
+				HDA_CODEC_AUDIO_WCAP_PIN |		\
+				HDA_CODEC_AUDIO_WCAP_STEREO,		\
+		[HDA_PARAM_PIN_CAP] = HDA_CODEC_PIN_CAP_INPUT |		\
+				HDA_CODEC_PIN_CAP_PRESENCE_DETECT,	\
+		[HDA_PARAM_INPUT_AMP_CAP] = 0x00,	/* None */	\
+		[HDA_PARAM_OUTPUT_AMP_CAP] = 0x00,	/* None */	\
+	},								\
+
+static const uint32_t
+hda_codec_output_parameters[][HDA_CODEC_PARAMS_COUNT] = {
+	HDA_CODEC_ROOT_DESC
+	HDA_CODEC_FG_OUTPUT_DESC
+	HDA_CODEC_OUTPUT_DESC
+};
+
+static const uint32_t
+hda_codec_input_parameters[][HDA_CODEC_PARAMS_COUNT] = {
+	HDA_CODEC_ROOT_DESC
+	HDA_CODEC_FG_INPUT_DESC
+	HDA_CODEC_INPUT_DESC
+};
+
+static const uint32_t
+hda_codec_duplex_parameters[][HDA_CODEC_PARAMS_COUNT] = {
+	HDA_CODEC_ROOT_DESC
+	HDA_CODEC_FG_DUPLEX_DESC
+	HDA_CODEC_OUTPUT_DESC
+	HDA_CODEC_INPUT_DESC
+};
+
+#define HDA_CODEC_NODES_COUNT	(ARRAY_SIZE(hda_codec_duplex_parameters))
+
+static const uint8_t
+hda_codec_conn_list[HDA_CODEC_NODES_COUNT][HDA_CODEC_CONN_LIST_COUNT] = {
+	[HDA_CODEC_PIN_OUTPUT_NID] = {HDA_CODEC_AUDIO_OUTPUT_NID},
+	[HDA_CODEC_AUDIO_INPUT_NID] = {HDA_CODEC_PIN_INPUT_NID},
+};
+
+static const uint32_t
+hda_codec_conf_default[HDA_CODEC_NODES_COUNT] = {
+	[HDA_CODEC_PIN_OUTPUT_NID] =					\
+		HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_JACK |
+		HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_OUT |
+		HDA_CONFIG_DEFAULTCONF_COLOR_BLACK |
+		(0x01 << HDA_CONFIG_DEFAULTCONF_ASSOCIATION_SHIFT),
+	[HDA_CODEC_PIN_INPUT_NID] = HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_JACK |
+				    HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_IN |
+				    HDA_CONFIG_DEFAULTCONF_COLOR_RED |
+			(0x02 << HDA_CONFIG_DEFAULTCONF_ASSOCIATION_SHIFT),
+};
+
+static const uint8_t
+hda_codec_pin_ctrl_default[HDA_CODEC_NODES_COUNT] = {
+	[HDA_CODEC_PIN_OUTPUT_NID] = HDA_CODEC_PIN_WIDGET_CTRL_OUT_ENABLE,
+	[HDA_CODEC_PIN_INPUT_NID] = HDA_CODEC_PIN_WIDGET_CTRL_IN_ENABLE,
+};
+
+static const
+verb_func_t hda_codec_verb_handlers[HDA_CODEC_NODES_COUNT] = {
+	[HDA_CODEC_AUDIO_OUTPUT_NID] = hda_codec_audio_output_nid,
+	[HDA_CODEC_AUDIO_INPUT_NID] = hda_codec_audio_input_nid,
+};
+
+/*
+ * HDA Codec module function definitions
+ */
+
+static int
+hda_codec_init(struct hda_codec_inst *hci, const char *play,
+    const char *rec, const char *opts)
+{
+	struct hda_codec_softc *sc = NULL;
+	struct hda_codec_stream *st = NULL;
+	int err;
+
+	if (!(play || rec))
+		return (-1);
+
+	DPRINTF("cad: 0x%x opts: %s\n", hci->cad, opts);
+
+	sc = calloc(1, sizeof(*sc));
+	if (!sc)
+		return (-1);
+
+	if (play && rec)
+		sc->get_parameters = hda_codec_duplex_parameters;
+	else {
+		if (play)
+			sc->get_parameters = hda_codec_output_parameters;
+		else
+			sc->get_parameters = hda_codec_input_parameters;
+	}
+	sc->subsystem_id = HDA_CODEC_SUBSYSTEM_ID;
+	sc->no_nodes = HDA_CODEC_NODES_COUNT;
+	sc->conn_list = hda_codec_conn_list;
+	sc->conf_default = hda_codec_conf_default;
+	sc->pin_ctrl_default = hda_codec_pin_ctrl_default;
+	sc->verb_handlers = hda_codec_verb_handlers;
+	DPRINTF("HDA Codec nodes: %d\n", sc->no_nodes);
+
+	/*
+	 * Initialize the Audio Output stream
+	 */
+	if (play) {
+		st = &sc->streams[HDA_CODEC_STREAM_OUTPUT];
+
+		err = hda_audio_ctxt_init(&st->actx, "hda-audio-output",
+			hda_codec_audio_output_do_transfer,
+			hda_codec_audio_output_do_setup, sc);
+		assert(!err);
+
+		st->aud = audio_init(play, 1);
+		if (!st->aud) {
+			DPRINTF("Fail to init the output audio player\n");
+			return (-1);
+		}
+	}
+
+	/*
+	 * Initialize the Audio Input stream
+	 */
+	if (rec) {
+		st = &sc->streams[HDA_CODEC_STREAM_INPUT];
+
+		err = hda_audio_ctxt_init(&st->actx, "hda-audio-input",
+			hda_codec_audio_input_do_transfer,
+			hda_codec_audio_input_do_setup, sc);
+		assert(!err);
+
+		st->aud = audio_init(rec, 0);
+		if (!st->aud) {
+			DPRINTF("Fail to init the input audio player\n");
+			return (-1);
+		}
+	}
+
+	sc->hci = hci;
+	hci->priv = sc;
+
+	return (0);
+}
+
+static int
+hda_codec_reset(struct hda_codec_inst *hci)
+{
+	struct hda_ops *hops = NULL;
+	struct hda_codec_softc *sc = NULL;
+	struct hda_codec_stream *st = NULL;
+	int i;
+
+	assert(hci);
+
+	hops = hci->hops;
+	assert(hops);
+
+	sc = (struct hda_codec_softc *)hci->priv;
+	assert(sc);
+
+	for (i = 0; i < HDA_CODEC_STREAMS_COUNT; i++) {
+		st = &sc->streams[i];
+		st->left_gain = HDA_CODEC_AMP_NUMSTEPS;
+		st->right_gain = HDA_CODEC_AMP_NUMSTEPS;
+		st->left_mute = HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE;
+		st->right_mute = HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE;
+	}
+
+	DPRINTF("cad: 0x%x\n", hci->cad);
+
+	if (!hops->signal) {
+		DPRINTF("The controller ops does not implement \
+			 the signal function\n");
+		return (-1);
+	}
+
+	return (hops->signal(hci));
+}
+
+static int
+hda_codec_command(struct hda_codec_inst *hci, uint32_t cmd_data)
+{
+	struct hda_codec_softc *sc = NULL;
+	struct hda_ops *hops = NULL;
+	uint8_t cad = 0, nid = 0;
+	uint16_t verb = 0, payload = 0;
+	uint32_t res = 0;
+
+	/* 4 bits */
+	cad = (cmd_data >> HDA_CMD_CAD_SHIFT) & 0x0f;
+	/* 8 bits */
+	nid = (cmd_data >> HDA_CMD_NID_SHIFT) & 0xff;
+
+	if ((cmd_data & 0x70000) == 0x70000) {
+		/* 12 bits */
+		verb = (cmd_data >> HDA_CMD_VERB_12BIT_SHIFT) & 0x0fff;
+		/* 8 bits */
+		payload = cmd_data & 0xff;
+	} else {
+		/* 4 bits */
+		verb = (cmd_data >> HDA_CMD_VERB_4BIT_SHIFT) & 0x0f;
+		/* 16 bits */
+		payload = cmd_data & 0xffff;
+	}
+
+	assert(cad == hci->cad);
+	assert(hci);
+
+	hops = hci->hops;
+	assert(hops);
+
+	sc = (struct hda_codec_softc *)hci->priv;
+	assert(sc);
+
+	assert(nid < sc->no_nodes);
+
+	if (!hops->response) {
+		DPRINTF("The controller ops does not implement \
+			 the response function\n");
+		return (-1);
+	}
+
+	switch (verb) {
+	case HDA_CMD_VERB_GET_PARAMETER:
+		res = sc->get_parameters[nid][payload];
+		break;
+	case HDA_CMD_VERB_GET_CONN_LIST_ENTRY:
+		res = sc->conn_list[nid][0];
+		break;
+	case HDA_CMD_VERB_GET_PIN_WIDGET_CTRL:
+		res = sc->pin_ctrl_default[nid];
+		break;
+	case HDA_CMD_VERB_GET_PIN_SENSE:
+		res = HDA_CODEC_PIN_SENSE_PRESENCE_PLUGGED;
+		break;
+	case HDA_CMD_VERB_GET_CONFIGURATION_DEFAULT:
+		res = sc->conf_default[nid];
+		break;
+	case HDA_CMD_VERB_GET_SUBSYSTEM_ID:
+		res = sc->subsystem_id;
+		break;
+	default:
+		assert(sc->verb_handlers);
+		if (sc->verb_handlers[nid])
+			res = sc->verb_handlers[nid](sc, verb, payload);
+		else
+			DPRINTF("Unknown VERB: 0x%x\n", verb);
+		break;
+	}
+
+	DPRINTF("cad: 0x%x nid: 0x%x verb: 0x%x payload: 0x%x response: 0x%x\n",
+	    cad, nid, verb, payload, res);
+
+	return (hops->response(hci, res, HDA_CODEC_RESPONSE_EX_SOL));
+}
+
+static int
+hda_codec_notify(struct hda_codec_inst *hci, uint8_t run,
+    uint8_t stream, uint8_t dir)
+{
+	struct hda_codec_softc *sc = NULL;
+	struct hda_codec_stream *st = NULL;
+	struct hda_audio_ctxt *actx = NULL;
+	int i;
+	int err;
+
+	assert(hci);
+	assert(stream);
+
+	sc = (struct hda_codec_softc *)hci->priv;
+	assert(sc);
+
+	i = dir ? HDA_CODEC_STREAM_OUTPUT : HDA_CODEC_STREAM_INPUT;
+	st = &sc->streams[i];
+
+	DPRINTF("run: %d, stream: 0x%x, st->stream: 0x%x dir: %d\n",
+	    run, stream, st->stream, dir);
+
+	if (stream != st->stream) {
+		DPRINTF("Stream not found\n");
+		return (0);
+	}
+
+	actx = &st->actx;
+
+	if (run)
+		err = hda_audio_ctxt_start(actx);
+	else
+		err = hda_audio_ctxt_stop(actx);
+
+	return (err);
+}
+
+static int
+hda_codec_parse_format(uint16_t fmt, struct audio_params *params)
+{
+	uint8_t div = 0;
+
+	assert(params);
+
+	/* Compute the Sample Rate */
+	params->rate = (fmt & HDA_CODEC_FMT_BASE_MASK) ? 44100 : 48000;
+
+	switch (fmt & HDA_CODEC_FMT_MULT_MASK) {
+	case HDA_CODEC_FMT_MULT_2:
+		params->rate *= 2;
+		break;
+	case HDA_CODEC_FMT_MULT_3:
+		params->rate *= 3;
+		break;
+	case HDA_CODEC_FMT_MULT_4:
+		params->rate *= 4;
+		break;
+	}
+
+	div = (fmt >> HDA_CODEC_FMT_DIV_SHIFT) & HDA_CODEC_FMT_DIV_MASK;
+	params->rate /= (div + 1);
+
+	/* Compute the Bits per Sample */
+	switch (fmt & HDA_CODEC_FMT_BITS_MASK) {
+	case HDA_CODEC_FMT_BITS_8:
+		params->format = AFMT_U8;
+		break;
+	case HDA_CODEC_FMT_BITS_16:
+		params->format = AFMT_S16_LE;
+		break;
+	case HDA_CODEC_FMT_BITS_24:
+		params->format = AFMT_S24_LE;
+		break;
+	case HDA_CODEC_FMT_BITS_32:
+		params->format = AFMT_S32_LE;
+		break;
+	default:
+		DPRINTF("Unknown format bits: 0x%x\n",
+		    fmt & HDA_CODEC_FMT_BITS_MASK);
+		return (-1);
+	}
+
+	/* Compute the Number of Channels */
+	params->channels = (fmt & HDA_CODEC_FMT_CHAN_MASK) + 1;
+
+	return (0);
+}
+
+static uint32_t
+hda_codec_audio_output_nid(struct hda_codec_softc *sc, uint16_t verb,
+    uint16_t payload)
+{
+	struct hda_codec_stream *st = &sc->streams[HDA_CODEC_STREAM_OUTPUT];
+	int res;
+
+	res = hda_codec_audio_inout_nid(st, verb, payload);
+
+	return (res);
+}
+
+static void
+hda_codec_audio_output_do_transfer(void *arg)
+{
+	struct hda_codec_softc *sc = (struct hda_codec_softc *)arg;
+	struct hda_codec_inst *hci = NULL;
+	struct hda_ops *hops = NULL;
+	struct hda_codec_stream *st = NULL;
+	struct audio *aud = NULL;
+	int err;
+
+	hci = sc->hci;
+	assert(hci);
+
+	hops = hci->hops;
+	assert(hops);
+
+	st = &sc->streams[HDA_CODEC_STREAM_OUTPUT];
+	aud = st->aud;
+
+	err = hops->transfer(hci, st->stream, 1, st->buf, sizeof(st->buf));
+	if (err)
+		return;
+
+	err = audio_playback(aud, st->buf, sizeof(st->buf));
+	assert(!err);
+}
+
+static int
+hda_codec_audio_output_do_setup(void *arg)
+{
+	struct hda_codec_softc *sc = (struct hda_codec_softc *)arg;
+	struct hda_codec_stream *st = NULL;
+	struct audio *aud = NULL;
+	struct audio_params params;
+	int err;
+
+	st = &sc->streams[HDA_CODEC_STREAM_OUTPUT];
+	aud = st->aud;
+
+	err = hda_codec_parse_format(st->fmt, &params);
+	if (err)
+		return (-1);
+
+	DPRINTF("rate: %d, channels: %d, format: 0x%x\n",
+	    params.rate, params.channels, params.format);
+
+	return (audio_set_params(aud, &params));
+}
+
+static uint32_t
+hda_codec_audio_input_nid(struct hda_codec_softc *sc, uint16_t verb,
+    uint16_t payload)
+{
+	struct hda_codec_stream *st = &sc->streams[HDA_CODEC_STREAM_INPUT];
+	int res;
+
+	res = hda_codec_audio_inout_nid(st, verb, payload);
+
+	return (res);
+}
+
+static void
+hda_codec_audio_input_do_transfer(void *arg)
+{
+	struct hda_codec_softc *sc = (struct hda_codec_softc *)arg;
+	struct hda_codec_inst *hci = NULL;
+	struct hda_ops *hops = NULL;
+	struct hda_codec_stream *st = NULL;
+	struct audio *aud = NULL;
+	int err;
+
+	hci = sc->hci;
+	assert(hci);
+
+	hops = hci->hops;
+	assert(hops);
+
+	st = &sc->streams[HDA_CODEC_STREAM_INPUT];
+	aud = st->aud;
+
+	err = audio_record(aud, st->buf, sizeof(st->buf));
+	assert(!err);
+
+	hops->transfer(hci, st->stream, 0, st->buf, sizeof(st->buf));
+}
+
+static int
+hda_codec_audio_input_do_setup(void *arg)
+{
+	struct hda_codec_softc *sc = (struct hda_codec_softc *)arg;
+	struct hda_codec_stream *st = NULL;
+	struct audio *aud = NULL;
+	struct audio_params params;
+	int err;
+
+	st = &sc->streams[HDA_CODEC_STREAM_INPUT];
+	aud = st->aud;
+
+	err = hda_codec_parse_format(st->fmt, &params);
+	if (err)
+		return (-1);
+
+	DPRINTF("rate: %d, channels: %d, format: 0x%x\n",
+	    params.rate, params.channels, params.format);
+
+	return (audio_set_params(aud, &params));
+}
+
+static uint32_t
+hda_codec_audio_inout_nid(struct hda_codec_stream *st, uint16_t verb,
+    uint16_t payload)
+{
+	uint32_t res = 0;
+	uint8_t mute = 0;
+	uint8_t gain = 0;
+
+	DPRINTF("%s verb: 0x%x, payload, 0x%x\n", st->actx.name, verb, payload);
+
+	switch (verb) {
+	case HDA_CMD_VERB_GET_CONV_FMT:
+		res = st->fmt;
+		break;
+	case HDA_CMD_VERB_SET_CONV_FMT:
+		st->fmt = payload;
+		break;
+	case HDA_CMD_VERB_GET_AMP_GAIN_MUTE:
+		if (payload & HDA_CMD_GET_AMP_GAIN_MUTE_LEFT) {
+			res = st->left_gain | st->left_mute;
+			DPRINTF("GET_AMP_GAIN_MUTE_LEFT: 0x%x\n", res);
+		} else {
+			res = st->right_gain | st->right_mute;
+			DPRINTF("GET_AMP_GAIN_MUTE_RIGHT: 0x%x\n", res);
+		}
+		break;
+	case HDA_CMD_VERB_SET_AMP_GAIN_MUTE:
+		mute = payload & HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE;
+		gain = payload & HDA_CODEC_SET_AMP_GAIN_MUTE_GAIN_MASK;
+
+		if (payload & HDA_CMD_SET_AMP_GAIN_MUTE_LEFT) {
+			st->left_mute = mute;
+			st->left_gain = gain;
+			DPRINTF("SET_AMP_GAIN_MUTE_LEFT: \
+			    mute: 0x%x gain: 0x%x\n", mute, gain);
+		}
+
+		if (payload & HDA_CMD_SET_AMP_GAIN_MUTE_RIGHT) {
+			st->right_mute = mute;
+			st->right_gain = gain;
+			DPRINTF("SET_AMP_GAIN_MUTE_RIGHT: \
+			    mute: 0x%x gain: 0x%x\n", mute, gain);
+		}
+		break;
+	case HDA_CMD_VERB_GET_CONV_STREAM_CHAN:
+		res = (st->stream << 4) | st->channel;
+		break;
+	case HDA_CMD_VERB_SET_CONV_STREAM_CHAN:
+		st->channel = payload & 0x0f;
+		st->stream = (payload >> 4) & 0x0f;
+		DPRINTF("st->channel: 0x%x st->stream: 0x%x\n",
+		    st->channel, st->stream);
+		if (!st->stream)
+			hda_audio_ctxt_stop(&st->actx);
+		break;
+	default:
+		DPRINTF("Unknown VERB: 0x%x\n", verb);
+		break;
+	}
+
+	return (res);
+}
+
+struct hda_codec_class hda_codec  = {
+	.name		= "hda_codec",
+	.init		= hda_codec_init,
+	.reset		= hda_codec_reset,
+	.command	= hda_codec_command,
+	.notify		= hda_codec_notify,
+};
+
+HDA_EMUL_SET(hda_codec);
+
+
+/*
+ * HDA Audio Context module function definitions
+ */
+
+static void *
+hda_audio_ctxt_thr(void *arg)
+{
+	struct hda_audio_ctxt *actx = arg;
+
+	DPRINTF("Start Thread: %s\n", actx->name);
+
+	pthread_mutex_lock(&actx->mtx);
+	while (1) {
+		while (!actx->run)
+			pthread_cond_wait(&actx->cond, &actx->mtx);
+
+		actx->do_transfer(actx->priv);
+	}
+	pthread_mutex_unlock(&actx->mtx);
+
+	pthread_exit(NULL);
+	return (NULL);
+}
+
+static int
+hda_audio_ctxt_init(struct hda_audio_ctxt *actx, const char *tname,
+    transfer_func_t do_transfer, setup_func_t do_setup, void *priv)
+{
+	int err;
+
+	assert(actx);
+	assert(tname);
+	assert(do_transfer);
+	assert(do_setup);
+	assert(priv);
+
+	memset(actx, 0, sizeof(*actx));
+
+	actx->run = 0;
+	actx->do_transfer = do_transfer;
+	actx->do_setup = do_setup;
+	actx->priv = priv;
+	if (strlen(tname) < sizeof(actx->name))
+		memcpy(actx->name, tname, strlen(tname) + 1);
+	else
+		strcpy(actx->name, "unknown");
+
+	err = pthread_mutex_init(&actx->mtx, NULL);
+	assert(!err);
+
+	err = pthread_cond_init(&actx->cond, NULL);
+	assert(!err);
+
+	err = pthread_create(&actx->tid, NULL, hda_audio_ctxt_thr, actx);
+	assert(!err);
+
+	pthread_set_name_np(actx->tid, tname);
+
+	actx->started = 1;
+
+	return (0);
+}
+
+static int
+hda_audio_ctxt_start(struct hda_audio_ctxt *actx)
+{
+	int err = 0;
+
+	assert(actx);
+	assert(actx->started);
+
+	/* The stream is supposed to be stopped */
+	if (actx->run)
+		return (-1);
+
+	pthread_mutex_lock(&actx->mtx);
+	err = (* actx->do_setup)(actx->priv);
+	if (!err) {
+		actx->run = 1;
+		pthread_cond_signal(&actx->cond);
+	}
+	pthread_mutex_unlock(&actx->mtx);
+
+	return (err);
+}
+
+static int
+hda_audio_ctxt_stop(struct hda_audio_ctxt *actx)
+{
+	actx->run = 0;
+	return (0);
+}
diff --git a/usr/src/cmd/bhyve/hda_reg.h b/usr/src/cmd/bhyve/hda_reg.h
new file mode 100644
index 0000000000..b3034bf9f4
--- /dev/null
+++ b/usr/src/cmd/bhyve/hda_reg.h
@@ -0,0 +1,1369 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2006 Stephane E. Potvin <sepotvin@videotron.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HDA_REG_H_
+#define _HDA_REG_H_
+
+/****************************************************************************
+ * HDA Device Verbs
+ ****************************************************************************/
+
+/* HDA Command */
+#define HDA_CMD_VERB_MASK				0x000fffff
+#define HDA_CMD_VERB_SHIFT				0
+#define HDA_CMD_NID_MASK				0x0ff00000
+#define HDA_CMD_NID_SHIFT				20
+#define HDA_CMD_CAD_MASK				0xf0000000
+#define HDA_CMD_CAD_SHIFT				28
+
+#define HDA_CMD_VERB_4BIT_SHIFT				16
+#define HDA_CMD_VERB_12BIT_SHIFT			8
+
+#define HDA_CMD_VERB_4BIT(verb, payload)				\
+    (((verb) << HDA_CMD_VERB_4BIT_SHIFT) | (payload))
+#define HDA_CMD_4BIT(cad, nid, verb, payload)				\
+    (((cad) << HDA_CMD_CAD_SHIFT) |					\
+    ((nid) << HDA_CMD_NID_SHIFT) |					\
+    (HDA_CMD_VERB_4BIT((verb), (payload))))
+
+#define HDA_CMD_VERB_12BIT(verb, payload)				\
+    (((verb) << HDA_CMD_VERB_12BIT_SHIFT) | (payload))
+#define HDA_CMD_12BIT(cad, nid, verb, payload)				\
+    (((cad) << HDA_CMD_CAD_SHIFT) |					\
+    ((nid) << HDA_CMD_NID_SHIFT) |					\
+    (HDA_CMD_VERB_12BIT((verb), (payload))))
+
+/* Get Parameter */
+#define HDA_CMD_VERB_GET_PARAMETER			0xf00
+
+#define HDA_CMD_GET_PARAMETER(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_PARAMETER, (payload)))
+
+/* Connection Select Control */
+#define HDA_CMD_VERB_GET_CONN_SELECT_CONTROL		0xf01
+#define HDA_CMD_VERB_SET_CONN_SELECT_CONTROL		0x701
+
+#define HDA_CMD_GET_CONN_SELECT_CONTROL(cad, nid)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_CONN_SELECT_CONTROL, 0x0))
+#define HDA_CMD_SET_CONNECTION_SELECT_CONTROL(cad, nid, payload)	\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_CONN_SELECT_CONTROL, (payload)))
+
+/* Connection List Entry */
+#define HDA_CMD_VERB_GET_CONN_LIST_ENTRY		0xf02
+
+#define HDA_CMD_GET_CONN_LIST_ENTRY(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_CONN_LIST_ENTRY, (payload)))
+
+#define HDA_CMD_GET_CONN_LIST_ENTRY_SIZE_SHORT		1
+#define HDA_CMD_GET_CONN_LIST_ENTRY_SIZE_LONG		2
+
+/* Processing State */
+#define HDA_CMD_VERB_GET_PROCESSING_STATE		0xf03
+#define HDA_CMD_VERB_SET_PROCESSING_STATE		0x703
+
+#define HDA_CMD_GET_PROCESSING_STATE(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_PROCESSING_STATE, 0x0))
+#define HDA_CMD_SET_PROCESSING_STATE(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_PROCESSING_STATE, (payload)))
+
+#define HDA_CMD_GET_PROCESSING_STATE_STATE_OFF		0x00
+#define HDA_CMD_GET_PROCESSING_STATE_STATE_ON		0x01
+#define HDA_CMD_GET_PROCESSING_STATE_STATE_BENIGN	0x02
+
+/* Coefficient Index */
+#define HDA_CMD_VERB_GET_COEFF_INDEX			0xd
+#define HDA_CMD_VERB_SET_COEFF_INDEX			0x5
+
+#define HDA_CMD_GET_COEFF_INDEX(cad, nid)				\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_GET_COEFF_INDEX, 0x0))
+#define HDA_CMD_SET_COEFF_INDEX(cad, nid, payload)			\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_SET_COEFF_INDEX, (payload)))
+
+/* Processing Coefficient */
+#define HDA_CMD_VERB_GET_PROCESSING_COEFF		0xc
+#define HDA_CMD_VERB_SET_PROCESSING_COEFF		0x4
+
+#define HDA_CMD_GET_PROCESSING_COEFF(cad, nid)				\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_GET_PROCESSING_COEFF, 0x0))
+#define HDA_CMD_SET_PROCESSING_COEFF(cad, nid, payload)			\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_SET_PROCESSING_COEFF, (payload)))
+
+/* Amplifier Gain/Mute */
+#define HDA_CMD_VERB_GET_AMP_GAIN_MUTE			0xb
+#define HDA_CMD_VERB_SET_AMP_GAIN_MUTE			0x3
+
+#define HDA_CMD_GET_AMP_GAIN_MUTE(cad, nid, payload)			\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_GET_AMP_GAIN_MUTE, (payload)))
+#define HDA_CMD_SET_AMP_GAIN_MUTE(cad, nid, payload)			\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_SET_AMP_GAIN_MUTE, (payload)))
+
+#define HDA_CMD_GET_AMP_GAIN_MUTE_INPUT		0x0000
+#define HDA_CMD_GET_AMP_GAIN_MUTE_OUTPUT	0x8000
+#define HDA_CMD_GET_AMP_GAIN_MUTE_RIGHT		0x0000
+#define HDA_CMD_GET_AMP_GAIN_MUTE_LEFT		0x2000
+
+#define HDA_CMD_GET_AMP_GAIN_MUTE_MUTE_MASK	0x00000008
+#define HDA_CMD_GET_AMP_GAIN_MUTE_MUTE_SHIFT	7
+#define HDA_CMD_GET_AMP_GAIN_MUTE_GAIN_MASK	0x00000007
+#define HDA_CMD_GET_AMP_GAIN_MUTE_GAIN_SHIFT	0
+
+#define HDA_CMD_GET_AMP_GAIN_MUTE_MUTE(rsp)				\
+    (((rsp) & HDA_CMD_GET_AMP_GAIN_MUTE_MUTE_MASK) >>			\
+    HDA_CMD_GET_AMP_GAIN_MUTE_MUTE_SHIFT)
+#define HDA_CMD_GET_AMP_GAIN_MUTE_GAIN(rsp)				\
+    (((rsp) & HDA_CMD_GET_AMP_GAIN_MUTE_GAIN_MASK) >>			\
+    HDA_CMD_GET_AMP_GAIN_MUTE_GAIN_SHIFT)
+
+#define HDA_CMD_SET_AMP_GAIN_MUTE_OUTPUT	0x8000
+#define HDA_CMD_SET_AMP_GAIN_MUTE_INPUT		0x4000
+#define HDA_CMD_SET_AMP_GAIN_MUTE_LEFT		0x2000
+#define HDA_CMD_SET_AMP_GAIN_MUTE_RIGHT		0x1000
+#define HDA_CMD_SET_AMP_GAIN_MUTE_INDEX_MASK	0x0f00
+#define HDA_CMD_SET_AMP_GAIN_MUTE_INDEX_SHIFT	8
+#define HDA_CMD_SET_AMP_GAIN_MUTE_MUTE		0x0080
+#define HDA_CMD_SET_AMP_GAIN_MUTE_GAIN_MASK	0x0007
+#define HDA_CMD_SET_AMP_GAIN_MUTE_GAIN_SHIFT	0
+
+#define HDA_CMD_SET_AMP_GAIN_MUTE_INDEX(index)				\
+    (((index) << HDA_CMD_SET_AMP_GAIN_MUTE_INDEX_SHIFT) &		\
+    HDA_CMD_SET_AMP_GAIN_MUTE_INDEX_MASK)
+#define HDA_CMD_SET_AMP_GAIN_MUTE_GAIN(index)				\
+    (((index) << HDA_CMD_SET_AMP_GAIN_MUTE_GAIN_SHIFT) &		\
+    HDA_CMD_SET_AMP_GAIN_MUTE_GAIN_MASK)
+
+/* Converter format */
+#define HDA_CMD_VERB_GET_CONV_FMT			0xa
+#define HDA_CMD_VERB_SET_CONV_FMT			0x2
+
+#define HDA_CMD_GET_CONV_FMT(cad, nid)					\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_GET_CONV_FMT, 0x0))
+#define HDA_CMD_SET_CONV_FMT(cad, nid, payload)				\
+    (HDA_CMD_4BIT((cad), (nid),						\
+    HDA_CMD_VERB_SET_CONV_FMT, (payload)))
+
+/* Digital Converter Control */
+#define HDA_CMD_VERB_GET_DIGITAL_CONV_FMT1		0xf0d
+#define HDA_CMD_VERB_GET_DIGITAL_CONV_FMT2		0xf0e
+#define HDA_CMD_VERB_SET_DIGITAL_CONV_FMT1		0x70d
+#define HDA_CMD_VERB_SET_DIGITAL_CONV_FMT2		0x70e
+
+#define HDA_CMD_GET_DIGITAL_CONV_FMT(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_DIGITAL_CONV_FMT1, 0x0))
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_DIGITAL_CONV_FMT1, (payload)))
+#define HDA_CMD_SET_DIGITAL_CONV_FMT2(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_DIGITAL_CONV_FMT2, (payload)))
+
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_CC_MASK		0x7f00
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_CC_SHIFT		8
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_L_MASK		0x0080
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_L_SHIFT		7
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_PRO_MASK		0x0040
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_PRO_SHIFT		6
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_NAUDIO_MASK	0x0020
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_NAUDIO_SHIFT	5
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_COPY_MASK		0x0010
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_COPY_SHIFT		4
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_PRE_MASK		0x0008
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_PRE_SHIFT		3
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_VCFG_MASK		0x0004
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_VCFG_SHIFT		2
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_V_MASK		0x0002
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_V_SHIFT		1
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_DIGEN_MASK		0x0001
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_DIGEN_SHIFT	0
+
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_CC(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_CC_MASK) >>			\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_CC_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_L(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_L_MASK) >>			\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_L_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_PRO(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_PRO_MASK) >>			\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_PRO_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_NAUDIO(rsp)			\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_NAUDIO_MASK) >>		\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_NAUDIO_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_COPY(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_COPY_MASK) >>		\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_COPY_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_PRE(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_PRE_MASK) >>			\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_PRE_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_VCFG(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_VCFG_MASK) >>		\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_VCFG_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_V(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_V_MASK) >>			\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_V_SHIFT)
+#define HDA_CMD_GET_DIGITAL_CONV_FMT_DIGEN(rsp)				\
+    (((rsp) & HDA_CMD_GET_DIGITAL_CONV_FMT_DIGEN_MASK) >>		\
+    HDA_CMD_GET_DIGITAL_CONV_FMT_DIGEN_SHIFT)
+
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_L			0x80
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_PRO		0x40
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_NAUDIO		0x20
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_COPY		0x10
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_PRE		0x08
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_VCFG		0x04
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_V			0x02
+#define HDA_CMD_SET_DIGITAL_CONV_FMT1_DIGEN		0x01
+
+/* Power State */
+#define HDA_CMD_VERB_GET_POWER_STATE			0xf05
+#define HDA_CMD_VERB_SET_POWER_STATE			0x705
+
+#define HDA_CMD_GET_POWER_STATE(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_POWER_STATE, 0x0))
+#define HDA_CMD_SET_POWER_STATE(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_POWER_STATE, (payload)))
+
+#define HDA_CMD_POWER_STATE_D0				0x00
+#define HDA_CMD_POWER_STATE_D1				0x01
+#define HDA_CMD_POWER_STATE_D2				0x02
+#define HDA_CMD_POWER_STATE_D3				0x03
+
+#define HDA_CMD_POWER_STATE_ACT_MASK			0x000000f0
+#define HDA_CMD_POWER_STATE_ACT_SHIFT			4
+#define HDA_CMD_POWER_STATE_SET_MASK			0x0000000f
+#define HDA_CMD_POWER_STATE_SET_SHIFT			0
+
+#define HDA_CMD_GET_POWER_STATE_ACT(rsp)				\
+    (((rsp) & HDA_CMD_POWER_STATE_ACT_MASK) >>				\
+    HDA_CMD_POWER_STATE_ACT_SHIFT)
+#define HDA_CMD_GET_POWER_STATE_SET(rsp)				\
+    (((rsp) & HDA_CMD_POWER_STATE_SET_MASK) >>				\
+    HDA_CMD_POWER_STATE_SET_SHIFT)
+
+#define HDA_CMD_SET_POWER_STATE_ACT(ps)					\
+    (((ps) << HDA_CMD_POWER_STATE_ACT_SHIFT) &				\
+    HDA_CMD_POWER_STATE_ACT_MASK)
+#define HDA_CMD_SET_POWER_STATE_SET(ps)					\
+    (((ps) << HDA_CMD_POWER_STATE_SET_SHIFT) &				\
+    HDA_CMD_POWER_STATE_ACT_MASK)
+
+/* Converter Stream, Channel */
+#define HDA_CMD_VERB_GET_CONV_STREAM_CHAN		0xf06
+#define HDA_CMD_VERB_SET_CONV_STREAM_CHAN		0x706
+
+#define HDA_CMD_GET_CONV_STREAM_CHAN(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_CONV_STREAM_CHAN, 0x0))
+#define HDA_CMD_SET_CONV_STREAM_CHAN(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_CONV_STREAM_CHAN, (payload)))
+
+#define HDA_CMD_CONV_STREAM_CHAN_STREAM_MASK		0x000000f0
+#define HDA_CMD_CONV_STREAM_CHAN_STREAM_SHIFT		4
+#define HDA_CMD_CONV_STREAM_CHAN_CHAN_MASK		0x0000000f
+#define HDA_CMD_CONV_STREAM_CHAN_CHAN_SHIFT		0
+
+#define HDA_CMD_GET_CONV_STREAM_CHAN_STREAM(rsp)			\
+    (((rsp) & HDA_CMD_CONV_STREAM_CHAN_STREAM_MASK) >>			\
+    HDA_CMD_CONV_STREAM_CHAN_STREAM_SHIFT)
+#define HDA_CMD_GET_CONV_STREAM_CHAN_CHAN(rsp)				\
+    (((rsp) & HDA_CMD_CONV_STREAM_CHAN_CHAN_MASK) >>			\
+    HDA_CMD_CONV_STREAM_CHAN_CHAN_SHIFT)
+
+#define HDA_CMD_SET_CONV_STREAM_CHAN_STREAM(param)			\
+    (((param) << HDA_CMD_CONV_STREAM_CHAN_STREAM_SHIFT) &		\
+    HDA_CMD_CONV_STREAM_CHAN_STREAM_MASK)
+#define HDA_CMD_SET_CONV_STREAM_CHAN_CHAN(param)			\
+    (((param) << HDA_CMD_CONV_STREAM_CHAN_CHAN_SHIFT) &			\
+    HDA_CMD_CONV_STREAM_CHAN_CHAN_MASK)
+
+/* Input Converter SDI Select */
+#define HDA_CMD_VERB_GET_INPUT_CONVERTER_SDI_SELECT	0xf04
+#define HDA_CMD_VERB_SET_INPUT_CONVERTER_SDI_SELECT	0x704
+
+#define HDA_CMD_GET_INPUT_CONVERTER_SDI_SELECT(cad, nid)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_INPUT_CONVERTER_SDI_SELECT, 0x0))
+#define HDA_CMD_SET_INPUT_CONVERTER_SDI_SELECT(cad, nid, payload)	\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_INPUT_CONVERTER_SDI_SELECT, (payload)))
+
+/* Pin Widget Control */
+#define HDA_CMD_VERB_GET_PIN_WIDGET_CTRL		0xf07
+#define HDA_CMD_VERB_SET_PIN_WIDGET_CTRL		0x707
+
+#define HDA_CMD_GET_PIN_WIDGET_CTRL(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_PIN_WIDGET_CTRL, 0x0))
+#define HDA_CMD_SET_PIN_WIDGET_CTRL(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_PIN_WIDGET_CTRL, (payload)))
+
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_HPHN_ENABLE_MASK	0x00000080
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_HPHN_ENABLE_SHIFT	7
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_OUT_ENABLE_MASK	0x00000040
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_OUT_ENABLE_SHIFT	6
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_IN_ENABLE_MASK	0x00000020
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_IN_ENABLE_SHIFT	5
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK	0x00000007
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_VREF_ENABLE_SHIFT	0
+
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_HPHN_ENABLE(rsp)			\
+    (((rsp) & HDA_CMD_GET_PIN_WIDGET_CTRL_HPHN_ENABLE_MASK) >>		\
+    HDA_CMD_GET_PIN_WIDGET_CTRL_HPHN_ENABLE_SHIFT)
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_OUT_ENABLE(rsp)			\
+    (((rsp) & HDA_CMD_GET_PIN_WIDGET_CTRL_OUT_ENABLE_MASK) >>		\
+    HDA_GET_CMD_PIN_WIDGET_CTRL_OUT_ENABLE_SHIFT)
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_IN_ENABLE(rsp)			\
+    (((rsp) & HDA_CMD_GET_PIN_WIDGET_CTRL_IN_ENABLE_MASK) >>		\
+    HDA_CMD_GET_PIN_WIDGET_CTRL_IN_ENABLE_SHIFT)
+#define HDA_CMD_GET_PIN_WIDGET_CTRL_VREF_ENABLE(rsp)			\
+    (((rsp) & HDA_CMD_GET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK) >>		\
+    HDA_CMD_GET_PIN_WIDGET_CTRL_VREF_ENABLE_SHIFT)
+
+#define HDA_CMD_SET_PIN_WIDGET_CTRL_HPHN_ENABLE		0x80
+#define HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE		0x40
+#define HDA_CMD_SET_PIN_WIDGET_CTRL_IN_ENABLE		0x20
+#define HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK	0x07
+#define HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_SHIFT	0
+
+#define HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE(param)			\
+    (((param) << HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_SHIFT) &	\
+    HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK)
+
+#define HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_HIZ		0
+#define HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_50		1
+#define HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_GROUND	2
+#define HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_80		4
+#define HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_100		5
+
+/* Unsolicited Response */
+#define HDA_CMD_VERB_GET_UNSOLICITED_RESPONSE		0xf08
+#define HDA_CMD_VERB_SET_UNSOLICITED_RESPONSE		0x708
+
+#define HDA_CMD_GET_UNSOLICITED_RESPONSE(cad, nid)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_UNSOLICITED_RESPONSE, 0x0))
+#define HDA_CMD_SET_UNSOLICITED_RESPONSE(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_UNSOLICITED_RESPONSE, (payload)))
+
+#define HDA_CMD_GET_UNSOLICITED_RESPONSE_ENABLE_MASK	0x00000080
+#define HDA_CMD_GET_UNSOLICITED_RESPONSE_ENABLE_SHIFT	7
+#define HDA_CMD_GET_UNSOLICITED_RESPONSE_TAG_MASK	0x0000001f
+#define HDA_CMD_GET_UNSOLICITED_RESPONSE_TAG_SHIFT	0
+
+#define HDA_CMD_GET_UNSOLICITED_RESPONSE_ENABLE(rsp)			\
+    (((rsp) & HDA_CMD_GET_UNSOLICITED_RESPONSE_ENABLE_MASK) >>		\
+    HDA_CMD_GET_UNSOLICITED_RESPONSE_ENABLE_SHIFT)
+#define HDA_CMD_GET_UNSOLICITED_RESPONSE_TAG(rsp)			\
+    (((rsp) & HDA_CMD_GET_UNSOLICITED_RESPONSE_TAG_MASK) >>		\
+    HDA_CMD_GET_UNSOLICITED_RESPONSE_TAG_SHIFT)
+
+#define HDA_CMD_SET_UNSOLICITED_RESPONSE_ENABLE		0x80
+#define HDA_CMD_SET_UNSOLICITED_RESPONSE_TAG_MASK	0x3f
+#define HDA_CMD_SET_UNSOLICITED_RESPONSE_TAG_SHIFT	0
+
+#define HDA_CMD_SET_UNSOLICITED_RESPONSE_TAG(param)			\
+    (((param) << HDA_CMD_SET_UNSOLICITED_RESPONSE_TAG_SHIFT) &		\
+    HDA_CMD_SET_UNSOLICITED_RESPONSE_TAG_MASK)
+
+/* Pin Sense */
+#define HDA_CMD_VERB_GET_PIN_SENSE			0xf09
+#define HDA_CMD_VERB_SET_PIN_SENSE			0x709
+
+#define HDA_CMD_GET_PIN_SENSE(cad, nid)					\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_PIN_SENSE, 0x0))
+#define HDA_CMD_SET_PIN_SENSE(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_PIN_SENSE, (payload)))
+
+#define HDA_CMD_GET_PIN_SENSE_PRESENCE_DETECT		0x80000000
+#define HDA_CMD_GET_PIN_SENSE_ELD_VALID			0x40000000
+#define HDA_CMD_GET_PIN_SENSE_IMP_SENSE_MASK		0x7fffffff
+#define HDA_CMD_GET_PIN_SENSE_IMP_SENSE_SHIFT		0
+
+#define HDA_CMD_GET_PIN_SENSE_IMP_SENSE(rsp)				\
+    (((rsp) & HDA_CMD_GET_PIN_SENSE_IMP_SENSE_MASK) >>			\
+    HDA_CMD_GET_PIN_SENSE_IMP_SENSE_SHIFT)
+
+#define HDA_CMD_GET_PIN_SENSE_IMP_SENSE_INVALID		0x7fffffff
+
+#define HDA_CMD_SET_PIN_SENSE_LEFT_CHANNEL		0x00
+#define HDA_CMD_SET_PIN_SENSE_RIGHT_CHANNEL		0x01
+
+/* EAPD/BTL Enable */
+#define HDA_CMD_VERB_GET_EAPD_BTL_ENABLE		0xf0c
+#define HDA_CMD_VERB_SET_EAPD_BTL_ENABLE		0x70c
+
+#define HDA_CMD_GET_EAPD_BTL_ENABLE(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_EAPD_BTL_ENABLE, 0x0))
+#define HDA_CMD_SET_EAPD_BTL_ENABLE(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_EAPD_BTL_ENABLE, (payload)))
+
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_LR_SWAP_MASK	0x00000004
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_LR_SWAP_SHIFT	2
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_EAPD_MASK		0x00000002
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_EAPD_SHIFT		1
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_BTL_MASK		0x00000001
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_BTL_SHIFT		0
+
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_LR_SWAP(rsp)			\
+    (((rsp) & HDA_CMD_GET_EAPD_BTL_ENABLE_LR_SWAP_MASK) >>		\
+    HDA_CMD_GET_EAPD_BTL_ENABLE_LR_SWAP_SHIFT)
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_EAPD(rsp)				\
+    (((rsp) & HDA_CMD_GET_EAPD_BTL_ENABLE_EAPD_MASK) >>			\
+    HDA_CMD_GET_EAPD_BTL_ENABLE_EAPD_SHIFT)
+#define HDA_CMD_GET_EAPD_BTL_ENABLE_BTL(rsp)				\
+    (((rsp) & HDA_CMD_GET_EAPD_BTL_ENABLE_BTL_MASK) >>			\
+    HDA_CMD_GET_EAPD_BTL_ENABLE_BTL_SHIFT)
+
+#define HDA_CMD_SET_EAPD_BTL_ENABLE_LR_SWAP		0x04
+#define HDA_CMD_SET_EAPD_BTL_ENABLE_EAPD		0x02
+#define HDA_CMD_SET_EAPD_BTL_ENABLE_BTL			0x01
+
+/* GPI Data */
+#define HDA_CMD_VERB_GET_GPI_DATA			0xf10
+#define HDA_CMD_VERB_SET_GPI_DATA			0x710
+
+#define HDA_CMD_GET_GPI_DATA(cad, nid)					\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPI_DATA, 0x0))
+#define HDA_CMD_SET_GPI_DATA(cad, nid)					\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPI_DATA, (payload)))
+
+/* GPI Wake Enable Mask */
+#define HDA_CMD_VERB_GET_GPI_WAKE_ENABLE_MASK		0xf11
+#define HDA_CMD_VERB_SET_GPI_WAKE_ENABLE_MASK		0x711
+
+#define HDA_CMD_GET_GPI_WAKE_ENABLE_MASK(cad, nid)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPI_WAKE_ENABLE_MASK, 0x0))
+#define HDA_CMD_SET_GPI_WAKE_ENABLE_MASK(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPI_WAKE_ENABLE_MASK, (payload)))
+
+/* GPI Unsolicited Enable Mask */
+#define HDA_CMD_VERB_GET_GPI_UNSOLICITED_ENABLE_MASK	0xf12
+#define HDA_CMD_VERB_SET_GPI_UNSOLICITED_ENABLE_MASK	0x712
+
+#define HDA_CMD_GET_GPI_UNSOLICITED_ENABLE_MASK(cad, nid)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPI_UNSOLICITED_ENABLE_MASK, 0x0))
+#define HDA_CMD_SET_GPI_UNSOLICITED_ENABLE_MASK(cad, nid, payload)	\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPI_UNSOLICITED_ENABLE_MASK, (payload)))
+
+/* GPI Sticky Mask */
+#define HDA_CMD_VERB_GET_GPI_STICKY_MASK		0xf13
+#define HDA_CMD_VERB_SET_GPI_STICKY_MASK		0x713
+
+#define HDA_CMD_GET_GPI_STICKY_MASK(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPI_STICKY_MASK, 0x0))
+#define HDA_CMD_SET_GPI_STICKY_MASK(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPI_STICKY_MASK, (payload)))
+
+/* GPO Data */
+#define HDA_CMD_VERB_GET_GPO_DATA			0xf14
+#define HDA_CMD_VERB_SET_GPO_DATA			0x714
+
+#define HDA_CMD_GET_GPO_DATA(cad, nid)					\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPO_DATA, 0x0))
+#define HDA_CMD_SET_GPO_DATA(cad, nid, payload)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPO_DATA, (payload)))
+
+/* GPIO Data */
+#define HDA_CMD_VERB_GET_GPIO_DATA			0xf15
+#define HDA_CMD_VERB_SET_GPIO_DATA			0x715
+
+#define HDA_CMD_GET_GPIO_DATA(cad, nid)					\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPIO_DATA, 0x0))
+#define HDA_CMD_SET_GPIO_DATA(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPIO_DATA, (payload)))
+
+/* GPIO Enable Mask */
+#define HDA_CMD_VERB_GET_GPIO_ENABLE_MASK		0xf16
+#define HDA_CMD_VERB_SET_GPIO_ENABLE_MASK		0x716
+
+#define HDA_CMD_GET_GPIO_ENABLE_MASK(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPIO_ENABLE_MASK, 0x0))
+#define HDA_CMD_SET_GPIO_ENABLE_MASK(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPIO_ENABLE_MASK, (payload)))
+
+/* GPIO Direction */
+#define HDA_CMD_VERB_GET_GPIO_DIRECTION			0xf17
+#define HDA_CMD_VERB_SET_GPIO_DIRECTION			0x717
+
+#define HDA_CMD_GET_GPIO_DIRECTION(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPIO_DIRECTION, 0x0))
+#define HDA_CMD_SET_GPIO_DIRECTION(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPIO_DIRECTION, (payload)))
+
+/* GPIO Wake Enable Mask */
+#define HDA_CMD_VERB_GET_GPIO_WAKE_ENABLE_MASK		0xf18
+#define HDA_CMD_VERB_SET_GPIO_WAKE_ENABLE_MASK		0x718
+
+#define HDA_CMD_GET_GPIO_WAKE_ENABLE_MASK(cad, nid)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPIO_WAKE_ENABLE_MASK, 0x0))
+#define HDA_CMD_SET_GPIO_WAKE_ENABLE_MASK(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPIO_WAKE_ENABLE_MASK, (payload)))
+
+/* GPIO Unsolicited Enable Mask */
+#define HDA_CMD_VERB_GET_GPIO_UNSOLICITED_ENABLE_MASK	0xf19
+#define HDA_CMD_VERB_SET_GPIO_UNSOLICITED_ENABLE_MASK	0x719
+
+#define HDA_CMD_GET_GPIO_UNSOLICITED_ENABLE_MASK(cad, nid)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPIO_UNSOLICITED_ENABLE_MASK, 0x0))
+#define HDA_CMD_SET_GPIO_UNSOLICITED_ENABLE_MASK(cad, nid, payload)	\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPIO_UNSOLICITED_ENABLE_MASK, (payload)))
+
+/* GPIO_STICKY_MASK */
+#define HDA_CMD_VERB_GET_GPIO_STICKY_MASK		0xf1a
+#define HDA_CMD_VERB_SET_GPIO_STICKY_MASK		0x71a
+
+#define HDA_CMD_GET_GPIO_STICKY_MASK(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_GPIO_STICKY_MASK, 0x0))
+#define HDA_CMD_SET_GPIO_STICKY_MASK(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_GPIO_STICKY_MASK, (payload)))
+
+/* Beep Generation */
+#define HDA_CMD_VERB_GET_BEEP_GENERATION		0xf0a
+#define HDA_CMD_VERB_SET_BEEP_GENERATION		0x70a
+
+#define HDA_CMD_GET_BEEP_GENERATION(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_BEEP_GENERATION, 0x0))
+#define HDA_CMD_SET_BEEP_GENERATION(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_BEEP_GENERATION, (payload)))
+
+/* Volume Knob */
+#define HDA_CMD_VERB_GET_VOLUME_KNOB			0xf0f
+#define HDA_CMD_VERB_SET_VOLUME_KNOB			0x70f
+
+#define HDA_CMD_GET_VOLUME_KNOB(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_VOLUME_KNOB, 0x0))
+#define HDA_CMD_SET_VOLUME_KNOB(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_VOLUME_KNOB, (payload)))
+
+/* Subsystem ID */
+#define HDA_CMD_VERB_GET_SUBSYSTEM_ID			0xf20
+#define HDA_CMD_VERB_SET_SUSBYSTEM_ID1			0x720
+#define HDA_CMD_VERB_SET_SUBSYSTEM_ID2			0x721
+#define HDA_CMD_VERB_SET_SUBSYSTEM_ID3			0x722
+#define HDA_CMD_VERB_SET_SUBSYSTEM_ID4			0x723
+
+#define HDA_CMD_GET_SUBSYSTEM_ID(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_SUBSYSTEM_ID, 0x0))
+#define HDA_CMD_SET_SUBSYSTEM_ID1(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_SUSBYSTEM_ID1, (payload)))
+#define HDA_CMD_SET_SUBSYSTEM_ID2(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_SUSBYSTEM_ID2, (payload)))
+#define HDA_CMD_SET_SUBSYSTEM_ID3(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_SUSBYSTEM_ID3, (payload)))
+#define HDA_CMD_SET_SUBSYSTEM_ID4(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_SUSBYSTEM_ID4, (payload)))
+
+/* Configuration Default */
+#define HDA_CMD_VERB_GET_CONFIGURATION_DEFAULT		0xf1c
+#define HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT1		0x71c
+#define HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT2		0x71d
+#define HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT3		0x71e
+#define HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT4		0x71f
+
+#define HDA_CMD_GET_CONFIGURATION_DEFAULT(cad, nid)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_CONFIGURATION_DEFAULT, 0x0))
+#define HDA_CMD_SET_CONFIGURATION_DEFAULT1(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT1, (payload)))
+#define HDA_CMD_SET_CONFIGURATION_DEFAULT2(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT2, (payload)))
+#define HDA_CMD_SET_CONFIGURATION_DEFAULT3(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT3, (payload)))
+#define HDA_CMD_SET_CONFIGURATION_DEFAULT4(cad, nid, payload)		\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_CONFIGURATION_DEFAULT4, (payload)))
+
+/* Stripe Control */
+#define HDA_CMD_VERB_GET_STRIPE_CONTROL			0xf24
+#define HDA_CMD_VERB_SET_STRIPE_CONTROL			0x724
+
+#define HDA_CMD_GET_STRIPE_CONTROL(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_STRIPE_CONTROL, 0x0))
+#define HDA_CMD_SET_STRIPE_CONTROL(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_STRIPE_CONTROL, (payload)))
+
+/* Channel Count Control */
+#define HDA_CMD_VERB_GET_CONV_CHAN_COUNT			0xf2d
+#define HDA_CMD_VERB_SET_CONV_CHAN_COUNT			0x72d 
+
+#define HDA_CMD_GET_CONV_CHAN_COUNT(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_CONV_CHAN_COUNT, 0x0))
+#define HDA_CMD_SET_CONV_CHAN_COUNT(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_CONV_CHAN_COUNT, (payload)))
+
+#define HDA_CMD_VERB_GET_HDMI_DIP_SIZE			0xf2e 
+
+#define HDA_CMD_GET_HDMI_DIP_SIZE(cad, nid, arg)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_HDMI_DIP_SIZE, (arg)))
+
+#define HDA_CMD_VERB_GET_HDMI_ELDD			0xf2f 
+
+#define HDA_CMD_GET_HDMI_ELDD(cad, nid, off)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_HDMI_ELDD, (off)))
+
+#define HDA_CMD_VERB_GET_HDMI_DIP_INDEX			0xf30 
+#define HDA_CMD_VERB_SET_HDMI_DIP_INDEX			0x730 
+
+#define HDA_CMD_GET_HDMI_DIP_INDEX(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_HDMI_DIP_INDEX, 0x0))
+#define HDA_CMD_SET_HDMI_DIP_INDEX(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_HDMI_DIP_INDEX, (payload)))
+
+#define HDA_CMD_VERB_GET_HDMI_DIP_DATA			0xf31 
+#define HDA_CMD_VERB_SET_HDMI_DIP_DATA			0x731 
+
+#define HDA_CMD_GET_HDMI_DIP_DATA(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_HDMI_DIP_DATA, 0x0))
+#define HDA_CMD_SET_HDMI_DIP_DATA(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_HDMI_DIP_DATA, (payload)))
+
+#define HDA_CMD_VERB_GET_HDMI_DIP_XMIT			0xf32 
+#define HDA_CMD_VERB_SET_HDMI_DIP_XMIT			0x732 
+
+#define HDA_CMD_GET_HDMI_DIP_XMIT(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_HDMI_DIP_XMIT, 0x0))
+#define HDA_CMD_SET_HDMI_DIP_XMIT(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_HDMI_DIP_XMIT, (payload)))
+
+#define HDA_CMD_VERB_GET_HDMI_CP_CTRL			0xf33 
+#define HDA_CMD_VERB_SET_HDMI_CP_CTRL			0x733 
+
+#define HDA_CMD_VERB_GET_HDMI_CHAN_SLOT			0xf34 
+#define HDA_CMD_VERB_SET_HDMI_CHAN_SLOT			0x734 
+
+#define HDA_CMD_GET_HDMI_CHAN_SLOT(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_GET_HDMI_CHAN_SLOT, 0x0))
+#define HDA_CMD_SET_HDMI_CHAN_SLOT(cad, nid, payload)			\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_SET_HDMI_CHAN_SLOT, (payload)))
+
+#define	HDA_HDMI_CODING_TYPE_REF_STREAM_HEADER		0
+#define	HDA_HDMI_CODING_TYPE_LPCM			1
+#define	HDA_HDMI_CODING_TYPE_AC3			2
+#define	HDA_HDMI_CODING_TYPE_MPEG1			3
+#define	HDA_HDMI_CODING_TYPE_MP3			4
+#define	HDA_HDMI_CODING_TYPE_MPEG2			5
+#define	HDA_HDMI_CODING_TYPE_AACLC			6
+#define	HDA_HDMI_CODING_TYPE_DTS			7
+#define	HDA_HDMI_CODING_TYPE_ATRAC			8
+#define	HDA_HDMI_CODING_TYPE_SACD			9
+#define	HDA_HDMI_CODING_TYPE_EAC3			10
+#define	HDA_HDMI_CODING_TYPE_DTS_HD			11
+#define	HDA_HDMI_CODING_TYPE_MLP			12
+#define	HDA_HDMI_CODING_TYPE_DST			13
+#define	HDA_HDMI_CODING_TYPE_WMAPRO			14
+#define	HDA_HDMI_CODING_TYPE_REF_CTX			15
+
+/* Function Reset */
+#define HDA_CMD_VERB_FUNCTION_RESET			0x7ff
+
+#define HDA_CMD_FUNCTION_RESET(cad, nid)				\
+    (HDA_CMD_12BIT((cad), (nid),					\
+    HDA_CMD_VERB_FUNCTION_RESET, 0x0))
+
+
+/****************************************************************************
+ * HDA Device Parameters
+ ****************************************************************************/
+
+/* Vendor ID */
+#define HDA_PARAM_VENDOR_ID				0x00
+
+#define HDA_PARAM_VENDOR_ID_VENDOR_ID_MASK		0xffff0000
+#define HDA_PARAM_VENDOR_ID_VENDOR_ID_SHIFT		16
+#define HDA_PARAM_VENDOR_ID_DEVICE_ID_MASK		0x0000ffff
+#define HDA_PARAM_VENDOR_ID_DEVICE_ID_SHIFT		0
+
+#define HDA_PARAM_VENDOR_ID_VENDOR_ID(param)				\
+    (((param) & HDA_PARAM_VENDOR_ID_VENDOR_ID_MASK) >>			\
+    HDA_PARAM_VENDOR_ID_VENDOR_ID_SHIFT)
+#define HDA_PARAM_VENDOR_ID_DEVICE_ID(param)				\
+    (((param) & HDA_PARAM_VENDOR_ID_DEVICE_ID_MASK) >>			\
+    HDA_PARAM_VENDOR_ID_DEVICE_ID_SHIFT)
+
+/* Revision ID */
+#define HDA_PARAM_REVISION_ID				0x02
+
+#define HDA_PARAM_REVISION_ID_MAJREV_MASK		0x00f00000
+#define HDA_PARAM_REVISION_ID_MAJREV_SHIFT		20
+#define HDA_PARAM_REVISION_ID_MINREV_MASK		0x000f0000
+#define HDA_PARAM_REVISION_ID_MINREV_SHIFT		16
+#define HDA_PARAM_REVISION_ID_REVISION_ID_MASK		0x0000ff00
+#define HDA_PARAM_REVISION_ID_REVISION_ID_SHIFT		8
+#define HDA_PARAM_REVISION_ID_STEPPING_ID_MASK		0x000000ff
+#define HDA_PARAM_REVISION_ID_STEPPING_ID_SHIFT		0
+
+#define HDA_PARAM_REVISION_ID_MAJREV(param)				\
+    (((param) & HDA_PARAM_REVISION_ID_MAJREV_MASK) >>			\
+    HDA_PARAM_REVISION_ID_MAJREV_SHIFT)
+#define HDA_PARAM_REVISION_ID_MINREV(param)				\
+    (((param) & HDA_PARAM_REVISION_ID_MINREV_MASK) >>			\
+    HDA_PARAM_REVISION_ID_MINREV_SHIFT)
+#define HDA_PARAM_REVISION_ID_REVISION_ID(param)			\
+    (((param) & HDA_PARAM_REVISION_ID_REVISION_ID_MASK) >>		\
+    HDA_PARAM_REVISION_ID_REVISION_ID_SHIFT)
+#define HDA_PARAM_REVISION_ID_STEPPING_ID(param)			\
+    (((param) & HDA_PARAM_REVISION_ID_STEPPING_ID_MASK) >>		\
+    HDA_PARAM_REVISION_ID_STEPPING_ID_SHIFT)
+
+/* Subordinate Node Cound */
+#define HDA_PARAM_SUB_NODE_COUNT			0x04
+
+#define HDA_PARAM_SUB_NODE_COUNT_START_MASK		0x00ff0000
+#define HDA_PARAM_SUB_NODE_COUNT_START_SHIFT		16
+#define HDA_PARAM_SUB_NODE_COUNT_TOTAL_MASK		0x000000ff
+#define HDA_PARAM_SUB_NODE_COUNT_TOTAL_SHIFT		0
+
+#define HDA_PARAM_SUB_NODE_COUNT_START(param)				\
+    (((param) & HDA_PARAM_SUB_NODE_COUNT_START_MASK) >>			\
+    HDA_PARAM_SUB_NODE_COUNT_START_SHIFT)
+#define HDA_PARAM_SUB_NODE_COUNT_TOTAL(param)				\
+    (((param) & HDA_PARAM_SUB_NODE_COUNT_TOTAL_MASK) >>			\
+    HDA_PARAM_SUB_NODE_COUNT_TOTAL_SHIFT)
+
+/* Function Group Type */
+#define HDA_PARAM_FCT_GRP_TYPE				0x05
+
+#define HDA_PARAM_FCT_GRP_TYPE_UNSOL_MASK		0x00000100
+#define HDA_PARAM_FCT_GRP_TYPE_UNSOL_SHIFT		8
+#define HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_MASK		0x000000ff
+#define HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_SHIFT	0
+
+#define HDA_PARAM_FCT_GRP_TYPE_UNSOL(param)				\
+    (((param) & HDA_PARAM_FCT_GRP_TYPE_UNSOL_MASK) >>			\
+    HDA_PARAM_FCT_GROUP_TYPE_UNSOL_SHIFT)
+#define HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE(param)				\
+    (((param) & HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_MASK) >>		\
+    HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_SHIFT)
+
+#define HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_AUDIO		0x01
+#define HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_MODEM		0x02
+
+/* Audio Function Group Capabilities */
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP			0x08
+
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_BEEP_GEN_MASK	0x00010000
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_BEEP_GEN_SHIFT	16
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_INPUT_DELAY_MASK	0x00000f00
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_INPUT_DELAY_SHIFT	8
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_OUTPUT_DELAY_MASK	0x0000000f
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_OUTPUT_DELAY_SHIFT	0
+
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_BEEP_GEN(param)			\
+    (((param) & HDA_PARAM_AUDIO_FCT_GRP_CAP_BEEP_GEN_MASK) >>		\
+    HDA_PARAM_AUDIO_FCT_GRP_CAP_BEEP_GEN_SHIFT)
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_INPUT_DELAY(param)			\
+    (((param) & HDA_PARAM_AUDIO_FCT_GRP_CAP_INPUT_DELAY_MASK) >>	\
+    HDA_PARAM_AUDIO_FCT_GRP_CAP_INPUT_DELAY_SHIFT)
+#define HDA_PARAM_AUDIO_FCT_GRP_CAP_OUTPUT_DELAY(param)			\
+    (((param) & HDA_PARAM_AUDIO_FCT_GRP_CAP_OUTPUT_DELAY_MASK) >>	\
+    HDA_PARAM_AUDIO_FCT_GRP_CAP_OUTPUT_DELAY_SHIFT)
+
+/* Audio Widget Capabilities */
+#define HDA_PARAM_AUDIO_WIDGET_CAP			0x09
+
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_MASK		0x00f00000
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT		20
+#define HDA_PARAM_AUDIO_WIDGET_CAP_DELAY_MASK		0x000f0000
+#define HDA_PARAM_AUDIO_WIDGET_CAP_DELAY_SHIFT		16
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CC_EXT_MASK		0x0000e000
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CC_EXT_SHIFT		13
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CP_MASK		0x00001000
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CP_SHIFT		12
+#define HDA_PARAM_AUDIO_WIDGET_CAP_LR_SWAP_MASK		0x00000800
+#define HDA_PARAM_AUDIO_WIDGET_CAP_LR_SWAP_SHIFT	11
+#define HDA_PARAM_AUDIO_WIDGET_CAP_POWER_CTRL_MASK	0x00000400
+#define HDA_PARAM_AUDIO_WIDGET_CAP_POWER_CTRL_SHIFT	10
+#define HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL_MASK		0x00000200
+#define HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL_SHIFT	9
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CONN_LIST_MASK	0x00000100
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CONN_LIST_SHIFT	8
+#define HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP_MASK	0x00000080
+#define HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP_SHIFT	7
+#define HDA_PARAM_AUDIO_WIDGET_CAP_PROC_WIDGET_MASK	0x00000040
+#define HDA_PARAM_AUDIO_WIDGET_CAP_PROC_WIDGET_SHIFT	6
+#define HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE_MASK		0x00000020
+#define HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE_SHIFT		5
+#define HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR_MASK	0x00000010
+#define HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR_SHIFT	4
+#define HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR_MASK		0x00000008
+#define HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR_SHIFT	3
+#define HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP_MASK		0x00000004
+#define HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP_SHIFT	2
+#define HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP_MASK		0x00000002
+#define HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP_SHIFT		1
+#define HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_MASK		0x00000001
+#define HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_SHIFT		0
+
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE(param)				\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_DELAY(param)				\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_DELAY_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_DELAY_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CC(param)				\
+    ((((param) & HDA_PARAM_AUDIO_WIDGET_CAP_CC_EXT_MASK) >>		\
+    (HDA_PARAM_AUDIO_WIDGET_CAP_CC_EXT_SHIFT - 1)) |			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_SHIFT))
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CP(param)				\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_CP_MASK) >>			\
+    HDA_PARAM_AUDIO_WIDGET_CAP_CP_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_LR_SWAP(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_LR_SWAP_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_LR_SWAP_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_POWER_CTRL(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_POWER_CTRL_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_POWER_CTRL_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_CONN_LIST(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_CONN_LIST_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_CONN_LIST_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_PROC_WIDGET(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_PROC_WIDGET_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_PROC_WIDGET_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP_SHIFT)
+#define HDA_PARAM_AUDIO_WIDGET_CAP_STEREO(param)			\
+    (((param) & HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_MASK) >>		\
+    HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_SHIFT)
+
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT	0x0
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT	0x1
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER	0x2
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR	0x3
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX	0x4
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_POWER_WIDGET	0x5
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_VOLUME_WIDGET	0x6
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_BEEP_WIDGET	0x7
+#define HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_VENDOR_WIDGET	0xf
+
+/* Supported PCM Size, Rates */
+
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE			0x0a
+
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT_MASK		0x00100000
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT_SHIFT	20
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT_MASK		0x00080000
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT_SHIFT	19
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT_MASK		0x00040000
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT_SHIFT	18
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT_MASK		0x00020000
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT_SHIFT	17
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT_MASK		0x00010000
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT_SHIFT		16
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ_MASK		0x00000001
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ_SHIFT		0
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ_MASK		0x00000002
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ_SHIFT	1
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ_MASK		0x00000004
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ_SHIFT	2
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ_MASK		0x00000008
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ_SHIFT	3
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ_MASK		0x00000010
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ_SHIFT	4
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ_MASK		0x00000020
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ_SHIFT	5
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_48KHZ_MASK		0x00000040
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_48KHZ_SHIFT	6
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ_MASK		0x00000080
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ_SHIFT	7
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ_MASK		0x00000100
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ_SHIFT	8
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ_MASK	0x00000200
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ_SHIFT	9
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ_MASK	0x00000400
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ_SHIFT	10
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_384KHZ_MASK	0x00000800
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_384KHZ_SHIFT	11
+
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_48KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_48KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_48KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ_SHIFT)
+#define HDA_PARAM_SUPP_PCM_SIZE_RATE_384KHZ(param)			\
+    (((param) & HDA_PARAM_SUPP_PCM_SIZE_RATE_384KHZ_MASK) >>		\
+    HDA_PARAM_SUPP_PCM_SIZE_RATE_384KHZ_SHIFT)
+
+/* Supported Stream Formats */
+#define HDA_PARAM_SUPP_STREAM_FORMATS			0x0b
+
+#define HDA_PARAM_SUPP_STREAM_FORMATS_AC3_MASK		0x00000004
+#define HDA_PARAM_SUPP_STREAM_FORMATS_AC3_SHIFT		2
+#define HDA_PARAM_SUPP_STREAM_FORMATS_FLOAT32_MASK	0x00000002
+#define HDA_PARAM_SUPP_STREAM_FORMATS_FLOAT32_SHIFT	1
+#define HDA_PARAM_SUPP_STREAM_FORMATS_PCM_MASK		0x00000001
+#define HDA_PARAM_SUPP_STREAM_FORMATS_PCM_SHIFT		0
+
+#define HDA_PARAM_SUPP_STREAM_FORMATS_AC3(param)			\
+    (((param) & HDA_PARAM_SUPP_STREAM_FORMATS_AC3_MASK) >>		\
+    HDA_PARAM_SUPP_STREAM_FORMATS_AC3_SHIFT)
+#define HDA_PARAM_SUPP_STREAM_FORMATS_FLOAT32(param)			\
+    (((param) & HDA_PARAM_SUPP_STREAM_FORMATS_FLOAT32_MASK) >>		\
+    HDA_PARAM_SUPP_STREAM_FORMATS_FLOAT32_SHIFT)
+#define HDA_PARAM_SUPP_STREAM_FORMATS_PCM(param)			\
+    (((param) & HDA_PARAM_SUPP_STREAM_FORMATS_PCM_MASK) >>		\
+    HDA_PARAM_SUPP_STREAM_FORMATS_PCM_SHIFT)
+
+/* Pin Capabilities */
+#define HDA_PARAM_PIN_CAP				0x0c
+
+#define HDA_PARAM_PIN_CAP_HBR_MASK			0x08000000
+#define HDA_PARAM_PIN_CAP_HBR_SHIFT			27
+#define HDA_PARAM_PIN_CAP_DP_MASK			0x01000000
+#define HDA_PARAM_PIN_CAP_DP_SHIFT			24
+#define HDA_PARAM_PIN_CAP_EAPD_CAP_MASK			0x00010000
+#define HDA_PARAM_PIN_CAP_EAPD_CAP_SHIFT		16
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_MASK		0x0000ff00
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_SHIFT		8
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_100_MASK		0x00002000
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_100_SHIFT		13
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_80_MASK		0x00001000
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_80_SHIFT		12
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_GROUND_MASK		0x00000400
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_GROUND_SHIFT	10
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_50_MASK		0x00000200
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_50_SHIFT		9
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_HIZ_MASK		0x00000100
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_HIZ_SHIFT		8
+#define HDA_PARAM_PIN_CAP_HDMI_MASK			0x00000080
+#define HDA_PARAM_PIN_CAP_HDMI_SHIFT			7
+#define HDA_PARAM_PIN_CAP_BALANCED_IO_PINS_MASK		0x00000040
+#define HDA_PARAM_PIN_CAP_BALANCED_IO_PINS_SHIFT	6
+#define HDA_PARAM_PIN_CAP_INPUT_CAP_MASK		0x00000020
+#define HDA_PARAM_PIN_CAP_INPUT_CAP_SHIFT		5
+#define HDA_PARAM_PIN_CAP_OUTPUT_CAP_MASK		0x00000010
+#define HDA_PARAM_PIN_CAP_OUTPUT_CAP_SHIFT		4
+#define HDA_PARAM_PIN_CAP_HEADPHONE_CAP_MASK		0x00000008
+#define HDA_PARAM_PIN_CAP_HEADPHONE_CAP_SHIFT		3
+#define HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP_MASK	0x00000004
+#define HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP_SHIFT	2
+#define HDA_PARAM_PIN_CAP_TRIGGER_REQD_MASK		0x00000002
+#define HDA_PARAM_PIN_CAP_TRIGGER_REQD_SHIFT		1
+#define HDA_PARAM_PIN_CAP_IMP_SENSE_CAP_MASK		0x00000001
+#define HDA_PARAM_PIN_CAP_IMP_SENSE_CAP_SHIFT		0
+
+#define HDA_PARAM_PIN_CAP_HBR(param)					\
+    (((param) & HDA_PARAM_PIN_CAP_HBR_MASK) >>				\
+    HDA_PARAM_PIN_CAP_HBR_SHIFT)
+#define HDA_PARAM_PIN_CAP_DP(param)					\
+    (((param) & HDA_PARAM_PIN_CAP_DP_MASK) >>				\
+    HDA_PARAM_PIN_CAP_DP_SHIFT)
+#define HDA_PARAM_PIN_CAP_EAPD_CAP(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_EAPD_CAP_MASK) >>			\
+    HDA_PARAM_PIN_CAP_EAPD_CAP_SHIFT)
+#define HDA_PARAM_PIN_CAP_VREF_CTRL(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_VREF_CTRL_MASK) >>			\
+    HDA_PARAM_PIN_CAP_VREF_CTRL_SHIFT)
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_100(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_VREF_CTRL_100_MASK) >>		\
+    HDA_PARAM_PIN_CAP_VREF_CTRL_100_SHIFT)
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_80(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_VREF_CTRL_80_MASK) >>			\
+    HDA_PARAM_PIN_CAP_VREF_CTRL_80_SHIFT)
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_GROUND(param)			\
+    (((param) & HDA_PARAM_PIN_CAP_VREF_CTRL_GROUND_MASK) >>		\
+    HDA_PARAM_PIN_CAP_VREF_CTRL_GROUND_SHIFT)
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_50(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_VREF_CTRL_50_MASK) >>			\
+    HDA_PARAM_PIN_CAP_VREF_CTRL_50_SHIFT)
+#define HDA_PARAM_PIN_CAP_VREF_CTRL_HIZ(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_VREF_CTRL_HIZ_MASK) >>		\
+    HDA_PARAM_PIN_CAP_VREF_CTRL_HIZ_SHIFT)
+#define HDA_PARAM_PIN_CAP_HDMI(param)					\
+    (((param) & HDA_PARAM_PIN_CAP_HDMI_MASK) >>				\
+    HDA_PARAM_PIN_CAP_HDMI_SHIFT)
+#define HDA_PARAM_PIN_CAP_BALANCED_IO_PINS(param)			\
+    (((param) & HDA_PARAM_PIN_CAP_BALANCED_IO_PINS_MASK) >>		\
+    HDA_PARAM_PIN_CAP_BALANCED_IO_PINS_SHIFT)
+#define HDA_PARAM_PIN_CAP_INPUT_CAP(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_INPUT_CAP_MASK) >>			\
+    HDA_PARAM_PIN_CAP_INPUT_CAP_SHIFT)
+#define HDA_PARAM_PIN_CAP_OUTPUT_CAP(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_OUTPUT_CAP_MASK) >>			\
+    HDA_PARAM_PIN_CAP_OUTPUT_CAP_SHIFT)
+#define HDA_PARAM_PIN_CAP_HEADPHONE_CAP(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_HEADPHONE_CAP_MASK) >>		\
+    HDA_PARAM_PIN_CAP_HEADPHONE_CAP_SHIFT)
+#define HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP(param)			\
+    (((param) & HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP_MASK) >>		\
+    HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP_SHIFT)
+#define HDA_PARAM_PIN_CAP_TRIGGER_REQD(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_TRIGGER_REQD_MASK) >>			\
+    HDA_PARAM_PIN_CAP_TRIGGER_REQD_SHIFT)
+#define HDA_PARAM_PIN_CAP_IMP_SENSE_CAP(param)				\
+    (((param) & HDA_PARAM_PIN_CAP_IMP_SENSE_CAP_MASK) >>		\
+    HDA_PARAM_PIN_CAP_IMP_SENSE_CAP_SHIFT)
+
+/* Input Amplifier Capabilities */
+#define HDA_PARAM_INPUT_AMP_CAP				0x0d
+
+#define HDA_PARAM_INPUT_AMP_CAP_MUTE_CAP_MASK		0x80000000
+#define HDA_PARAM_INPUT_AMP_CAP_MUTE_CAP_SHIFT		31
+#define HDA_PARAM_INPUT_AMP_CAP_STEPSIZE_MASK		0x007f0000
+#define HDA_PARAM_INPUT_AMP_CAP_STEPSIZE_SHIFT		16
+#define HDA_PARAM_INPUT_AMP_CAP_NUMSTEPS_MASK		0x00007f00
+#define HDA_PARAM_INPUT_AMP_CAP_NUMSTEPS_SHIFT		8
+#define HDA_PARAM_INPUT_AMP_CAP_OFFSET_MASK		0x0000007f
+#define HDA_PARAM_INPUT_AMP_CAP_OFFSET_SHIFT		0
+
+#define HDA_PARAM_INPUT_AMP_CAP_MUTE_CAP(param)				\
+    (((param) & HDA_PARAM_INPUT_AMP_CAP_MUTE_CAP_MASK) >>		\
+    HDA_PARAM_INPUT_AMP_CAP_MUTE_CAP_SHIFT)
+#define HDA_PARAM_INPUT_AMP_CAP_STEPSIZE(param)				\
+    (((param) & HDA_PARAM_INPUT_AMP_CAP_STEPSIZE_MASK) >>		\
+    HDA_PARAM_INPUT_AMP_CAP_STEPSIZE_SHIFT)
+#define HDA_PARAM_INPUT_AMP_CAP_NUMSTEPS(param)				\
+    (((param) & HDA_PARAM_INPUT_AMP_CAP_NUMSTEPS_MASK) >>		\
+    HDA_PARAM_INPUT_AMP_CAP_NUMSTEPS_SHIFT)
+#define HDA_PARAM_INPUT_AMP_CAP_OFFSET(param)				\
+    (((param) & HDA_PARAM_INPUT_AMP_CAP_OFFSET_MASK) >>			\
+    HDA_PARAM_INPUT_AMP_CAP_OFFSET_SHIFT)
+
+/* Output Amplifier Capabilities */
+#define HDA_PARAM_OUTPUT_AMP_CAP			0x12
+
+#define HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP_MASK		0x80000000
+#define HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP_SHIFT		31
+#define HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE_MASK		0x007f0000
+#define HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE_SHIFT		16
+#define HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS_MASK		0x00007f00
+#define HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS_SHIFT		8
+#define HDA_PARAM_OUTPUT_AMP_CAP_OFFSET_MASK		0x0000007f
+#define HDA_PARAM_OUTPUT_AMP_CAP_OFFSET_SHIFT		0
+
+#define HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP(param)			\
+    (((param) & HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP_MASK) >>		\
+    HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP_SHIFT)
+#define HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE(param)			\
+    (((param) & HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE_MASK) >>		\
+    HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE_SHIFT)
+#define HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS(param)			\
+    (((param) & HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS_MASK) >>		\
+    HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS_SHIFT)
+#define HDA_PARAM_OUTPUT_AMP_CAP_OFFSET(param)				\
+    (((param) & HDA_PARAM_OUTPUT_AMP_CAP_OFFSET_MASK) >>		\
+    HDA_PARAM_OUTPUT_AMP_CAP_OFFSET_SHIFT)
+
+/* Connection List Length */
+#define HDA_PARAM_CONN_LIST_LENGTH			0x0e
+
+#define HDA_PARAM_CONN_LIST_LENGTH_LONG_FORM_MASK	0x00000080
+#define HDA_PARAM_CONN_LIST_LENGTH_LONG_FORM_SHIFT	7
+#define HDA_PARAM_CONN_LIST_LENGTH_LIST_LENGTH_MASK	0x0000007f
+#define HDA_PARAM_CONN_LIST_LENGTH_LIST_LENGTH_SHIFT	0
+
+#define HDA_PARAM_CONN_LIST_LENGTH_LONG_FORM(param)			\
+    (((param) & HDA_PARAM_CONN_LIST_LENGTH_LONG_FORM_MASK) >>		\
+    HDA_PARAM_CONN_LIST_LENGTH_LONG_FORM_SHIFT)
+#define HDA_PARAM_CONN_LIST_LENGTH_LIST_LENGTH(param)			\
+    (((param) & HDA_PARAM_CONN_LIST_LENGTH_LIST_LENGTH_MASK) >>		\
+    HDA_PARAM_CONN_LIST_LENGTH_LIST_LENGTH_SHIFT)
+
+/* Supported Power States */
+#define HDA_PARAM_SUPP_POWER_STATES			0x0f
+
+#define HDA_PARAM_SUPP_POWER_STATES_D3_MASK		0x00000008
+#define HDA_PARAM_SUPP_POWER_STATES_D3_SHIFT		3
+#define HDA_PARAM_SUPP_POWER_STATES_D2_MASK		0x00000004
+#define HDA_PARAM_SUPP_POWER_STATES_D2_SHIFT		2
+#define HDA_PARAM_SUPP_POWER_STATES_D1_MASK		0x00000002
+#define HDA_PARAM_SUPP_POWER_STATES_D1_SHIFT		1
+#define HDA_PARAM_SUPP_POWER_STATES_D0_MASK		0x00000001
+#define HDA_PARAM_SUPP_POWER_STATES_D0_SHIFT		0
+
+#define HDA_PARAM_SUPP_POWER_STATES_D3(param)				\
+    (((param) & HDA_PARAM_SUPP_POWER_STATES_D3_MASK) >>			\
+    HDA_PARAM_SUPP_POWER_STATES_D3_SHIFT)
+#define HDA_PARAM_SUPP_POWER_STATES_D2(param)				\
+    (((param) & HDA_PARAM_SUPP_POWER_STATES_D2_MASK) >>			\
+    HDA_PARAM_SUPP_POWER_STATES_D2_SHIFT)
+#define HDA_PARAM_SUPP_POWER_STATES_D1(param)				\
+    (((param) & HDA_PARAM_SUPP_POWER_STATES_D1_MASK) >>			\
+    HDA_PARAM_SUPP_POWER_STATES_D1_SHIFT)
+#define HDA_PARAM_SUPP_POWER_STATES_D0(param)				\
+    (((param) & HDA_PARAM_SUPP_POWER_STATES_D0_MASK) >>			\
+    HDA_PARAM_SUPP_POWER_STATES_D0_SHIFT)
+
+/* Processing Capabilities */
+#define HDA_PARAM_PROCESSING_CAP			0x10
+
+#define HDA_PARAM_PROCESSING_CAP_NUMCOEFF_MASK		0x0000ff00
+#define HDA_PARAM_PROCESSING_CAP_NUMCOEFF_SHIFT		8
+#define HDA_PARAM_PROCESSING_CAP_BENIGN_MASK		0x00000001
+#define HDA_PARAM_PROCESSING_CAP_BENIGN_SHIFT		0
+
+#define HDA_PARAM_PROCESSING_CAP_NUMCOEFF(param)			\
+    (((param) & HDA_PARAM_PROCESSING_CAP_NUMCOEFF_MASK) >>		\
+    HDA_PARAM_PROCESSING_CAP_NUMCOEFF_SHIFT)
+#define HDA_PARAM_PROCESSING_CAP_BENIGN(param)				\
+    (((param) & HDA_PARAM_PROCESSING_CAP_BENIGN_MASK) >>		\
+    HDA_PARAM_PROCESSING_CAP_BENIGN_SHIFT)
+
+/* GPIO Count */
+#define HDA_PARAM_GPIO_COUNT				0x11
+
+#define HDA_PARAM_GPIO_COUNT_GPI_WAKE_MASK		0x80000000
+#define HDA_PARAM_GPIO_COUNT_GPI_WAKE_SHIFT		31
+#define HDA_PARAM_GPIO_COUNT_GPI_UNSOL_MASK		0x40000000
+#define HDA_PARAM_GPIO_COUNT_GPI_UNSOL_SHIFT		30
+#define HDA_PARAM_GPIO_COUNT_NUM_GPI_MASK		0x00ff0000
+#define HDA_PARAM_GPIO_COUNT_NUM_GPI_SHIFT		16
+#define HDA_PARAM_GPIO_COUNT_NUM_GPO_MASK		0x0000ff00
+#define HDA_PARAM_GPIO_COUNT_NUM_GPO_SHIFT		8
+#define HDA_PARAM_GPIO_COUNT_NUM_GPIO_MASK		0x000000ff
+#define HDA_PARAM_GPIO_COUNT_NUM_GPIO_SHIFT		0
+
+#define HDA_PARAM_GPIO_COUNT_GPI_WAKE(param)				\
+    (((param) & HDA_PARAM_GPIO_COUNT_GPI_WAKE_MASK) >>			\
+    HDA_PARAM_GPIO_COUNT_GPI_WAKE_SHIFT)
+#define HDA_PARAM_GPIO_COUNT_GPI_UNSOL(param)				\
+    (((param) & HDA_PARAM_GPIO_COUNT_GPI_UNSOL_MASK) >>			\
+    HDA_PARAM_GPIO_COUNT_GPI_UNSOL_SHIFT)
+#define HDA_PARAM_GPIO_COUNT_NUM_GPI(param)				\
+    (((param) & HDA_PARAM_GPIO_COUNT_NUM_GPI_MASK) >>			\
+    HDA_PARAM_GPIO_COUNT_NUM_GPI_SHIFT)
+#define HDA_PARAM_GPIO_COUNT_NUM_GPO(param)				\
+    (((param) & HDA_PARAM_GPIO_COUNT_NUM_GPO_MASK) >>			\
+    HDA_PARAM_GPIO_COUNT_NUM_GPO_SHIFT)
+#define HDA_PARAM_GPIO_COUNT_NUM_GPIO(param)				\
+    (((param) & HDA_PARAM_GPIO_COUNT_NUM_GPIO_MASK) >>			\
+    HDA_PARAM_GPIO_COUNT_NUM_GPIO_SHIFT)
+
+/* Volume Knob Capabilities */
+#define HDA_PARAM_VOLUME_KNOB_CAP			0x13
+
+#define HDA_PARAM_VOLUME_KNOB_CAP_DELTA_MASK		0x00000080
+#define HDA_PARAM_VOLUME_KNOB_CAP_DELTA_SHIFT		7
+#define HDA_PARAM_VOLUME_KNOB_CAP_NUM_STEPS_MASK	0x0000007f
+#define HDA_PARAM_VOLUME_KNOB_CAP_NUM_STEPS_SHIFT	0
+
+#define HDA_PARAM_VOLUME_KNOB_CAP_DELTA(param)				\
+    (((param) & HDA_PARAM_VOLUME_KNOB_CAP_DELTA_MASK) >>		\
+    HDA_PARAM_VOLUME_KNOB_CAP_DELTA_SHIFT)
+#define HDA_PARAM_VOLUME_KNOB_CAP_NUM_STEPS(param)			\
+    (((param) & HDA_PARAM_VOLUME_KNOB_CAP_NUM_STEPS_MASK) >>		\
+    HDA_PARAM_VOLUME_KNOB_CAP_NUM_STEPS_SHIFT)
+
+
+#define HDA_CONFIG_DEFAULTCONF_SEQUENCE_MASK		0x0000000f
+#define HDA_CONFIG_DEFAULTCONF_SEQUENCE_SHIFT		0
+#define HDA_CONFIG_DEFAULTCONF_ASSOCIATION_MASK		0x000000f0
+#define HDA_CONFIG_DEFAULTCONF_ASSOCIATION_SHIFT	4
+#define HDA_CONFIG_DEFAULTCONF_MISC_MASK		0x00000f00
+#define HDA_CONFIG_DEFAULTCONF_MISC_SHIFT		8
+#define HDA_CONFIG_DEFAULTCONF_COLOR_MASK		0x0000f000
+#define HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT		12
+#define HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_MASK	0x000f0000
+#define HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_SHIFT	16
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_MASK		0x00f00000
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_SHIFT		20
+#define HDA_CONFIG_DEFAULTCONF_LOCATION_MASK		0x3f000000
+#define HDA_CONFIG_DEFAULTCONF_LOCATION_SHIFT		24
+#define HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_MASK	0xc0000000
+#define HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_SHIFT	30
+
+#define HDA_CONFIG_DEFAULTCONF_SEQUENCE(conf)				\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_SEQUENCE_MASK) >>			\
+    HDA_CONFIG_DEFAULTCONF_SEQUENCE_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_ASSOCIATION(conf)			\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_ASSOCIATION_MASK) >>		\
+    HDA_CONFIG_DEFAULTCONF_ASSOCIATION_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_MISC(conf)				\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_MISC_MASK) >>			\
+    HDA_CONFIG_DEFAULTCONF_MISC_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_COLOR(conf)				\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_COLOR_MASK) >>			\
+    HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE(conf)			\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_MASK) >>		\
+    HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE(conf)				\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_DEVICE_MASK) >>			\
+    HDA_CONFIG_DEFAULTCONF_DEVICE_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_LOCATION(conf)				\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_LOCATION_MASK) >>			\
+    HDA_CONFIG_DEFAULTCONF_LOCATION_SHIFT)
+#define HDA_CONFIG_DEFAULTCONF_CONNECTIVITY(conf)			\
+    (((conf) & HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_MASK) >>		\
+    HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_SHIFT)
+
+#define HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_JACK		(0<<30)
+#define HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_NONE		(1<<30)
+#define HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_FIXED		(2<<30)
+#define HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_BOTH		(3<<30)
+
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_OUT			(0<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_SPEAKER			(1<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_HP_OUT			(2<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_CD			(3<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_SPDIF_OUT			(4<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_DIGITAL_OTHER_OUT		(5<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_MODEM_LINE		(6<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_MODEM_HANDSET		(7<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_IN			(8<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_AUX			(9<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_MIC_IN			(10<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_TELEPHONY			(11<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_SPDIF_IN			(12<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_DIGITAL_OTHER_IN		(13<<20)
+#define HDA_CONFIG_DEFAULTCONF_DEVICE_OTHER			(15<<20)
+
+#endif /* _HDA_REG_H_ */
diff --git a/usr/src/cmd/bhyve/hdac_reg.h b/usr/src/cmd/bhyve/hdac_reg.h
new file mode 100644
index 0000000000..35272e5135
--- /dev/null
+++ b/usr/src/cmd/bhyve/hdac_reg.h
@@ -0,0 +1,271 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2006 Stephane E. Potvin <sepotvin@videotron.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HDAC_REG_H_
+#define _HDAC_REG_H_
+
+/****************************************************************************
+ * HDA Controller Register Set
+ ****************************************************************************/
+#define HDAC_GCAP	0x00	/* 2 - Global Capabilities*/
+#define HDAC_VMIN	0x02	/* 1 - Minor Version */
+#define HDAC_VMAJ	0x03	/* 1 - Major Version */
+#define	HDAC_OUTPAY	0x04	/* 2 - Output Payload Capability */
+#define HDAC_INPAY	0x06	/* 2 - Input Payload Capability */
+#define HDAC_GCTL	0x08	/* 4 - Global Control */
+#define HDAC_WAKEEN	0x0c	/* 2 - Wake Enable */
+#define HDAC_STATESTS	0x0e	/* 2 - State Change Status */
+#define HDAC_GSTS	0x10	/* 2 - Global Status */
+#define HDAC_OUTSTRMPAY	0x18	/* 2 - Output Stream Payload Capability */
+#define HDAC_INSTRMPAY	0x1a	/* 2 - Input Stream Payload Capability */
+#define HDAC_INTCTL	0x20	/* 4 - Interrupt Control */
+#define HDAC_INTSTS	0x24	/* 4 - Interrupt Status */
+#define HDAC_WALCLK	0x30	/* 4 - Wall Clock Counter */
+#define HDAC_SSYNC	0x38	/* 4 - Stream Synchronization */
+#define HDAC_CORBLBASE	0x40	/* 4 - CORB Lower Base Address */
+#define HDAC_CORBUBASE	0x44	/* 4 - CORB Upper Base Address */
+#define HDAC_CORBWP	0x48	/* 2 - CORB Write Pointer */
+#define HDAC_CORBRP	0x4a	/* 2 - CORB Read Pointer */
+#define HDAC_CORBCTL	0x4c	/* 1 - CORB Control */
+#define HDAC_CORBSTS	0x4d	/* 1 - CORB Status */
+#define HDAC_CORBSIZE	0x4e	/* 1 - CORB Size */
+#define HDAC_RIRBLBASE	0x50	/* 4 - RIRB Lower Base Address */
+#define HDAC_RIRBUBASE	0x54	/* 4 - RIRB Upper Base Address */
+#define HDAC_RIRBWP	0x58	/* 2 - RIRB Write Pointer */
+#define HDAC_RINTCNT	0x5a	/* 2 - Response Interrupt Count */
+#define HDAC_RIRBCTL	0x5c	/* 1 - RIRB Control */
+#define HDAC_RIRBSTS	0x5d	/* 1 - RIRB Status */
+#define HDAC_RIRBSIZE	0x5e	/* 1 - RIRB Size */
+#define HDAC_ICOI	0x60	/* 4 - Immediate Command Output Interface */
+#define HDAC_ICII	0x64	/* 4 - Immediate Command Input Interface */
+#define HDAC_ICIS	0x68	/* 2 - Immediate Command Status */
+#define HDAC_DPIBLBASE	0x70	/* 4 - DMA Position Buffer Lower Base */
+#define HDAC_DPIBUBASE	0x74	/* 4 - DMA Position Buffer Upper Base */
+#define HDAC_SDCTL0	0x80	/* 3 - Stream Descriptor Control */
+#define HDAC_SDCTL1	0x81	/* 3 - Stream Descriptor Control */
+#define HDAC_SDCTL2	0x82	/* 3 - Stream Descriptor Control */
+#define HDAC_SDSTS	0x83	/* 1 - Stream Descriptor Status */
+#define HDAC_SDLPIB	0x84	/* 4 - Link Position in Buffer */
+#define HDAC_SDCBL	0x88	/* 4 - Cyclic Buffer Length */
+#define HDAC_SDLVI	0x8C	/* 2 - Last Valid Index */
+#define HDAC_SDFIFOS	0x90	/* 2 - FIFOS */
+#define HDAC_SDFMT	0x92	/* 2 - fmt */
+#define HDAC_SDBDPL	0x98	/* 4 - Buffer Descriptor Pointer Lower Base */
+#define HDAC_SDBDPU	0x9C	/* 4 - Buffer Descriptor Pointer Upper Base */
+
+#define _HDAC_ISDOFFSET(n, iss, oss)	(0x80 + ((n) * 0x20))
+#define _HDAC_ISDCTL(n, iss, oss)	(0x00 + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDSTS(n, iss, oss)	(0x03 + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDPICB(n, iss, oss)	(0x04 + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDCBL(n, iss, oss)	(0x08 + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDLVI(n, iss, oss)	(0x0c + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDFIFOD(n, iss, oss)	(0x10 + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDFMT(n, iss, oss)	(0x12 + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDBDPL(n, iss, oss)	(0x18 + _HDAC_ISDOFFSET(n, iss, oss))
+#define _HDAC_ISDBDPU(n, iss, oss)	(0x1c + _HDAC_ISDOFFSET(n, iss, oss))
+
+#define _HDAC_OSDOFFSET(n, iss, oss)	(0x80 + ((iss) * 0x20) + ((n) * 0x20))
+#define _HDAC_OSDCTL(n, iss, oss)	(0x00 + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDSTS(n, iss, oss)	(0x03 + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDPICB(n, iss, oss)	(0x04 + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDCBL(n, iss, oss)	(0x08 + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDLVI(n, iss, oss)	(0x0c + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDFIFOD(n, iss, oss)	(0x10 + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDFMT(n, iss, oss)	(0x12 + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDBDPL(n, iss, oss)	(0x18 + _HDAC_OSDOFFSET(n, iss, oss))
+#define _HDAC_OSDBDPU(n, iss, oss)	(0x1c + _HDAC_OSDOFFSET(n, iss, oss))
+
+#define _HDAC_BSDOFFSET(n, iss, oss)					\
+	(0x80 + ((iss) * 0x20) + ((oss) * 0x20) + ((n) * 0x20))
+#define _HDAC_BSDCTL(n, iss, oss)	(0x00 + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDSTS(n, iss, oss)	(0x03 + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDPICB(n, iss, oss)	(0x04 + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDCBL(n, iss, oss)	(0x08 + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDLVI(n, iss, oss)	(0x0c + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDFIFOD(n, iss, oss)	(0x10 + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDFMT(n, iss, oss)	(0x12 + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDBDPL(n, iss, oss)	(0x18 + _HDAC_BSDOFFSET(n, iss, oss))
+#define _HDAC_BSDBDBU(n, iss, oss)	(0x1c + _HDAC_BSDOFFSET(n, iss, oss))
+
+/****************************************************************************
+ * HDA Controller Register Fields
+ ****************************************************************************/
+
+/* GCAP - Global Capabilities */
+#define HDAC_GCAP_64OK			0x0001
+#define HDAC_GCAP_NSDO_MASK		0x0006
+#define HDAC_GCAP_NSDO_SHIFT		1
+#define HDAC_GCAP_BSS_MASK		0x00f8
+#define HDAC_GCAP_BSS_SHIFT		3
+#define HDAC_GCAP_ISS_MASK		0x0f00
+#define HDAC_GCAP_ISS_SHIFT		8
+#define HDAC_GCAP_OSS_MASK		0xf000
+#define HDAC_GCAP_OSS_SHIFT		12
+
+#define HDAC_GCAP_NSDO_1SDO		0x00
+#define HDAC_GCAP_NSDO_2SDO		0x02
+#define HDAC_GCAP_NSDO_4SDO		0x04
+
+#define HDAC_GCAP_BSS(gcap)						\
+	(((gcap) & HDAC_GCAP_BSS_MASK) >> HDAC_GCAP_BSS_SHIFT)
+#define HDAC_GCAP_ISS(gcap)						\
+	(((gcap) & HDAC_GCAP_ISS_MASK) >> HDAC_GCAP_ISS_SHIFT)
+#define HDAC_GCAP_OSS(gcap)						\
+	(((gcap) & HDAC_GCAP_OSS_MASK) >> HDAC_GCAP_OSS_SHIFT)
+#define HDAC_GCAP_NSDO(gcap)						\
+	(((gcap) & HDAC_GCAP_NSDO_MASK) >> HDAC_GCAP_NSDO_SHIFT)
+
+/* GCTL - Global Control */
+#define HDAC_GCTL_CRST			0x00000001
+#define HDAC_GCTL_FCNTRL		0x00000002
+#define HDAC_GCTL_UNSOL			0x00000100
+
+/* WAKEEN - Wake Enable */
+#define HDAC_WAKEEN_SDIWEN_MASK		0x7fff
+#define HDAC_WAKEEN_SDIWEN_SHIFT	0
+
+/* STATESTS - State Change Status */
+#define HDAC_STATESTS_SDIWAKE_MASK	0x7fff
+#define HDAC_STATESTS_SDIWAKE_SHIFT	0
+
+#define HDAC_STATESTS_SDIWAKE(statests, n)				\
+    (((((statests) & HDAC_STATESTS_SDIWAKE_MASK) >>			\
+    HDAC_STATESTS_SDIWAKE_SHIFT) >> (n)) & 0x0001)
+
+/* GSTS - Global Status */
+#define HDAC_GSTS_FSTS			0x0002
+
+/* INTCTL - Interrut Control */
+#define HDAC_INTCTL_SIE_MASK		0x3fffffff
+#define HDAC_INTCTL_SIE_SHIFT		0
+#define HDAC_INTCTL_CIE			0x40000000
+#define HDAC_INTCTL_GIE			0x80000000
+
+/* INTSTS - Interrupt Status */
+#define HDAC_INTSTS_SIS_MASK		0x3fffffff
+#define HDAC_INTSTS_SIS_SHIFT		0
+#define HDAC_INTSTS_CIS			0x40000000
+#define HDAC_INTSTS_GIS			0x80000000
+
+/* SSYNC - Stream Synchronization */
+#define HDAC_SSYNC_SSYNC_MASK		0x3fffffff
+#define HDAC_SSYNC_SSYNC_SHIFT		0
+
+/* CORBWP - CORB Write Pointer */
+#define HDAC_CORBWP_CORBWP_MASK		0x00ff
+#define HDAC_CORBWP_CORBWP_SHIFT	0
+
+/* CORBRP - CORB Read Pointer */
+#define HDAC_CORBRP_CORBRP_MASK		0x00ff
+#define HDAC_CORBRP_CORBRP_SHIFT	0
+#define HDAC_CORBRP_CORBRPRST		0x8000
+
+/* CORBCTL - CORB Control */
+#define HDAC_CORBCTL_CMEIE		0x01
+#define HDAC_CORBCTL_CORBRUN		0x02
+
+/* CORBSTS - CORB Status */
+#define HDAC_CORBSTS_CMEI		0x01
+
+/* CORBSIZE - CORB Size */
+#define HDAC_CORBSIZE_CORBSIZE_MASK	0x03
+#define HDAC_CORBSIZE_CORBSIZE_SHIFT	0
+#define HDAC_CORBSIZE_CORBSZCAP_MASK	0xf0
+#define HDAC_CORBSIZE_CORBSZCAP_SHIFT	4
+
+#define HDAC_CORBSIZE_CORBSIZE_2	0x00
+#define HDAC_CORBSIZE_CORBSIZE_16	0x01
+#define HDAC_CORBSIZE_CORBSIZE_256	0x02
+
+#define HDAC_CORBSIZE_CORBSZCAP_2	0x10
+#define HDAC_CORBSIZE_CORBSZCAP_16	0x20
+#define HDAC_CORBSIZE_CORBSZCAP_256	0x40
+
+#define HDAC_CORBSIZE_CORBSIZE(corbsize)				\
+    (((corbsize) & HDAC_CORBSIZE_CORBSIZE_MASK) >> HDAC_CORBSIZE_CORBSIZE_SHIFT)
+
+/* RIRBWP - RIRB Write Pointer */
+#define HDAC_RIRBWP_RIRBWP_MASK		0x00ff
+#define HDAC_RIRBWP_RIRBWP_SHIFT	0
+#define HDAC_RIRBWP_RIRBWPRST		0x8000
+
+/* RINTCTN - Response Interrupt Count */
+#define HDAC_RINTCNT_MASK		0x00ff
+#define HDAC_RINTCNT_SHIFT		0
+
+/* RIRBCTL - RIRB Control */
+#define HDAC_RIRBCTL_RINTCTL		0x01
+#define HDAC_RIRBCTL_RIRBDMAEN		0x02
+#define HDAC_RIRBCTL_RIRBOIC		0x04
+
+/* RIRBSTS - RIRB Status */
+#define HDAC_RIRBSTS_RINTFL		0x01
+#define HDAC_RIRBSTS_RIRBOIS		0x04
+
+/* RIRBSIZE - RIRB Size */
+#define HDAC_RIRBSIZE_RIRBSIZE_MASK	0x03
+#define HDAC_RIRBSIZE_RIRBSIZE_SHIFT	0
+#define HDAC_RIRBSIZE_RIRBSZCAP_MASK	0xf0
+#define HDAC_RIRBSIZE_RIRBSZCAP_SHIFT	4
+
+#define HDAC_RIRBSIZE_RIRBSIZE_2	0x00
+#define HDAC_RIRBSIZE_RIRBSIZE_16	0x01
+#define HDAC_RIRBSIZE_RIRBSIZE_256	0x02
+
+#define HDAC_RIRBSIZE_RIRBSZCAP_2	0x10
+#define HDAC_RIRBSIZE_RIRBSZCAP_16	0x20
+#define HDAC_RIRBSIZE_RIRBSZCAP_256	0x40
+
+#define HDAC_RIRBSIZE_RIRBSIZE(rirbsize)				\
+    (((rirbsize) & HDAC_RIRBSIZE_RIRBSIZE_MASK) >> HDAC_RIRBSIZE_RIRBSIZE_SHIFT)
+
+/* DPLBASE - DMA Position Lower Base Address */
+#define HDAC_DPLBASE_DPLBASE_MASK	0xffffff80
+#define HDAC_DPLBASE_DPLBASE_SHIFT	7
+#define HDAC_DPLBASE_DPLBASE_DMAPBE	0x00000001
+
+/* SDCTL - Stream Descriptor Control */
+#define HDAC_SDCTL_SRST			0x000001
+#define HDAC_SDCTL_RUN			0x000002
+#define HDAC_SDCTL_IOCE			0x000004
+#define HDAC_SDCTL_FEIE			0x000008
+#define HDAC_SDCTL_DEIE			0x000010
+#define HDAC_SDCTL2_STRIPE_MASK		0x03
+#define HDAC_SDCTL2_STRIPE_SHIFT	0
+#define HDAC_SDCTL2_TP			0x04
+#define HDAC_SDCTL2_DIR			0x08
+#define HDAC_SDCTL2_STRM_MASK		0xf0
+#define HDAC_SDCTL2_STRM_SHIFT		4
+
+#define HDAC_SDSTS_DESE			(1 << 4)
+#define HDAC_SDSTS_FIFOE		(1 << 3)
+#define HDAC_SDSTS_BCIS			(1 << 2)
+
+#endif /* _HDAC_REG_H_ */
diff --git a/usr/src/cmd/bhyve/mevent.c b/usr/src/cmd/bhyve/mevent.c
index a258fd3047..d604039e1b 100644
--- a/usr/src/cmd/bhyve/mevent.c
+++ b/usr/src/cmd/bhyve/mevent.c
@@ -139,7 +139,7 @@ mevent_pipe_read(int fd, enum ev_type type, void *param)
 static void
 mevent_notify(void)
 {
-	char c;
+	char c = '\0';
 	
 	/*
 	 * If calling from outside the i/o thread, write a byte on the
diff --git a/usr/src/cmd/bhyve/net_backends.c b/usr/src/cmd/bhyve/net_backends.c
new file mode 100644
index 0000000000..88afaca4b1
--- /dev/null
+++ b/usr/src/cmd/bhyve/net_backends.c
@@ -0,0 +1,807 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file implements multiple network backends (tap, netmap, ...),
+ * to be used by network frontends such as virtio-net and e1000.
+ * The API to access the backend (e.g. send/receive packets, negotiate
+ * features) is exported by net_backends.h.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>		/* u_short etc */
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <net/if.h>
+#include <net/netmap.h>
+#include <net/netmap_virt.h>
+#define NETMAP_WITH_LIBS
+#include <net/netmap_user.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <sysexits.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <poll.h>
+#include <assert.h>
+
+
+#include "iov.h"
+#include "mevent.h"
+#include "net_backends.h"
+
+#include <sys/linker_set.h>
+
+/*
+ * Each network backend registers a set of function pointers that are
+ * used to implement the net backends API.
+ * This might need to be exposed if we implement backends in separate files.
+ */
+struct net_backend {
+	const char *prefix;	/* prefix matching this backend */
+
+	/*
+	 * Routines used to initialize and cleanup the resources needed
+	 * by a backend. The cleanup function is used internally,
+	 * and should not be called by the frontend.
+	 */
+	int (*init)(struct net_backend *be, const char *devname,
+	    net_be_rxeof_t cb, void *param);
+	void (*cleanup)(struct net_backend *be);
+
+	/*
+	 * Called to serve a guest transmit request. The scatter-gather
+	 * vector provided by the caller has 'iovcnt' elements and contains
+	 * the packet to send.
+	 */
+	ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
+
+	/*
+	 * Called to receive a packet from the backend. When the function
+	 * returns a positive value 'len', the scatter-gather vector
+	 * provided by the caller contains a packet with such length.
+	 * The function returns 0 if the backend doesn't have a new packet to
+	 * receive.
+	 */
+	ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
+
+	/*
+	 * Ask the backend for the virtio-net features it is able to
+	 * support. Possible features are TSO, UFO and checksum offloading
+	 * in both rx and tx direction and for both IPv4 and IPv6.
+	 */
+	uint64_t (*get_cap)(struct net_backend *be);
+
+	/*
+	 * Tell the backend to enable/disable the specified virtio-net
+	 * features (capabilities).
+	 */
+	int (*set_cap)(struct net_backend *be, uint64_t features,
+	    unsigned int vnet_hdr_len);
+
+	struct pci_vtnet_softc *sc;
+	int fd;
+
+	/*
+	 * Length of the virtio-net header used by the backend and the
+	 * frontend, respectively. A zero value means that the header
+	 * is not used.
+	 */
+	unsigned int be_vnet_hdr_len;
+	unsigned int fe_vnet_hdr_len;
+
+	/* Size of backend-specific private data. */
+	size_t priv_size;
+
+	/* Room for backend-specific data. */
+	char opaque[0];
+};
+
+SET_DECLARE(net_backend_set, struct net_backend);
+
+#define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
+
+#define WPRINTF(params) printf params
+
+/*
+ * The tap backend
+ */
+
+struct tap_priv {
+	struct mevent *mevp;
+};
+
+static void
+tap_cleanup(struct net_backend *be)
+{
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
+
+	if (priv->mevp) {
+		mevent_delete(priv->mevp);
+	}
+	if (be->fd != -1) {
+		close(be->fd);
+		be->fd = -1;
+	}
+}
+
+static int
+tap_init(struct net_backend *be, const char *devname,
+	 net_be_rxeof_t cb, void *param)
+{
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
+	char tbuf[80];
+	int opt = 1;
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_t rights;
+#endif
+
+	if (cb == NULL) {
+		WPRINTF(("TAP backend requires non-NULL callback\n"));
+		return (-1);
+	}
+
+	strcpy(tbuf, "/dev/");
+	strlcat(tbuf, devname, sizeof(tbuf));
+
+	be->fd = open(tbuf, O_RDWR);
+	if (be->fd == -1) {
+		WPRINTF(("open of tap device %s failed\n", tbuf));
+		goto error;
+	}
+
+	/*
+	 * Set non-blocking and register for read
+	 * notifications with the event loop
+	 */
+	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
+		WPRINTF(("tap device O_NONBLOCK failed\n"));
+		goto error;
+	}
+
+#ifndef WITHOUT_CAPSICUM
+	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
+	if (caph_rights_limit(be->fd, &rights) == -1)
+		errx(EX_OSERR, "Unable to apply rights for sandbox");
+#endif
+
+	priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
+	if (priv->mevp == NULL) {
+		WPRINTF(("Could not register event\n"));
+		goto error;
+	}
+
+	return (0);
+
+error:
+	tap_cleanup(be);
+	return (-1);
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static ssize_t
+tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	return (writev(be->fd, iov, iovcnt));
+}
+
+static ssize_t
+tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	ssize_t ret;
+
+	/* Should never be called without a valid tap fd */
+	assert(be->fd != -1);
+
+	ret = readv(be->fd, iov, iovcnt);
+
+	if (ret < 0 && errno == EWOULDBLOCK) {
+		return (0);
+	}
+
+	return (ret);
+}
+
+static uint64_t
+tap_get_cap(struct net_backend *be)
+{
+
+	return (0); /* no capabilities for now */
+}
+
+static int
+tap_set_cap(struct net_backend *be, uint64_t features,
+		unsigned vnet_hdr_len)
+{
+
+	return ((features || vnet_hdr_len) ? -1 : 0);
+}
+
+static struct net_backend tap_backend = {
+	.prefix = "tap",
+	.priv_size = sizeof(struct tap_priv),
+	.init = tap_init,
+	.cleanup = tap_cleanup,
+	.send = tap_send,
+	.recv = tap_recv,
+	.get_cap = tap_get_cap,
+	.set_cap = tap_set_cap,
+};
+
+/* A clone of the tap backend, with a different prefix. */
+static struct net_backend vmnet_backend = {
+	.prefix = "vmnet",
+	.priv_size = sizeof(struct tap_priv),
+	.init = tap_init,
+	.cleanup = tap_cleanup,
+	.send = tap_send,
+	.recv = tap_recv,
+	.get_cap = tap_get_cap,
+	.set_cap = tap_set_cap,
+};
+
+DATA_SET(net_backend_set, tap_backend);
+DATA_SET(net_backend_set, vmnet_backend);
+
+/*
+ * The netmap backend
+ */
+
+/* The virtio-net features supported by netmap. */
+#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
+		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
+		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
+		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
+
+struct netmap_priv {
+	char ifname[IFNAMSIZ];
+	struct nm_desc *nmd;
+	uint16_t memid;
+	struct netmap_ring *rx;
+	struct netmap_ring *tx;
+	struct mevent *mevp;
+	net_be_rxeof_t cb;
+	void *cb_param;
+};
+
+static void
+nmreq_init(struct nmreq *req, char *ifname)
+{
+
+	memset(req, 0, sizeof(*req));
+	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
+	req->nr_version = NETMAP_API;
+}
+
+static int
+netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
+{
+	int err;
+	struct nmreq req;
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+	nmreq_init(&req, priv->ifname);
+	req.nr_cmd = NETMAP_BDG_VNET_HDR;
+	req.nr_arg1 = vnet_hdr_len;
+	err = ioctl(be->fd, NIOCREGIF, &req);
+	if (err) {
+		WPRINTF(("Unable to set vnet header length %d\n",
+				vnet_hdr_len));
+		return (err);
+	}
+
+	be->be_vnet_hdr_len = vnet_hdr_len;
+
+	return (0);
+}
+
+static int
+netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
+{
+	int prev_hdr_len = be->be_vnet_hdr_len;
+	int ret;
+
+	if (vnet_hdr_len == prev_hdr_len) {
+		return (1);
+	}
+
+	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
+	if (ret) {
+		return (0);
+	}
+
+	netmap_set_vnet_hdr_len(be, prev_hdr_len);
+
+	return (1);
+}
+
+static uint64_t
+netmap_get_cap(struct net_backend *be)
+{
+
+	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
+	    NETMAP_FEATURES : 0);
+}
+
+static int
+netmap_set_cap(struct net_backend *be, uint64_t features,
+	       unsigned vnet_hdr_len)
+{
+
+	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
+}
+
+static int
+netmap_init(struct net_backend *be, const char *devname,
+	    net_be_rxeof_t cb, void *param)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
+	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
+
+	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
+	if (priv->nmd == NULL) {
+		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
+			devname, strerror(errno)));
+		free(priv);
+		return (-1);
+	}
+
+	priv->memid = priv->nmd->req.nr_arg2;
+	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
+	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
+	priv->cb = cb;
+	priv->cb_param = param;
+	be->fd = priv->nmd->fd;
+
+	priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
+	if (priv->mevp == NULL) {
+		WPRINTF(("Could not register event\n"));
+		return (-1);
+	}
+
+	return (0);
+}
+
+static void
+netmap_cleanup(struct net_backend *be)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+
+	if (priv->mevp) {
+		mevent_delete(priv->mevp);
+	}
+	if (priv->nmd) {
+		nm_close(priv->nmd);
+	}
+	be->fd = -1;
+}
+
+static ssize_t
+netmap_send(struct net_backend *be, struct iovec *iov,
+	    int iovcnt)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+	struct netmap_ring *ring;
+	ssize_t totlen = 0;
+	int nm_buf_size;
+	int nm_buf_len;
+	uint32_t head;
+	void *nm_buf;
+	int j;
+
+	ring = priv->tx;
+	head = ring->head;
+	if (head == ring->tail) {
+		WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt)));
+		goto txsync;
+	}
+	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+	nm_buf_size = ring->nr_buf_size;
+	nm_buf_len = 0;
+
+	for (j = 0; j < iovcnt; j++) {
+		int iov_frag_size = iov[j].iov_len;
+		void *iov_frag_buf = iov[j].iov_base;
+
+		totlen += iov_frag_size;
+
+		/*
+		 * Split each iovec fragment over more netmap slots, if
+		 * necessary.
+		 */
+		for (;;) {
+			int copylen;
+
+			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
+			memcpy(nm_buf, iov_frag_buf, copylen);
+
+			iov_frag_buf += copylen;
+			iov_frag_size -= copylen;
+			nm_buf += copylen;
+			nm_buf_size -= copylen;
+			nm_buf_len += copylen;
+
+			if (iov_frag_size == 0) {
+				break;
+			}
+
+			ring->slot[head].len = nm_buf_len;
+			ring->slot[head].flags = NS_MOREFRAG;
+			head = nm_ring_next(ring, head);
+			if (head == ring->tail) {
+				/*
+				 * We ran out of netmap slots while
+				 * splitting the iovec fragments.
+				 */
+				WPRINTF(("No space, drop %zu bytes\n",
+				   count_iov(iov, iovcnt)));
+				goto txsync;
+			}
+			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
+			nm_buf_size = ring->nr_buf_size;
+			nm_buf_len = 0;
+		}
+	}
+
+	/* Complete the last slot, which must not have NS_MOREFRAG set. */
+	ring->slot[head].len = nm_buf_len;
+	ring->slot[head].flags = 0;
+	head = nm_ring_next(ring, head);
+
+	/* Now update ring->head and ring->cur. */
+	ring->head = ring->cur = head;
+txsync:
+	ioctl(be->fd, NIOCTXSYNC, NULL);
+
+	return (totlen);
+}
+
+static ssize_t
+netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+	struct netmap_slot *slot = NULL;
+	struct netmap_ring *ring;
+	void *iov_frag_buf;
+	int iov_frag_size;
+	ssize_t totlen = 0;
+	uint32_t head;
+
+	assert(iovcnt);
+
+	ring = priv->rx;
+	head = ring->head;
+	iov_frag_buf = iov->iov_base;
+	iov_frag_size = iov->iov_len;
+
+	do {
+		int nm_buf_len;
+		void *nm_buf;
+
+		if (head == ring->tail) {
+			return (0);
+		}
+
+		slot = ring->slot + head;
+		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
+		nm_buf_len = slot->len;
+
+		for (;;) {
+			int copylen = nm_buf_len < iov_frag_size ?
+			    nm_buf_len : iov_frag_size;
+
+			memcpy(iov_frag_buf, nm_buf, copylen);
+			nm_buf += copylen;
+			nm_buf_len -= copylen;
+			iov_frag_buf += copylen;
+			iov_frag_size -= copylen;
+			totlen += copylen;
+
+			if (nm_buf_len == 0) {
+				break;
+			}
+
+			iov++;
+			iovcnt--;
+			if (iovcnt == 0) {
+				/* No space to receive. */
+				WPRINTF(("Short iov, drop %zd bytes\n",
+				    totlen));
+				return (-ENOSPC);
+			}
+			iov_frag_buf = iov->iov_base;
+			iov_frag_size = iov->iov_len;
+		}
+
+		head = nm_ring_next(ring, head);
+
+	} while (slot->flags & NS_MOREFRAG);
+
+	/* Release slots to netmap. */
+	ring->head = ring->cur = head;
+
+	return (totlen);
+}
+
+static struct net_backend netmap_backend = {
+	.prefix = "netmap",
+	.priv_size = sizeof(struct netmap_priv),
+	.init = netmap_init,
+	.cleanup = netmap_cleanup,
+	.send = netmap_send,
+	.recv = netmap_recv,
+	.get_cap = netmap_get_cap,
+	.set_cap = netmap_set_cap,
+};
+
+/* A clone of the netmap backend, with a different prefix. */
+static struct net_backend vale_backend = {
+	.prefix = "vale",
+	.priv_size = sizeof(struct netmap_priv),
+	.init = netmap_init,
+	.cleanup = netmap_cleanup,
+	.send = netmap_send,
+	.recv = netmap_recv,
+	.get_cap = netmap_get_cap,
+	.set_cap = netmap_set_cap,
+};
+
+DATA_SET(net_backend_set, netmap_backend);
+DATA_SET(net_backend_set, vale_backend);
+
+/*
+ * Initialize a backend and attach to the frontend.
+ * This is called during frontend initialization.
+ *  @pbe is a pointer to the backend to be initialized
+ *  @devname is the backend-name as supplied on the command line,
+ * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
+ *  @cb is the receive callback supplied by the frontend,
+ *	and it is invoked in the event loop when a receive
+ *	event is generated in the hypervisor,
+ *  @param is a pointer to the frontend, and normally used as
+ *	the argument for the callback.
+ */
+int
+netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
+    void *param)
+{
+	struct net_backend **pbe, *nbe, *tbe = NULL;
+	int err;
+
+	/*
+	 * Find the network backend that matches the user-provided
+	 * device name. net_backend_set is built using a linker set.
+	 */
+	SET_FOREACH(pbe, net_backend_set) {
+		if (strncmp(devname, (*pbe)->prefix,
+		    strlen((*pbe)->prefix)) == 0) {
+			tbe = *pbe;
+			assert(tbe->init != NULL);
+			assert(tbe->cleanup != NULL);
+			assert(tbe->send != NULL);
+			assert(tbe->recv != NULL);
+			assert(tbe->get_cap != NULL);
+			assert(tbe->set_cap != NULL);
+			break;
+		}
+	}
+
+	*ret = NULL;
+	if (tbe == NULL)
+		return (EINVAL);
+	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
+	*nbe = *tbe;	/* copy the template */
+	nbe->fd = -1;
+	nbe->sc = param;
+	nbe->be_vnet_hdr_len = 0;
+	nbe->fe_vnet_hdr_len = 0;
+
+	/* Initialize the backend. */
+	err = nbe->init(nbe, devname, cb, param);
+	if (err) {
+		free(nbe);
+		return (err);
+	}
+
+	*ret = nbe;
+
+	return (0);
+}
+
+void
+netbe_cleanup(struct net_backend *be)
+{
+
+	if (be != NULL) {
+		be->cleanup(be);
+		free(be);
+	}
+}
+
+uint64_t
+netbe_get_cap(struct net_backend *be)
+{
+
+	assert(be != NULL);
+	return (be->get_cap(be));
+}
+
+int
+netbe_set_cap(struct net_backend *be, uint64_t features,
+	      unsigned vnet_hdr_len)
+{
+	int ret;
+
+	assert(be != NULL);
+
+	/* There are only three valid lengths, i.e., 0, 10 and 12. */
+	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
+		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
+		return (-1);
+
+	be->fe_vnet_hdr_len = vnet_hdr_len;
+
+	ret = be->set_cap(be, features, vnet_hdr_len);
+	assert(be->be_vnet_hdr_len == 0 ||
+	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
+
+	return (ret);
+}
+
+static __inline struct iovec *
+iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
+{
+	struct iovec *riov;
+
+	/* XXX short-cut: assume first segment is >= tlen */
+	assert(iov[0].iov_len >= tlen);
+
+	iov[0].iov_len -= tlen;
+	if (iov[0].iov_len == 0) {
+		assert(*iovcnt > 1);
+		*iovcnt -= 1;
+		riov = &iov[1];
+	} else {
+		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
+		riov = &iov[0];
+	}
+
+	return (riov);
+}
+
+ssize_t
+netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+
+	assert(be != NULL);
+	if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
+		/*
+		 * The frontend uses a virtio-net header, but the backend
+		 * does not. We ignore it (as it must be all zeroes) and
+		 * strip it.
+		 */
+		assert(be->be_vnet_hdr_len == 0);
+		iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
+	}
+
+	return (be->send(be, iov, iovcnt));
+}
+
+/*
+ * Try to read a packet from the backend, without blocking.
+ * If no packets are available, return 0. In case of success, return
+ * the length of the packet just read. Return -1 in case of errors.
+ */
+ssize_t
+netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
+{
+	/* Length of prepended virtio-net header. */
+	unsigned int hlen = be->fe_vnet_hdr_len;
+	int ret;
+
+	assert(be != NULL);
+
+	if (hlen && hlen != be->be_vnet_hdr_len) {
+		/*
+		 * The frontend uses a virtio-net header, but the backend
+		 * does not. We need to prepend a zeroed header.
+		 */
+		struct virtio_net_rxhdr *vh;
+
+		assert(be->be_vnet_hdr_len == 0);
+
+		/*
+		 * Get a pointer to the rx header, and use the
+		 * data immediately following it for the packet buffer.
+		 */
+		vh = iov[0].iov_base;
+		iov = iov_trim(iov, &iovcnt, hlen);
+
+		/*
+		 * The only valid field in the rx packet header is the
+		 * number of buffers if merged rx bufs were negotiated.
+		 */
+		memset(vh, 0, hlen);
+		if (hlen == VNET_HDR_LEN) {
+			vh->vrh_bufs = 1;
+		}
+	}
+
+	ret = be->recv(be, iov, iovcnt);
+	if (ret > 0) {
+		ret += hlen;
+	}
+
+	return (ret);
+}
+
+/*
+ * Read a packet from the backend and discard it.
+ * Returns the size of the discarded packet or zero if no packet was available.
+ * A negative error code is returned in case of read error.
+ */
+ssize_t
+netbe_rx_discard(struct net_backend *be)
+{
+	/*
+	 * MP note: the dummybuf is only used to discard frames,
+	 * so there is no need for it to be per-vtnet or locked.
+	 * We only make it large enough for TSO-sized segment.
+	 */
+	static uint8_t dummybuf[65536 + 64];
+	struct iovec iov;
+
+	iov.iov_base = dummybuf;
+	iov.iov_len = sizeof(dummybuf);
+
+	return netbe_recv(be, &iov, 1);
+}
+
diff --git a/usr/src/cmd/bhyve/net_backends.h b/usr/src/cmd/bhyve/net_backends.h
new file mode 100644
index 0000000000..bba39db59b
--- /dev/null
+++ b/usr/src/cmd/bhyve/net_backends.h
@@ -0,0 +1,89 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __NET_BACKENDS_H__
+#define __NET_BACKENDS_H__
+
+#include <stdint.h>
+
+/* Opaque type representing a network backend. */
+typedef struct net_backend net_backend_t;
+
+/* Interface between network frontends and the network backends. */
+typedef void (*net_be_rxeof_t)(int, enum ev_type, void *param);
+int	netbe_init(net_backend_t **be, const char *devname, net_be_rxeof_t cb,
+            void *param);
+void	netbe_cleanup(net_backend_t *be);
+uint64_t netbe_get_cap(net_backend_t *be);
+int	 netbe_set_cap(net_backend_t *be, uint64_t cap,
+             unsigned vnet_hdr_len);
+ssize_t	netbe_send(net_backend_t *be, struct iovec *iov, int iovcnt);
+ssize_t	netbe_recv(net_backend_t *be, struct iovec *iov, int iovcnt);
+ssize_t	netbe_rx_discard(net_backend_t *be);
+
+
+/*
+ * Network device capabilities taken from the VirtIO standard.
+ * Despite the name, these capabilities can be used by different frontents
+ * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...).
+ */
+#define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
+#define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
+#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
+#define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
+#define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
+#define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
+#define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
+#define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
+#define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
+#define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
+#define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
+#define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
+#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
+#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
+#define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
+#define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
+#define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
+#define	VIRTIO_NET_F_GUEST_ANNOUNCE \
+				(1 << 21) /* guest can send gratuitous pkts */
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+} __packed;
+
+#endif /* __NET_BACKENDS_H__ */
diff --git a/usr/src/cmd/bhyve/net_utils.c b/usr/src/cmd/bhyve/net_utils.c
new file mode 100644
index 0000000000..a7ae4d2eef
--- /dev/null
+++ b/usr/src/cmd/bhyve/net_utils.c
@@ -0,0 +1,89 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <net/ethernet.h>
+
+#include <errno.h>
+#include <md5.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "bhyverun.h"
+#include "net_utils.h"
+
+int
+net_parsemac(char *mac_str, uint8_t *mac_addr)
+{
+        struct ether_addr *ea;
+        char *tmpstr;
+        char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+        tmpstr = strsep(&mac_str,"=");
+
+        if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+                ea = ether_aton(mac_str);
+
+                if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+                    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+			fprintf(stderr, "Invalid MAC %s\n", mac_str);
+                        return (EINVAL);
+                } else
+                        memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+        }
+
+        return (0);
+}
+
+void
+net_genmac(struct pci_devinst *pi, uint8_t *macaddr)
+{
+	/*
+	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
+	 * followed by an MD5 of the PCI slot/func number and dev name
+	 */
+	MD5_CTX mdctx;
+	unsigned char digest[16];
+	char nstr[80];
+
+	snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+	    pi->pi_func, vmname);
+
+	MD5Init(&mdctx);
+	MD5Update(&mdctx, nstr, (unsigned int)strlen(nstr));
+	MD5Final(digest, &mdctx);
+
+	macaddr[0] = 0x00;
+	macaddr[1] = 0xa0;
+	macaddr[2] = 0x98;
+	macaddr[3] = digest[0];
+	macaddr[4] = digest[1];
+	macaddr[5] = digest[2];
+}
diff --git a/usr/src/cmd/bhyve/net_utils.h b/usr/src/cmd/bhyve/net_utils.h
new file mode 100644
index 0000000000..3c83519931
--- /dev/null
+++ b/usr/src/cmd/bhyve/net_utils.h
@@ -0,0 +1,39 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 Vincenzo Maffione <v.maffione@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_UTILS_H_
+#define _NET_UTILS_H_
+
+#include <stdint.h>
+#include "pci_emul.h"
+
+void	net_genmac(struct pci_devinst *pi, uint8_t *macaddr);
+int	net_parsemac(char *mac_str, uint8_t *mac_addr);
+
+#endif /* _NET_UTILS_H_ */
diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c
index e211b5cf9c..62a647e43e 100644
--- a/usr/src/cmd/bhyve/pci_e82545.c
+++ b/usr/src/cmd/bhyve/pci_e82545.c
@@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$");
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "mevent.h"
+#include "net_utils.h"
 
 /* Hardware/register definitions XXX: move some to common code. */
 #define E82545_VENDOR_ID_INTEL			0x8086
@@ -2285,39 +2286,17 @@ e82545_open_tap(struct e82545_softc *sc, char *opts)
 #endif
 }
 
-static int
-e82545_parsemac(char *mac_str, uint8_t *mac_addr)
-{
-	struct ether_addr *ea;
-	char *tmpstr;
-	char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
-
-	tmpstr = strsep(&mac_str,"=");
-	if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
-		ea = ether_aton(mac_str);
-		if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
-		    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
-			fprintf(stderr, "Invalid MAC %s\n", mac_str);
-			return (1);
-		} else
-			memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
-	}
-	return (0);
-}
-
 static int
 e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
-	DPRINTF("Loading with options: %s\r\n", opts);
-
-	MD5_CTX mdctx;
-	unsigned char digest[16];
 	char nstr[80];
 	struct e82545_softc *sc;
 	char *devname;
 	char *vtopts;
 	int mac_provided;
 
+	DPRINTF("Loading with options: %s\r\n", opts);
+
 	/* Setup our softc */
 	sc = calloc(1, sizeof(*sc));
 
@@ -2367,7 +2346,7 @@ e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		(void) strsep(&vtopts, ",");
 
 		if (vtopts != NULL) {
-			err = e82545_parsemac(vtopts, sc->esc_mac.octet);
+			err = net_parsemac(vtopts, sc->esc_mac.octet);
 			if (err != 0) {
 				free(devname);
 				return (err);
@@ -2382,24 +2361,8 @@ e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		free(devname);
 	}
 
-	/*
-	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
-	 * followed by an MD5 of the PCI slot/func number and dev name
-	 */
 	if (!mac_provided) {
-		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
-		    pi->pi_func, vmname);
-
-		MD5Init(&mdctx);
-		MD5Update(&mdctx, nstr, strlen(nstr));
-		MD5Final(digest, &mdctx);
-
-		sc->esc_mac.octet[0] = 0x00;
-		sc->esc_mac.octet[1] = 0xa0;
-		sc->esc_mac.octet[2] = 0x98;
-		sc->esc_mac.octet[3] = digest[0];
-		sc->esc_mac.octet[4] = digest[1];
-		sc->esc_mac.octet[5] = digest[2];
+		net_genmac(pi, sc->esc_mac.octet);
 	}
 
 	/* H/w initiated reset */
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
index a71cc528aa..771cf4e77e 100644
--- a/usr/src/cmd/bhyve/pci_emul.c
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -600,6 +600,7 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 	uint64_t *baseptr = NULL;
 	uint64_t limit = 0, lobits = 0;
 	uint64_t addr, mask, bar;
+	uint16_t cmd, enbit;
 	int error;
 
 	assert(idx >= 0 && idx <= PCI_BARMAX);
@@ -619,13 +620,14 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 	switch (type) {
 	case PCIBAR_NONE:
 		baseptr = NULL;
-		addr = mask = lobits = 0;
+		addr = mask = lobits = enbit = 0;
 		break;
 	case PCIBAR_IO:
 		baseptr = &pci_emul_iobase;
 		limit = PCI_EMUL_IOLIMIT;
 		mask = PCIM_BAR_IO_BASE;
 		lobits = PCIM_BAR_IO_SPACE;
+		enbit = PCIM_CMD_PORTEN;
 		break;
 	case PCIBAR_MEM64:
 		/*
@@ -647,19 +649,20 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				 PCIM_BAR_MEM_PREFETCH;
-			break;
 		} else {
 			baseptr = &pci_emul_membase32;
 			limit = PCI_EMUL_MEMLIMIT32;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
 		}
+		enbit = PCIM_CMD_MEMEN;
 		break;
 	case PCIBAR_MEM32:
 		baseptr = &pci_emul_membase32;
 		limit = PCI_EMUL_MEMLIMIT32;
 		mask = PCIM_BAR_MEM_BASE;
 		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+		enbit = PCIM_CMD_MEMEN;
 		break;
 	default:
 		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
@@ -690,6 +693,9 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
 	}
 
+	cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND);
+	if ((cmd & enbit) != enbit)
+		pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit);
 	register_bar(pdi, idx);
 
 	return (0);
@@ -775,8 +781,7 @@ pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
 	pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
 	pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
 
-	pci_set_cfgdata8(pdi, PCIR_COMMAND,
-		    PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+	pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN);
 
 	err = (*pde->pe_init)(ctx, pdi, fi->fi_param);
 	if (err == 0)
@@ -966,15 +971,23 @@ pci_emul_add_pciecap(struct pci_devinst *pi, int type)
 	int err;
 	struct pciecap pciecap;
 
-	if (type != PCIEM_TYPE_ROOT_PORT)
-		return (-1);
-
 	bzero(&pciecap, sizeof(pciecap));
 
+	/*
+	 * Use the integrated endpoint type for endpoints on a root complex bus.
+	 *
+	 * NB: bhyve currently only supports a single PCI bus that is the root
+	 * complex bus, so all endpoints are integrated.
+	 */
+	if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0))
+		type = PCIEM_TYPE_ROOT_INT_EP;
+
 	pciecap.capid = PCIY_EXPRESS;
-	pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT;
-	pciecap.link_capabilities = 0x411;	/* gen1, x1 */
-	pciecap.link_status = 0x11;		/* gen1, x1 */
+	pciecap.pcie_capabilities = PCIECAP_VERSION | type;
+	if (type != PCIEM_TYPE_ROOT_INT_EP) {
+		pciecap.link_capabilities = 0x411;	/* gen1, x1 */
+		pciecap.link_status = 0x11;		/* gen1, x1 */
+	}
 
 	err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
 	return (err);
@@ -1697,31 +1710,18 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
 	}
 }
 
-static void
-pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
+/*
+ * Update device state in response to changes to the PCI command
+ * register.
+ */
+void
+pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old)
 {
-	int i, rshift;
-	uint32_t cmd, cmd2, changed, old, readonly;
-
-	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
-
-	/*
-	 * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
-	 *
-	 * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
-	 * 'write 1 to clear'. However these bits are not set to '1' by
-	 * any device emulation so it is simpler to treat them as readonly.
-	 */
-	rshift = (coff & 0x3) * 8;
-	readonly = 0xFFFFF880 >> rshift;
-
-	old = CFGREAD(pi, coff, bytes);
-	new &= ~readonly;
-	new |= (old & readonly);
-	CFGWRITE(pi, coff, new, bytes);			/* update config */
+	int i;
+	uint16_t changed, new;
 
-	cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* get updated value */
-	changed = cmd ^ cmd2;
+	new = pci_get_cfgdata16(pi, PCIR_COMMAND);
+	changed = old ^ new;
 
 	/*
 	 * If the MMIO or I/O address space decoding has changed then
@@ -1735,7 +1735,7 @@ pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
 			case PCIBAR_IO:
 				/* I/O address space decoding changed? */
 				if (changed & PCIM_CMD_PORTEN) {
-					if (porten(pi))
+					if (new & PCIM_CMD_PORTEN)
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
@@ -1745,7 +1745,7 @@ pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
 			case PCIBAR_MEM64:
 				/* MMIO address space decoding changed? */
 				if (changed & PCIM_CMD_MEMEN) {
-					if (memen(pi))
+					if (new & PCIM_CMD_MEMEN)
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
@@ -1763,6 +1763,32 @@ pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
 	pci_lintr_update(pi);
 }
 
+static void
+pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
+{
+	int rshift;
+	uint32_t cmd, old, readonly;
+
+	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
+
+	/*
+	 * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
+	 *
+	 * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
+	 * 'write 1 to clear'. However these bits are not set to '1' by
+	 * any device emulation so it is simpler to treat them as readonly.
+	 */
+	rshift = (coff & 0x3) * 8;
+	readonly = 0xFFFFF880 >> rshift;
+
+	old = CFGREAD(pi, coff, bytes);
+	new &= ~readonly;
+	new |= (old & readonly);
+	CFGWRITE(pi, coff, new, bytes);			/* update config */
+
+	pci_emul_cmd_changed(pi, cmd);
+}
+
 static void
 pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
     int coff, int bytes, uint32_t *eax)
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
index 0053caed99..51de897543 100644
--- a/usr/src/cmd/bhyve/pci_emul.h
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -230,6 +230,7 @@ int	pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
 	    uint64_t hostbase, enum pcibar_type type, uint64_t size);
 int	pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
 int	pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
+void	pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old);
 void	pci_generate_msi(struct pci_devinst *pi, int msgnum);
 void	pci_generate_msix(struct pci_devinst *pi, int msgnum);
 void	pci_lintr_assert(struct pci_devinst *pi);
diff --git a/usr/src/cmd/bhyve/pci_fbuf.c b/usr/src/cmd/bhyve/pci_fbuf.c
index 8d24dde9da..1b2eb03b9b 100644
--- a/usr/src/cmd/bhyve/pci_fbuf.c
+++ b/usr/src/cmd/bhyve/pci_fbuf.c
@@ -229,15 +229,13 @@ pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 static int
 pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
 {
-	char	*uopts, *xopts, *config;
+	char	*uopts, *uoptsbak, *xopts, *config;
 	char	*tmpstr;
 	int	ret;
 
 	ret = 0;
-	uopts = strdup(opts);
-	for (xopts = strtok(uopts, ",");
-	     xopts != NULL;
-	     xopts = strtok(NULL, ",")) {
+	uoptsbak = uopts = strdup(opts);
+	while ((xopts = strsep(&uopts, ",")) != NULL) {
 		if (strcmp(xopts, "wait") == 0) {
 			sc->rfb_wait = 1;
 			continue;
@@ -264,7 +262,7 @@ pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
 			if (config) {
 				if (tmpstr[0] == '[')
 					tmpstr++;
-				sc->rfb_host = tmpstr;
+				sc->rfb_host = strdup(tmpstr);
 				if (config[0] == ':')
 					config++;
 				else {
@@ -280,12 +278,12 @@ pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
 					sc->rfb_port = atoi(tmpstr);
 				else {
 					sc->rfb_port = atoi(config);
-					sc->rfb_host = tmpstr;
+					sc->rfb_host = strdup(tmpstr);
 				}
 			}
 #ifndef __FreeBSD__
 		} else if (!strcmp(xopts, "unix")) {
-			sc->rfb_unix = config;
+			sc->rfb_unix = strdup(config);
 #endif
 	        } else if (!strcmp(xopts, "vga")) {
 			if (!strcmp(config, "off")) {
@@ -318,7 +316,7 @@ pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
 			} else if (sc->memregs.height == 0)
 				sc->memregs.height = 1080;
 		} else if (!strcmp(xopts, "password")) {
-			sc->rfb_password = config;
+			sc->rfb_password = strdup(config);
 		} else {
 			pci_fbuf_usage(xopts);
 			ret = -1;
@@ -327,6 +325,7 @@ pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
 	}
 
 done:
+	free(uoptsbak);
 	return (ret);
 }
 
diff --git a/usr/src/cmd/bhyve/pci_hda.c b/usr/src/cmd/bhyve/pci_hda.c
new file mode 100644
index 0000000000..e0324f46a9
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_hda.c
@@ -0,0 +1,1331 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Alex Teaca <iateaca@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <time.h>
+
+#include "pci_hda.h"
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "hdac_reg.h"
+
+/*
+ * HDA defines
+ */
+#define PCIR_HDCTL		0x40
+#define INTEL_VENDORID		0x8086
+#define HDA_INTEL_82801G	0x27d8
+
+#define HDA_IOSS_NO		0x08
+#define HDA_OSS_NO		0x04
+#define HDA_ISS_NO		0x04
+#define HDA_CODEC_MAX		0x0f
+#define HDA_LAST_OFFSET						\
+	(0x2084 + ((HDA_ISS_NO) * 0x20) + ((HDA_OSS_NO) * 0x20))
+#define HDA_SET_REG_TABLE_SZ					\
+	(0x80 + ((HDA_ISS_NO) * 0x20) + ((HDA_OSS_NO) * 0x20))
+#define HDA_CORB_ENTRY_LEN	0x04
+#define HDA_RIRB_ENTRY_LEN	0x08
+#define HDA_BDL_ENTRY_LEN	0x10
+#define HDA_DMA_PIB_ENTRY_LEN	0x08
+#define HDA_STREAM_TAGS_CNT	0x10
+#define HDA_STREAM_REGS_BASE	0x80
+#define HDA_STREAM_REGS_LEN	0x20
+
+#define HDA_DMA_ACCESS_LEN	(sizeof(uint32_t))
+#define HDA_BDL_MAX_LEN		0x0100
+
+#define HDAC_SDSTS_FIFORDY	(1 << 5)
+
+#define HDA_RIRBSTS_IRQ_MASK	(HDAC_RIRBSTS_RINTFL | HDAC_RIRBSTS_RIRBOIS)
+#define HDA_STATESTS_IRQ_MASK	((1 << HDA_CODEC_MAX) - 1)
+#define HDA_SDSTS_IRQ_MASK					\
+	(HDAC_SDSTS_DESE | HDAC_SDSTS_FIFOE | HDAC_SDSTS_BCIS)
+
+/*
+ * HDA data structures
+ */
+
+struct hda_softc;
+
+typedef void (*hda_set_reg_handler)(struct hda_softc *sc, uint32_t offset,
+		uint32_t old);
+
+struct hda_bdle {
+	uint32_t addrl;
+	uint32_t addrh;
+	uint32_t len;
+	uint32_t ioc;
+} __packed;
+
+struct hda_bdle_desc {
+	void *addr;
+	uint8_t ioc;
+	uint32_t len;
+};
+
+struct hda_codec_cmd_ctl {
+	char *name;
+	void *dma_vaddr;
+	uint8_t run;
+	uint16_t rp;
+	uint16_t size;
+	uint16_t wp;
+};
+
+struct hda_stream_desc {
+	uint8_t dir;
+	uint8_t run;
+	uint8_t stream;
+
+	/* bp is the no. of bytes transferred in the current bdle */
+	uint32_t bp;
+	/* be is the no. of bdles transferred in the bdl */
+	uint32_t be;
+
+	uint32_t bdl_cnt;
+	struct hda_bdle_desc bdl[HDA_BDL_MAX_LEN];
+};
+
+struct hda_softc {
+	struct pci_devinst *pci_dev;
+	uint32_t regs[HDA_LAST_OFFSET];
+
+	uint8_t lintr;
+	uint8_t rirb_cnt;
+	uint64_t wall_clock_start;
+
+	struct hda_codec_cmd_ctl corb;
+	struct hda_codec_cmd_ctl rirb;
+
+	uint8_t codecs_no;
+	struct hda_codec_inst *codecs[HDA_CODEC_MAX];
+
+	/* Base Address of the DMA Position Buffer */
+	void *dma_pib_vaddr;
+
+	struct hda_stream_desc streams[HDA_IOSS_NO];
+	/* 2 tables for output and input */
+	uint8_t stream_map[2][HDA_STREAM_TAGS_CNT];
+};
+
+/*
+ * HDA module function declarations
+ */
+static inline void hda_set_reg_by_offset(struct hda_softc *sc, uint32_t offset,
+    uint32_t value);
+static inline uint32_t hda_get_reg_by_offset(struct hda_softc *sc,
+    uint32_t offset);
+static inline void hda_set_field_by_offset(struct hda_softc *sc,
+    uint32_t offset, uint32_t mask, uint32_t value);
+
+static uint8_t hda_parse_config(const char *opts, const char *key, char *val);
+static struct hda_softc *hda_init(const char *opts);
+static void hda_update_intr(struct hda_softc *sc);
+static void hda_response_interrupt(struct hda_softc *sc);
+static int hda_codec_constructor(struct hda_softc *sc,
+    struct hda_codec_class *codec, const char *play, const char *rec,
+    const char *opts);
+static struct hda_codec_class *hda_find_codec_class(const char *name);
+
+static int hda_send_command(struct hda_softc *sc, uint32_t verb);
+static int hda_notify_codecs(struct hda_softc *sc, uint8_t run,
+    uint8_t stream, uint8_t dir);
+static void hda_reset(struct hda_softc *sc);
+static void hda_reset_regs(struct hda_softc *sc);
+static void hda_stream_reset(struct hda_softc *sc, uint8_t stream_ind);
+static int hda_stream_start(struct hda_softc *sc, uint8_t stream_ind);
+static int hda_stream_stop(struct hda_softc *sc, uint8_t stream_ind);
+static uint32_t hda_read(struct hda_softc *sc, uint32_t offset);
+static int hda_write(struct hda_softc *sc, uint32_t offset, uint8_t size,
+    uint32_t value);
+
+static inline void hda_print_cmd_ctl_data(struct hda_codec_cmd_ctl *p);
+static int hda_corb_start(struct hda_softc *sc);
+static int hda_corb_run(struct hda_softc *sc);
+static int hda_rirb_start(struct hda_softc *sc);
+
+static void *hda_dma_get_vaddr(struct hda_softc *sc, uint64_t dma_paddr,
+    size_t len);
+static void hda_dma_st_dword(void *dma_vaddr, uint32_t data);
+static uint32_t hda_dma_ld_dword(void *dma_vaddr);
+
+static inline uint8_t hda_get_stream_by_offsets(uint32_t offset,
+    uint8_t reg_offset);
+static inline uint32_t hda_get_offset_stream(uint8_t stream_ind);
+
+static void hda_set_gctl(struct hda_softc *sc, uint32_t offset, uint32_t old);
+static void hda_set_statests(struct hda_softc *sc, uint32_t offset,
+    uint32_t old);
+static void hda_set_corbwp(struct hda_softc *sc, uint32_t offset, uint32_t old);
+static void hda_set_corbctl(struct hda_softc *sc, uint32_t offset,
+    uint32_t old);
+static void hda_set_rirbctl(struct hda_softc *sc, uint32_t offset,
+    uint32_t old);
+static void hda_set_rirbsts(struct hda_softc *sc, uint32_t offset,
+    uint32_t old);
+static void hda_set_dpiblbase(struct hda_softc *sc, uint32_t offset,
+    uint32_t old);
+static void hda_set_sdctl(struct hda_softc *sc, uint32_t offset, uint32_t old);
+static void hda_set_sdctl2(struct hda_softc *sc, uint32_t offset, uint32_t old);
+static void hda_set_sdsts(struct hda_softc *sc, uint32_t offset, uint32_t old);
+
+static int hda_signal_state_change(struct hda_codec_inst *hci);
+static int hda_response(struct hda_codec_inst *hci, uint32_t response,
+    uint8_t unsol);
+static int hda_transfer(struct hda_codec_inst *hci, uint8_t stream,
+    uint8_t dir, void *buf, size_t count);
+
+static void hda_set_pib(struct hda_softc *sc, uint8_t stream_ind, uint32_t pib);
+static uint64_t hda_get_clock_ns(void);
+
+/*
+ * PCI HDA function declarations
+ */
+static int pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts);
+static void pci_hda_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+    int baridx, uint64_t offset, int size, uint64_t value);
+static uint64_t pci_hda_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+    int baridx, uint64_t offset, int size);
+/*
+ * HDA global data
+ */
+
+static const hda_set_reg_handler hda_set_reg_table[] = {
+	[HDAC_GCTL] = hda_set_gctl,
+	[HDAC_STATESTS] = hda_set_statests,
+	[HDAC_CORBWP] = hda_set_corbwp,
+	[HDAC_CORBCTL] = hda_set_corbctl,
+	[HDAC_RIRBCTL] = hda_set_rirbctl,
+	[HDAC_RIRBSTS] = hda_set_rirbsts,
+	[HDAC_DPIBLBASE] = hda_set_dpiblbase,
+
+#define HDAC_ISTREAM(n, iss, oss)				\
+	[_HDAC_ISDCTL(n, iss, oss)] = hda_set_sdctl,		\
+	[_HDAC_ISDCTL(n, iss, oss) + 2] = hda_set_sdctl2,	\
+	[_HDAC_ISDSTS(n, iss, oss)] = hda_set_sdsts,		\
+
+#define HDAC_OSTREAM(n, iss, oss)				\
+	[_HDAC_OSDCTL(n, iss, oss)] = hda_set_sdctl,		\
+	[_HDAC_OSDCTL(n, iss, oss) + 2] = hda_set_sdctl2,	\
+	[_HDAC_OSDSTS(n, iss, oss)] = hda_set_sdsts,		\
+
+	HDAC_ISTREAM(0, HDA_ISS_NO, HDA_OSS_NO)
+	HDAC_ISTREAM(1, HDA_ISS_NO, HDA_OSS_NO)
+	HDAC_ISTREAM(2, HDA_ISS_NO, HDA_OSS_NO)
+	HDAC_ISTREAM(3, HDA_ISS_NO, HDA_OSS_NO)
+
+	HDAC_OSTREAM(0, HDA_ISS_NO, HDA_OSS_NO)
+	HDAC_OSTREAM(1, HDA_ISS_NO, HDA_OSS_NO)
+	HDAC_OSTREAM(2, HDA_ISS_NO, HDA_OSS_NO)
+	HDAC_OSTREAM(3, HDA_ISS_NO, HDA_OSS_NO)
+
+	[HDA_SET_REG_TABLE_SZ] = NULL,
+};
+
+static const uint16_t hda_corb_sizes[] = {
+	[HDAC_CORBSIZE_CORBSIZE_2]	= 2,
+	[HDAC_CORBSIZE_CORBSIZE_16]	= 16,
+	[HDAC_CORBSIZE_CORBSIZE_256]	= 256,
+	[HDAC_CORBSIZE_CORBSIZE_MASK]	= 0,
+};
+
+static const uint16_t hda_rirb_sizes[] = {
+	[HDAC_RIRBSIZE_RIRBSIZE_2]	= 2,
+	[HDAC_RIRBSIZE_RIRBSIZE_16]	= 16,
+	[HDAC_RIRBSIZE_RIRBSIZE_256]	= 256,
+	[HDAC_RIRBSIZE_RIRBSIZE_MASK]	= 0,
+};
+
+static struct hda_ops hops = {
+	.signal		= hda_signal_state_change,
+	.response	= hda_response,
+	.transfer	= hda_transfer,
+};
+
+struct pci_devemu pci_de_hda = {
+	.pe_emu		= "hda",
+	.pe_init	= pci_hda_init,
+	.pe_barwrite	= pci_hda_write,
+	.pe_barread	= pci_hda_read
+};
+
+PCI_EMUL_SET(pci_de_hda);
+
+SET_DECLARE(hda_codec_class_set, struct hda_codec_class);
+
+#if DEBUG_HDA == 1
+FILE *dbg;
+#endif
+
+/*
+ * HDA module function definitions
+ */
+
+static inline void
+hda_set_reg_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t value)
+{
+	assert(offset < HDA_LAST_OFFSET);
+	sc->regs[offset] = value;
+}
+
+static inline uint32_t
+hda_get_reg_by_offset(struct hda_softc *sc, uint32_t offset)
+{
+	assert(offset < HDA_LAST_OFFSET);
+	return sc->regs[offset];
+}
+
+static inline void
+hda_set_field_by_offset(struct hda_softc *sc, uint32_t offset,
+    uint32_t mask, uint32_t value)
+{
+	uint32_t reg_value = 0;
+
+	reg_value = hda_get_reg_by_offset(sc, offset);
+
+	reg_value &= ~mask;
+	reg_value |= (value & mask);
+
+	hda_set_reg_by_offset(sc, offset, reg_value);
+}
+
+static uint8_t
+hda_parse_config(const char *opts, const char *key, char *val)
+{
+	char buf[64];
+	char *s = buf;
+	char *tmp = NULL;
+	size_t len;
+	int i;
+
+	if (!opts)
+		return (0);
+
+	len = strlen(opts);
+	if (len >= sizeof(buf)) {
+		DPRINTF("Opts too big\n");
+		return (0);
+	}
+
+	DPRINTF("opts: %s\n", opts);
+
+	strcpy(buf, opts);
+
+	for (i = 0; i < len; i++)
+		if (buf[i] == ',') {
+			buf[i] = 0;
+			tmp = buf + i + 1;
+			break;
+		}
+
+	if (!memcmp(s, key, strlen(key))) {
+		strncpy(val, s + strlen(key), 64);
+		return (1);
+	}
+
+	if (!tmp)
+		return (0);
+
+	s = tmp;
+	if (!memcmp(s, key, strlen(key))) {
+		strncpy(val, s + strlen(key), 64);
+		return (1);
+	}
+
+	return (0);
+}
+
+static struct hda_softc *
+hda_init(const char *opts)
+{
+	struct hda_softc *sc = NULL;
+	struct hda_codec_class *codec = NULL;
+	char play[64];
+	char rec[64];
+	int err, p, r;
+
+#if DEBUG_HDA == 1
+	dbg = fopen("/tmp/bhyve_hda.log", "w+");
+#endif
+
+	DPRINTF("opts: %s\n", opts);
+
+	sc = calloc(1, sizeof(*sc));
+	if (!sc)
+		return (NULL);
+
+	hda_reset_regs(sc);
+
+	/*
+	 * TODO search all the codecs declared in opts
+	 * For now we play with one single codec
+	 */
+	codec = hda_find_codec_class("hda_codec");
+	if (codec) {
+		p = hda_parse_config(opts, "play=", play);
+		r = hda_parse_config(opts, "rec=", rec);
+		DPRINTF("play: %s rec: %s\n", play, rec);
+		if (p | r) {
+			err = hda_codec_constructor(sc, codec, p ?	\
+				play : NULL, r ? rec : NULL, NULL);
+			assert(!err);
+		}
+	}
+
+	return (sc);
+}
+
+static void
+hda_update_intr(struct hda_softc *sc)
+{
+	struct pci_devinst *pi = sc->pci_dev;
+	uint32_t intctl = hda_get_reg_by_offset(sc, HDAC_INTCTL);
+	uint32_t intsts = 0;
+	uint32_t sdsts = 0;
+	uint32_t rirbsts = 0;
+	uint32_t wakeen = 0;
+	uint32_t statests = 0;
+	uint32_t off = 0;
+	int i;
+
+	/* update the CIS bits */
+	rirbsts = hda_get_reg_by_offset(sc, HDAC_RIRBSTS);
+	if (rirbsts & (HDAC_RIRBSTS_RINTFL | HDAC_RIRBSTS_RIRBOIS))
+		intsts |= HDAC_INTSTS_CIS;
+
+	wakeen = hda_get_reg_by_offset(sc, HDAC_WAKEEN);
+	statests = hda_get_reg_by_offset(sc, HDAC_STATESTS);
+	if (statests & wakeen)
+		intsts |= HDAC_INTSTS_CIS;
+
+	/* update the SIS bits */
+	for (i = 0; i < HDA_IOSS_NO; i++) {
+		off = hda_get_offset_stream(i);
+		sdsts = hda_get_reg_by_offset(sc, off + HDAC_SDSTS);
+		if (sdsts & HDAC_SDSTS_BCIS)
+			intsts |= (1 << i);
+	}
+
+	/* update the GIS bit */
+	if (intsts)
+		intsts |= HDAC_INTSTS_GIS;
+
+	hda_set_reg_by_offset(sc, HDAC_INTSTS, intsts);
+
+	if ((intctl & HDAC_INTCTL_GIE) && ((intsts &			\
+		~HDAC_INTSTS_GIS) & intctl)) {
+		if (!sc->lintr) {
+			pci_lintr_assert(pi);
+			sc->lintr = 1;
+		}
+	} else {
+		if (sc->lintr) {
+			pci_lintr_deassert(pi);
+			sc->lintr = 0;
+		}
+	}
+}
+
+static void
+hda_response_interrupt(struct hda_softc *sc)
+{
+	uint8_t rirbctl = hda_get_reg_by_offset(sc, HDAC_RIRBCTL);
+
+	if ((rirbctl & HDAC_RIRBCTL_RINTCTL) && sc->rirb_cnt) {
+		sc->rirb_cnt = 0;
+		hda_set_field_by_offset(sc, HDAC_RIRBSTS, HDAC_RIRBSTS_RINTFL,
+				HDAC_RIRBSTS_RINTFL);
+		hda_update_intr(sc);
+	}
+}
+
+static int
+hda_codec_constructor(struct hda_softc *sc, struct hda_codec_class *codec,
+    const char *play, const char *rec, const char *opts)
+{
+	struct hda_codec_inst *hci = NULL;
+
+	if (sc->codecs_no >= HDA_CODEC_MAX)
+		return (-1);
+
+	hci = calloc(1, sizeof(struct hda_codec_inst));
+	if (!hci)
+		return (-1);
+
+	hci->hda = sc;
+	hci->hops = &hops;
+	hci->cad = sc->codecs_no;
+	hci->codec = codec;
+
+	sc->codecs[sc->codecs_no++] = hci;
+
+	if (!codec->init) {
+		DPRINTF("This codec does not implement the init function\n");
+		return (-1);
+	}
+
+	return (codec->init(hci, play, rec, opts));
+}
+
+static struct hda_codec_class *
+hda_find_codec_class(const char *name)
+{
+	struct hda_codec_class **pdpp = NULL, *pdp = NULL;
+
+	SET_FOREACH(pdpp, hda_codec_class_set) {
+		pdp = *pdpp;
+		if (!strcmp(pdp->name, name)) {
+			return (pdp);
+		}
+	}
+
+	return (NULL);
+}
+
+static int
+hda_send_command(struct hda_softc *sc, uint32_t verb)
+{
+	struct hda_codec_inst *hci = NULL;
+	struct hda_codec_class *codec = NULL;
+	uint8_t cad = (verb >> HDA_CMD_CAD_SHIFT) & 0x0f;
+
+	hci = sc->codecs[cad];
+	if (!hci)
+		return (-1);
+
+	DPRINTF("cad: 0x%x verb: 0x%x\n", cad, verb);
+
+	codec = hci->codec;
+	assert(codec);
+
+	if (!codec->command) {
+		DPRINTF("This codec does not implement the command function\n");
+		return (-1);
+	}
+
+	return (codec->command(hci, verb));
+}
+
+static int
+hda_notify_codecs(struct hda_softc *sc, uint8_t run, uint8_t stream,
+    uint8_t dir)
+{
+	struct hda_codec_inst *hci = NULL;
+	struct hda_codec_class *codec = NULL;
+	int err;
+	int i;
+
+	/* Notify each codec */
+	for (i = 0; i < sc->codecs_no; i++) {
+		hci = sc->codecs[i];
+		assert(hci);
+
+		codec = hci->codec;
+		assert(codec);
+
+		if (codec->notify) {
+			err = codec->notify(hci, run, stream, dir);
+			if (!err)
+				break;
+		}
+	}
+
+	return (i == sc->codecs_no ? (-1) : 0);
+}
+
+static void
+hda_reset(struct hda_softc *sc)
+{
+	int i;
+	struct hda_codec_inst *hci = NULL;
+	struct hda_codec_class *codec = NULL;
+
+	hda_reset_regs(sc);
+
+	/* Reset each codec */
+	for (i = 0; i < sc->codecs_no; i++) {
+		hci = sc->codecs[i];
+		assert(hci);
+
+		codec = hci->codec;
+		assert(codec);
+
+		if (codec->reset)
+			codec->reset(hci);
+	}
+
+	sc->wall_clock_start = hda_get_clock_ns();
+}
+
+static void
+hda_reset_regs(struct hda_softc *sc)
+{
+	uint32_t off = 0;
+	uint8_t i;
+
+	DPRINTF("Reset the HDA controller registers ...\n");
+
+	memset(sc->regs, 0, sizeof(sc->regs));
+
+	hda_set_reg_by_offset(sc, HDAC_GCAP,
+			HDAC_GCAP_64OK |
+			(HDA_ISS_NO << HDAC_GCAP_ISS_SHIFT) |
+			(HDA_OSS_NO << HDAC_GCAP_OSS_SHIFT));
+	hda_set_reg_by_offset(sc, HDAC_VMAJ, 0x01);
+	hda_set_reg_by_offset(sc, HDAC_OUTPAY, 0x3c);
+	hda_set_reg_by_offset(sc, HDAC_INPAY, 0x1d);
+	hda_set_reg_by_offset(sc, HDAC_CORBSIZE,
+	    HDAC_CORBSIZE_CORBSZCAP_256 | HDAC_CORBSIZE_CORBSIZE_256);
+	hda_set_reg_by_offset(sc, HDAC_RIRBSIZE,
+	    HDAC_RIRBSIZE_RIRBSZCAP_256 | HDAC_RIRBSIZE_RIRBSIZE_256);
+
+	for (i = 0; i < HDA_IOSS_NO; i++) {
+		off = hda_get_offset_stream(i);
+		hda_set_reg_by_offset(sc, off + HDAC_SDFIFOS, HDA_FIFO_SIZE);
+	}
+}
+
+static void
+hda_stream_reset(struct hda_softc *sc, uint8_t stream_ind)
+{
+	struct hda_stream_desc *st = &sc->streams[stream_ind];
+	uint32_t off = hda_get_offset_stream(stream_ind);
+
+	DPRINTF("Reset the HDA stream: 0x%x\n", stream_ind);
+
+	/* Reset the Stream Descriptor registers */
+	memset(sc->regs + HDA_STREAM_REGS_BASE + off, 0, HDA_STREAM_REGS_LEN);
+
+	/* Reset the Stream Descriptor */
+	memset(st, 0, sizeof(*st));
+
+	hda_set_field_by_offset(sc, off + HDAC_SDSTS,
+	    HDAC_SDSTS_FIFORDY, HDAC_SDSTS_FIFORDY);
+	hda_set_field_by_offset(sc, off + HDAC_SDCTL0,
+	    HDAC_SDCTL_SRST, HDAC_SDCTL_SRST);
+}
+
+static int
+hda_stream_start(struct hda_softc *sc, uint8_t stream_ind)
+{
+	struct hda_stream_desc *st = &sc->streams[stream_ind];
+	struct hda_bdle_desc *bdle_desc = NULL;
+	struct hda_bdle *bdle = NULL;
+	uint32_t lvi = 0;
+	uint32_t bdl_cnt = 0;
+	uint64_t bdpl = 0;
+	uint64_t bdpu = 0;
+	uint64_t bdl_paddr = 0;
+	void *bdl_vaddr = NULL;
+	uint32_t bdle_sz = 0;
+	uint64_t bdle_addrl = 0;
+	uint64_t bdle_addrh = 0;
+	uint64_t bdle_paddr = 0;
+	void *bdle_vaddr = NULL;
+	uint32_t off = hda_get_offset_stream(stream_ind);
+	uint32_t sdctl = 0;
+	uint8_t strm = 0;
+	uint8_t dir = 0;
+	int i;
+
+	assert(!st->run);
+
+	lvi = hda_get_reg_by_offset(sc, off + HDAC_SDLVI);
+	bdpl = hda_get_reg_by_offset(sc, off + HDAC_SDBDPL);
+	bdpu = hda_get_reg_by_offset(sc, off + HDAC_SDBDPU);
+
+	bdl_cnt = lvi + 1;
+	assert(bdl_cnt <= HDA_BDL_MAX_LEN);
+
+	bdl_paddr = bdpl | (bdpu << 32);
+	bdl_vaddr = hda_dma_get_vaddr(sc, bdl_paddr,
+	    HDA_BDL_ENTRY_LEN * bdl_cnt);
+	if (!bdl_vaddr) {
+		DPRINTF("Fail to get the guest virtual address\n");
+		return (-1);
+	}
+
+	DPRINTF("stream: 0x%x bdl_cnt: 0x%x bdl_paddr: 0x%lx\n",
+	    stream_ind, bdl_cnt, bdl_paddr);
+
+	st->bdl_cnt = bdl_cnt;
+
+	bdle = (struct hda_bdle *)bdl_vaddr;
+	for (i = 0; i < bdl_cnt; i++, bdle++) {
+		bdle_sz = bdle->len;
+		assert(!(bdle_sz % HDA_DMA_ACCESS_LEN));
+
+		bdle_addrl = bdle->addrl;
+		bdle_addrh = bdle->addrh;
+
+		bdle_paddr = bdle_addrl | (bdle_addrh << 32);
+		bdle_vaddr = hda_dma_get_vaddr(sc, bdle_paddr, bdle_sz);
+		if (!bdle_vaddr) {
+			DPRINTF("Fail to get the guest virtual address\n");
+			return (-1);
+		}
+
+		bdle_desc = &st->bdl[i];
+		bdle_desc->addr = bdle_vaddr;
+		bdle_desc->len = bdle_sz;
+		bdle_desc->ioc = bdle->ioc;
+
+		DPRINTF("bdle: 0x%x bdle_sz: 0x%x\n", i, bdle_sz);
+	}
+
+	sdctl = hda_get_reg_by_offset(sc, off + HDAC_SDCTL0);
+	strm = (sdctl >> 20) & 0x0f;
+	dir = stream_ind >= HDA_ISS_NO;
+
+	DPRINTF("strm: 0x%x, dir: 0x%x\n", strm, dir);
+
+	sc->stream_map[dir][strm] = stream_ind;
+	st->stream = strm;
+	st->dir = dir;
+	st->bp = 0;
+	st->be = 0;
+
+	hda_set_pib(sc, stream_ind, 0);
+
+	st->run = 1;
+
+	hda_notify_codecs(sc, 1, strm, dir);
+
+	return (0);
+}
+
+static int
+hda_stream_stop(struct hda_softc *sc, uint8_t stream_ind)
+{
+	struct hda_stream_desc *st = &sc->streams[stream_ind];
+	uint8_t strm = st->stream;
+	uint8_t dir = st->dir;
+
+	DPRINTF("stream: 0x%x, strm: 0x%x, dir: 0x%x\n", stream_ind, strm, dir);
+
+	st->run = 0;
+
+	hda_notify_codecs(sc, 0, strm, dir);
+
+	return (0);
+}
+
+static uint32_t
+hda_read(struct hda_softc *sc, uint32_t offset)
+{
+	if (offset == HDAC_WALCLK)
+		return (24 * (hda_get_clock_ns() -			\
+			sc->wall_clock_start) / 1000);
+
+	return (hda_get_reg_by_offset(sc, offset));
+}
+
+static int
+hda_write(struct hda_softc *sc, uint32_t offset, uint8_t size, uint32_t value)
+{
+	uint32_t old = hda_get_reg_by_offset(sc, offset);
+	uint32_t masks[] = {0x00000000, 0x000000ff, 0x0000ffff,
+			0x00ffffff, 0xffffffff};
+	hda_set_reg_handler set_reg_handler = hda_set_reg_table[offset];
+
+	hda_set_field_by_offset(sc, offset, masks[size], value);
+
+	if (set_reg_handler)
+		set_reg_handler(sc, offset, old);
+
+	return (0);
+}
+
+static inline void
+hda_print_cmd_ctl_data(struct hda_codec_cmd_ctl *p)
+{
+#if DEBUG_HDA == 1
+	char *name = p->name;
+#endif
+	DPRINTF("%s size: %d\n", name, p->size);
+	DPRINTF("%s dma_vaddr: %p\n", name, p->dma_vaddr);
+	DPRINTF("%s wp: 0x%x\n", name, p->wp);
+	DPRINTF("%s rp: 0x%x\n", name, p->rp);
+}
+
+static int
+hda_corb_start(struct hda_softc *sc)
+{
+	struct hda_codec_cmd_ctl *corb = &sc->corb;
+	uint8_t corbsize = 0;
+	uint64_t corblbase = 0;
+	uint64_t corbubase = 0;
+	uint64_t corbpaddr = 0;
+
+	corb->name = "CORB";
+
+	corbsize = hda_get_reg_by_offset(sc, HDAC_CORBSIZE) &		\
+		   HDAC_CORBSIZE_CORBSIZE_MASK;
+	corb->size = hda_corb_sizes[corbsize];
+
+	if (!corb->size) {
+		DPRINTF("Invalid corb size\n");
+		return (-1);
+	}
+
+	corblbase = hda_get_reg_by_offset(sc, HDAC_CORBLBASE);
+	corbubase = hda_get_reg_by_offset(sc, HDAC_CORBUBASE);
+
+	corbpaddr = corblbase | (corbubase << 32);
+	DPRINTF("CORB dma_paddr: %p\n", (void *)corbpaddr);
+
+	corb->dma_vaddr = hda_dma_get_vaddr(sc, corbpaddr,
+			HDA_CORB_ENTRY_LEN * corb->size);
+	if (!corb->dma_vaddr) {
+		DPRINTF("Fail to get the guest virtual address\n");
+		return (-1);
+	}
+
+	corb->wp = hda_get_reg_by_offset(sc, HDAC_CORBWP);
+	corb->rp = hda_get_reg_by_offset(sc, HDAC_CORBRP);
+
+	corb->run = 1;
+
+	hda_print_cmd_ctl_data(corb);
+
+	return (0);
+}
+
+static int
+hda_corb_run(struct hda_softc *sc)
+{
+	struct hda_codec_cmd_ctl *corb = &sc->corb;
+	uint32_t verb = 0;
+	int err;
+
+	corb->wp = hda_get_reg_by_offset(sc, HDAC_CORBWP);
+
+	while (corb->rp != corb->wp && corb->run) {
+		corb->rp++;
+		corb->rp %= corb->size;
+
+		verb = hda_dma_ld_dword(corb->dma_vaddr +		\
+				HDA_CORB_ENTRY_LEN * corb->rp);
+
+		err = hda_send_command(sc, verb);
+		assert(!err);
+	}
+
+	hda_set_reg_by_offset(sc, HDAC_CORBRP, corb->rp);
+
+	if (corb->run)
+		hda_response_interrupt(sc);
+
+	return (0);
+}
+
+static int
+hda_rirb_start(struct hda_softc *sc)
+{
+	struct hda_codec_cmd_ctl *rirb = &sc->rirb;
+	uint8_t rirbsize = 0;
+	uint64_t rirblbase = 0;
+	uint64_t rirbubase = 0;
+	uint64_t rirbpaddr = 0;
+
+	rirb->name = "RIRB";
+
+	rirbsize = hda_get_reg_by_offset(sc, HDAC_RIRBSIZE) &		\
+		   HDAC_RIRBSIZE_RIRBSIZE_MASK;
+	rirb->size = hda_rirb_sizes[rirbsize];
+
+	if (!rirb->size) {
+		DPRINTF("Invalid rirb size\n");
+		return (-1);
+	}
+
+	rirblbase = hda_get_reg_by_offset(sc, HDAC_RIRBLBASE);
+	rirbubase = hda_get_reg_by_offset(sc, HDAC_RIRBUBASE);
+
+	rirbpaddr = rirblbase | (rirbubase << 32);
+	DPRINTF("RIRB dma_paddr: %p\n", (void *)rirbpaddr);
+
+	rirb->dma_vaddr = hda_dma_get_vaddr(sc, rirbpaddr,
+			HDA_RIRB_ENTRY_LEN * rirb->size);
+	if (!rirb->dma_vaddr) {
+		DPRINTF("Fail to get the guest virtual address\n");
+		return (-1);
+	}
+
+	rirb->wp = hda_get_reg_by_offset(sc, HDAC_RIRBWP);
+	rirb->rp = 0x0000;
+
+	rirb->run = 1;
+
+	hda_print_cmd_ctl_data(rirb);
+
+	return (0);
+}
+
+static void *
+hda_dma_get_vaddr(struct hda_softc *sc, uint64_t dma_paddr, size_t len)
+{
+	struct pci_devinst *pi = sc->pci_dev;
+
+	assert(pi);
+
+	return (paddr_guest2host(pi->pi_vmctx, (uintptr_t)dma_paddr, len));
+}
+
+static void
+hda_dma_st_dword(void *dma_vaddr, uint32_t data)
+{
+	*(uint32_t*)dma_vaddr = data;
+}
+
+static uint32_t
+hda_dma_ld_dword(void *dma_vaddr)
+{
+	return (*(uint32_t*)dma_vaddr);
+}
+
+static inline uint8_t
+hda_get_stream_by_offsets(uint32_t offset, uint8_t reg_offset)
+{
+	uint8_t stream_ind = (offset - reg_offset) >> 5;
+
+	assert(stream_ind < HDA_IOSS_NO);
+
+	return (stream_ind);
+}
+
+static inline uint32_t
+hda_get_offset_stream(uint8_t stream_ind)
+{
+	return (stream_ind << 5);
+}
+
+static void
+hda_set_gctl(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+
+	if (!(value & HDAC_GCTL_CRST)) {
+		hda_reset(sc);
+	}
+}
+
+static void
+hda_set_statests(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+
+	hda_set_reg_by_offset(sc, offset, old);
+
+	/* clear the corresponding bits written by the software (guest) */
+	hda_set_field_by_offset(sc, offset, value & HDA_STATESTS_IRQ_MASK, 0);
+
+	hda_update_intr(sc);
+}
+
+static void
+hda_set_corbwp(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	hda_corb_run(sc);
+}
+
+static void
+hda_set_corbctl(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+	int err;
+	struct hda_codec_cmd_ctl *corb = NULL;
+
+	if (value & HDAC_CORBCTL_CORBRUN) {
+		if (!(old & HDAC_CORBCTL_CORBRUN)) {
+			err = hda_corb_start(sc);
+			assert(!err);
+		}
+	} else {
+		corb = &sc->corb;
+		memset(corb, 0, sizeof(*corb));
+	}
+
+	hda_corb_run(sc);
+}
+
+static void
+hda_set_rirbctl(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+	int err;
+	struct hda_codec_cmd_ctl *rirb = NULL;
+
+	if (value & HDAC_RIRBCTL_RIRBDMAEN) {
+		err = hda_rirb_start(sc);
+		assert(!err);
+	} else {
+		rirb = &sc->rirb;
+		memset(rirb, 0, sizeof(*rirb));
+	}
+}
+
+static void
+hda_set_rirbsts(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+
+	hda_set_reg_by_offset(sc, offset, old);
+
+	/* clear the corresponding bits written by the software (guest) */
+	hda_set_field_by_offset(sc, offset, value & HDA_RIRBSTS_IRQ_MASK, 0);
+
+	hda_update_intr(sc);
+}
+
+static void
+hda_set_dpiblbase(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+	uint64_t dpiblbase = 0;
+	uint64_t dpibubase = 0;
+	uint64_t dpibpaddr = 0;
+
+	if ((value & HDAC_DPLBASE_DPLBASE_DMAPBE) != (old &		\
+				HDAC_DPLBASE_DPLBASE_DMAPBE)) {
+		if (value & HDAC_DPLBASE_DPLBASE_DMAPBE) {
+			dpiblbase = value & HDAC_DPLBASE_DPLBASE_MASK;
+			dpibubase = hda_get_reg_by_offset(sc, HDAC_DPIBUBASE);
+
+			dpibpaddr = dpiblbase | (dpibubase << 32);
+			DPRINTF("DMA Position In Buffer dma_paddr: %p\n",
+			    (void *)dpibpaddr);
+
+			sc->dma_pib_vaddr = hda_dma_get_vaddr(sc, dpibpaddr,
+					HDA_DMA_PIB_ENTRY_LEN * HDA_IOSS_NO);
+			if (!sc->dma_pib_vaddr) {
+				DPRINTF("Fail to get the guest \
+					 virtual address\n");
+				assert(0);
+			}
+		} else {
+			DPRINTF("DMA Position In Buffer Reset\n");
+			sc->dma_pib_vaddr = NULL;
+		}
+	}
+}
+
+static void
+hda_set_sdctl(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint8_t stream_ind = hda_get_stream_by_offsets(offset, HDAC_SDCTL0);
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+	int err;
+
+	DPRINTF("stream_ind: 0x%x old: 0x%x value: 0x%x\n",
+	    stream_ind, old, value);
+
+	if (value & HDAC_SDCTL_SRST) {
+		hda_stream_reset(sc, stream_ind);
+	}
+
+	if ((value & HDAC_SDCTL_RUN) != (old & HDAC_SDCTL_RUN)) {
+		if (value & HDAC_SDCTL_RUN) {
+			err = hda_stream_start(sc, stream_ind);
+			assert(!err);
+		} else {
+			err = hda_stream_stop(sc, stream_ind);
+			assert(!err);
+		}
+	}
+}
+
+static void
+hda_set_sdctl2(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+
+	hda_set_field_by_offset(sc, offset - 2, 0x00ff0000, value << 16);
+}
+
+static void
+hda_set_sdsts(struct hda_softc *sc, uint32_t offset, uint32_t old)
+{
+	uint32_t value = hda_get_reg_by_offset(sc, offset);
+
+	hda_set_reg_by_offset(sc, offset, old);
+
+	/* clear the corresponding bits written by the software (guest) */
+	hda_set_field_by_offset(sc, offset, value & HDA_SDSTS_IRQ_MASK, 0);
+
+	hda_update_intr(sc);
+}
+
+static int
+hda_signal_state_change(struct hda_codec_inst *hci)
+{
+	struct hda_softc *sc = NULL;
+	uint32_t sdiwake = 0;
+
+	assert(hci);
+	assert(hci->hda);
+
+	DPRINTF("cad: 0x%x\n", hci->cad);
+
+	sc = hci->hda;
+	sdiwake = 1 << hci->cad;
+
+	hda_set_field_by_offset(sc, HDAC_STATESTS, sdiwake, sdiwake);
+	hda_update_intr(sc);
+
+	return (0);
+}
+
+static int
+hda_response(struct hda_codec_inst *hci, uint32_t response, uint8_t unsol)
+{
+	struct hda_softc *sc = NULL;
+	struct hda_codec_cmd_ctl *rirb = NULL;
+	uint32_t response_ex = 0;
+	uint8_t rintcnt = 0;
+
+	assert(hci);
+	assert(hci->cad <= HDA_CODEC_MAX);
+
+	response_ex = hci->cad | unsol;
+
+	sc = hci->hda;
+	assert(sc);
+
+	rirb = &sc->rirb;
+
+	if (rirb->run) {
+		rirb->wp++;
+		rirb->wp %= rirb->size;
+
+		hda_dma_st_dword(rirb->dma_vaddr + HDA_RIRB_ENTRY_LEN *	\
+				rirb->wp, response);
+		hda_dma_st_dword(rirb->dma_vaddr + HDA_RIRB_ENTRY_LEN *	\
+				rirb->wp + 0x04, response_ex);
+
+		hda_set_reg_by_offset(sc, HDAC_RIRBWP, rirb->wp);
+
+		sc->rirb_cnt++;
+	}
+
+	rintcnt = hda_get_reg_by_offset(sc, HDAC_RINTCNT);
+	if (sc->rirb_cnt == rintcnt)
+		hda_response_interrupt(sc);
+
+	return (0);
+}
+
+static int
+hda_transfer(struct hda_codec_inst *hci, uint8_t stream, uint8_t dir,
+    void *buf, size_t count)
+{
+	struct hda_softc *sc = NULL;
+	struct hda_stream_desc *st = NULL;
+	struct hda_bdle_desc *bdl = NULL;
+	struct hda_bdle_desc *bdle_desc = NULL;
+	uint8_t stream_ind = 0;
+	uint32_t lpib = 0;
+	uint32_t off = 0;
+	size_t left = 0;
+	uint8_t irq = 0;
+
+	assert(hci);
+	assert(hci->hda);
+	assert(buf);
+	assert(!(count % HDA_DMA_ACCESS_LEN));
+
+	if (!stream) {
+		DPRINTF("Invalid stream\n");
+		return (-1);
+	}
+
+	sc = hci->hda;
+
+	assert(stream < HDA_STREAM_TAGS_CNT);
+	stream_ind = sc->stream_map[dir][stream];
+
+	if (!dir)
+		assert(stream_ind < HDA_ISS_NO);
+	else
+		assert(stream_ind >= HDA_ISS_NO && stream_ind < HDA_IOSS_NO);
+
+	st = &sc->streams[stream_ind];
+	if (!st->run) {
+		DPRINTF("Stream 0x%x stopped\n", stream);
+		return (-1);
+	}
+
+	assert(st->stream == stream);
+
+	off = hda_get_offset_stream(stream_ind);
+
+	lpib = hda_get_reg_by_offset(sc, off + HDAC_SDLPIB);
+
+	bdl = st->bdl;
+
+	assert(st->be < st->bdl_cnt);
+	assert(st->bp < bdl[st->be].len);
+
+	left = count;
+	while (left) {
+		bdle_desc = &bdl[st->be];
+
+		if (dir)
+			*(uint32_t *)buf =				\
+			    hda_dma_ld_dword(bdle_desc->addr + st->bp);
+		else
+			hda_dma_st_dword(bdle_desc->addr + st->bp,
+					*(uint32_t *)buf);
+
+		buf += HDA_DMA_ACCESS_LEN;
+		st->bp += HDA_DMA_ACCESS_LEN;
+		lpib += HDA_DMA_ACCESS_LEN;
+		left -= HDA_DMA_ACCESS_LEN;
+
+		if (st->bp == bdle_desc->len) {
+			st->bp = 0;
+			if (bdle_desc->ioc)
+				irq = 1;
+			st->be++;
+			if (st->be == st->bdl_cnt) {
+				st->be = 0;
+				lpib = 0;
+			}
+			bdle_desc = &bdl[st->be];
+		}
+	}
+
+	hda_set_pib(sc, stream_ind, lpib);
+
+	if (irq) {
+		hda_set_field_by_offset(sc, off + HDAC_SDSTS,
+				HDAC_SDSTS_BCIS, HDAC_SDSTS_BCIS);
+		hda_update_intr(sc);
+	}
+
+	return (0);
+}
+
+static void
+hda_set_pib(struct hda_softc *sc, uint8_t stream_ind, uint32_t pib)
+{
+	uint32_t off = hda_get_offset_stream(stream_ind);
+
+	hda_set_reg_by_offset(sc, off + HDAC_SDLPIB, pib);
+	/* LPIB Alias */
+	hda_set_reg_by_offset(sc, 0x2000 + off + HDAC_SDLPIB, pib);
+	if (sc->dma_pib_vaddr)
+		*(uint32_t *)(sc->dma_pib_vaddr + stream_ind *	\
+				HDA_DMA_PIB_ENTRY_LEN) = pib;
+}
+
+static uint64_t hda_get_clock_ns(void)
+{
+	struct timespec ts;
+	int err;
+
+	err = clock_gettime(CLOCK_MONOTONIC, &ts);
+	assert(!err);
+
+	return (ts.tv_sec * 1000000000LL + ts.tv_nsec);
+}
+
+/*
+ * PCI HDA function definitions
+ */
+static int
+pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct hda_softc *sc = NULL;
+
+	assert(ctx != NULL);
+	assert(pi != NULL);
+
+	pci_set_cfgdata16(pi, PCIR_VENDOR, INTEL_VENDORID);
+	pci_set_cfgdata16(pi, PCIR_DEVICE, HDA_INTEL_82801G);
+
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_MULTIMEDIA_HDA);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_MULTIMEDIA);
+
+	/* select the Intel HDA mode */
+	pci_set_cfgdata8(pi, PCIR_HDCTL, 0x01);
+
+	/* allocate one BAR register for the Memory address offsets */
+	pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, HDA_LAST_OFFSET);
+
+	/* allocate an IRQ pin for our slot */
+	pci_lintr_request(pi);
+
+	sc = hda_init(opts);
+	if (!sc)
+		return (-1);
+
+	sc->pci_dev = pi;
+	pi->pi_arg = sc;
+
+	return (0);
+}
+
+static void
+pci_hda_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+    int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct hda_softc *sc = pi->pi_arg;
+	int err;
+
+	assert(sc);
+	assert(baridx == 0);
+	assert(size <= 4);
+
+	DPRINTF("offset: 0x%lx value: 0x%lx\n", offset, value);
+
+	err = hda_write(sc, offset, size, value);
+	assert(!err);
+}
+
+static uint64_t
+pci_hda_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+    int baridx, uint64_t offset, int size)
+{
+	struct hda_softc *sc = pi->pi_arg;
+	uint64_t value = 0;
+
+	assert(sc);
+	assert(baridx == 0);
+	assert(size <= 4);
+
+	value = hda_read(sc, offset);
+
+	DPRINTF("offset: 0x%lx value: 0x%lx\n", offset, value);
+
+	return (value);
+}
diff --git a/usr/src/cmd/bhyve/pci_hda.h b/usr/src/cmd/bhyve/pci_hda.h
new file mode 100644
index 0000000000..8ed050cc8f
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_hda.h
@@ -0,0 +1,92 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Alex Teaca <iateaca@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HDA_EMUL_H_ 
+#define _HDA_EMUL_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <assert.h>
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+
+#include "hda_reg.h"
+
+/*
+ * HDA Debug Log
+ */
+#define DEBUG_HDA			1
+#if DEBUG_HDA == 1
+extern FILE *dbg;
+#define DPRINTF(fmt, arg...)						\
+do {fprintf(dbg, "%s-%d: " fmt, __func__, __LINE__, ##arg);		\
+fflush(dbg); } while (0)
+#else
+#define DPRINTF(fmt, arg...)
+#endif
+
+#define HDA_FIFO_SIZE			0x100
+
+struct hda_softc;
+struct hda_codec_class;
+
+struct hda_codec_inst {
+	uint8_t cad;
+	struct hda_codec_class *codec;
+	struct hda_softc *hda;
+	struct hda_ops *hops;
+	void *priv;
+};
+
+struct hda_codec_class {
+	char *name;
+	int (*init)(struct hda_codec_inst *hci, const char *play,
+		const char *rec, const char *opts);
+	int (*reset)(struct hda_codec_inst *hci);
+	int (*command)(struct hda_codec_inst *hci, uint32_t cmd_data);
+	int (*notify)(struct hda_codec_inst *hci, uint8_t run, uint8_t stream,
+		uint8_t dir);
+};
+
+struct hda_ops {
+	int (*signal)(struct hda_codec_inst *hci);
+	int (*response)(struct hda_codec_inst *hci, uint32_t response,
+		uint8_t unsol);
+	int (*transfer)(struct hda_codec_inst *hci, uint8_t stream,
+		uint8_t dir, void *buf, size_t count);
+};
+
+#define HDA_EMUL_SET(x)		DATA_SET(hda_codec_class_set, x);
+
+#endif	/* _HDA_EMUL_H_ */
diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c
index a56c1d6959..3e6e469ed1 100644
--- a/usr/src/cmd/bhyve/pci_nvme.c
+++ b/usr/src/cmd/bhyve/pci_nvme.c
@@ -4,6 +4,9 @@
  * Copyright (c) 2017 Shunsuke Mie
  * Copyright (c) 2018 Leon Dang
  *
+ * Function crc16 Copyright (c) 2017, Fedor Uporov 
+ *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -30,7 +33,7 @@
  * bhyve PCIe-NVMe device emulation.
  *
  * options:
- *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
+ *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
  *
  *  accepted devpath:
  *    /dev/blockdev
@@ -42,6 +45,7 @@
  *  ioslots = max number of concurrent io requests
  *  sectsz  = sector size (defaults to blockif sector size)
  *  ser     = serial number (20-chars max)
+ *  eui64   = IEEE Extended Unique Identifier (8 byte value)
  *
  */
 
@@ -54,6 +58,10 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
+#include <net/ieee_oui.h>
+#ifndef __FreeBSD__
+#include <endian.h>
+#endif
 
 #include <assert.h>
 #include <pthread.h>
@@ -94,6 +102,10 @@ static int nvme_debug = 0;
 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
 #define	NVME_MAX_BLOCKIOVS	512
 
+/* This is a synthetic status code to indicate there is no status */
+#define NVME_NO_STATUS		0xffff
+#define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
+
 /* helpers */
 
 /* Convert a zero-based value into a one-based value */
@@ -164,6 +176,7 @@ struct pci_nvme_blockstore {
 	uint64_t	size;
 	uint32_t	sectsz;
 	uint32_t	sectsz_bits;
+	uint64_t	eui64;
 };
 
 struct pci_nvme_ioreq {
@@ -352,12 +365,61 @@ pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
 	cd->power_state[0].mp = 10;
 }
 
-static void
-pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
+/*
+ * Calculate the CRC-16 of the given buffer
+ * See copyright attribution at top of file
+ */
+static uint16_t
+crc16(uint16_t crc, const void *buffer, unsigned int len)
 {
-	struct nvme_namespace_data *nd;
+	const unsigned char *cp = buffer;
+	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
+	static uint16_t const crc16_table[256] = {
+		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
+		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
+		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
+		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
+		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
+		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
+		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
+		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
+		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
+		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
+		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
+		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
+		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
+		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
+		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
+		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
+		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
+		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
+		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
+		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
+		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
+		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
+		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
+		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
+		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
+		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
+		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
+		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
+		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
+		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
+		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
+		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
+	};
+
+	while (len--)
+		crc = (((crc >> 8) & 0xffU) ^
+		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
+	return crc;
+}
 
-	nd = &sc->nsdata;
+static void
+pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
+    struct nvme_namespace_data *nd, uint32_t nsid,
+    uint64_t eui64)
+{
 
 	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
 	nd->ncap = nd->nsze;
@@ -365,10 +427,25 @@ pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
 
 	/* Get LBA and backstore information from backing store */
 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
+	nd->flbas = 0;
+
+	/* Create an EUI-64 if user did not provide one */
+	if (eui64 == 0) {
+		char *data = NULL;
+
+		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
+		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
+
+		if (data != NULL) {
+			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
+			free(data);
+		}
+		eui64 = (eui64 << 16) | (nsid & 0xffff);
+	}
+	be64enc(nd->eui64, eui64);
+
 	/* LBA data-sz = 2^lbads */
 	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
-
-	nd->flbas = 0;
 }
 
 static void
@@ -982,6 +1059,7 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
 	
 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
 		cmd = &(sq->qbase)[sqhead];
+		compl.cdw0 = 0;
 		compl.status = 0;
 
 		switch (cmd->opc) {
@@ -1026,14 +1104,16 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
 			/* XXX dont care, unhandled for now
 			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
 			*/
+			compl.status = NVME_NO_STATUS;
 			break;
 		default:
 			WPRINTF(("0x%x command is not implemented\r\n",
 			    cmd->opc));
+			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
+			do_intr |= 1;
 		}
 	
-		/* for now skip async event generation */
-		if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
+		if (NVME_COMPLETION_VALID(compl)) {
 			struct nvme_completion *cp;
 			int phase;
 
@@ -1820,6 +1900,8 @@ pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
 				free(uopt);
 				return (-1);
 			}
+		} else if (!strcmp("eui64", xopts)) {
+			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
 		} else if (optidx == 0) {
 			snprintf(bident, sizeof(bident), "%d:%d",
 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
@@ -1929,12 +2011,18 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		goto done;
 	}
 
+	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
+	if (error) {
+		WPRINTF(("%s pci add Express capability failed\r\n", __func__));
+		goto done;
+	}
+
 	pthread_mutex_init(&sc->mtx, NULL);
 	sem_init(&sc->iosemlock, 0, sc->ioslots);
 
 	pci_nvme_reset(sc);
 	pci_nvme_init_ctrldata(sc);
-	pci_nvme_init_nsdata(sc);
+	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
 	pci_nvme_init_logpages(sc);
 
 	pci_lintr_request(pi);
diff --git a/usr/src/cmd/bhyve/pci_virtio_console.c b/usr/src/cmd/bhyve/pci_virtio_console.c
index 90437662df..f7038ff40f 100644
--- a/usr/src/cmd/bhyve/pci_virtio_console.c
+++ b/usr/src/cmd/bhyve/pci_virtio_console.c
@@ -375,8 +375,11 @@ out:
 	if (fd != -1)
 		close(fd);
 
-	if (error != 0 && s != -1)
-		close(s);
+	if (error != 0) {
+		if (s != -1)
+			close(s);
+		free(sock);
+	}
 
 	return (error);
 }
@@ -630,7 +633,7 @@ pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq)
 
 	if (!port->vsp_rx_ready) {
 		port->vsp_rx_ready = 1;
-		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+		vq_kick_disable(vq);
 	}
 }
 
diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c
index aa188a3e59..73f8aa0d6b 100644
--- a/usr/src/cmd/bhyve/pci_virtio_net.c
+++ b/usr/src/cmd/bhyve/pci_virtio_net.c
@@ -52,7 +52,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/select.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
-#include <machine/atomic.h>
 #include <net/ethernet.h>
 #ifdef __FreeBSD__
 #ifndef NETMAP_WITH_LIBS
@@ -89,6 +88,7 @@ __FBSDID("$FreeBSD$");
 #include "mevent.h"
 #endif
 #include "virtio.h"
+#include "net_utils.h"
 
 #define VTNET_RINGSZ	1024
 
@@ -176,14 +176,13 @@ struct pci_vtnet_softc {
 	struct nm_desc	*vsc_nmd;
 
 	int		vsc_rx_ready;
-	volatile int	resetting;	/* set and checked outside lock */
+	int		resetting;	/* protected by tx_mtx */
 
 	uint64_t	vsc_features;	/* negotiated features */
 
 	struct virtio_net_config vsc_config;
 
 	pthread_mutex_t	rx_mtx;
-	int		rx_in_progress;
 	int		rx_vhdrlen;
 	int		rx_merge;	/* merged rx bufs in use */
 
@@ -215,62 +214,39 @@ static struct virtio_consts vtnet_vi_consts = {
 	VTNET_S_HOSTCAPS,	/* our capabilities */
 };
 
-/*
- * If the transmit thread is active then stall until it is done.
- */
 static void
-pci_vtnet_txwait(struct pci_vtnet_softc *sc)
+pci_vtnet_reset(void *vsc)
 {
+	struct pci_vtnet_softc *sc = vsc;
+
+	DPRINTF(("vtnet: device reset requested !\n"));
+
+	/* Acquire the RX lock to block RX processing. */
+	pthread_mutex_lock(&sc->rx_mtx);
 
+	/* Set sc->resetting and give a chance to the TX thread to stop. */
 	pthread_mutex_lock(&sc->tx_mtx);
+	sc->resetting = 1;
 	while (sc->tx_in_progress) {
 		pthread_mutex_unlock(&sc->tx_mtx);
 		usleep(10000);
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
-	pthread_mutex_unlock(&sc->tx_mtx);
-}
-
-/*
- * If the receive thread is active then stall until it is done.
- */
-static void
-pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
-{
-
-	pthread_mutex_lock(&sc->rx_mtx);
-	while (sc->rx_in_progress) {
-		pthread_mutex_unlock(&sc->rx_mtx);
-		usleep(10000);
-		pthread_mutex_lock(&sc->rx_mtx);
-	}
-	pthread_mutex_unlock(&sc->rx_mtx);
-}
-
-static void
-pci_vtnet_reset(void *vsc)
-{
-	struct pci_vtnet_softc *sc = vsc;
-
-	DPRINTF(("vtnet: device reset requested !\n"));
-
-	sc->resetting = 1;
-
-	/*
-	 * Wait for the transmit and receive threads to finish their
-	 * processing.
-	 */
-	pci_vtnet_txwait(sc);
-	pci_vtnet_rxwait(sc);
 
 	sc->vsc_rx_ready = 0;
 	sc->rx_merge = 1;
 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
 
-	/* now reset rings, MSI-X vectors, and negotiated capabilities */
+	/*
+	 * Now reset rings, MSI-X vectors, and negotiated capabilities.
+	 * Do that with the TX lock held, since we need to reset
+	 * sc->resetting.
+	 */
 	vi_reset_dev(&sc->vsc_vs);
 
 	sc->resetting = 0;
+	pthread_mutex_unlock(&sc->tx_mtx);
+	pthread_mutex_unlock(&sc->rx_mtx);
 }
 
 /*
@@ -370,9 +346,9 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 
 	/*
 	 * But, will be called when the rx ring hasn't yet
-	 * been set up or the guest is resetting the device.
+	 * been set up.
 	 */
-	if (!sc->vsc_rx_ready || sc->resetting) {
+	if (!sc->vsc_rx_ready) {
 #ifdef	__FreeBSD__
 		/*
 		 * Drop the packet and try later.
@@ -580,9 +556,9 @@ pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
 
 	/*
 	 * But, will be called when the rx ring hasn't yet
-	 * been set up or the guest is resetting the device.
+	 * been set up.
 	 */
-	if (!sc->vsc_rx_ready || sc->resetting) {
+	if (!sc->vsc_rx_ready) {
 		/*
 		 * Drop the packet and try later.
 		 */
@@ -661,9 +637,7 @@ pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
 	struct pci_vtnet_softc *sc = param;
 
 	pthread_mutex_lock(&sc->rx_mtx);
-	sc->rx_in_progress = 1;
 	sc->pci_vtnet_rx(sc);
-	sc->rx_in_progress = 0;
 	pthread_mutex_unlock(&sc->rx_mtx);
 
 }
@@ -685,9 +659,7 @@ pci_vtnet_poll_thread(void *param)
 			continue;
 		}
 		pthread_mutex_lock(&sc->vsc_mtx);
-		sc->rx_in_progress = 1;
 		pci_vtnet_tap_rx(sc);
-		sc->rx_in_progress = 0;
 		pthread_mutex_unlock(&sc->vsc_mtx);
 	}
 
@@ -705,7 +677,7 @@ pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
 	 */
 	if (sc->vsc_rx_ready == 0) {
 		sc->vsc_rx_ready = 1;
-		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+		vq_kick_disable(vq);
 	}
 }
 
@@ -751,7 +723,7 @@ pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
 
 	/* Signal the tx thread for processing */
 	pthread_mutex_lock(&sc->tx_mtx);
-	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+	vq_kick_disable(vq);
 	if (sc->tx_in_progress == 0)
 		pthread_cond_signal(&sc->tx_cond);
 	pthread_mutex_unlock(&sc->tx_mtx);
@@ -780,8 +752,7 @@ pci_vtnet_tx_thread(void *param)
 	for (;;) {
 		/* note - tx mutex is locked here */
 		while (sc->resetting || !vq_has_descs(vq)) {
-			vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
-			mb();
+			vq_kick_enable(vq);
 			if (!sc->resetting && vq_has_descs(vq))
 				break;
 
@@ -789,7 +760,7 @@ pci_vtnet_tx_thread(void *param)
 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 			assert(error == 0);
 		}
-		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+		vq_kick_disable(vq);
 		sc->tx_in_progress = 1;
 		pthread_mutex_unlock(&sc->tx_mtx);
 
@@ -821,31 +792,6 @@ pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
 }
 #endif /* __FreeBSD__ */
 
-#ifdef __FreeBSD__
-static int
-pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
-{
-	struct ether_addr *ea;
-	char *tmpstr;
-	char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
-
-	tmpstr = strsep(&mac_str,"=");
-
-	if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
-		ea = ether_aton(mac_str);
-
-		if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
-		    memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
-			fprintf(stderr, "Invalid MAC %s\n", mac_str);
-			return (EINVAL);
-		} else
-			memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
-	}
-
-	return (0);
-}
-#endif /* __FreeBSD__ */
-
 static void
 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
 {
@@ -968,11 +914,6 @@ pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
 static int
 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
-#ifdef	__FreeBSD__
-	MD5_CTX mdctx;
-	unsigned char digest[16];
-	char nstr[80];
-#endif
 	char tname[MAXCOMLEN + 1];
 	struct pci_vtnet_softc *sc;
 	const char *env_msi;
@@ -1027,7 +968,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 
 #ifdef	__FreBSD__
 		if (vtopts != NULL) {
-			err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
+			err = net_parsemac(vtopts, sc->vsc_config.mac);
 			if (err != 0) {
 				free(devname);
 				return (err);
@@ -1048,24 +989,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	}
 
 #ifdef	__FreeBSD__
-	/*
-	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
-	 * followed by an MD5 of the PCI slot/func number and dev name
-	 */
 	if (!mac_provided) {
-		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
-	            pi->pi_func, vmname);
-
-		MD5Init(&mdctx);
-		MD5Update(&mdctx, nstr, strlen(nstr));
-		MD5Final(digest, &mdctx);
-
-		sc->vsc_config.mac[0] = 0x00;
-		sc->vsc_config.mac[1] = 0xa0;
-		sc->vsc_config.mac[2] = 0x98;
-		sc->vsc_config.mac[3] = digest[0];
-		sc->vsc_config.mac[4] = digest[1];
-		sc->vsc_config.mac[5] = digest[2];
+		net_genmac(pi, sc->vsc_config.mac);
 	}
 #endif
 
@@ -1095,7 +1020,6 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 
 	sc->rx_merge = 1;
 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
-	sc->rx_in_progress = 0;
 	pthread_mutex_init(&sc->rx_mtx, NULL); 
 
 	/* 
diff --git a/usr/src/cmd/bhyve/pci_virtio_scsi.c b/usr/src/cmd/bhyve/pci_virtio_scsi.c
index 38e7d918a0..632f920293 100644
--- a/usr/src/cmd/bhyve/pci_virtio_scsi.c
+++ b/usr/src/cmd/bhyve/pci_virtio_scsi.c
@@ -309,7 +309,8 @@ pci_vtscsi_reset(void *vsc)
 	/* initialize config structure */
 	sc->vss_config = (struct pci_vtscsi_config){
 		.num_queues = VTSCSI_REQUESTQ,
-		.seg_max = VTSCSI_MAXSEG,
+		/* Leave room for the request and the response. */
+		.seg_max = VTSCSI_MAXSEG - 2,
 		.max_sectors = 2,
 		.cmd_per_lun = 1,
 		.event_info_size = sizeof(struct pci_vtscsi_event),
@@ -464,7 +465,7 @@ pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in,
 	int data_niov_in, data_niov_out;
 	void *ext_data_ptr = NULL;
 	uint32_t ext_data_len = 0, ext_sg_entries = 0;
-	int err;
+	int err, nxferred;
 
 	seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in,
 	    VTSCSI_IN_HEADER_LEN(sc));
@@ -543,10 +544,11 @@ pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in,
 	}
 
 	buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0);
+	nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled;
 	free(cmd_rd);
 	free(cmd_wr);
 	ctl_scsi_free_io(io);
-	return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled);
+	return (nxferred);
 }
 
 static void
@@ -581,7 +583,7 @@ static void
 pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq)
 {
 
-	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+	vq_kick_disable(vq);
 }
 
 static void
diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c
index 29d56ec32c..324c706c47 100644
--- a/usr/src/cmd/bhyve/pci_xhci.c
+++ b/usr/src/cmd/bhyve/pci_xhci.c
@@ -1911,6 +1911,11 @@ pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot,
 		return;
 	}
 
+	if (epid == 0 || epid >= XHCI_MAX_ENDPOINTS) {
+		DPRINTF(("pci_xhci: invalid endpoint %u\r\n", epid));
+		return;
+	}
+
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	devep = &dev->eps[epid];
 	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
@@ -1942,6 +1947,23 @@ pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot,
 
 	/* get next trb work item */
 	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) {
+		struct xhci_stream_ctx *sctx;
+
+		/*
+		 * Stream IDs of 0, 65535 (any stream), and 65534
+		 * (prime) are invalid.
+		 */
+		if (streamid == 0 || streamid == 65534 || streamid == 65535) {
+			DPRINTF(("pci_xhci: invalid stream %u\r\n", streamid));
+			return;
+		}
+
+		sctx = NULL;
+		pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx);
+		if (sctx == NULL) {
+			DPRINTF(("pci_xhci: invalid stream %u\r\n", streamid));
+			return;
+		}
 		sctx_tr = &devep->ep_sctx_trbs[streamid];
 		ringaddr = sctx_tr->ringaddr;
 		ccs = sctx_tr->ccs;
@@ -1950,6 +1972,10 @@ pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot,
 		        streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT,
 		        trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT));
 	} else {
+		if (streamid != 0) {
+			DPRINTF(("pci_xhci: invalid stream %u\r\n", streamid));
+			return;
+		}
 		ringaddr = devep->ep_ringaddr;
 		ccs = devep->ep_ccs;
 		trb = devep->ep_tr;
@@ -2561,7 +2587,7 @@ pci_xhci_dev_intr(struct usb_hci *hci, int epctx)
 	struct pci_xhci_softc	*sc;
 	struct pci_xhci_portregs *p;
 	struct xhci_endp_ctx	*ep_ctx;
-	int	error;
+	int	error = 0;
 	int	dir_in;
 	int	epid;
 
diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c
index 39ea1611f9..942c294775 100644
--- a/usr/src/cmd/bhyve/rfb.c
+++ b/usr/src/cmd/bhyve/rfb.c
@@ -278,8 +278,10 @@ rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd)
 			rc->enc_raw_ok = true;
 			break;
 		case RFB_ENCODING_ZLIB:
-			rc->enc_zlib_ok = true;
-			deflateInit(&rc->zstream, Z_BEST_SPEED);
+			if (!rc->enc_zlib_ok) {
+				deflateInit(&rc->zstream, Z_BEST_SPEED);
+				rc->enc_zlib_ok = true;
+			}
 			break;
 		case RFB_ENCODING_RESIZE:
 			rc->enc_resize_ok = true;
@@ -978,7 +980,7 @@ rfb_init(char *hostname, int port, int wait, char *password)
 	int e;
 	char servname[6];
 	struct rfb_softc *rc;
-	struct addrinfo *ai;
+	struct addrinfo *ai = NULL;
 	struct addrinfo hints;
 	int on = 1;
 #ifndef WITHOUT_CAPSICUM
@@ -993,6 +995,7 @@ rfb_init(char *hostname, int port, int wait, char *password)
 	                     sizeof(uint32_t));
 	rc->crc_width = RFB_MAX_WIDTH;
 	rc->crc_height = RFB_MAX_HEIGHT;
+	rc->sfd = -1;
 
 	rc->password = password;
 
@@ -1012,28 +1015,25 @@ rfb_init(char *hostname, int port, int wait, char *password)
 
 	if ((e = getaddrinfo(hostname, servname, &hints, &ai)) != 0) {
 		fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(e));
-		return(-1);
+		goto error;
 	}
 
 	rc->sfd = socket(ai->ai_family, ai->ai_socktype, 0);
 	if (rc->sfd < 0) {
 		perror("socket");
-		freeaddrinfo(ai);
-		return (-1);
+		goto error;
 	}
 
 	setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
 
 	if (bind(rc->sfd, ai->ai_addr, ai->ai_addrlen) < 0) {
 		perror("bind");
-		freeaddrinfo(ai);
-		return (-1);
+		goto error;
 	}
 
 	if (listen(rc->sfd, 1) < 0) {
 		perror("listen");
-		freeaddrinfo(ai);
-		return (-1);
+		goto error;
 	}
 
 #ifndef WITHOUT_CAPSICUM
@@ -1062,6 +1062,16 @@ rfb_init(char *hostname, int port, int wait, char *password)
 
 	freeaddrinfo(ai);
 	return (0);
+
+ error:
+	if (ai != NULL)
+		freeaddrinfo(ai);
+	if (rc->sfd != -1)
+		close(rc->sfd);
+	free(rc->crc);
+	free(rc->crc_tmp);
+	free(rc);
+	return (-1);
 }
 
 #ifndef __FreeBSD__
diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c
index c0fff61d00..fc448152ad 100644
--- a/usr/src/cmd/bhyve/uart_emul.c
+++ b/usr/src/cmd/bhyve/uart_emul.c
@@ -923,9 +923,14 @@ uart_tty_backend(struct uart_softc *sc, const char *opts)
 	int fd;
 
 	fd = open(opts, O_RDWR | O_NONBLOCK);
-	if (fd < 0 || !isatty(fd))
+	if (fd < 0)
 		return (-1);
 
+	if (!isatty(fd)) {
+		close(fd);
+		return (-1);
+	}
+
 	sc->tty.rfd = sc->tty.wfd = fd;
 	sc->tty.opened = true;
 
diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c
index 47a3ed29ba..2d78b016c6 100644
--- a/usr/src/cmd/bhyve/virtio.c
+++ b/usr/src/cmd/bhyve/virtio.c
@@ -428,7 +428,8 @@ vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
 
 	/*
 	 * Ensure the used descriptor is visible before updating the index.
-	 * This is necessary on ISAs with memory ordering less strict than x86.
+	 * This is necessary on ISAs with memory ordering less strict than x86
+	 * (and even on x86 to act as a compiler barrier).
 	 */
 	atomic_thread_fence_rel();
 	vuh->vu_idx = uidx;
diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h
index a2c3362ec2..521bfac681 100644
--- a/usr/src/cmd/bhyve/virtio.h
+++ b/usr/src/cmd/bhyve/virtio.h
@@ -32,6 +32,7 @@
 #define	_VIRTIO_H_
 
 #include <pthread_np.h>
+#include <machine/atomic.h>
 
 /*
  * These are derived from several virtio specifications.
@@ -463,6 +464,26 @@ vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
 	}
 }
 
+static inline void
+vq_kick_enable(struct vqueue_info *vq)
+{
+
+	vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
+	/*
+	 * Full memory barrier to make sure the store to vu_flags
+	 * happens before the load from va_idx, which results from
+	 * a subsequent call to vq_has_descs().
+	 */
+	atomic_thread_fence_seq_cst();
+}
+
+static inline void
+vq_kick_disable(struct vqueue_info *vq)
+{
+
+	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
+}
+
 struct iovec;
 void	vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
 			void *dev_softc, struct pci_devinst *pi,
diff --git a/usr/src/compat/freebsd/net/ieee_oui.h b/usr/src/compat/freebsd/net/ieee_oui.h
new file mode 100644
index 0000000000..068328d833
--- /dev/null
+++ b/usr/src/compat/freebsd/net/ieee_oui.h
@@ -0,0 +1,85 @@
+/* -
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ * Author: George V. Neville-Neil
+ *
+ */
+
+/* Organizationally Unique Identifier assigned by IEEE 14 Nov 2013 */
+#define OUI_FREEBSD_BASE 0x589cfc000000
+#define OUI_FREEBSD(nic) (OUI_FREEBSD_BASE | (nic))
+
+/* 
+ * OUIs are most often used to uniquely identify network interfaces
+ * and occupy the first 3 bytes of both destination and source MAC
+ * addresses.  The following allocations exist so that various
+ * software systems associated with FreeBSD can have unique IDs in the
+ * absence of hardware.  The use of OUIs for this purpose is not fully
+ * fleshed out but is now in common use in virtualization technology.
+ * 
+ * Allocations from this range are expected to be made using COMMON
+ * SENSE by developers.  Do NOT take a large range just because
+ * they're currently wide open.  Take the smallest useful range for
+ * your system.  We have (2^24 - 2) available addresses (see Reserved
+ * Values below) but that is far from infinite.
+ *
+ * In the event of a conflict arbitration of allocation in this file
+ * is subject to core@ approval.
+ * 
+ * Applications are differentiated based on the high order bit(s) of
+ * the remaining three bytes.  Our first allocation has all 0s, the
+ * next allocation has the highest bit set.  Allocating in this way
+ * gives us 254 allocations of 64K addresses.  Address blocks can be
+ * concatenated if necessary.
+ *
+ * Reserved Values: 0x000000 and 0xffffff are reserved and MUST NOT BE
+ * allocated for any reason.
+ */
+
+/* Allocate 20 bits to bhyve */
+#define OUI_FREEBSD_BHYVE_LOW	OUI_FREEBSD(0x000001)
+#define OUI_FREEBSD_BHYVE_HIGH	OUI_FREEBSD(0x0fffff)
+
+/*
+ * Allocate 16 bits for a pool to give to various interfaces that need a
+ * generated address, but don't quite need to slice off a whole section of
+ * the OUI (e.g. cloned interfaces, one-off NICs of various vendors).
+ *
+ * ether_gen_addr should be used to generate an address from this pool.
+ */
+#define	OUI_FREEBSD_GENERATED_MASK	0x10ffff
+#define	OUI_FREEBSD_GENERATED_LOW	OUI_FREEBSD(0x100000)
+#define	OUI_FREEBSD_GENERATED_HIGH	OUI_FREEBSD(OUI_FREEBSD_GENERATED_MASK)
+
+/* Allocate 16 bits for emulated NVMe devices */
+#define OUI_FREEBSD_NVME_MASK		0x20ffff
+#define OUI_FREEBSD_NVME_LOW		OUI_FREEBSD(0x200000)
+#define OUI_FREEBSD_NVME_HIGH		OUI_FREEBSD(OUI_FREEBSD_NVME_MASK)
diff --git a/usr/src/compat/freebsd/sys/param.h b/usr/src/compat/freebsd/sys/param.h
index b125f9014f..5ba21a2809 100644
--- a/usr/src/compat/freebsd/sys/param.h
+++ b/usr/src/compat/freebsd/sys/param.h
@@ -23,7 +23,7 @@
 #define	MAXPHYS		(56 * 1024)
 #endif
 #define	MAXHOSTNAMELEN	256
-#define	SPECNAMELEN	63
+#define	SPECNAMELEN	255
 
 #ifdef	_KERNEL
 #include <sys/time.h>
diff --git a/usr/src/compat/freebsd/sys/pcpu.h b/usr/src/compat/freebsd/sys/pcpu.h
index f29c9c5018..1bad53c159 100644
--- a/usr/src/compat/freebsd/sys/pcpu.h
+++ b/usr/src/compat/freebsd/sys/pcpu.h
@@ -16,6 +16,8 @@
 #ifndef _COMPAT_FREEBSD_SYS_PCPU_H_
 #define	_COMPAT_FREEBSD_SYS_PCPU_H_
 
-#define	curcpu	(CPU->cpu_id)
+#define	curcpu		(CPU->cpu_id)
+
+#define	get_pcpu()	CPU
 
 #endif	/* _COMPAT_FREEBSD_SYS_PCPU_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync
index 1cddfd829e..e8aeaaffcf 100644
--- a/usr/src/uts/i86pc/io/vmm/README.sync
+++ b/usr/src/uts/i86pc/io/vmm/README.sync
@@ -1,18 +1,30 @@
 The bhyve kernel module and its associated userland consumers have been updated
 to the latest upstream FreeBSD sources as of:
 
+commit 37e8a0e0058c226e6bd0ed5c3a07ee15b1146122
+Author: mav <mav@FreeBSD.org>
+Date:   Mon Sep 23 17:53:47 2019 +0000
 
-commit 3b9cb80b242682690203709aaff4eafae41c138f
-Author: jhb <jhb@FreeBSD.org>
-Date:   Mon Jun 3 23:17:35 2019 +0000
+    Make nvme(4) driver some more NUMA aware.
 
-    Emulate the AMD MSR_LS_CFG MSR used for various Ryzen errata.
+     - For each queue pair precalculate CPU and domain it is bound to.
+    If queue pairs are not per-CPU, then use the domain of the device.
+     - Allocate most of queue pair memory from the domain it is bound to.
+     - Bind callouts to the same CPUs as queue pair to avoid migrations.
+     - Do not assign queue pairs to each SMT thread.  It just wasted
+    resources and increased lock congestions.
+     - Remove fixed multiplier of CPUs per queue pair, spread them even.
+    This allows to use more queue pairs in some hardware configurations.
+     - If queue pair serves multiple CPUs, bind different NVMe devices to
+    different CPUs.
 
-    Writes are ignored and reads always return zero.
+    MFC after:      1 month
+    Sponsored by:   iXsystems, Inc.
 
-    Submitted by:   José Albornoz <jojo@eljojo.net> (write-only version)
-    Reviewed by:    Patrick Mooney, cem
-    MFC after:      2 weeks
-    Differential Revision:  https://reviews.freebsd.org/D19506
+Which corresponds to SVN revision: 352630
 
-Which corresponds to SVN revision: 348592
+
+NOTE:
+This sync ignores commit c8edafdabc27533d9c51eddc2896e772c16d965c.
+There are big changes to the virtio net devices that we haven't synced up yet
+because SmartOS relies heavily on viona instead.
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
index 80d76ab640..c194e3d818 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -112,11 +112,6 @@ SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
 static MALLOC_DEFINE(M_SVM, "svm", "svm");
 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 
-#ifdef __FreeBSD__
-/* Per-CPU context area. */
-extern struct pcpu __pcpu[];
-#endif
-
 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
     "SVM features advertised by CPUID.8000000AH:EDX");
@@ -2160,11 +2155,7 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
 		/* Launch Virtual Machine. */
 		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
 		svm_dr_enter_guest(gctx);
-#ifdef __FreeBSD__
-		svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]);
-#else
-		svm_launch(vmcb_pa, gctx, CPU);
-#endif
+		svm_launch(vmcb_pa, gctx, get_pcpu());
 		svm_dr_leave_guest(gctx);
 
 		CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
@@ -2420,25 +2411,24 @@ svm_restorectx(void *arg, int vcpu)
 #endif /* __FreeBSD__ */
 
 struct vmm_ops vmm_ops_amd = {
-	svm_init,
-	svm_cleanup,
-	svm_restore,
-	svm_vminit,
-	svm_vmrun,
-	svm_vmcleanup,
-	svm_getreg,
-	svm_setreg,
-	vmcb_getdesc,
-	vmcb_setdesc,
-	svm_getcap,
-	svm_setcap,
-	svm_npt_alloc,
-	svm_npt_free,
-	svm_vlapic_init,
-	svm_vlapic_cleanup,
-
+	.init		= svm_init,
+	.cleanup	= svm_cleanup,
+	.resume		= svm_restore,
+	.vminit		= svm_vminit,
+	.vmrun		= svm_vmrun,
+	.vmcleanup	= svm_vmcleanup,
+	.vmgetreg	= svm_getreg,
+	.vmsetreg	= svm_setreg,
+	.vmgetdesc	= vmcb_getdesc,
+	.vmsetdesc	= vmcb_setdesc,
+	.vmgetcap	= svm_getcap,
+	.vmsetcap	= svm_setcap,
+	.vmspace_alloc	= svm_npt_alloc,
+	.vmspace_free	= svm_npt_free,
+	.vlapic_init	= svm_vlapic_init,
+	.vlapic_cleanup	= svm_vlapic_cleanup,
 #ifndef __FreeBSD__
-	svm_savectx,
-	svm_restorectx,
+	.vmsavectx	= svm_savectx,
+	.vmrestorectx	= svm_restorectx,
 #endif
 };
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index eea036b253..ede3a54d66 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -2356,20 +2356,20 @@ ept_fault_type(uint64_t ept_qual)
 	return (fault_type);
 }
 
-static boolean_t
+static bool
 ept_emulation_fault(uint64_t ept_qual)
 {
 	int read, write;
 
 	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
-		return (FALSE);
+		return (false);
 
 	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
-		return (FALSE);
+		return (false);
 
 	/*
 	 * The EPT violation must have been caused by accessing a
@@ -2378,10 +2378,10 @@ ept_emulation_fault(uint64_t ept_qual)
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
-		return (FALSE);
+		return (false);
 	}
 
-	return (TRUE);
+	return (true);
 }
 
 static __inline int
@@ -4284,26 +4284,26 @@ vmx_restorectx(void *arg, int vcpu)
 #endif /* __FreeBSD__ */
 
 struct vmm_ops vmm_ops_intel = {
-	vmx_init,
-	vmx_cleanup,
-	vmx_restore,
-	vmx_vminit,
-	vmx_run,
-	vmx_vmcleanup,
-	vmx_getreg,
-	vmx_setreg,
-	vmx_getdesc,
-	vmx_setdesc,
-	vmx_getcap,
-	vmx_setcap,
-	ept_vmspace_alloc,
-	ept_vmspace_free,
-	vmx_vlapic_init,
-	vmx_vlapic_cleanup,
+	.init		= vmx_init,
+	.cleanup	= vmx_cleanup,
+	.resume		= vmx_restore,
+	.vminit		= vmx_vminit,
+	.vmrun		= vmx_run,
+	.vmcleanup	= vmx_vmcleanup,
+	.vmgetreg	= vmx_getreg,
+	.vmsetreg	= vmx_setreg,
+	.vmgetdesc	= vmx_getdesc,
+	.vmsetdesc	= vmx_setdesc,
+	.vmgetcap	= vmx_getcap,
+	.vmsetcap	= vmx_setcap,
+	.vmspace_alloc	= ept_vmspace_alloc,
+	.vmspace_free	= ept_vmspace_free,
+	.vlapic_init	= vmx_vlapic_init,
+	.vlapic_cleanup	= vmx_vlapic_cleanup,
 
 #ifndef __FreeBSD__
-	vmx_savectx,
-	vmx_restorectx,
+	.vmsavectx	= vmx_savectx,
+	.vmrestorectx	= vmx_restorectx,
 #endif
 };
 
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
index 4a1a2cd358..9121e46b40 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
@@ -48,24 +48,18 @@ __FBSDID("$FreeBSD$");
 #include "vmx.h"
 #include "vmx_msr.h"
 
-static boolean_t
+static bool
 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
 {
 
-	if (msr_val & (1UL << (bitpos + 32)))
-		return (TRUE);
-	else
-		return (FALSE);
+	return ((msr_val & (1UL << (bitpos + 32))) != 0);
 }
 
-static boolean_t
+static bool
 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
 {
 
-	if ((msr_val & (1UL << bitpos)) == 0)
-		return (TRUE);
-	else
-		return (FALSE);
+	return ((msr_val & (1UL << bitpos)) == 0);
 }
 
 uint32_t
@@ -92,16 +86,13 @@ vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
 {
 	int i;
 	uint64_t val, trueval;
-	boolean_t true_ctls_avail, one_allowed, zero_allowed;
+	bool true_ctls_avail, one_allowed, zero_allowed;
 
 	/* We cannot ask the same bit to be set to both '1' and '0' */
 	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
 		return (EINVAL);
 
-	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
-		true_ctls_avail = TRUE;
-	else
-		true_ctls_avail = FALSE;
+	true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0;
 
 	val = rdmsr(ctl_reg);
 	if (true_ctls_avail)
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
index 902080e34c..41c2c5b2f8 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vtd.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c
@@ -53,6 +53,8 @@ __FBSDID("$FreeBSD$");
  * Architecture Spec, September 2008.
  */
 
+#define VTD_DRHD_INCLUDE_PCI_ALL(Flags)  (((Flags) >> 0) & 0x1)
+
 /* Section 10.4 "Register Descriptions" */
 struct vtdmap {
 	volatile uint32_t	version;
@@ -118,10 +120,11 @@ struct domain {
 static SLIST_HEAD(, domain) domhead;
 
 #define	DRHD_MAX_UNITS	8
-static int		drhd_num;
-static struct vtdmap	*vtdmaps[DRHD_MAX_UNITS];
-static int		max_domains;
-typedef int		(*drhd_ident_func_t)(void);
+static ACPI_DMAR_HARDWARE_UNIT	*drhds[DRHD_MAX_UNITS];
+static int			drhd_num;
+static struct vtdmap		*vtdmaps[DRHD_MAX_UNITS];
+static int			max_domains;
+typedef int			(*drhd_ident_func_t)(void);
 #ifndef __FreeBSD__
 static dev_info_t	*vtddips[DRHD_MAX_UNITS];
 #endif
@@ -180,6 +183,69 @@ domain_id(void)
 	return (id);
 }
 
+static struct vtdmap *
+vtd_device_scope(uint16_t rid)
+{
+	int i, remaining, pathremaining;
+	char *end, *pathend;
+	struct vtdmap *vtdmap;
+	ACPI_DMAR_HARDWARE_UNIT *drhd;
+	ACPI_DMAR_DEVICE_SCOPE *device_scope;
+	ACPI_DMAR_PCI_PATH *path;
+
+	for (i = 0; i < drhd_num; i++) {
+		drhd = drhds[i];
+
+		if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) {
+			/*
+			 * From Intel VT-d arch spec, version 3.0:
+			 * If a DRHD structure with INCLUDE_PCI_ALL flag Set is reported
+			 * for a Segment, it must be enumerated by BIOS after all other
+			 * DRHD structures for the same Segment.
+			 */
+			vtdmap = vtdmaps[i];
+			return(vtdmap);
+		}
+
+		end = (char *)drhd + drhd->Header.Length;
+		remaining = drhd->Header.Length - sizeof(ACPI_DMAR_HARDWARE_UNIT);
+		while (remaining > sizeof(ACPI_DMAR_DEVICE_SCOPE)) {
+			device_scope = (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining);
+			remaining -= device_scope->Length;
+
+			switch (device_scope->EntryType){
+				/* 0x01 and 0x02 are PCI device entries */
+				case 0x01:
+				case 0x02:
+					break;
+				default:
+					continue;
+			}
+
+			if (PCI_RID2BUS(rid) != device_scope->Bus)
+				continue;
+
+			pathend = (char *)device_scope + device_scope->Length;
+			pathremaining = device_scope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE);
+			while (pathremaining >= sizeof(ACPI_DMAR_PCI_PATH)) {
+				path = (ACPI_DMAR_PCI_PATH *)(pathend - pathremaining);
+				pathremaining -= sizeof(ACPI_DMAR_PCI_PATH);
+
+				if (PCI_RID2SLOT(rid) != path->Device)
+					continue;
+				if (PCI_RID2FUNC(rid) != path->Function)
+					continue;
+
+				vtdmap = vtdmaps[i];
+				return (vtdmap);
+			}
+		}
+	}
+
+	/* No matching scope */
+	return (NULL);
+}
+
 static void
 vtd_wbflush(struct vtdmap *vtdmap)
 {
@@ -285,7 +351,7 @@ extern dev_info_t *vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *, int);
 static int
 vtd_init(void)
 {
-	int i, units, remaining;
+	int i, units, remaining, tmp;
 	struct vtdmap *vtdmap;
 	vm_paddr_t ctx_paddr;
 	char *end;
@@ -342,16 +408,16 @@ vtd_init(void)
 			break;
 
 		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
+		drhds[units] = drhd;
 #ifdef __FreeBSD__
-		vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
+		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
 #else
 		vtddips[units] = vtd_get_dip(drhd, units);
 		vtdmaps[units] = (struct vtdmap *)vtd_map(vtddips[units]);
 		if (vtdmaps[units] == NULL)
 			goto fail;
-		units++;
 #endif
-		if (units >= DRHD_MAX_UNITS)
+		if (++units >= DRHD_MAX_UNITS)
 			break;
 		remaining -= hdr->Length;
 	}
@@ -363,12 +429,18 @@ vtd_init(void)
 skip_dmar:
 #endif
 	drhd_num = units;
-	vtdmap = vtdmaps[0];
 
-	if (VTD_CAP_CM(vtdmap->cap) != 0)
-		panic("vtd_init: invalid caching mode");
+	max_domains = 64 * 1024; /* maximum valid value */
+	for (i = 0; i < drhd_num; i++){
+		vtdmap = vtdmaps[i];
+
+		if (VTD_CAP_CM(vtdmap->cap) != 0)
+			panic("vtd_init: invalid caching mode");
 
-	max_domains = vtd_max_domains(vtdmap);
+		/* take most compatible (minimum) value */
+		if ((tmp = vtd_max_domains(vtdmap)) < max_domains)
+			max_domains = tmp;
+	}
 
 	/*
 	 * Set up the root-table to point to the context-entry tables
@@ -459,7 +531,6 @@ vtd_add_device(void *arg, uint16_t rid)
 	struct vtdmap *vtdmap;
 	uint8_t bus;
 
-	vtdmap = vtdmaps[0];
 	bus = PCI_RID2BUS(rid);
 	ctxp = ctx_tables[bus];
 	pt_paddr = vtophys(dom->ptp);
@@ -471,6 +542,10 @@ vtd_add_device(void *arg, uint16_t rid)
 		      (uint16_t)(ctxp[idx + 1] >> 8));
 	}
 
+	if ((vtdmap = vtd_device_scope(rid)) == NULL)
+		panic("vtd_add_device: device %x is not in scope for "
+		      "any DMA remapping unit", rid);
+
 	/*
 	 * Order is important. The 'present' bit is set only after all fields
 	 * of the context pointer are initialized.
@@ -654,8 +729,6 @@ vtd_create_domain(vm_paddr_t maxaddr)
 	if (drhd_num <= 0)
 		panic("vtd_create_domain: no dma remapping hardware available");
 
-	vtdmap = vtdmaps[0];
-
 	/*
 	 * Calculate AGAW.
 	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
@@ -680,7 +753,14 @@ vtd_create_domain(vm_paddr_t maxaddr)
 	pt_levels = 2;
 	sagaw = 30;
 	addrwidth = 0;
-	tmp = VTD_CAP_SAGAW(vtdmap->cap);
+
+	tmp = ~0;
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		/* take most compatible value */
+		tmp &= VTD_CAP_SAGAW(vtdmap->cap);
+	}
+
 	for (i = 0; i < 5; i++) {
 		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
 			break;
@@ -692,8 +772,8 @@ vtd_create_domain(vm_paddr_t maxaddr)
 	}
 
 	if (i >= 5) {
-		panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
-		      VTD_CAP_SAGAW(vtdmap->cap), agaw);
+		panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d",
+		      tmp, agaw);
 	}
 
 	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
@@ -721,7 +801,12 @@ vtd_create_domain(vm_paddr_t maxaddr)
 	 * There is not any code to deal with the demotion at the moment
 	 * so we disable superpage mappings altogether.
 	 */
-	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+	dom->spsmask = ~0;
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		/* take most compatible value */
+		dom->spsmask &= VTD_CAP_SPS(vtdmap->cap);
+	}
 #endif
 #else
 	/*
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
index 9b3e7376d5..03f63798e7 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vatpit.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
+ * Copyright (c) 2018 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -180,20 +181,20 @@ static void
 pit_timer_start_cntr0(struct vatpit *vatpit)
 {
 	struct channel *c;
+	struct bintime now, delta;
+	sbintime_t precision;
 
 	c = &vatpit->channel[0];
 	if (c->initial != 0) {
-		sbintime_t precision;
-		struct bintime now, delta;
-
 		delta.sec = 0;
 		delta.frac = vatpit->freq_bt.frac * c->initial;
 		bintime_add(&c->callout_bt, &delta);
 		precision = bttosbt(delta) >> tc_precexp;
 
 		/*
-		 * Reset 'callout_bt' if the time that the callout was supposed
-		 * to fire is more than 'c->initial' ticks in the past.
+		 * Reset 'callout_bt' if the time that the callout
+		 * was supposed to fire is more than 'c->initial'
+		 * ticks in the past.
 		 */
 		binuptime(&now);
 		if (bintime_cmp(&c->callout_bt, &now, <)) {
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
index 4e58249c8d..687e0e6a8e 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
@@ -339,7 +339,7 @@ vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
 		return (&lapic->lvt_cmci);
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
-		return ((&lapic->lvt_timer) + i);;
+		return ((&lapic->lvt_timer) + i);
 	default:
 		panic("vlapic_get_lvt: invalid LVT\n");
 	}
@@ -854,7 +854,8 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
 		 */
 		CPU_ZERO(dmask);
 		vcpuid = vm_apicid2vcpuid(vm, dest);
-		if (vcpuid < vm_get_maxcpus(vm))
+		amask = vm_active_cpus(vm);
+		if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
 			CPU_SET(vcpuid, dmask);
 	} else {
 		/*
diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_page.h b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h
index 4559fe6d4c..deb25a6cc0 100644
--- a/usr/src/uts/i86pc/io/vmm/vm/vm_page.h
+++ b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h
@@ -19,9 +19,9 @@
 
 #include "vm_glue.h"
 
-void vm_page_lock(vm_page_t);
-void vm_page_unhold(vm_page_t);
-void vm_page_unlock(vm_page_t);
+#define	PQ_ACTIVE	1
+
+void vm_page_unwire(vm_page_t, uint8_t);
 
 #define	VM_PAGE_TO_PHYS(page)	(mmu_ptob((uintptr_t)((page)->vmp_pfn)))
 
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index 47a5f26cb7..2238536121 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -1008,7 +1008,7 @@ vmm_sysmem_maxaddr(struct vm *vm)
 }
 
 static void
-vm_iommu_modify(struct vm *vm, boolean_t map)
+vm_iommu_modify(struct vm *vm, bool map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
@@ -1083,8 +1083,8 @@ vm_iommu_modify(struct vm *vm, boolean_t map)
 #endif
 }
 
-#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
-#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
+#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), false)
+#define	vm_iommu_map(vm)	vm_iommu_modify((vm), true)
 
 #ifdef __FreeBSD__
 int
@@ -1193,9 +1193,7 @@ vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
-	vm_page_lock(m);
-	vm_page_unhold(m);
-	vm_page_unlock(m);
+	vm_page_unwire(m, PQ_ACTIVE);
 }
 
 int
@@ -1234,20 +1232,20 @@ vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 	return (0);
 }
 
-static boolean_t
+static bool
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
-		return (TRUE);
+		return (true);
 	default:
-		return (FALSE);
+		return (false);
 	}
 }
 
-static boolean_t
+static bool
 is_segment_register(int reg)
 {
 	
@@ -1260,9 +1258,9 @@ is_segment_register(int reg)
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
-		return (TRUE);
+		return (true);
 	default:
-		return (FALSE);
+		return (false);
 	}
 }
 
@@ -2622,12 +2620,12 @@ vm_hpet(struct vm *vm)
 }
 
 #ifdef	__FreeBSD__
-boolean_t
+bool
 vmm_is_pptdev(int bus, int slot, int func)
 {
-	int found, i, n;
-	int b, s, f;
+	int b, f, i, n, s;
 	char *val, *cp, *cp2;
+	bool found;
 
 	/*
 	 * XXX
@@ -2641,7 +2639,7 @@ vmm_is_pptdev(int bus, int slot, int func)
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
-	found = 0;
+	found = false;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = kern_getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
@@ -2650,7 +2648,7 @@ vmm_is_pptdev(int bus, int slot, int func)
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
-				found = 1;
+				found = true;
 				break;
 			}
 		
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h
index f12047819d..e0ea1ec927 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_host.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h
@@ -100,17 +100,12 @@ vmm_get_host_gdtrbase(void)
 #endif
 }
 
-#ifdef	__FreeBSD__
-struct pcpu;
-extern struct pcpu __pcpu[];
-#endif
-
 static __inline uint64_t
 vmm_get_host_gsbase(void)
 {
 
 #ifdef	__FreeBSD__
-	return ((uint64_t)&__pcpu[curcpu]);
+	return ((uint64_t)get_pcpu());
 #else
 	return (rdmsr(MSR_GSBASE));
 #endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
index ea96cd8db0..4a4fb07eba 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
@@ -91,6 +91,7 @@ enum {
 	VIE_OP_TYPE_BITTEST,
 	VIE_OP_TYPE_TWOB_GRP15,
 	VIE_OP_TYPE_ADD,
+	VIE_OP_TYPE_TEST,
 	VIE_OP_TYPE_LAST
 };
 
@@ -235,6 +236,12 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x8F,
 		.op_type = VIE_OP_TYPE_POP,
 	},
+	[0xF7] = {
+		/* XXX Group 3 extended opcode - not just TEST */
+		.op_byte = 0xF7,
+		.op_type = VIE_OP_TYPE_TEST,
+		.op_flags = VIE_OP_F_IMM,
+	},
 	[0xFF] = {
 		/* XXX Group 5 extended opcode - not just PUSH */
 		.op_byte = 0xFF,
@@ -465,6 +472,41 @@ getaddflags(int opsize, uint64_t x, uint64_t y)
 		return (getaddflags64(x, y));
 }
 
+/*
+ * Return the status flags that would result from doing (x & y).
+ */
+#define	GETANDFLAGS(sz)							\
+static u_long								\
+getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
+{									\
+	u_long rflags;							\
+									\
+	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
+	    "=r" (rflags), "+r" (x) : "m" (y));				\
+	return (rflags);						\
+} struct __hack
+
+GETANDFLAGS(8);
+GETANDFLAGS(16);
+GETANDFLAGS(32);
+GETANDFLAGS(64);
+
+static u_long
+getandflags(int opsize, uint64_t x, uint64_t y)
+{
+	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
+	    ("getandflags: invalid operand size %d", opsize));
+
+	if (opsize == 1)
+		return (getandflags8(x, y));
+	else if (opsize == 2)
+		return (getandflags16(x, y));
+	else if (opsize == 4)
+		return (getandflags32(x, y));
+	else
+		return (getandflags64(x, y));
+}
+
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -1233,6 +1275,55 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+static int
+emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t op1, rflags, rflags2;
+
+	size = vie->opsize;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0xF7:
+		/*
+		 * F7 /0		test r/m16, imm16
+		 * F7 /0		test r/m32, imm32
+		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
+		 *
+		 * Test mem (ModRM:r/m) with immediate and set status
+		 * flags according to the results.  The comparison is
+		 * performed by anding the immediate from the first
+		 * operand and then setting the status flags.
+		 */
+		if ((vie->reg & 7) != 0)
+			return (EINVAL);
+
+		error = memread(vm, vcpuid, gpa, &op1, size, arg);
+		if (error)
+			return (error);
+
+		rflags2 = getandflags(size, op1, vie->immediate);
+		break;
+	default:
+		return (EINVAL);
+	}
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+
+	/*
+	 * OF and CF are cleared; the SF, ZF and PF flags are set according
+	 * to the result; AF is undefined.
+	 */
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
 static int
 emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -1658,6 +1749,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		error = emulate_add(vm, vcpuid, gpa, vie, memread,
 		    memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_TEST:
+		error = emulate_test(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
 	default:
 		error = EINVAL;
 		break;
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
index 43b2bebe97..57e4cfddf3 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
@@ -149,13 +149,10 @@ lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg)
 	return (0);
 }
 
-static boolean_t
+static bool
 x2apic_msr(u_int msr)
 {
-	if (msr >= 0x800 && msr <= 0xBFF)
-		return (TRUE);
-	else
-		return (FALSE);
+	return (msr >= 0x800 && msr <= 0xBFF);
 }
 
 static u_int
@@ -165,14 +162,11 @@ x2apic_msr_to_regoff(u_int msr)
 	return ((msr - 0x800) << 4);
 }
 
-boolean_t
+bool
 lapic_msr(u_int msr)
 {
 
-	if (x2apic_msr(msr) || (msr == MSR_APICBASE))
-		return (TRUE);
-	else
-		return (FALSE);
+	return (x2apic_msr(msr) || msr == MSR_APICBASE);
 }
 
 int
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
index da3b0ff660..58508ad70b 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
@@ -45,7 +45,7 @@
 
 struct vm;
 
-boolean_t lapic_msr(u_int num);
+bool	lapic_msr(u_int num);
 int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval,
 	    bool *retu);
 int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval,
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
index 66a67d9529..ddae4202b7 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c
@@ -982,35 +982,19 @@ vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp,
 }
 
 void
-vm_page_lock(vm_page_t vmp)
+vm_page_unwire(vm_page_t vmp, uint8_t nqueue __unused)
 {
 	ASSERT(!MUTEX_HELD(&vmp->vmp_lock));
-
 	mutex_enter(&vmp->vmp_lock);
-}
-
-void
-vm_page_unlock(vm_page_t vmp)
-{
-	boolean_t purge = (vmp->vmp_pfn == PFN_INVALID);
-
-	ASSERT(MUTEX_HELD(&vmp->vmp_lock));
-
-	mutex_exit(&vmp->vmp_lock);
 
-	if (purge) {
-		mutex_destroy(&vmp->vmp_lock);
-		kmem_free(vmp, sizeof (*vmp));
-	}
-}
-
-void
-vm_page_unhold(vm_page_t vmp)
-{
-	ASSERT(MUTEX_HELD(&vmp->vmp_lock));
 	VERIFY(vmp->vmp_pfn != PFN_INVALID);
 
 	vm_object_deallocate(vmp->vmp_obj_held);
 	vmp->vmp_obj_held = NULL;
 	vmp->vmp_pfn = PFN_INVALID;
+
+	mutex_exit(&vmp->vmp_lock);
+
+	mutex_destroy(&vmp->vmp_lock);
+	kmem_free(vmp, sizeof (*vmp));
 }
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c
index 3eadfe57e5..b8acff9bbc 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_util.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c
@@ -50,26 +50,20 @@ __FBSDID("$FreeBSD$");
 
 #include "vmm_util.h"
 
-boolean_t
+bool
 vmm_is_intel(void)
 {
 
-	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
-		return (TRUE);
-	else
-		return (FALSE);
+	return (strcmp(cpu_vendor, "GenuineIntel") == 0);
 }
 
-boolean_t
+bool
 vmm_is_amd(void)
 {
-	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
-		return (TRUE);
-	else
-		return (FALSE);
+	return (strcmp(cpu_vendor, "AuthenticAMD") == 0);
 }
 
-boolean_t
+bool
 vmm_supports_1G_pages(void)
 {
 	unsigned int regs[4];
@@ -82,9 +76,9 @@ vmm_supports_1G_pages(void)
 	if (cpu_exthigh >= 0x80000001) {
 		do_cpuid(0x80000001, regs);
 		if (regs[3] & (1 << 26))
-			return (TRUE);
+			return (true);
 	}
-	return (FALSE);
+	return (false);
 }
 
 #ifdef	__FreeBSD__
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h
index fc7e7364c7..8c65e7e3a6 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_util.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h
@@ -33,9 +33,9 @@
 
 struct trapframe;
 
-boolean_t	vmm_is_intel(void);
-boolean_t	vmm_is_amd(void);
-boolean_t	vmm_supports_1G_pages(void);
+bool		vmm_is_intel(void);
+bool		vmm_is_amd(void);
+bool		vmm_supports_1G_pages(void);
 
 void		dump_trapframe(struct trapframe *tf);
 
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index ac8f14b042..0bbc219b7f 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -127,10 +127,39 @@ enum x2apic_state {
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
-
-#define	VM_MAX_NAMELEN	32
+#ifndef __FreeBSD__
+/*
+ * illumos doesn't have a limitation based on SPECNAMELEN like FreeBSD does.
+ * Instead of picking an arbitrary value we will just rely on the same
+ * calculation that's made below. If this calculation ever changes we need to
+ * update the the VM_MAX_NAMELEN mapping in the bhyve brand's boot.c file.
+ */
+#else
+/*
+ * The VM name has to fit into the pathname length constraints of devfs,
+ * governed primarily by SPECNAMELEN.  The length is the total number of
+ * characters in the full path, relative to the mount point and not 
+ * including any leading '/' characters.
+ * A prefix and a suffix are added to the name specified by the user.
+ * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters
+ * longer for future use.
+ * The suffix is a string that identifies a bootrom image or some similar
+ * image that is attached to the VM. A separator character gets added to
+ * the suffix automatically when generating the full path, so it must be
+ * accounted for, reducing the effective length by 1.
+ * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37
+ * bytes for FreeBSD 12.  A minimum length is set for safety and supports
+ * a SPECNAMELEN as small as 32 on old systems.
+ */
+#endif
+#define VM_MAX_PREFIXLEN 10
+#define VM_MAX_SUFFIXLEN 15
+#define VM_MIN_NAMELEN   6
+#define VM_MAX_NAMELEN \
+    (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1)
 
 #ifdef _KERNEL
+CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN);
 
 struct vm;
 struct vm_exception;
@@ -309,12 +338,12 @@ vcpu_reqidle(struct vm_eventinfo *info)
 int vcpu_debugged(struct vm *vm, int vcpuid);
 
 /*
- * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * Return true if device indicated by bus/slot/func is supposed to be a
  * pci passthrough device.
  *
- * Return 0 otherwise.
+ * Return false otherwise.
  */
-int vmm_is_pptdev(int bus, int slot, int func);
+bool vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
-- 
cgit v1.2.3


From aee33e58f237f0d3cfeab24295f491937f0f7c79 Mon Sep 17 00:00:00 2001
From: Patrick Mooney <pmooney@pfmooney.com>
Date: Wed, 20 May 2020 19:34:30 +0000
Subject: 12766 clean up bhyve lints Reviewed by: Andy Fiddaman
 <andy@omniosce.org> Approved by: Robert Mustacchi <rm@fingolfin.org>

---
 usr/src/cmd/bhyve/Makefile                   |  4 ---
 usr/src/cmd/bhyve/test/Makefile.com          |  2 --
 usr/src/cmd/bhyve/test/Makefile.subdirs      |  3 +-
 usr/src/cmd/bhyve/test/Makefile.targ         |  5 ----
 usr/src/cmd/bhyvectl/Makefile                |  2 --
 usr/src/cmd/pptadm/Makefile                  |  3 --
 usr/src/lib/libppt/Makefile                  |  3 +-
 usr/src/lib/libppt/Makefile.com              | 11 +------
 usr/src/lib/libppt/common/llib-lppt          | 19 ------------
 usr/src/lib/libvmm/Makefile                  |  3 +-
 usr/src/lib/libvmm/Makefile.com              |  2 --
 usr/src/lib/libvmmapi/Makefile               |  3 +-
 usr/src/lib/libvmmapi/Makefile.com           |  6 +---
 usr/src/lib/libvmmapi/common/llib-lvmmapi    |  2 --
 usr/src/uts/i86pc/io/vmm/amd/svm_support.s   | 14 ---------
 usr/src/uts/i86pc/io/vmm/intel/vmx_support.s | 29 +------------------
 usr/src/uts/i86pc/ppt/Makefile               |  8 ------
 usr/src/uts/i86pc/viona/Makefile             | 13 ---------
 usr/src/uts/i86pc/vmm/Makefile               | 43 ----------------------------
 19 files changed, 7 insertions(+), 168 deletions(-)
 delete mode 100644 usr/src/lib/libppt/common/llib-lppt
 delete mode 100644 usr/src/lib/libvmmapi/common/llib-lvmmapi

(limited to 'usr/src/uts/i86pc')

diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index eb7c4def30..f9e8605b4e 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -26,7 +26,6 @@ all	:=	TARGET = all
 install	:=	TARGET = install
 clean	:=	TARGET = clean
 clobber	:=	TARGET = clobber
-lint	:=	TARGET = lint
 
 SRCS =	acpi.c			\
 	atkbdc.c		\
@@ -129,7 +128,6 @@ SMOFF += all_func_returns,leaks,no_if_block
 # Force c99 for everything
 CSTD=		$(CSTD_GNU99)
 C99MODE=	-xc99=%all
-C99LMODE=	-Xc99=%all
 
 $(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz
 $(MEVENT_TEST_PROG) := LDLIBS += -lsocket
@@ -153,8 +151,6 @@ clean: $(SUBDIRS)
 clobber: clean $(SUBDIRS)
 	$(RM) $(CLOBBERFILES)
 
-lint:	lint_SRCS $(SUBDIRS)
-
 $(SUBDIRS): FRC
 	@cd $@; pwd; $(MAKE) $(TARGET)
 
diff --git a/usr/src/cmd/bhyve/test/Makefile.com b/usr/src/cmd/bhyve/test/Makefile.com
index f5efacc510..a1a2d79db1 100644
--- a/usr/src/cmd/bhyve/test/Makefile.com
+++ b/usr/src/cmd/bhyve/test/Makefile.com
@@ -22,7 +22,6 @@ include $(SRC)/cmd/Makefile.cmd.64
 #
 CSTD=		$(CSTD_GNU99)
 C99MODE=	-xc99=%all
-C99LMODE=	-Xc99=%all
 
 CFLAGS +=	$(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \
 		-_gcc=-Wno-parentheses
@@ -58,4 +57,3 @@ ROOTOUT =	$(OUTFILES:%=$(ROOTTSTDIR)/%)
 ROOTTESTS =	$(ROOTTSTEXES) $(ROOTTSTSH) $(ROOTOUT)
 FILEMODE =	0555
 LDLIBS =	$(LDLIBS.cmd)
-LINTEXE =	$(EXETESTS:%.exe=%.exe.ln)
diff --git a/usr/src/cmd/bhyve/test/Makefile.subdirs b/usr/src/cmd/bhyve/test/Makefile.subdirs
index 45f0aa67fa..ff7d6c91cb 100644
--- a/usr/src/cmd/bhyve/test/Makefile.subdirs
+++ b/usr/src/cmd/bhyve/test/Makefile.subdirs
@@ -19,9 +19,8 @@ all	:= TARGET += all
 clean	:= TARGET += clean
 clobber	:= TARGET += clobber
 install	:= TARGET += install
-lint	:= TARGET += lint
 
-all clean clobber install lint: $(SUBDIRS)
+all clean clobber install: $(SUBDIRS)
 
 $(SUBDIRS): FRC
 	@cd $@; pwd; $(MAKE) $(TARGET)
diff --git a/usr/src/cmd/bhyve/test/Makefile.targ b/usr/src/cmd/bhyve/test/Makefile.targ
index e3ec55cfdb..13d28a0630 100644
--- a/usr/src/cmd/bhyve/test/Makefile.targ
+++ b/usr/src/cmd/bhyve/test/Makefile.targ
@@ -43,11 +43,6 @@ $(ROOTTSTDIR)/%.exe: %.exe $(ROOTTSTDIR)
 
 all: install
 
-%.exe.ln: %.c $(SUPOBJS)
-	$(LINT.c) $< $(LDLIBS)
-
-lint: $(LINTEXE)
-
 clean:
 	-$(RM) *.o $(CLEANFILES)
 
diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile
index 0a8a96cfc9..bbac3dd0cc 100644
--- a/usr/src/cmd/bhyvectl/Makefile
+++ b/usr/src/cmd/bhyvectl/Makefile
@@ -51,8 +51,6 @@ install: all $(ROOTUSRSBINPROG)
 clean:
 	$(RM) $(OBJS) $(CLEANFILES)
 
-lint:	lint_SRCS
-
 include ../Makefile.targ
 
 %.o: $(CONTRIB)/freebsd/lib/libutil/%.c
diff --git a/usr/src/cmd/pptadm/Makefile b/usr/src/cmd/pptadm/Makefile
index 3be558a7a0..7094f8563c 100644
--- a/usr/src/cmd/pptadm/Makefile
+++ b/usr/src/cmd/pptadm/Makefile
@@ -21,7 +21,6 @@ include ../Makefile.ctf
 LDLIBS += -lofmt -lppt -lnvpair
 
 CSTD = $(CSTD_GNU99)
-C99LMODE = -Xc99=%all
 
 CLEANFILES += $(OBJS)
 
@@ -34,8 +33,6 @@ install: all $(ROOTUSRSBINPROG)
 clean:
 	-$(RM) $(CLEANFILES)
 
-lint: lint_SRCS
-
 %.o: ../%.c
 	$(COMPILE.c) $<
 	$(POST_PROCESS_O)
diff --git a/usr/src/lib/libppt/Makefile b/usr/src/lib/libppt/Makefile
index 21c26d447e..d8f34163ab 100644
--- a/usr/src/lib/libppt/Makefile
+++ b/usr/src/lib/libppt/Makefile
@@ -24,11 +24,10 @@ all :=		TARGET= all
 clean :=	TARGET= clean
 clobber :=	TARGET= clobber
 install :=	TARGET= install
-lint :=		TARGET= lint
 
 .KEEP_STATE:
 
-all clean clobber install lint: $(SUBDIRS)
+all clean clobber install: $(SUBDIRS)
 
 install_h: $(ROOTHDRS)
 
diff --git a/usr/src/lib/libppt/Makefile.com b/usr/src/lib/libppt/Makefile.com
index 7b2ff4885f..3d5e96f436 100644
--- a/usr/src/lib/libppt/Makefile.com
+++ b/usr/src/lib/libppt/Makefile.com
@@ -22,25 +22,16 @@ include $(SRC)/lib/Makefile.lib
 
 SRCDIR = ../common
 
-LIBS = $(DYNLIB) $(LINTLIB)
+LIBS = $(DYNLIB)
 SRCS =	$(SRCDIR)/libppt.c
 
 CSTD=	$(CSTD_GNU99)
 C99LMODE=	-Xc99=%all
 
-#
-# lint doesn't like %4s in sscanf().
-#
-LINTFLAGS += -erroff=E_BAD_FORMAT_ARG_TYPE2
-LINTFLAGS64 += -erroff=E_BAD_FORMAT_ARG_TYPE2
-
-$(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC)
 LDLIBS += -lpcidb -ldevinfo -lcmdutils -lnvpair -lc
 
 .KEEP_STATE:
 
 all: $(LIBS)
 
-lint: lintcheck
-
 include $(SRC)/lib/Makefile.targ
diff --git a/usr/src/lib/libppt/common/llib-lppt b/usr/src/lib/libppt/common/llib-lppt
deleted file mode 100644
index dadd992a31..0000000000
--- a/usr/src/lib/libppt/common/llib-lppt
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2018 Joyent, Inc.
- */
-
-/* LINTLIBRARY */
-/* PROTOLIB1 */
-
-#include <libppt.h>
diff --git a/usr/src/lib/libvmm/Makefile b/usr/src/lib/libvmm/Makefile
index 66bd60eb46..0aae533bb0 100644
--- a/usr/src/lib/libvmm/Makefile
+++ b/usr/src/lib/libvmm/Makefile
@@ -26,11 +26,10 @@ all:=		TARGET= all
 install:=	TARGET= install
 clean:=		TARGET= clean
 clobber:=	TARGET= clobber
-lint:=		TARGET= lint
 
 .KEEP_STATE:
 
-all install clean clobber lint: $(SUBDIRS)
+all install clean clobber: $(SUBDIRS)
 
 install_h:	$(ROOTHDRS)
 check:		$(CHECKHDRS)
diff --git a/usr/src/lib/libvmm/Makefile.com b/usr/src/lib/libvmm/Makefile.com
index d85abae8ce..bef555aed3 100644
--- a/usr/src/lib/libvmm/Makefile.com
+++ b/usr/src/lib/libvmm/Makefile.com
@@ -37,8 +37,6 @@ LDLIBS +=	-lc -lvmmapi
 
 all: $(LIBS)
 
-lint: lintcheck
-
 pics/%.o: $(SRC)/common/list/%.c
 	$(COMPILE.c) -o $@ $<
 	$(POST_PROCESS_O)
diff --git a/usr/src/lib/libvmmapi/Makefile b/usr/src/lib/libvmmapi/Makefile
index 233fcd5edb..d002a65297 100644
--- a/usr/src/lib/libvmmapi/Makefile
+++ b/usr/src/lib/libvmmapi/Makefile
@@ -27,12 +27,11 @@ all:=		TARGET= all
 install:=	TARGET= install
 clean:=	TARGET= clean
 clobber:=	TARGET= clobber
-lint:=		TARGET= lint
 _msg:=		TARGET= _msg
 
 .KEEP_STATE:
 
-all install clean clobber lint: $(SUBDIRS)
+all install clean clobber: $(SUBDIRS)
 
 # install rule for install_h target
 
diff --git a/usr/src/lib/libvmmapi/Makefile.com b/usr/src/lib/libvmmapi/Makefile.com
index 34240f4331..1653e8619c 100644
--- a/usr/src/lib/libvmmapi/Makefile.com
+++ b/usr/src/lib/libvmmapi/Makefile.com
@@ -27,7 +27,7 @@ include ../../Makefile.rootfs
 
 SRCDIR		= ../common
 
-LIBS		= $(DYNLIB) $(LINTLIB)
+LIBS		= $(DYNLIB)
 
 CPPFLAGS	= -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
 	$(CPPFLAGS.master) -I$(SRC)/uts/i86pc
@@ -35,16 +35,12 @@ CPPFLAGS	= -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
 # not linted
 SMATCH=off
 
-$(LINTLIB) :=	SRCS = $(SRCDIR)/$(LINTSRC)
-
 LDLIBS		+= -lc
 
 .KEEP_STATE:
 
 all: $(LIBS)
 
-lint: lintcheck
-
 pics/%.o: $(CONTRIB)/freebsd/lib/libutil/%.c
 	$(COMPILE.c) -o $@ $<
 	$(POST_PROCESS_O)
diff --git a/usr/src/lib/libvmmapi/common/llib-lvmmapi b/usr/src/lib/libvmmapi/common/llib-lvmmapi
deleted file mode 100644
index 221ed3a23e..0000000000
--- a/usr/src/lib/libvmmapi/common/llib-lvmmapi
+++ /dev/null
@@ -1,2 +0,0 @@
-/* LINTLIBRARY */
-/* PROTOLIB1 */
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
index fad994b09c..27ef1a04af 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
@@ -36,18 +36,6 @@
 
 /* Porting note: This is named 'svm_support.S' upstream. */
 
-#if defined(lint)
-
-struct svm_regctx;
-struct cpu;
-
-/*ARGSUSED*/
-void
-svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *cpu)
-{}
-
-#else /* lint */
-
 #define	VMLOAD	.byte 0x0f, 0x01, 0xda
 #define	VMRUN	.byte 0x0f, 0x01, 0xd8
 #define	VMSAVE	.byte 0x0f, 0x01, 0xdb
@@ -160,5 +148,3 @@ ENTRY_NP(svm_launch)
 	popq	%rbp
 	ret
 SET_SIZE(svm_launch)
-
-#endif /* lint */
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
index 0130f88dd6..f719e31e30 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
@@ -45,31 +45,6 @@
 
 /* Porting note: This is named 'vmx_support.S' upstream. */
 
-
-
-#if defined(lint)
-
-struct vmxctx;
-struct vmx;
-
-/*ARGSUSED*/
-void
-vmx_launch(struct vmxctx *ctx)
-{}
-
-void
-vmx_exit_guest()
-{}
-
-/*ARGSUSED*/
-int
-vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched)
-{
-	return (0);
-}
-
-#else /* lint */
-
 #include "vmx_assym.h"
 #include "vmcs.h"
 
@@ -155,7 +130,7 @@ vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched)
 #define	VMXSTKSIZE	VMXSTK_FP
 
 /*
- * vmx_enter_guest(struct vmxctx *vmxctx, int launched)
+ * vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched)
  * Interrupts must be disabled on entry.
  */
 ENTRY_NP(vmx_enter_guest)
@@ -380,5 +355,3 @@ ENTRY_NP(vmx_call_isr)
 	popq	%rbp
 	ret
 SET_SIZE(vmx_call_isr)
-
-#endif /* lint */
diff --git a/usr/src/uts/i86pc/ppt/Makefile b/usr/src/uts/i86pc/ppt/Makefile
index f231dfddf6..7c41368efd 100644
--- a/usr/src/uts/i86pc/ppt/Makefile
+++ b/usr/src/uts/i86pc/ppt/Makefile
@@ -24,7 +24,6 @@ UTSBASE	= ../..
 #
 MODULE		= ppt
 OBJECTS		= $(PPT_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(PPT_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
 CONF_SRCDIR	= $(UTSBASE)/i86pc/io/vmm/io
 MAPFILE		= $(UTSBASE)/i86pc/io/vmm/io/ppt.mapfile
@@ -38,7 +37,6 @@ include $(UTSBASE)/i86pc/Makefile.i86pc
 #	Define targets
 #
 ALL_TARGET	= $(BINARY)
-LINT_TARGET	= $(MODULE).lint
 INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 
 #
@@ -72,12 +70,6 @@ clean:		$(CLEAN_DEPS)
 
 clobber:	$(CLOBBER_DEPS)
 
-lint:		$(LINT_DEPS)
-
-modlintlib:	$(MODLINTLIB_DEPS)
-
-clean.lint:	$(CLEAN_LINT_DEPS)
-
 install:	$(INSTALL_DEPS)
 
 #
diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile
index dac59c9a45..b7f0fd6f53 100644
--- a/usr/src/uts/i86pc/viona/Makefile
+++ b/usr/src/uts/i86pc/viona/Makefile
@@ -24,7 +24,6 @@ UTSBASE	= ../..
 #
 MODULE		= viona
 OBJECTS	= $(VIONA_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
 CONF_SRCDIR	= $(UTSBASE)/i86pc/io/viona
 MAPFILE		= $(UTSBASE)/i86pc/io/viona/viona.mapfile
@@ -38,17 +37,11 @@ include $(UTSBASE)/i86pc/Makefile.i86pc
 # Define targets
 #
 ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
-LINT_TARGET	= $(MODULE).lint
 INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 
 #
 # Overrides
 #
-LINTTAGS	+= -erroff=E_ASSIGN_NARROW_CONV
-LINTTAGS	+= -erroff=E_FUNC_ARG_UNUSED
-LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
-LINTTAGS	+= -erroff=E_FUNC_RET_MAYBE_IGNORED2
-LINTTAGS	+= -erroff=E_FUNC_RET_ALWAYS_IGNOR2
 
 # needs work
 SMOFF += all_func_returns
@@ -74,12 +67,6 @@ clean:		$(CLEAN_DEPS)
 
 clobber:	$(CLOBBER_DEPS)
 
-lint:		$(LINT_DEPS)
-
-modlintlib:	$(MODLINTLIB_DEPS)
-
-clean.lint:	$(CLEAN_LINT_DEPS)
-
 install:	$(INSTALL_DEPS)
 
 #
diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile
index d5dc8d7124..c55abf6090 100644
--- a/usr/src/uts/i86pc/vmm/Makefile
+++ b/usr/src/uts/i86pc/vmm/Makefile
@@ -24,7 +24,6 @@ UTSBASE	= ../..
 #
 MODULE		= vmm
 OBJECTS		= $(VMM_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(USR_DRV_DIR)/$(MODULE)
 CONF_SRCDIR	= $(UTSBASE)/i86pc/io/vmm
 MAPFILE		= $(UTSBASE)/i86pc/io/vmm/vmm.mapfile
@@ -38,42 +37,11 @@ include $(UTSBASE)/i86pc/Makefile.i86pc
 #	Define targets
 #
 ALL_TARGET	= $(BINARY)
-LINT_TARGET	= $(MODULE).lint
 INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 
 #
 #	Overrides and additions
 #
-LINTTAGS	+= -erroff=E_EMPTY_DECLARATION
-LINTTAGS	+= -erroff=E_OPERANDS_INCOMPATIBLE_TYPES
-LINTTAGS	+= -erroff=E_VOID_CANT_RETURN_VALUE
-LINTTAGS	+= -erroff=E_YACC_ERROR
-LINTTAGS	+= -erroff=E_STATIC_UNUSED
-LINTTAGS	+= -erroff=E_FUNC_RET_MAYBE_IGNORED2
-LINTTAGS	+= -erroff=E_FUNC_RET_ALWAYS_IGNOR2
-LINTTAGS	+= -erroff=E_BAD_FORMAT_ARG_TYPE2
-LINTTAGS	+= -erroff=E_FUNC_ARG_UNUSED
-LINTTAGS	+= -erroff=E_FUNC_SET_NOT_USED
-LINTTAGS	+= -erroff=E_ASSIGN_NARROW_CONV
-LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
-LINTTAGS	+= -erroff=E_CONSTANT_CONDITION
-LINTTAGS	+= -erroff=E_PTR_TO_VOID_IN_ARITHMETIC
-LINTTAGS	+= -erroff=E_CONST_TRUNCATED_BY_ASSIGN
-LINTTAGS	+= -erroff=E_NOP_ELSE_STMT
-LINTTAGS	+= -erroff=E_FUNC_NO_RET_VAL
-LINTTAGS	+= -erroff=E_OLD_STYLE_DECL_OR_BAD_TYPE
-LINTTAGS	+= -erroff=E_VAR_USED_BEFORE_SET
-LINTTAGS	+= -erroff=E_INTEGER_OVERFLOW_DETECTED
-LINTTAGS	+= -erroff=E_STMT_NOT_REACHED
-LINTTAGS	+= -erroff=E_FUNC_NO_RET_VAL
-LINTTAGS	+= -erroff=E_USELESS_DECLARATION
-LINTTAGS	+= -erroff=E_EXPR_NULL_EFFECT
-LINTTAGS	+= -erroff=E_CASE_FALLTHRU
-LINTTAGS	+= -erroff=E_FUNC_DECL_VAR_ARG2
-LINTTAGS	+= -erroff=E_ASM_IMPOSSIBLE_CONSTRAINT
-LINTTAGS	+= -erroff=E_ASM_UNUSED_PARAM
-LINTTAGS	+= -erroff=E_NOP_IF_STMT
-LINTTAGS	+= -erroff=E_ZERO_OR_NEGATIVE_SUBSCRIPT
 
 CERRWARN	+= -_gcc=-Wno-empty-body
 
@@ -86,11 +54,6 @@ $(OBJS_DIR)/vmm_sol_dev.o := SMOFF += signed_integer_overflow_check
 # a can't happen: vmx_setcap() warn: variable dereferenced before check 'pptr'
 $(OBJS_DIR)/vmx.o := SMOFF += deref_check
 
-# These sources only compile with gcc.  Workaround a confluence of cruft
-# regarding dmake and shadow compilation by neutering the sun compiler.
-#amd64_CC	= $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc
-#CFLAGS		+= -_cc=-xdryrun
-
 ALL_BUILDS	= $(ALL_BUILDSONLY64)
 DEF_BUILDS	= $(DEF_BUILDSONLY64)
 PRE_INC_PATH	= -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 \
@@ -133,12 +96,6 @@ clean:		$(CLEAN_DEPS)
 
 clobber:	$(CLOBBER_DEPS)
 
-lint:		$(LINT_DEPS)
-
-modlintlib:	$(MODLINTLIB_DEPS)
-
-clean.lint:	$(CLEAN_LINT_DEPS)
-
 install:	$(INSTALL_DEPS)
 
 #
-- 
cgit v1.2.3