9 files changed, 5811 insertions, 6014 deletions
diff --git a/kvm.c b/kvm.c
index 306334a..4f8f5d1 100644
--- a/kvm.c
+++ b/kvm.c
@@ -49,6 +49,7 @@
 #include "kvm_apicdef.h"
 #include "kvm_iodev.h"
 #include "kvm.h"
+#include "kvm_x86impl.h"
 #include "kvm_irq.h"
 #include "kvm_tss.h"
 #include "kvm_ioapic.h"
@@ -56,9 +57,6 @@
 #include "kvm_i8254.h"
 #include "kvm_mmu.h"
 #include "kvm_cache_regs.h"
-#include "kvm_x86impl.h"
-#include "kvm_lapic.h"
-#include "kvm_vmx.h"
 
 #undef DEBUG
 
@@ -66,11 +64,17 @@
  * The entire state of the kvm device.
  */
 typedef struct {
-	struct kvm *kds_kvmp;			/* pointer to underlying VM */
-	struct kvm_vcpu *kds_vcpu;		/* pointer to VCPU */
+	struct kvm *kds_kvmp;		/* pointer to underlying VM */
+	struct kvm_vcpu *kds_vcpu;	/* pointer to VCPU */
 } kvm_devstate_t;
 
 /*
+ * Globals
+ */
+page_t *bad_page;
+pfn_t bad_pfn;
+
+/*
  * Tunables
  */
 static int kvm_hiwat = 0x1000000;
@@ -82,9 +86,7 @@ static void *kvm_state;		/* DDI state */
 static vmem_t *kvm_minor;	/* minor number arena */
 static dev_info_t *kvm_dip;	/* global devinfo hanlde */
 static minor_t kvm_base_minor;	/* The only minor device that can be opened */
-
-static int kvmid;  /* monotonically increasing, unique per vm */
-
+static int kvmid;		/* monotonically increasing, unique per vm */
 static int largepages_enabled = 1;
 static cpuset_t cpus_hardware_enabled;
 static volatile uint32_t hardware_enable_failed;
@@ -92,817 +94,254 @@ static int kvm_usage_count;
 static list_t vm_list;
 static kmutex_t kvm_lock;
 static int ignore_msrs = 0;
-
-/*
- * Driver forward declarations
- */
-static int kvm_open(dev_t *devp, int flag, int otyp, cred_t *cred);
-static int kvm_close(dev_t dev, int flag, int otyp, cred_t *cred);
-static int kvm_read(dev_t dev, struct uio *uiop, cred_t *credp);
-static int kvm_write(dev_t dev, struct uio *uiop, cred_t *credp);
-static int kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int md,
-    cred_t *cred_p, int *rv);
-static int kvm_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off,
-    size_t len, size_t *maplen, uint_t model);
-static int kvm_segmap(dev_t, off_t, struct as *, caddr_t *, off_t,
-    unsigned int, unsigned int, unsigned int, cred_t *);
-static int kvm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
-    void **result);
-static int kvm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
-static int kvm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
-
-static struct cb_ops kvm_cb_ops = {
-	kvm_open,
-	kvm_close,	/* close */
-	nodev,
-	nodev,
-	nodev,		/* dump */
-	nodev,		/* read */
-	nodev,		/* write */
-	kvm_ioctl,
-	kvm_devmap,
-	nodev,		/* mmap */
-	kvm_segmap,	/* segmap */
-	nochpoll,	/* poll */
-	ddi_prop_op,
-	NULL,
-	D_NEW | D_MP | D_DEVMAP
-};
-static struct dev_ops kvm_ops = {
-	DEVO_REV,
-	0,
-	kvm_getinfo,
-	nulldev,	/* identify */
-	nulldev,	/* probe */
-	kvm_attach,
-	kvm_detach,
-	nodev,		/* reset */
-	&kvm_cb_ops,
-	(struct bus_ops *)0
-};
-
-static struct modldrv modldrv = {
-	&mod_driverops,
-	"kvm driver v0.1",
-	&kvm_ops
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1,
-	{ &modldrv, NULL }
-};
-
-/* XXX */
-static int hardware_enable_all(void);
-static void hardware_disable_all(void);
-static void kvm_destroy_vm(struct kvm *);
-static int kvm_avlmmucmp(const void *, const void *);
-extern struct kvm_x86_ops vmx_x86_ops;
-extern struct kvm_shared_msrs **shared_msrs;
-struct kvm_shared_msrs_global shared_msrs_global;
-static void kvm_on_user_return(struct kvm_vcpu *,
-    struct kvm_user_return_notifier *);
-page_t *bad_page;
-pfn_t bad_pfn;
-struct kvm_x86_ops *kvm_x86_ops;
-
-inline int
-kvm_exception_is_soft(unsigned int nr)
-{
-	return (nr == BP_VECTOR) || (nr == OF_VECTOR);
-}
-
-/*
- * EFER defaults:
- * - enable syscall per default because its emulated by KVM
- * - enable LME and LMA per default on 64 bit KVM
- */
-#ifdef CONFIG_X86_64
-static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL;
-#else
-static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL;
-#endif
+static unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)];
 
 void
-kvm_enable_efer_bits(uint64_t mask)
+kvm_user_return_notifier_register(struct kvm_vcpu *vcpu,
+    struct kvm_user_return_notifier *urn)
 {
-	efer_reserved_bits &= ~mask;
+	vcpu->urn = urn;
 }
 
 void
-kvm_disable_largepages(void)
+kvm_user_return_notifier_unregister(struct kvm_vcpu *vcpu,
+    struct kvm_user_return_notifier *urn)
 {
-	largepages_enabled = 0;
+	vcpu->urn = NULL;
 }
 
-int
-kvm_arch_hardware_setup(void)
+void
+kvm_fire_urn(struct kvm_vcpu *vcpu)
 {
-	return (kvm_x86_ops->hardware_setup());
+	if (vcpu->urn)
+		vcpu->urn->on_user_return(vcpu, vcpu->urn);
 }
 
+/*
+ * Called when we've been asked to save our context. i.e. we're being swapped
+ * out.
+ */
 void
-bitmap_zero(unsigned long *dst, int nbits)
+kvm_ctx_save(void *arg)
 {
-	int len = BITS_TO_LONGS(nbits) * sizeof (unsigned long);
-	memset(dst, 0, len);
+	struct kvm_vcpu *vcpu = arg;
+	kvm_arch_vcpu_put(vcpu);
+	kvm_fire_urn(vcpu);
 }
 
-struct kvm_mmu_page *
-page_private(kvm_t *kvmp, page_t *page)
+/*
+ * Called when we're being asked to restore our context. i.e. we're returning
+ * from being swapped out.
+ */
+void
+kvm_ctx_restore(void *arg)
 {
-	kvm_mmu_page_t mp, *res;
-	mp.kmp_avlspt = (uintptr_t)page;
-	mutex_enter(&kvmp->kvm_avllock);
-	res = avl_find(&kvmp->kvm_avlmp, &mp, NULL);
-	mutex_exit(&kvmp->kvm_avllock);
-	ASSERT(res != NULL);
-	return (res);
-}
+	int cpu;
 
-inline struct kvm_mmu_page *
-page_header(kvm_t *kvmp, hpa_t shadow_page)
-{
-	return (page_private(kvmp, pfn_to_page(shadow_page >> PAGESHIFT)));
+	cpu = CPU->cpu_seqid;
+	struct kvm_vcpu *vcpu = arg;
+	kvm_arch_vcpu_load(vcpu, cpu);
 }
 
-struct kvm_memory_slot *
-gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
 #ifdef XXX_KVM_DECLARATION
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+#define	pfn_valid(pfn) ((pfn < physmax) && (pfn != PFN_INVALID))
 #else
-	struct kvm_memslots *slots = kvm->memslots;
+#define	pfn_valid(pfn) (pfn != PFN_INVALID)
 #endif
 
-	for (i = 0; i < slots->nmemslots; ++i) {
-		struct kvm_memory_slot *memslot = &slots->memslots[i];
-
-		if (gfn >= memslot->base_gfn &&
-		    gfn < memslot->base_gfn + memslot->npages)
-			return (memslot);
-	}
-	return (NULL);
-}
-
-gfn_t
-unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+inline int
+kvm_is_mmio_pfn(pfn_t pfn)
 {
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
+	if (pfn_valid(pfn)) {
 #ifdef XXX
-	aliases = rcu_dereference(kvm->arch.aliases);
-#else
-	XXX_KVM_SYNC_PROBE;
-	aliases = kvm->arch.aliases;
-#endif
-
-	for (i = 0; i < aliases->naliases; i++) {
-		alias = &aliases->aliases[i];
-		if (alias->flags & KVM_ALIAS_INVALID)
-			continue;
-		if (gfn >= alias->base_gfn &&
-		    gfn < alias->base_gfn + alias->npages)
-			return (alias->target_gfn + gfn - alias->base_gfn);
-	}
-
-	return (gfn);
-}
-
-int
-kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-#ifdef XXX_KVM_DECLARATION
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+		struct page *page = compound_head(pfn_to_page(pfn));
+		return (PageReserved(page));
 #else
-	struct kvm_memslots *slots = kvm->memslots;
+		XXX_KVM_PROBE;
 #endif
-
-	gfn = unalias_gfn_instantiation(kvm, gfn);
-
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *memslot = &slots->memslots[i];
-
-		if (memslot->flags & KVM_MEMSLOT_INVALID)
-			continue;
-
-		if (gfn >= memslot->base_gfn &&
-		    gfn < memslot->base_gfn + memslot->npages) {
-			return (1);
-		}
-	}
-
-	return (0);
+		return (0);
+	} else
+		return (1);
 }
 
 /*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
+ * Switches to specified vcpu, until a matching vcpu_put()
  */
-
-#define	MSR_KVM_WALL_CLOCK  0x11
-#define	MSR_KVM_SYSTEM_TIME 0x12
-
-#define	KVM_SAVE_MSRS_BEGIN	5
-static uint32_t msrs_to_save[] = {
-	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE,
-	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-	MSR_K6_STAR,
-#ifdef CONFIG_X86_64
-	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
-	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
-};
-
-static unsigned num_msrs_to_save;
-
-static uint32_t emulated_msrs[] = {
-	MSR_IA32_MISC_ENABLE,
-};
-
-uint64_t
-native_read_msr_safe(unsigned int msr, int *err)
-{
-	DECLARE_ARGS(val, low, high);
-	uint64_t ret = 0;
-	on_trap_data_t otd;
-
-	if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
-		ret = native_read_msr(msr);
-		*err = 0;
-	} else {
-		*err = EINVAL; /* XXX probably not right... */
-	}
-	no_trap();
-
-	return (ret);
-}
-
-/* Can be uninlined because referenced by paravirt */
-int
-native_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+void
+vcpu_load(struct kvm_vcpu *vcpu)
 {
-	int err = 0;
-	on_trap_data_t otd;
-
-	if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
-		native_write_msr(msr, low, high);
-	} else {
-		err = EINVAL;  /* XXX probably not right... */
-	}
-	no_trap();
+	int cpu;
 
-	return (err);
+	mutex_enter(&vcpu->mutex);
+	kpreempt_disable();
+	cpu = CPU->cpu_seqid;
+	installctx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL,
+	    NULL, NULL, NULL);
+	kvm_arch_vcpu_load(vcpu, cpu);
+	kpreempt_enable();
 }
 
-static void
-kvm_init_msr_list(void)
+void
+vcpu_put(struct kvm_vcpu *vcpu)
 {
-	uint32_t dummy[2];
-	unsigned i, j;
-
-	/* skip the first msrs in the list. KVM-specific */
-	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
-		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
-			continue;
-		if (j < i)
-			msrs_to_save[j] = msrs_to_save[i];
-		j++;
-	}
-	num_msrs_to_save = j;
+	kpreempt_disable();
+	kvm_arch_vcpu_put(vcpu);
+	kvm_fire_urn(vcpu);
+	removectx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL,
+	    NULL, NULL, NULL);
+	kpreempt_enable();
+	mutex_exit(&vcpu->mutex);
 }
 
-uint64_t cpu_tsc_khz;
-extern uint64_t cpu_freq_hz;
-
 static void
-kvm_timer_init(void)
+ack_flush(void *_completed)
 {
-	int cpu;
-
-	/*
-	 * XXX We assume that any machine running solaris kvm
-	 * has constant time stamp counter increment rate.
-	 * This will be true for all but older machines.
-	 */
-	/* assume pi_clock in mhz */
-	cpu_tsc_khz = (cpu_freq_hz / 1000);
 }
 
 int
-kvm_arch_init(void *opaque)
+make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
-	int r;
-	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+	int i;
+	cpuset_t set;
+	processorid_t me, cpu;
+	struct kvm_vcpu *vcpu;
 
-	if (ops->cpu_has_kvm_support()) {
-		cmn_err(CE_WARN, "kvm: no hardware support\n");
-		r = ENOTSUP;
-		goto out;
+	CPUSET_ZERO(set);
+
+	mutex_enter(&kvm->requests_lock);
+	me = curthread->t_cpu->cpu_id;
+	for (i = 0; i < kvm->online_vcpus; i++) {
+		vcpu = kvm->vcpus[i];
+		if (!vcpu)
+			break;
+		if (test_and_set_bit(req, &vcpu->requests))
+			continue;
+		cpu = vcpu->cpu;
+		if (cpu != -1 && cpu != me)
+			CPUSET_ADD(set, cpu);
 	}
-	if (ops->disabled_by_bios()) {
-		cmn_err(CE_WARN, "kvm: disabled by bios\n");
-		r = ENOTSUP;
-		goto out;
+	if (CPUSET_ISNULL(set))
+		kvm_xcall(KVM_CPUALL, ack_flush, NULL);
+	else {
+		kpreempt_disable();
+		xc_sync((xc_arg_t) ack_flush, (xc_arg_t) NULL,
+			0, CPUSET2BV(set), (xc_func_t) kvm_xcall_func);
+		kpreempt_enable();
 	}
+	mutex_exit(&kvm->requests_lock);
 
-	r = kvm_mmu_module_init();
-	if (r)
-		goto out;
-
-	kvm_init_msr_list();
-
-	kvm_x86_ops = ops;
-	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
-	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
-	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-	    PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
-	kvm_timer_init();
-
-	return (0);
-
-out:
-	return (r);
+	return (1);
 }
 
-page_t *
-alloc_page(size_t size, int flag)
+void
+kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-	caddr_t page_addr;
-	pfn_t pfn;
-	page_t *pp;
-
-	if ((page_addr = kmem_zalloc(size, flag)) == NULL)
-		return ((page_t *)NULL);
-
-	pp = page_numtopp_nolock(hat_getpfnum(kas.a_hat, page_addr));
-	return (pp);
+	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+		KVM_KSTAT_INC(kvm, kvmks_remote_tlb_flush);
 }
 
 void
-kvm_arch_check_processor_compat(void *rtn)
+kvm_reload_remote_mmus(struct kvm *kvm)
 {
-	kvm_x86_ops->check_processor_compatibility(rtn);
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
 int
-kvm_init(void *opaque)
+kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
 	int r;
-	int cpu;
-
-	r = kvm_arch_init(opaque);
-
-	if (r != DDI_SUCCESS)
-		return (r);
-
-	bad_page = alloc_page(PAGESIZE, KM_SLEEP);
-	bad_pfn = bad_page->p_pagenum;
-
-#ifdef XXX
-	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
-		r = -ENOMEM;
-		goto out_free_0;
-	}
-#else
-	XXX_KVM_PROBE;
-#endif
-	r = kvm_arch_hardware_setup();
-
-	if (r != DDI_SUCCESS)
-		goto out_free_0a;
-
-#ifdef XXX
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu,
-				kvm_arch_check_processor_compat,
-				&r, 1);
-		if (r < 0)
-			goto out_free_1;
-	}
-#else
-	r = 0;
-	kvm_xcall(KVM_CPUALL, kvm_arch_check_processor_compat, &r);
-	if (r < 0)
-		goto out_free_1;
-	XXX_KVM_PROBE;
-#endif
-
 
+	mutex_init(&vcpu->mutex, NULL, MUTEX_DRIVER, 0);
+	vcpu->cpu = -1;
+	vcpu->kvm = kvm;
+	vcpu->vcpu_id = id;
 #ifdef XXX
-	r = register_cpu_notifier(&kvm_cpu_notifier);
-	if (r)
-		goto out_free_2;
-	register_reboot_notifier(&kvm_reboot_notifier);
-
-	r = sysdev_class_register(&kvm_sysdev_class);
-	if (r)
-		goto out_free_3;
-
-	r = sysdev_register(&kvm_sysdev);
-	if (r)
-		goto out_free_4;
+	init_waitqueue_head(&vcpu->wq);
 #else
 	XXX_KVM_PROBE;
 #endif
+	vcpu->run = ddi_umem_alloc(PAGESIZE * 2, DDI_UMEM_SLEEP, &vcpu->cookie);
 
-#ifdef XXX
-	kvm_chardev_ops.owner = module;
-	kvm_vm_fops.owner = module;
-	kvm_vcpu_fops.owner = module;
+	r = kvm_arch_vcpu_init(vcpu);
 
-	r = misc_register(&kvm_dev);
-	if (r) {
-		cmn_err(CE_WARN, "kvm: misc device register failed\n");
-		goto out_free;
+	if (r != 0) {
+		vcpu->run = NULL;
+		ddi_umem_free(vcpu->cookie);
+		return (r);
 	}
 
-	/*
-	 * XXX - if kernel preemption occurs, we probably need
-	 * to implement these, and add hooks to the preemption code.
-	 * For right now, we'll make the totally unreasonable
-	 * assumption that we won't be preempted while in the
-	 * kernel, i.e., no realtime threads are running
-	 */
-	kvm_preempt_ops.sched_in = kvm_sched_in;
-	kvm_preempt_ops.sched_out = kvm_sched_out;
-
-	kvm_init_debug();
-#else
-	XXX_KVM_PROBE;
-#endif
-
 	return (0);
-
-out_free:
-out_free_5:
-#ifdef XXX
-	sysdev_unregister(&kvm_sysdev);
-out_free_4:
-	sysdev_class_unregister(&kvm_sysdev_class);
-out_free_3:
-	unregister_reboot_notifier(&kvm_reboot_notifier);
-	unregister_cpu_notifier(&kvm_cpu_notifier);
-#else
-	XXX_KVM_PROBE;
-#endif
-out_free_2:
-out_free_1:
-#ifdef XXX
-	kvm_arch_hardware_unsetup();
-#else
-	XXX_KVM_PROBE;
-#endif
-out_free_0a:
-#ifdef XXX
-	free_cpumask_var(cpus_hardware_enabled);
-#else
-	XXX_KVM_PROBE;
-#endif
-out_free_0:
-#ifdef XXX
-	free_page(bad_page, PAGESIZE);
-#else
-	XXX_KVM_PROBE;
-#endif
-out:
-#ifdef XXX
-	kvm_arch_exit();
-#else
-	XXX_KVM_PROBE;
-#endif
-out_fail:
-	return (r);
 }
 
 void
-kvm_define_shared_msr(unsigned slot, uint32_t msr)
-{
-	if (slot >= shared_msrs_global.nr)
-		shared_msrs_global.nr = slot + 1;
-	shared_msrs_global.msrs[slot] = msr;
-#ifdef XXX
-	/* we need ensured the shared_msr_global have been updated */
-	smp_wmb();
-#else
-	XXX_KVM_SYNC_PROBE;
-#endif
-}
-
-int
-_init(void)
-{
-
-	return (mod_install(&modlinkage));
-}
-
-int
-_fini(void)
-{
-	return (mod_remove(&modlinkage));
-}
-
-int
-_info(struct modinfo *modinfop)
-{
-	return (mod_info(&modlinkage, modinfop));
-}
-
-static int
-kvm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
-	minor_t instance;
-
-	if (kpm_enable == 0) {
-		cmn_err(CE_WARN, "kvm: kpm_enable must be true\n");
-		return (DDI_FAILURE);
-	}
-
-
-	if (cmd != DDI_ATTACH)
-		return (DDI_FAILURE);
-
-	if (kvm_dip != NULL)
-		return (DDI_FAILURE);
-
-	if (ddi_soft_state_init(&kvm_state, sizeof (kvm_devstate_t), 1) != 0)
-		return (DDI_FAILURE);
-
-	instance = ddi_get_instance(dip);
-	if (ddi_create_minor_node(dip, "kvm",
-	    S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) {
-		ddi_soft_state_fini(&kvm_state);
-		return (DDI_FAILURE);
-	}
-
-	mutex_init(&kvm_lock, NULL, MUTEX_DRIVER, 0);
-	kvm_x86_ops = &vmx_x86_ops;
-	if (vmx_init() != DDI_SUCCESS) {
-		ddi_soft_state_fini(&kvm_state);
-		ddi_remove_minor_node(dip, NULL);
-		mutex_destroy(&kvm_lock);
-		return (DDI_FAILURE);
-	}
-
-	if (hardware_enable_all() != 0) {
-		ddi_soft_state_fini(&kvm_state);
-		ddi_remove_minor_node(dip, NULL);
-		mutex_destroy(&kvm_lock);
-		vmx_fini();
-		return (DDI_FAILURE);
-	}
-
-	kvm_dip = dip;
-	kvm_base_minor = instance;
-
-	list_create(&vm_list, sizeof (struct kvm),
-	    offsetof(struct kvm, vm_list));
-	kvm_minor = vmem_create("kvm_minor", (void *)1, UINT32_MAX - 1, 1,
-	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
-
-	ddi_report_dev(dip);
-
-	return (DDI_SUCCESS);
-}
-
-static int
-kvm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
-	int instance;
-
-	if (cmd != DDI_DETACH)
-		return (DDI_FAILURE);
-
-	VERIFY(kvm_dip != NULL && kvm_dip == dip);
-	instance = ddi_get_instance(dip);
-	VERIFY(instance == kvm_base_minor);
-	ddi_prop_remove_all(dip);
-	ddi_remove_minor_node(dip, NULL);
-	list_destroy(&vm_list);
-	vmem_destroy(kvm_minor);
-	kvm_dip = NULL;
-
-	hardware_disable_all();
-	mutex_destroy(&kvm_lock);
-	ddi_soft_state_fini(&kvm_state);
-	vmx_fini();
-
-	return (DDI_SUCCESS);
-}
-
-/*ARGSUSED*/
-static int
-kvm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
-{
-	kvm_devstate_t *rsp;
-	int error = DDI_FAILURE;
-
-	switch (infocmd) {
-	case DDI_INFO_DEVT2DEVINFO:
-		*result = kvm_dip;
-		break;
-
-	case DDI_INFO_DEVT2INSTANCE:
-		*result = (void *)((uint64_t)getminor((dev_t)arg));
-		error = DDI_SUCCESS;
-		break;
-
-	default:
-		break;
-	}
-
-	return (error);
-}
-
-/*ARGSUSED*/
-static int
-kvm_open(dev_t *devp, int flag, int otype, cred_t *credp)
+kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
-	minor_t minor;
-	kvm_devstate_t *ksp;
-
-	if (flag & FEXCL || flag & FNDELAY)
-		return (EINVAL);
-
-	if (otype != OTYP_CHR)
-		return (EINVAL);
-
-	/*
-	 * XXX This should be its own privilage
-	 */
-	if (drv_priv(credp) != 0)
-		return (EPERM);
-
-	if (!(flag & FREAD && flag & FWRITE))
-		return (EINVAL);
-
-	if (getminor(*devp) != kvm_base_minor)
-		return (ENXIO);
-
-	minor = (minor_t)(uintptr_t)vmem_alloc(kvm_minor,
-	    1, VM_BESTFIT | VM_SLEEP);
-
-	if (ddi_soft_state_zalloc(kvm_state, minor) != 0) {
-		vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1);
-		return (ENXIO);
-	}
-
-	*devp = makedevice(getmajor(*devp), minor);
-	ksp = ddi_get_soft_state(kvm_state, minor);
-	VERIFY(ksp != NULL);
-
-	return (0);
+	kvm_arch_vcpu_uninit(vcpu);
+	ddi_umem_free(vcpu->cookie);
 }
 
-/*ARGSUSED*/
-static int
-kvm_close(dev_t dev, int flag, int otyp, cred_t *cred)
+/*
+ * Note if we want to implement the kvm mmu notifier components than the
+ * following two functions will need to be readdressed.
+ */
+static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
-	kvm_devstate_t *ksp;
-	minor_t minor = getminor(dev);
-	kvm_t *kvmp;
-
-	VERIFY(getminor(dev) != kvm_base_minor);
-	ksp = ddi_get_soft_state(kvm_state, minor);
-
-	if ((kvmp = ksp->kds_kvmp) != NULL) {
-		mutex_enter(&kvm_lock);
-
-		if (kvmp->kvm_clones > 0) {
-			kvmp->kvm_clones--;
-			mutex_exit(&kvm_lock);
-		} else {
-			mutex_exit(&kvm_lock);
-			kvm_destroy_vm(kvmp);
-		}
-	}
-
-	ddi_soft_state_free(kvm_state, minor);
-	vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1);
-
 	return (0);
 }
 
 static void
-hardware_enable(void *junk)
+kvm_fini_mmu_notifier(struct kvm *kvm)
 {
-	int cpu;
-	int r;
-
-	cpu = curthread->t_cpu->cpu_id;
-
-	if (CPU_IN_SET(cpus_hardware_enabled, cpu))
-		return;
-
-	CPUSET_ADD(cpus_hardware_enabled, cpu);
-
-	r = kvm_arch_hardware_enable(NULL);
-
-	if (r) {
-		CPUSET_DEL(cpus_hardware_enabled, cpu);
-		atomic_inc_32(&hardware_enable_failed);
-		cmn_err(CE_WARN, "kvm: enabling virtualization CPU%d failed\n",
-			cpu);
-	}
 }
 
 static void
-hardware_disable(void *junk)
+kvm_destroy_vm(struct kvm *kvmp)
 {
-	int cpu = curthread->t_cpu->cpu_id;
+	int ii;
+	void *cookie;
 
-	if (!CPU_IN_SET(cpus_hardware_enabled, cpu))
+	if (kvmp == NULL)
 		return;
 
-	CPUSET_DEL(cpus_hardware_enabled, cpu);
-	kvm_arch_hardware_disable(NULL);
-}
-
-/*
- * The following needs to run on each cpu.  Currently,
- * wait is always 1, so we use the kvm_xcall() routine which
- * calls xc_sync.  Later, if needed, the implementation can be
- * changed to use xc_call or xc_call_nowait.
- */
-#define	on_each_cpu(func, info, wait)	\
-	/*CSTYLED*/			\
-	({				\
-		kvm_xcall(KVM_CPUALL, func, info);	\
-	0;				\
-	})
-
-static void
-hardware_disable_all_nolock(void)
-{
-	kvm_usage_count--;
-	if (!kvm_usage_count)
-		on_each_cpu(hardware_disable, NULL, 1);
-}
-
-static void
-hardware_disable_all(void)
-{
-	mutex_enter(&kvm_lock);
-	hardware_disable_all_nolock();
-	mutex_exit(&kvm_lock);
-}
-
-static int
-hardware_enable_all(void)
-{
-	int r = 0;
-
-	mutex_enter(&kvm_lock);
-
-	kvm_usage_count++;
-	if (kvm_usage_count == 1) {
-		hardware_enable_failed = 0;
-		on_each_cpu(hardware_enable, NULL, 1);
-
-		if (hardware_enable_failed) {
-			hardware_disable_all_nolock();
-			r = EBUSY;
-		}
-	}
+	if (kvmp->kvm_kstat != NULL)
+		kstat_delete(kvmp->kvm_kstat);
 
-	mutex_exit(&kvm_lock);
+	kvm_arch_destroy_vm_comps(kvmp);
 
-	return (r);
-}
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+	kvm_coalesced_mmio_free(kvmp);
+#endif
 
-/*
- * Note if we want to implement the kvm mmu notifier components than the
- * following two functions will need to be readdressed.
- */
-static int kvm_init_mmu_notifier(struct kvm *kvm)
-{
-	return (0);
-}
+	list_remove(&vm_list, kvmp);
+	/*
+	 * XXX: The fact that we're cleaning these up here means that we aren't
+	 * properly cleaning them up somewhere else.
+	 */
+	cookie = NULL;
+	while (avl_destroy_nodes(&kvmp->kvm_avlmp, &cookie) != NULL)
+		continue;
+	avl_destroy(&kvmp->kvm_avlmp);
+	mutex_destroy(&kvmp->kvm_avllock);
+	mutex_destroy(&kvmp->slots_lock);
+	mutex_destroy(&kvmp->irq_lock);
+	mutex_destroy(&kvmp->lock);
+	mutex_destroy(&kvmp->requests_lock);
+	mutex_destroy(&kvmp->mmu_lock);
+	kvmp->mm = NULL;
+	kvm_fini_mmu_notifier(kvmp);
 
-static void
-kvm_fini_mmu_notifier(struct kvm *kvm)
-{
-}
+	for (ii = 0; ii < KVM_NR_BUSES; ii++)
+		kmem_free(kvmp->buses[ii], sizeof (struct kvm_io_bus));
 
-void
-kvm_arch_flush_shadow(struct kvm *kvm)
-{
-	kvm_mmu_zap_all(kvm);
-#ifdef XXX
-	kvm_reload_remote_mmus(kvm);
-#else
-	XXX_KVM_PROBE;
+	rw_destroy(&kvmp->kvm_rwlock);
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	/*
+	 * These lists are contained by the pic. However, the pic isn't
+	 */
+	list_destroy(&kvmp->irq_ack_notifier_list);
+	list_destroy(&kvmp->mask_notifier_list);
 #endif
+	kvm_arch_destroy_vm(kvmp);
 }
 
 static struct kvm *
@@ -997,143 +436,10 @@ kvm_create_vm(void)
 	return (kvmp);
 }
 
-static void
-kvm_destroy_vm(struct kvm *kvmp)
-{
-	int ii;
-	void *cookie;
-
-	if (kvmp == NULL)
-		return;
-
-	if (kvmp->kvm_kstat != NULL)
-		kstat_delete(kvmp->kvm_kstat);
-
-	kvm_arch_destroy_vm_comps(kvmp);
-
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-	kvm_coalesced_mmio_free(kvmp);
-#endif
-
-	list_remove(&vm_list, kvmp);
-	/*
-	 * XXX: The fact that we're cleaning these up here means that we aren't
-	 * properly cleaning them up somewhere else.
-	 */
-	cookie = NULL;
-	while (avl_destroy_nodes(&kvmp->kvm_avlmp, &cookie) != NULL)
-		continue;
-	avl_destroy(&kvmp->kvm_avlmp);
-	mutex_destroy(&kvmp->kvm_avllock);
-	mutex_destroy(&kvmp->slots_lock);
-	mutex_destroy(&kvmp->irq_lock);
-	mutex_destroy(&kvmp->lock);
-	mutex_destroy(&kvmp->requests_lock);
-	mutex_destroy(&kvmp->mmu_lock);
-	kvmp->mm = NULL;
-	kvm_fini_mmu_notifier(kvmp);
-
-	for (ii = 0; ii < KVM_NR_BUSES; ii++)
-		kmem_free(kvmp->buses[ii], sizeof (struct kvm_io_bus));
-
-	rw_destroy(&kvmp->kvm_rwlock);
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
-	/*
-	 * These lists are contained by the pic. However, the pic isn't
-	 */
-	list_destroy(&kvmp->irq_ack_notifier_list);
-	list_destroy(&kvmp->mask_notifier_list);
-#endif
-	kvm_arch_destroy_vm(kvmp);
-}
-
-static int
-kvm_dev_ioctl_create_vm(kvm_devstate_t *ksp, intptr_t arg, int *rv)
-{
-	if (ksp->kds_kvmp != NULL)
-		return (EINVAL);
-
-	ksp->kds_kvmp = kvm_create_vm();
-
-	if (ksp->kds_kvmp == NULL) {
-		cmn_err(CE_WARN, "Could not create new vm\n");
-		return (EIO);
-	}
-	*rv = ksp->kds_kvmp->kvmid;
-	return (DDI_SUCCESS);
-}
-
-static long
-kvm_dev_ioctl_check_extension_generic(long arg, int *rv)
-{
-	switch (arg) {
-	case KVM_CAP_USER_MEMORY:
-	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
-	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	case KVM_CAP_SET_BOOT_CPU_ID:
-#endif
-	case KVM_CAP_INTERNAL_ERROR_DATA:
-		*rv = 1;
-		return (DDI_SUCCESS);
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
-	case KVM_CAP_IRQ_ROUTING:
-		*rv = KVM_MAX_IRQ_ROUTES;
-		return (DDI_SUCCESS);
-#endif
-	default:
-		break;
-	}
-	return (kvm_dev_ioctl_check_extension(arg, rv));
-}
-
-
-void
-kvm_arch_commit_memory_region(struct kvm *kvm,
-    struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old,
-    int user_alloc)
-{
-
-	int npages = mem->memory_size >> PAGESHIFT;
-
-	if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
-		int ret = 0;
-
-#ifdef XXX
-		down_write(&current->mm->mmap_sem);
-		ret = munmap(old.userspace_addr,
-				old.npages * PAGESIZE);
-		up_write(&current->mm->mmap_sem);
-#else
-		XXX_KVM_PROBE;
-		/* see comment in kvm_arch_prepare_memory_region */
-		/*
-		 * XXX this needs to be here, but I'm getting kernel heap
-		 * corruption panics with someone writing to a buffer after it
-		 * is freed
-		 */
-		kmem_free((caddr_t)old.userspace_addr, old.npages * PAGESIZE);
-#endif
-		if (ret < 0) {
-			cmn_err(CE_WARN, "kvm_vm_ioctl_set_memory_region: "
-			    "failed to munmap memory\n");
-		}
-	}
-
-	mutex_enter(&kvm->mmu_lock);
-	if (!kvm->arch.n_requested_mmu_pages) {
-		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
-	}
-
-	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-	mutex_exit(&kvm->mmu_lock);
-}
-
 /*
  * Free any memory in @free but not in @dont.
  */
-void
+static void
 kvm_free_physmem_slot(struct kvm_memory_slot *free,
     struct kvm_memory_slot *dont)
 {
@@ -1174,6 +480,13 @@ kvm_free_physmem(struct kvm *kvm)
 	kmem_free(kvm->memslots, sizeof (struct kvm_memslots));
 }
 
+
+void
+kvm_get_kvm(struct kvm *kvm)
+{
+	atomic_inc_32(&kvm->users_count);
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -1400,763 +713,293 @@ kvm_set_memory_region(kvm_t *kvm,
 	return (r);
 }
 
-
-static int
-kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, caddr_t addr)
+int
+kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+    struct kvm_userspace_memory_region *mem, int user_alloc)
 {
-	/*
-	 * XXX later, if adding other arch beside x86, need to do something
-	 * else here
-	 */
-	return (kvm_x86_ops->set_tss_addr(kvmp, addr));
+	if (mem->slot >= KVM_MEMORY_SLOTS)
+		return (EINVAL);
+
+	return (kvm_set_memory_region(kvm, mem, user_alloc));
 }
 
-static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
-				unsigned int *ecx, unsigned int *edx)
+void
+kvm_disable_largepages(void)
 {
-	/* ecx is often an input as well as an output. */
-	__asm__ volatile("cpuid"
-	    : "=a" (*eax),
-	    "=b" (*ebx),
-	    "=c" (*ecx),
-	    "=d" (*edx)
-	    : "0" (*eax), "2" (*ecx));
+	largepages_enabled = 0;
 }
 
-#define	__cpuid			native_cpuid
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void
-cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx,
-    unsigned int *ecx, unsigned int *edx)
+int
+is_error_pfn(pfn_t pfn)
 {
-	*eax = op;
-	*ecx = count;
-	__cpuid(eax, ebx, ecx, edx);
+	return (pfn == bad_pfn);
 }
 
-static void
-do_cpuid_1_ent(kvm_cpuid_entry2_t *entry, uint32_t function, uint32_t index)
+static unsigned long
+bad_hva(void)
 {
-	entry->function = function;
-	entry->index = index;
-	cpuid_count(entry->function, entry->index,
-		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
-	entry->flags = 0;
+	return (PAGEOFFSET);
 }
 
-static int
-is_efer_nx(void)
+int
+kvm_is_error_hva(unsigned long addr)
 {
-	unsigned long long efer = 0;
-
-	rdmsrl_safe(MSR_EFER, &efer);
-	return (efer & EFER_NX);
+	return (addr == bad_hva());
 }
 
-#define	F(x) bit(X86_FEATURE_##x)
-
-static void
-do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function,
-    uint32_t index, int *nent, int maxnent)
+struct kvm_memory_slot *
+gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 {
-	unsigned int ddic;
-	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
-	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
-				? F(GBPAGES) : 0;
-	unsigned f_lm = F(LM);
+	int i;
+#ifdef XXX_KVM_DECLARATION
+	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
 #else
-	unsigned f_gbpages = 0;
-	unsigned f_lm = 0;
+	struct kvm_memslots *slots = kvm->memslots;
 #endif
-	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-
-	/* cpuid 1.edx */
-	const uint32_t kvm_supported_word0_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
-		0 /* Reserved, DS, ACPI */ | F(MMX) |
-		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
-		0 /* HTT, TM, Reserved, PBE */;
-	/* cpuid 0x80000001.edx */
-	const uint32_t kvm_supported_word1_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* Reserved */ |
-		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
-		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
-	/* cpuid 1.ecx */
-	const uint32_t kvm_supported_word4_x86_features =
-		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
-		0 /* DS-CPL, VMX, SMX, EST */ |
-		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
-		0 /* Reserved, DCA */ | F(XMM4_1) |
-		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, XSAVE, OSXSAVE */;
-	/* cpuid 0x80000001.ecx */
-	const uint32_t kvm_supported_word6_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
-		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
-		0 /* SKINIT */ | 0 /* WDT */;
-
-	/* all calls to cpuid_count() should be made on the same cpu */
-	/* XXX - right now, system panics at ddi_exit_critical() */
-	/* XXX - to run everything on same cpu, bind qemu at startup */
 
-	kpreempt_disable();
-
-	do_cpuid_1_ent(entry, function, index);
-	++*nent;
+	for (i = 0; i < slots->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
 
-	switch (function) {
-	case 0:
-		entry->eax = min(entry->eax, (uint32_t)0xb);
-		break;
-	case 1:
-		entry->edx &= kvm_supported_word0_x86_features;
-		entry->ecx &= kvm_supported_word4_x86_features;
-		/*
-		 * we support x2apic emulation even if host does not support
-		 * it since we emulate x2apic in software
-		 */
-		entry->ecx |= F(X2APIC);
-		break;
-	/*
-	 * function 2 entries are STATEFUL. That is, repeated cpuid commands
-	 * may return different values. This forces us to get_cpu() before
-	 * issuing the first command, and also to emulate this annoying behavior
-	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT
-	 */
-	case 2: {
-		int t, times = entry->eax & 0xff;
-
-		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-		for (t = 1; t < times && *nent < maxnent; ++t) {
-			do_cpuid_1_ent(&entry[t], function, 0);
-			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-			++*nent;
-		}
-		break;
-	}
-	/* function 4 and 0xb have additional index. */
-	case 4: {
-		int i, cache_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until cache_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
-			cache_type = entry[i - 1].eax & 0x1f;
-			if (!cache_type)
-				break;
-			do_cpuid_1_ent(&entry[i], function, i);
-			entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 0xb: {
-		int i, level_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until level_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
-			level_type = entry[i - 1].ecx & 0xff00;
-			if (!level_type)
-				break;
-			do_cpuid_1_ent(&entry[i], function, i);
-			entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 0x80000000:
-		entry->eax = min(entry->eax, 0x8000001a);
-		break;
-	case 0x80000001:
-		entry->edx &= kvm_supported_word1_x86_features;
-		entry->ecx &= kvm_supported_word6_x86_features;
-		break;
+		if (gfn >= memslot->base_gfn &&
+		    gfn < memslot->base_gfn + memslot->npages)
+			return (memslot);
 	}
-	/*
-	 * XXX - see comment above for ddi_enter_critical()
-	 *
-	 * ddi_exit_critical(ddic);
-	 */
-	kpreempt_enable();
+	return (NULL);
 }
 
-#undef F
-
-static int
-kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-    struct kvm_cpuid_entry2  *entries)
+struct kvm_memory_slot *
+gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
-	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = E2BIG;
-	uint32_t func;
-	int allocsize = 0;
-
-	if (cpuid->nent < 1)
-		goto out;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
-	r = ENOMEM;
-	allocsize = sizeof (struct kvm_cpuid_entry2) * cpuid->nent;
-	cpuid_entries = kmem_zalloc(allocsize, KM_SLEEP);
-
-	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[0].eax;
-	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent);
-
-	r = E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[nent - 1].eax;
-	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent);
-	r = E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	r = EFAULT;
-	if (copyout(cpuid_entries, entries,
-	    nent * sizeof (kvm_cpuid_entry2_t)))
-		goto out_free;
-
-	cpuid->nent = nent;
-	r = 0;
-
-out_free:
-	kmem_free(cpuid_entries, allocsize);
-out:
-	return (r);
+	gfn = unalias_gfn(kvm, gfn);
+	return (gfn_to_memslot_unaliased(kvm, gfn));
 }
 
-static inline void
-__vmwrite(unsigned long field, unsigned long value)
+int
+kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
-	uint8_t err = 0;
-
-	/*CSTYLED*/
-	__asm__ volatile ( ASM_VMX_VMWRITE_RAX_RDX "\n\t" "setna %0"
-	    /* XXX: CF==1 or ZF==1 --> crash (ud2) */
-	    /* "ja 1f ; ud2 ; 1:\n" */
-	    : "=q"(err) : "a" (value), "d" (field)
-	    : "cc", "memory");
-
-	/* XXX the following should be ifdef debug... */
-	if (err) {
-#ifdef XXX
-		vmcs_read32(VM_INSTRUCTION_ERROR);
-		cmn_err(CE_WARN, "_vmwrite: error writing %lx to %lx: "
-		    "error number = %d\n", value, field, err & 0xff);
+	int i;
+#ifdef XXX_KVM_DECLARATION
+	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
 #else
-		XXX_KVM_PROBE;
+	struct kvm_memslots *slots = kvm->memslots;
 #endif
-	}
-}
-
-void
-kvm_migrate_timers(struct kvm_vcpu *vcpu)
-{
-	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
-}
-
-static int
-kvm_request_guest_time_update(struct kvm_vcpu *v)
-{
-	struct kvm_vcpu_arch *vcpu = &v->arch;
 
-	if (!vcpu->time_page)
-		return (0);
+	gfn = unalias_gfn_instantiation(kvm, gfn);
 
-	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
 
-	return (1);
-}
+		if (memslot->flags & KVM_MEMSLOT_INVALID)
+			continue;
 
-void
-kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	kvm_x86_ops->vcpu_load(vcpu, cpu);
-#ifdef XXX
-	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
-		unsigned long khz = cpufreq_quick_get(cpu);
-		if (!khz)
-			khz = tsc_khz;
-		per_cpu(cpu_tsc_khz, cpu) = khz;
+		if (gfn >= memslot->base_gfn &&
+		    gfn < memslot->base_gfn + memslot->npages) {
+			return (1);
+		}
 	}
-#else
-	XXX_KVM_PROBE;
-#endif
-	kvm_request_guest_time_update(vcpu);
-}
-
-void
-kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->guest_fpu_loaded)
-		return;
-
-	vcpu->guest_fpu_loaded = 0;
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
-	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_fpu_reload);
-	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
-	KVM_TRACE1(fpu, int, 0);
-}
-
-/* straight from xen code... */
-void
-ldt_load(void)
-{
-	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
-	wr_ldtr(ULDT_SEL);
-}
-
-inline int
-is_pae(struct kvm_vcpu *vcpu)
-{
-	return (kvm_read_cr4_bits(vcpu, X86_CR4_PAE));
-}
-
-void
-kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	kvm_put_guest_fpu(vcpu);
-
-	kvm_x86_ops->vcpu_put(vcpu);
-}
-
-void
-kvm_user_return_notifier_register(struct kvm_vcpu *vcpu,
-    struct kvm_user_return_notifier *urn)
-{
-	vcpu->urn = urn;
-}
-
-void
-kvm_user_return_notifier_unregister(struct kvm_vcpu *vcpu,
-    struct kvm_user_return_notifier *urn)
-{
-	vcpu->urn = NULL;
-}
-
-void
-kvm_fire_urn(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->urn)
-		vcpu->urn->on_user_return(vcpu, vcpu->urn);
-}
-
-/*
- * Called when we've been asked to save our context. i.e. we're being swapped
- * out.
- */
-void
-kvm_ctx_save(void *arg)
-{
-	struct kvm_vcpu *vcpu = arg;
-	kvm_arch_vcpu_put(vcpu);
-	kvm_fire_urn(vcpu);
-}
-
-/*
- * Called when we're being asked to restore our context. i.e. we're returning
- * from being swapped out.
- */
-void
-kvm_ctx_restore(void *arg)
-{
-	int cpu;
 
-	cpu = CPU->cpu_seqid;
-	struct kvm_vcpu *vcpu = arg;
-	kvm_arch_vcpu_load(vcpu, cpu);
+	return (0);
 }
 
-/*
- * Switches to specified vcpu, until a matching vcpu_put()
- */
-void
-vcpu_load(struct kvm_vcpu *vcpu)
+unsigned long
+kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
 {
-	int cpu;
+	struct vm_area_struct *vma;
+	unsigned long addr, size;
 
-	mutex_enter(&vcpu->mutex);
-	kpreempt_disable();
-	cpu = CPU->cpu_seqid;
-	installctx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL,
-	    NULL, NULL, NULL);
-	kvm_arch_vcpu_load(vcpu, cpu);
-	kpreempt_enable();
-}
+	size = PAGESIZE;
 
-void
-vcpu_put(struct kvm_vcpu *vcpu)
-{
-	kpreempt_disable();
-	kvm_arch_vcpu_put(vcpu);
-	kvm_fire_urn(vcpu);
-	removectx(curthread, vcpu, kvm_ctx_save, kvm_ctx_restore, NULL,
-	    NULL, NULL, NULL);
-	kpreempt_enable();
-	mutex_exit(&vcpu->mutex);
-}
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return (PAGESIZE);
 
-/*
- * find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful)
- */
-static int
-is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
-    uint32_t function, uint32_t index)
-{
-	if (e->function != function)
-		return (0);
-	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
-		return (0);
-	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
-		return (0);
-	return (1);
-}
+#ifdef XXX
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, addr);
+	if (!vma)
+		goto out;
 
-static int
-move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
-	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-	int j, nent = vcpu->arch.cpuid_nent;
-
-	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
-	/* when no next entry is found, the current entry[i] is reselected */
-	for (j = i + 1; ; j = (j + 1) % nent) {
-		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
-		if (ej->function == e->function) {
-			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-			return (j);
-		}
-	}
+	size = vma_kernel_pagesize(vma);
 
-	return (0); /* silence gcc, even though control never reaches here */
+out:
+	up_read(&current->mm->mmap_sem);
+	return (size);
+#else
+	XXX_KVM_PROBE;
+	return (PAGESIZE);
+#endif
 }
 
-struct kvm_cpuid_entry2 *
-kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, uint32_t index)
+int
+memslot_id(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_cpuid_entry2 *best = NULL;
+#ifdef XXX_KVM_DECLARATION
+	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+#else
+	struct kvm_memslots *slots = kvm->memslots;
+#endif
+	struct kvm_memory_slot *memslot = NULL;
 
-	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-		struct kvm_cpuid_entry2 *e;
+	gfn = unalias_gfn(kvm, gfn);
+	for (i = 0; i < slots->nmemslots; ++i) {
+		memslot = &slots->memslots[i];
 
-		e = &vcpu->arch.cpuid_entries[i];
-		if (is_matching_cpuid_entry(e, function, index)) {
-			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
-				move_to_next_stateful_cpuid_entry(vcpu, i);
-			best = e;
+		if (gfn >= memslot->base_gfn &&
+		    gfn < memslot->base_gfn + memslot->npages)
 			break;
-		}
-		/*
-		 * Both basic or both extended?
-		 */
-		if (((e->function ^ function) & 0x80000000) == 0)
-			if (!best || e->function > best->function)
-				best = e;
 	}
 
-	return (best);
+	return (memslot - slots->memslots);
 }
 
-static int
-kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid)
+unsigned long
+gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		return (E2BIG);
-
-	bcopy(cpuid->entries, vcpu->arch.cpuid_entries,
-	    cpuid->nent * sizeof (struct kvm_cpuid_entry2));
+	struct kvm_memory_slot *slot;
 
-	vcpu_load(vcpu);
-	vcpu->arch.cpuid_nent = cpuid->nent;
-	kvm_apic_set_version(vcpu);
-	kvm_x86_ops->cpuid_update(vcpu);
-	vcpu_put(vcpu);
+	gfn = unalias_gfn_instantiation(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return (bad_hva());
 
-	return (0);
+	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE);
 }
 
-static int
-kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid)
+static pfn_t
+hva_to_pfn(struct kvm *kvm, unsigned long addr)
 {
-	int r;
-	struct kvm_cpuid_entry2 *entries = cpuid->entries;
-
-	cpuid->nent = vcpu->arch.cpuid_nent;
-
-	if (cpuid->nent < vcpu->arch.cpuid_nent)
-		return (E2BIG);
+	page_t page[1];
+	int npages;
+	pfn_t pfn;
+	proc_t *procp = ttoproc(curthread);
+	struct as *as = procp->p_as;
 
-	bcopy(&vcpu->arch.cpuid_entries, cpuid->entries,
-	    vcpu->arch.cpuid_nent * sizeof (struct kvm_cpuid_entry2));
+#ifdef XXX
 
-	return (0);
-}
+	npages = get_user_pages_fast(addr, 1, 1, page);
 
-unsigned long
-kvm_get_rflags(struct kvm_vcpu *vcpu)
-{
-	unsigned long rflags;
+	if (unlikely(npages != 1)) {
+		struct vm_area_struct *vma;
 
-	rflags = kvm_x86_ops->get_rflags(vcpu);
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, addr);
 
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		if (vma == NULL || addr < vma->vm_start ||
+		    !(vma->vm_flags & VM_PFNMAP)) {
+			up_read(&current->mm->mmap_sem);
+			get_page(bad_page);
+			return (page_to_pfn(bad_page));
+		}
 
-	return (rflags);
+		pfn = ((addr - vma->vm_start) >> PAGESHIFT) + vma->vm_pgoff;
+		up_read(&current->mm->mmap_sem);
+		BUG_ON(!kvm_is_mmio_pfn(pfn));
+	} else
+		pfn = page_to_pfn(page[0]);
+#else
+	XXX_KVM_PROBE;
+	if (addr < kernelbase)
+		pfn = hat_getpfnum(as->a_hat, (caddr_t)addr);
+	else
+		pfn = hat_getpfnum(kas.a_hat, (caddr_t)addr);
+#endif
+	return (pfn);
 }
 
-int
-kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+pfn_t
+gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
-	vcpu_load(vcpu);
-
-	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
-	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-#ifdef CONFIG_X86_64
-	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
-	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
-	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
-	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
-	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
-	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
-	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
-	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
-#endif
+	unsigned long addr;
+	pfn_t pfn;
 
-	regs->rip = kvm_rip_read(vcpu);
-	regs->rflags = kvm_get_rflags(vcpu);
+	addr = gfn_to_hva(kvm, gfn);
 
-	vcpu_put(vcpu);
+	if (kvm_is_error_hva(addr)) {
+		get_page(bad_page);
+		return (page_to_pfn(bad_page));
+	}
 
-	return (0);
-}
+	pfn = hva_to_pfn(kvm, addr);
 
-void
-kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->get_segment(vcpu, var, seg);
+	return (pfn);
 }
 
-static uint16_t
-get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+page_t *
+gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
-	struct kvm_segment kvm_seg;
+	pfn_t pfn = gfn_to_pfn(kvm, gfn);
 
-	kvm_get_segment(vcpu, &kvm_seg, seg);
+	if (!kvm_is_mmio_pfn(pfn))
+		return (pfn_to_page(pfn));
 
-	return (kvm_seg.selector);
+	get_page(bad_page);
+	return (bad_page);
 }
 
 void
-kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
-	    vcpu->arch.singlestep_cs == get_segment_selector(vcpu,
-	    VCPU_SREG_CS) && vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) {
-		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-	}
-
-	kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
-int
-kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+kvm_release_pfn_clean(pfn_t pfn)
 {
-	vcpu_load(vcpu);
-
-	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
-#ifdef CONFIG_X86_64
-	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
-	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
-	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
-	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
-	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
-	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
-	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
-	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#ifdef XXX
+	if (!kvm_is_mmio_pfn(pfn))
+		put_page(pfn_to_page(pfn));
+#else
+	XXX_KVM_PROBE;
 #endif
-
-	kvm_rip_write(vcpu, regs->rip);
-	kvm_set_rflags(vcpu, regs->rflags);
-
-	vcpu->arch.exception.pending = 0;
-
-	vcpu_put(vcpu);
-
-	return (0);
-}
-
-int
-kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-	vcpu_load(vcpu);
-
-	memcpy(fpu->fpr, fxsave->st_space, 128);
-	fpu->fcw = fxsave->cwd;
-	fpu->fsw = fxsave->swd;
-	fpu->ftwx = fxsave->twd;
-	fpu->last_opcode = fxsave->fop;
-	fpu->last_ip = fxsave->rip;
-	fpu->last_dp = fxsave->rdp;
-	memcpy(fpu->xmm, fxsave->xmm_space, sizeof (fxsave->xmm_space));
-
-	vcpu_put(vcpu);
-
-	return (0);
 }
 
-int
-kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+void
+kvm_release_page_dirty(page_t *page)
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-	vcpu_load(vcpu);
-
-	memcpy(fxsave->st_space, fpu->fpr, 128);
-	fxsave->cwd = fpu->fcw;
-	fxsave->swd = fpu->fsw;
-	fxsave->twd = fpu->ftwx;
-	fxsave->fop = fpu->last_opcode;
-	fxsave->rip = fpu->last_ip;
-	fxsave->rdp = fpu->last_dp;
-	memcpy(fxsave->xmm_space, fpu->xmm, sizeof (fxsave->xmm_space));
-
-	vcpu_put(vcpu);
-
-	return (0);
+	kvm_release_pfn_dirty(page_to_pfn(page));
 }
 
-unsigned long
-kvm_get_cr8(struct kvm_vcpu *vcpu)
+void
+kvm_release_pfn_dirty(pfn_t pfn)
 {
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		return (kvm_lapic_get_cr8(vcpu));
-	} else {
-		return (vcpu->arch.cr8);
-	}
+	kvm_set_pfn_dirty(pfn);
+	kvm_release_pfn_clean(pfn);
 }
 
-int
-kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+void
+kvm_set_pfn_dirty(pfn_t pfn)
 {
-	struct descriptor_table dt;
-
-	vcpu_load(vcpu);
-
-	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
-	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
-	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
-	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
-	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
-	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
-	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
-	kvm_x86_ops->get_idt(vcpu, &dt);
-	sregs->idt.limit = dt.limit;
-	sregs->idt.base = dt.base;
-	kvm_x86_ops->get_gdt(vcpu, &dt);
-	sregs->gdt.limit = dt.limit;
-	sregs->gdt.base = dt.base;
-
-	sregs->cr0 = kvm_read_cr0(vcpu);
-	sregs->cr2 = vcpu->arch.cr2;
-	sregs->cr3 = vcpu->arch.cr3;
-	sregs->cr4 = kvm_read_cr4(vcpu);
-	sregs->cr8 = kvm_get_cr8(vcpu);
-	sregs->efer = vcpu->arch.efer;
-	sregs->apic_base = kvm_get_apic_base(vcpu);
-
-	memset(sregs->interrupt_bitmap, 0, sizeof (sregs->interrupt_bitmap));
-
-	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) {
-		set_bit(vcpu->arch.interrupt.nr,
-			(unsigned long *)sregs->interrupt_bitmap);
+#ifdef XXX
+	if (!kvm_is_mmio_pfn(pfn)) {
+		struct page *page = pfn_to_page(pfn);
+		if (!PageReserved(page))
+			SetPageDirty(page); /* XXX - not defined in linux?! */
 	}
-
-	vcpu_put(vcpu);
-
-	return (0);
-}
-
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-
-inline void
-kvm_queue_interrupt(struct kvm_vcpu *vcpu, uint8_t vector, int soft)
-{
-	vcpu->arch.interrupt.pending = 1;
-	vcpu->arch.interrupt.soft = soft;
-	vcpu->arch.interrupt.nr = vector;
+#else
+	XXX_KVM_PROBE;
+#endif
 }
 
-inline unsigned long
-bad_hva(void)
+void
+kvm_set_pfn_accessed(struct kvm *kvm, pfn_t pfn)
 {
-	return (PAGEOFFSET);
+#ifdef XXX
+	if (!kvm_is_mmio_pfn(pfn))
+		mark_page_accessed(pfn_to_page(pfn));
+#else
+	XXX_KVM_PROBE;
+#endif
 }
 
-unsigned long
-gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+void
+kvm_get_pfn(struct kvm_vcpu *vcpu, pfn_t pfn)
 {
-	struct kvm_memory_slot *slot;
-
-	gfn = unalias_gfn_instantiation(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
-	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
-		return (bad_hva());
-
-	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE);
+	if (!kvm_is_mmio_pfn(pfn))
+		get_page(pfn_to_page(pfn));
 }
 
-int
-kvm_is_error_hva(unsigned long addr)
+static int
+next_segment(unsigned long len, int offset)
 {
-	return (addr == bad_hva());
+	if (len > PAGESIZE - offset)
+		return (PAGESIZE - offset);
+	else
+		return (len);
 }
 
 int
@@ -2182,258 +1025,55 @@ kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len)
 	return (0);
 }
 
-/*
- * Load the pae pdptrs.  Return true is they are all valid.
- */
 int
-load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 {
-	gfn_t pdpt_gfn = cr3 >> PAGESHIFT;
-	unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2;
-	int i;
+	gfn_t gfn = gpa >> PAGESHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
 	int ret;
-	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
-
-	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn,
-	    pdpte, offset * sizeof (uint64_t), sizeof (pdpte));
-
-	if (ret < 0) {
-		ret = 0;
-		goto out;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(pdpte); i++) {
-		if (is_present_gpte(pdpte[i]) &&
-		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
-			ret = 0;
-			goto out;
-		}
-	}
-	ret = 1;
-
-	memcpy(vcpu->arch.pdptrs, pdpte, sizeof (vcpu->arch.pdptrs));
-	__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail);
-	__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty);
-out:
-	return (ret);
-}
-
-static void
-update_cr8_intercept(struct kvm_vcpu *vcpu)
-{
-	int max_irr, tpr;
-
-	if (!kvm_x86_ops->update_cr8_intercept)
-		return;
-
-	if (!vcpu->arch.apic)
-		return;
-	if (!vcpu->arch.apic->vapic_addr)
-		max_irr = kvm_lapic_find_highest_irr(vcpu);
-	else
-		max_irr = -1;
-
-	if (max_irr != -1)
-		max_irr >>= 4;
-	tpr = kvm_lapic_get_cr8(vcpu);
-
-	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
-}
-
-inline int
-is_protmode(struct kvm_vcpu *vcpu)
-{
-	return (kvm_read_cr0_bits(vcpu, X86_CR0_PE));
-}
-
-int
-kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
-{
-	return (vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id);
-}
-
-unsigned long
-find_next_bit(const unsigned long *addr,
-    unsigned long size, unsigned long offset)
-{
-	const unsigned long *p = addr + (offset/64);
-	unsigned long result = offset & ~(64-1);
-	unsigned long tmp;
-
-	if (offset >= size)
-		return (size);
-
-	size -= result;
-	offset %= 64;
-
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < 64)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= 64;
-		result += 64;
-	}
-	while (size & ~(64-1)) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += 64;
-		size -= 64;
-	}
-
-	if (!size)
-		return (result);
-	tmp = *p;
-
-found_first:
-	tmp &= (~0UL >> (64 - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return (result + size);	/* Nope. */
-found_middle:
-	return (result + __ffs(tmp));
-}
-
-int
-kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
-{
-	int mmu_reset_needed = 0;
-	int pending_vec, max_bits;
-	struct descriptor_table dt;
-
-	vcpu_load(vcpu);
-
-	dt.limit = sregs->idt.limit;
-	dt.base = sregs->idt.base;
-	kvm_x86_ops->set_idt(vcpu, &dt);
-	dt.limit = sregs->gdt.limit;
-	dt.base = sregs->gdt.base;
-	kvm_x86_ops->set_gdt(vcpu, &dt);
-
-	vcpu->arch.cr2 = sregs->cr2;
-	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-	vcpu->arch.cr3 = sregs->cr3;
-
-	kvm_set_cr8(vcpu, sregs->cr8);
-
-	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
-	kvm_x86_ops->set_efer(vcpu, sregs->efer);
-	kvm_set_apic_base(vcpu, sregs->apic_base);
-
-	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
-	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
-	vcpu->arch.cr0 = sregs->cr0;
-
-	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
-	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-
-	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
-		load_pdptrs(vcpu, vcpu->arch.cr3);
-		mmu_reset_needed = 1;
-	}
-
-	if (mmu_reset_needed)
-		kvm_mmu_reset_context(vcpu);
-
-	max_bits = (sizeof (sregs->interrupt_bitmap)) << 3;
-	pending_vec =
-	    find_next_bit((const unsigned long *)sregs->interrupt_bitmap,
-	    max_bits, 0);
+	uintptr_t dp = (uintptr_t)data;
 
-	if (pending_vec < max_bits) {
-		kvm_queue_interrupt(vcpu, pending_vec, 0);
-		if (irqchip_in_kernel(vcpu->kvm))
-			kvm_pic_clear_isr_ack(vcpu->kvm);
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_read_guest_page(kvm, gfn, (void *)dp, offset, seg);
+		if (ret < 0)
+			return (ret);
+		offset = 0;
+		len -= seg;
+		dp += seg;
+		++gfn;
 	}
-
-	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
-	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
-	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
-	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
-	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
-	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
-	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
-	update_cr8_intercept(vcpu);
-
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	/* Older userspace won't unhalt the vcpu on reset. */
-	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
-	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
-	    !is_protmode(vcpu))
-		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-#endif /* CONFIG_KVM_APIC_ARCHITECTURE */
-
-	vcpu_put(vcpu);
-
 	return (0);
 }
 
-static void
-kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+int
+kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 {
-	static int version;
-	struct pvclock_wall_clock wc;
-	struct timespec boot;
-
-	if (!wall_clock)
-		return;
-
-	version++;
+	int r;
+	unsigned long addr;
+	gfn_t gfn = gpa >> PAGESHIFT;
+	int offset = offset_in_page(gpa);
 
-	kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return (-EFAULT);
 
-	/*
-	 * The guest calculates current wall clock time by adding
-	 * system time (updated by kvm_write_guest_time below) to the
-	 * wall clock specified here.  guest system time equals host
-	 * system time for us, thus we must fill in host boot time here.
-	 */
 #ifdef XXX
-	getboottime(&boot);
-
-	wc.sec = boot.tv_sec;
-	wc.nsec = boot.tv_nsec;
-	wc.version = version;
-
-	kvm_write_guest(kvm, wall_clock, &wc, sizeof (wc));
-
-	version++;
-	kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
+	pagefault_disable();
 #else
 	XXX_KVM_PROBE;
 #endif
-}
-
-static int
-next_segment(unsigned long len, int offset)
-{
-	if (len > PAGESIZE - offset)
-		return (PAGESIZE - offset);
-	else
-		return (len);
-}
 
-void
-mark_page_dirty(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_memory_slot *memslot;
-
-	gfn = unalias_gfn(kvm, gfn);
-	memslot = gfn_to_memslot_unaliased(kvm, gfn);
-
-	if (memslot && memslot->dirty_bitmap) {
-		unsigned long rel_gfn = gfn - memslot->base_gfn;
-		unsigned long *p = memslot->dirty_bitmap + rel_gfn / 64;
-		int offset = rel_gfn % 64;
+	r = copyin((caddr_t)addr + offset, data, len);
+#ifdef XXX
+	pagefault_enable();
+#else
+	XXX_KVM_PROBE;
+#endif
+	if (r)
+		return (-EFAULT);
 
-		/* avoid RMW */
-		if (!test_bit(offset, p))
-			__set_bit(offset, p);
-	}
+	return (0);
 }
 
 int
@@ -2484,1010 +1124,277 @@ kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len)
 	return (0);
 }
 
-static int
-xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data)
-{
-	struct kvm *kvm = vcpu->kvm;
-	int lm = is_long_mode(vcpu);
-	uint8_t *blob_addr = lm ?
-	    (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 :
-	    (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
-	uint8_t blob_size = lm ?
-	    kvm->arch.xen_hvm_config.blob_size_64 :
-	    kvm->arch.xen_hvm_config.blob_size_32;
-	uint32_t page_num = data & ~PAGEMASK;
-	uint64_t page_addr = data & PAGEMASK;
-	uint8_t *page;
-	int r;
-
-	r = E2BIG;
-	if (page_num >= blob_size)
-		goto out;
-	r = ENOMEM;
-	page = kmem_alloc(PAGESIZE, KM_SLEEP);
-	r = EFAULT;
-	if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE))
-		goto out_free;
-	if (kvm_write_guest(kvm, page_addr, page, PAGESIZE))
-		goto out_free;
-	r = 0;
-out_free:
-	kmem_free(page, PAGESIZE);
-out:
-	return (r);
-}
-
-static void
-set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
-{
-	if (efer & efer_reserved_bits) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
-
-	if (is_paging(vcpu) &&
-	    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
-
-	if (efer & EFER_FFXSR) {
-		struct kvm_cpuid_entry2 *feat;
-
-		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-	}
-
-	if (efer & EFER_SVME) {
-		struct kvm_cpuid_entry2 *feat;
-
-		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-	}
-
-	kvm_x86_ops->set_efer(vcpu, efer);
-
-	efer &= ~EFER_LMA;
-	efer |= vcpu->arch.efer & EFER_LMA;
-
-	vcpu->arch.efer = efer;
-
-	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-	kvm_mmu_reset_context(vcpu);
-}
-
-static int
-msr_mtrr_valid(unsigned msr)
-{
-	switch (msr) {
-	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
-	case MSR_MTRRfix64K_00000:
-	case MSR_MTRRfix16K_80000:
-	case MSR_MTRRfix16K_A0000:
-	case MSR_MTRRfix4K_C0000:
-	case MSR_MTRRfix4K_C8000:
-	case MSR_MTRRfix4K_D0000:
-	case MSR_MTRRfix4K_D8000:
-	case MSR_MTRRfix4K_E0000:
-	case MSR_MTRRfix4K_E8000:
-	case MSR_MTRRfix4K_F0000:
-	case MSR_MTRRfix4K_F8000:
-	case MSR_MTRRdefType:
-	case MSR_IA32_CR_PAT:
-		return (1);
-	case 0x2f8:
-		return (1);
-	}
-
-	return (0);
-}
-
-static int
-valid_pat_type(unsigned t)
-{
-	return (t < 8 && (1 << t) & 0xf3); /* 0, 1, 4, 5, 6, 7 */
-}
-
-static int
-valid_mtrr_type(unsigned t)
-{
-	return (t < 8 && (1 << t) & 0x73); /* 0, 1, 4, 5, 6 */
-}
-
-static int
-mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+int
+kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
-	int i;
-
-	if (!msr_mtrr_valid(msr))
-		return (0);
-
-	if (msr == MSR_IA32_CR_PAT) {
-		for (i = 0; i < 8; i++)
-			if (!valid_pat_type((data >> (i * 8)) & 0xff))
-				return (0);
-		return (1);
-	} else if (msr == MSR_MTRRdefType) {
-		if (data & ~0xcff)
-			return (0);
-		return (valid_mtrr_type(data & 0xff));
-	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
-		for (i = 0; i < 8; i++)
-			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
-				return (0);
-		return (1);
-	}
-
-	/* variable MTRRs */
-	return (valid_mtrr_type(data & 0xff));
+	return (kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len));
 }
 
-static int
-set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+void
+mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
-	struct mtrr_state_type *state = &vcpu->arch.mtrr_state;
-
-	uint64_t *p = (uint64_t *)&state->fixed_ranges;
+	struct kvm_memory_slot *memslot;
 
-	if (!mtrr_valid(vcpu, msr, data))
-		return (1);
+	gfn = unalias_gfn(kvm, gfn);
+	memslot = gfn_to_memslot_unaliased(kvm, gfn);
 
-	if (msr == MSR_MTRRdefType) {
-		state->def_type = data;
-		state->enabled = (data & 0xc00) >> 10;
-	} else if (msr == MSR_MTRRfix64K_00000)
-		p[0] = data;
-	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
-		p[1 + msr - MSR_MTRRfix16K_80000] = data;
-	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
-		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
-	else if (msr == MSR_IA32_CR_PAT)
-		vcpu->arch.pat = data;
-	else {	/* Variable MTRRs */
-		int idx, is_mtrr_mask;
-		uint64_t *pt;
-
-		idx = (msr - 0x200) / 2;
-		is_mtrr_mask = msr - 0x200 - 2 * idx;
-
-		if (!is_mtrr_mask) {
-			pt = (uint64_t *)&state->var_ranges[idx].base_lo;
-		} else {
-			pt = (uint64_t *)&state->var_ranges[idx].mask_lo;
-		}
+	if (memslot && memslot->dirty_bitmap) {
+		unsigned long rel_gfn = gfn - memslot->base_gfn;
+		unsigned long *p = memslot->dirty_bitmap + rel_gfn / 64;
+		int offset = rel_gfn % 64;
 
-		*pt = data;
+		/* avoid RMW */
+		if (!test_bit(offset, p))
+			__set_bit(offset, p);
 	}
-
-	kvm_mmu_reset_context(vcpu);
-
-	return (0);
-}
-
-int
-clear_user(void *addr, unsigned long size)
-{
-	caddr_t ka;
-	int rval = 0;
-
-	ka = kmem_zalloc(size, KM_SLEEP);
-	rval = copyout(ka, addr, size);
-	kmem_free(ka, size);
-
-	return (rval);
 }
 
-static int
-set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void
+kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
-	switch (msr) {
-	case HV_X64_MSR_APIC_ASSIST_PAGE: {
-		unsigned long addr;
-
-		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
-			vcpu->arch.hv_vapic = data;
+	for (;;) {
+		if (kvm_arch_vcpu_runnable(vcpu)) {
+			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
 		}
 
-		addr = gfn_to_hva(vcpu->kvm,
-		    data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
-
-		if (kvm_is_error_hva(addr))
-			return (1);
-
-		if (clear_user((void *)addr, PAGESIZE))
-			return (1);
-
-		vcpu->arch.hv_vapic = data;
-		break;
-	}
-
-	case HV_X64_MSR_EOI:
-		return (kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data));
-	case HV_X64_MSR_ICR:
-		return (kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data));
-	case HV_X64_MSR_TPR:
-		return (kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data));
-
-	default:
-		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
-		    "data 0x%lx\n", msr, data);
-		return (1);
-	}
-
-	return (0);
-}
+		if (issig(JUSTLOOKING))
+			break;
 
-static int
-set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
-{
-	struct kvm *kvm = vcpu->kvm;
-
-	switch (msr) {
-	case HV_X64_MSR_GUEST_OS_ID:
-		kvm->arch.hv_guest_os_id = data;
-		/* setting guest os id to zero disables hypercall page */
-		if (!kvm->arch.hv_guest_os_id)
-			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
-		break;
-	case HV_X64_MSR_HYPERCALL: {
-		uint64_t gfn;
-		unsigned long addr;
-		uint8_t instructions[4];
+		mutex_enter(&vcpu->kvcpu_kick_lock);
 
-		/* if guest os id is not set hypercall should remain disabled */
-		if (!kvm->arch.hv_guest_os_id)
-			break;
-		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
-			kvm->arch.hv_hypercall = data;
+		if (kvm_cpu_has_pending_timer(vcpu)) {
+			mutex_exit(&vcpu->kvcpu_kick_lock);
 			break;
 		}
-		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
-		addr = gfn_to_hva(kvm, gfn);
-		if (kvm_is_error_hva(addr))
-			return (1);
-		kvm_x86_ops->patch_hypercall(vcpu, instructions);
-		((unsigned char *)instructions)[3] = 0xc3; /* ret */
-		if (copyout(instructions, (caddr_t)addr, 4))
-			return (1);
-		kvm->arch.hv_hypercall = data;
-		break;
-	}
-	default:
-		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
-		    "data 0x%lx\n", msr, data);
-		return (1);
-	}
-
-	return (0);
-}
 
-static int
-set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
-{
-	uint64_t mcg_cap = vcpu->arch.mcg_cap;
-	unsigned bank_num = mcg_cap & 0xff;
+		(void) cv_wait_sig_swap(&vcpu->kvcpu_kick_cv,
+		    &vcpu->kvcpu_kick_lock);
 
-	switch (msr) {
-	case MSR_IA32_MCG_STATUS:
-		vcpu->arch.mcg_status = data;
-		break;
-	case MSR_IA32_MCG_CTL:
-		if (!(mcg_cap & MCG_CTL_P))
-			return (1);
-		if (data != 0 && data != ~(uint64_t)0)
-			return (-1);
-		vcpu->arch.mcg_ctl = data;
-		break;
-	default:
-		if (msr >= MSR_IA32_MC0_CTL &&
-		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
-			uint32_t offset = msr - MSR_IA32_MC0_CTL;
-			/*
-			 * only 0 or all 1s can be written to IA32_MCi_CTL
-			 * some Linux kernels though clear bit 10 in bank 4 to
-			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
-			 * this to avoid an uncatched #GP in the guest
-			 */
-			if ((offset & 0x3) == 0 &&
-			    data != 0 && (data | (1 << 10)) != ~(uint64_t)0)
-				return (-1);
-			vcpu->arch.mce_banks[offset] = data;
-			break;
-		}
-		return (1);
+		mutex_exit(&vcpu->kvcpu_kick_lock);
 	}
-	return (0);
 }
 
-static int
-kvm_hv_msr_partition_wide(uint32_t msr)
+/*
+ * Creates some virtual cpus.  Good luck creating more than one.
+ */
+int
+kvm_vm_ioctl_create_vcpu(struct kvm *kvm, uint32_t id, int *rval_p)
 {
-	int r = 0;
-	switch (msr) {
-	case HV_X64_MSR_GUEST_OS_ID:
-	case HV_X64_MSR_HYPERCALL:
-		r = 1;
-		break;
-	}
+	int r, i;
+	struct kvm_vcpu *vcpu, *v;
 
-	return (r);
-}
+	vcpu = kvm_arch_vcpu_create(kvm, id);
+	if (vcpu == NULL)
+		return (EINVAL);
 
-#ifdef XXX_KVM_DECLARATION
-#define	pfn_valid(pfn) ((pfn < physmax) && (pfn != PFN_INVALID))
+#ifdef XXX
+	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 #else
-#define	pfn_valid(pfn) (pfn != PFN_INVALID)
+	XXX_KVM_PROBE;
 #endif
 
-inline int
-kvm_is_mmio_pfn(pfn_t pfn)
-{
-	if (pfn_valid(pfn)) {
+	r = kvm_arch_vcpu_setup(vcpu);
+	if (r)
+		return (r);
+
+	mutex_enter(&kvm->lock);
+
 #ifdef XXX
-		struct page *page = compound_head(pfn_to_page(pfn));
-		return (PageReserved(page));
+	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
 #else
-		XXX_KVM_PROBE;
+	XXX_KVM_SYNC_PROBE;
+	if (kvm->online_vcpus == KVM_MAX_VCPUS) {
 #endif
-		return (0);
-	} else
-		return (1);
-}
+		r = EINVAL;
+		goto vcpu_destroy;
+	}
 
-page_t *
-gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
-	pfn_t pfn = gfn_to_pfn(kvm, gfn);
+	/* kvm_for_each_vcpu(r, v, kvm) */
+	for (i = 0; i < kvm->online_vcpus; i++) {
+		v = kvm->vcpus[i];
+		if (v->vcpu_id == id) {
+			r = -EEXIST;
+			goto vcpu_destroy;
+		}
+	}
 
-	if (!kvm_is_mmio_pfn(pfn))
-		return (pfn_to_page(pfn));
+	/* BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); */
 
-	get_page(bad_page);
-	return (bad_page);
-}
+	/* Now it's all set up, let userspace reach it */
+	kvm_get_kvm(kvm);
 
-void
-kvm_release_page_dirty(page_t *page)
-{
-	kvm_release_pfn_dirty(page_to_pfn(page));
-}
+	*rval_p = kvm->online_vcpus;  /* guarantee unique id */
+	vcpu->vcpu_id = *rval_p;
+
+	/* XXX need to protect online_vcpus */
+	kvm->vcpus[kvm->online_vcpus] = vcpu;
 
-int
-kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
-{
-	switch (msr) {
-	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
-	case MSR_K7_HWCR:
-		data &= ~(uint64_t)0x40; /* ignore flush filter disable */
-		if (data != 0) {
-			cmn_err(CE_NOTE,
-			    "unimplemented HWCR wrmsr: 0x%lx\n", data);
-			return (1);
-		}
-		break;
-	case MSR_FAM10H_MMIO_CONF_BASE:
-		if (data != 0) {
-			cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: "
-				"0x%lx\n", data);
-			return (1);
-		}
-		break;
-	case MSR_AMD64_NB_CFG:
-		break;
-	case MSR_IA32_DEBUGCTLMSR:
-		if (!data) {
-			/* We support the non-activated case already */
-			break;
-		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
-			/*
-			 * Values other than LBR and BTF are vendor-specific,
-			 * thus reserved and should throw a #GP
-			 */
-			return (1);
-		}
-		cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%lx, nop\n",
-			__func__, data);
-		break;
-	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_UCODE_WRITE:
-	case MSR_VM_HSAVE_PA:
-	case MSR_AMD64_PATCH_LOADER:
-		break;
-	case 0x200 ... 0x2ff:
-		return (set_msr_mtrr(vcpu, msr, data));
-	case MSR_IA32_APICBASE:
-		kvm_set_apic_base(vcpu, data);
-		break;
-	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
-		return (kvm_x2apic_msr_write(vcpu, msr, data));
-	case MSR_IA32_MISC_ENABLE:
-		vcpu->arch.ia32_misc_enable_msr = data;
-		break;
-	case MSR_KVM_WALL_CLOCK:
-		vcpu->kvm->arch.wall_clock = data;
-		kvm_write_wall_clock(vcpu->kvm, data);
-		break;
-	case MSR_KVM_SYSTEM_TIME: {
 #ifdef XXX
-		if (vcpu->arch.time_page) {
-			kvm_release_page_dirty(vcpu->arch.time_page);
-			vcpu->arch.time_page = NULL;
-		}
+	smp_wmb();
 #else
-		XXX_KVM_PROBE;
+	XXX_KVM_SYNC_PROBE;
 #endif
+	atomic_inc_32(&kvm->online_vcpus);
 
-		vcpu->arch.time = data;
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	if (kvm->bsp_vcpu_id == id)
+		kvm->bsp_vcpu = vcpu;
+#endif
 
-		/* we verify if the enable bit is set... */
-		if (!(data & 1))
-			break;
+	mutex_exit(&kvm->lock);
+	return (r);
 
-		/* ...but clean it before doing the actual write */
-		vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1);
+vcpu_destroy:
 #ifdef XXX
-		vcpu->arch.time_page =
-				gfn_to_page(vcpu->kvm, data >> PAGESHIFT);
-
-		if (is_error_page(vcpu->arch.time_page)) {
-			kvm_release_page_clean(vcpu->arch.time_page);
-			vcpu->arch.time_page = NULL;
-		}
-
-		kvm_request_guest_time_update(vcpu);
+	mutex_exit(&kvm->lock);
+	kvm_arch_vcpu_destroy(vcpu);
 #else
-		XXX_KVM_PROBE;
+	XXX_KVM_PROBE;
 #endif
-		break;
-	}
-	case MSR_IA32_MCG_CTL:
-	case MSR_IA32_MCG_STATUS:
-	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
-		return (set_msr_mce(vcpu, msr, data));
-
-	/*
-	 * Performance counters are not protected by a CPUID bit, so we should
-	 * check all of them in the generic path for the sake of cross vendor
-	 * migration. Writing a zero into the event select MSRs disables them,
-	 * which we perfectly emulate ;-). Any other value should be at least
-	 * reported, some guests depend on them.
-	 */
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
-	case MSR_K7_EVNTSEL0:
-	case MSR_K7_EVNTSEL1:
-	case MSR_K7_EVNTSEL2:
-	case MSR_K7_EVNTSEL3:
-		if (data != 0)
-			cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
-				"0x%x data 0x%lx\n", msr, data);
-		break;
-	/*
-	 * at least RHEL 4 unconditionally writes to the perfctr registers,
-	 * so we ignore writes to make it happy.
-	 */
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-	case MSR_K7_PERFCTR0:
-	case MSR_K7_PERFCTR1:
-	case MSR_K7_PERFCTR2:
-	case MSR_K7_PERFCTR3:
-		cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
-			"0x%x data 0x%lx\n", msr, data);
-		break;
-	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
-		if (kvm_hv_msr_partition_wide(msr)) {
-			int r;
-			mutex_enter(&vcpu->kvm->lock);
-			r = set_msr_hyperv_pw(vcpu, msr, data);
-			mutex_exit(&vcpu->kvm->lock);
-			return (r);
-		} else
-			return (set_msr_hyperv(vcpu, msr, data));
-		break;
-	default:
-		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
-			return (xen_hvm_config(vcpu, data));
-		if (!ignore_msrs) {
-			cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %lx\n",
-				msr, data);
-			return (1);
-		} else {
-			cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %lx\n",
-				msr, data);
-			break;
-		}
-	}
-
-	return (0);
-}
-
-static int
-get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
-{
-	struct mtrr_state_type *state = &vcpu->arch.mtrr_state;
-	uint64_t *p = (uint64_t *)&state->fixed_ranges;
-
-	if (!msr_mtrr_valid(msr))
-		return (1);
-
-	if (msr == MSR_MTRRdefType)
-		*pdata = vcpu->arch.mtrr_state.def_type +
-		    (vcpu->arch.mtrr_state.enabled << 10);
-	else if (msr == MSR_MTRRfix64K_00000)
-		*pdata = p[0];
-	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
-		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
-	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
-		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
-	else if (msr == MSR_IA32_CR_PAT)
-		*pdata = vcpu->arch.pat;
-	else {	/* Variable MTRRs */
-		int idx, is_mtrr_mask;
-		uint64_t *pt;
-
-		idx = (msr - 0x200) / 2;
-		is_mtrr_mask = msr - 0x200 - 2 * idx;
-		if (!is_mtrr_mask) {
-			pt = (uint64_t *)&state->var_ranges[idx].base_lo;
-		} else {
-			pt = (uint64_t *)&state->var_ranges[idx].mask_lo;
-		}
-
-		*pdata = *pt;
-	}
-
-	return (0);
+	return (r);
 }
 
 static int
-get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 {
-	uint64_t data = 0;
-
-	switch (msr) {
-	case HV_X64_MSR_VP_INDEX: {
-		int r;
-		struct kvm_vcpu *v;
-		kvm_for_each_vcpu(r, v, vcpu->kvm)
-			if (v == vcpu)
-				data = r;
-		break;
-	}
-	case HV_X64_MSR_EOI:
-		return (kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata));
-	case HV_X64_MSR_ICR:
-		return (kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata));
-	case HV_X64_MSR_TPR:
-		return (kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata));
-	default:
-		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
-		return (1);
-	}
+	if (sigset) {
+		vcpu->sigset_active = 1;
+		vcpu->sigset = *sigset;
+	} else
+		vcpu->sigset_active = 0;
 
-	*pdata = data;
 	return (0);
 }
 
 static int
-get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+kvm_dev_ioctl_create_vm(kvm_devstate_t *ksp, intptr_t arg, int *rv)
 {
-	uint64_t data = 0;
-	struct kvm *kvm = vcpu->kvm;
-
-	switch (msr) {
-	case HV_X64_MSR_GUEST_OS_ID:
-		data = kvm->arch.hv_guest_os_id;
-		break;
-	case HV_X64_MSR_HYPERCALL:
-		data = kvm->arch.hv_hypercall;
-		break;
-	default:
-		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
-		return (1);
-	}
-
-	*pdata = data;
+	if (ksp->kds_kvmp != NULL)
+		return (EINVAL);
 
-	return (0);
-}
+	ksp->kds_kvmp = kvm_create_vm();
 
-static int
-get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
-{
-	uint64_t data;
-	uint64_t mcg_cap = vcpu->arch.mcg_cap;
-	unsigned bank_num = mcg_cap & 0xff;
-
-	switch (msr) {
-	case MSR_IA32_P5_MC_ADDR:
-	case MSR_IA32_P5_MC_TYPE:
-		data = 0;
-		break;
-	case MSR_IA32_MCG_CAP:
-		data = vcpu->arch.mcg_cap;
-		break;
-	case MSR_IA32_MCG_CTL:
-		if (!(mcg_cap & MCG_CTL_P))
-			return (1);
-		data = vcpu->arch.mcg_ctl;
-		break;
-	case MSR_IA32_MCG_STATUS:
-		data = vcpu->arch.mcg_status;
-		break;
-	default:
-		if (msr >= MSR_IA32_MC0_CTL &&
-		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
-			uint32_t offset = msr - MSR_IA32_MC0_CTL;
-			data = vcpu->arch.mce_banks[offset];
-			break;
-		}
-		return (1);
+	if (ksp->kds_kvmp == NULL) {
+		cmn_err(CE_WARN, "Could not create new vm\n");
+		return (EIO);
 	}
-	*pdata = data;
-	return (0);
+	*rv = ksp->kds_kvmp->kvmid;
+	return (DDI_SUCCESS);
 }
 
 int
-kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+kvm_dev_ioctl_check_extension_generic(long arg, int *rv)
 {
-	uint64_t data;
-
-	switch (msr) {
-	case MSR_IA32_PLATFORM_ID:
-	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_EBL_CR_POWERON:
-	case MSR_IA32_DEBUGCTLMSR:
-	case MSR_IA32_LASTBRANCHFROMIP:
-	case MSR_IA32_LASTBRANCHTOIP:
-	case MSR_IA32_LASTINTFROMIP:
-	case MSR_IA32_LASTINTTOIP:
-	case MSR_K8_SYSCFG:
-	case MSR_K7_HWCR:
-	case MSR_VM_HSAVE_PA:
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
-	case MSR_K7_EVNTSEL0:
-	case MSR_K7_PERFCTR0:
-	case MSR_K8_INT_PENDING_MSG:
-	case MSR_AMD64_NB_CFG:
-	case MSR_FAM10H_MMIO_CONF_BASE:
-		data = 0;
-		break;
-	case MSR_MTRRcap:
-		data = 0x500 | KVM_NR_VAR_MTRR;
-		break;
-	case 0x200 ... 0x2ff:
-		return (get_msr_mtrr(vcpu, msr, pdata));
-	case 0xcd: /* fsb frequency */
-		data = 3;
-		break;
-	case MSR_IA32_APICBASE:
-		data = kvm_get_apic_base(vcpu);
-		break;
-	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
-		return (kvm_x2apic_msr_read(vcpu, msr, pdata));
-		break;
-	case MSR_IA32_MISC_ENABLE:
-		data = vcpu->arch.ia32_misc_enable_msr;
-		break;
-	case MSR_IA32_PERF_STATUS:
-		/* TSC increment by tick */
-		data = 1000ULL;
-		/* CPU multiplier */
-		data |= (((uint64_t)4ULL) << 40);
-		break;
-	case MSR_EFER:
-		data = vcpu->arch.efer;
-		break;
-	case MSR_KVM_WALL_CLOCK:
-		data = vcpu->kvm->arch.wall_clock;
-		break;
-	case MSR_KVM_SYSTEM_TIME:
-		data = vcpu->arch.time;
-		break;
-	case MSR_IA32_P5_MC_ADDR:
-	case MSR_IA32_P5_MC_TYPE:
-	case MSR_IA32_MCG_CAP:
-	case MSR_IA32_MCG_CTL:
-	case MSR_IA32_MCG_STATUS:
-	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
-		return (get_msr_mce(vcpu, msr, pdata));
-	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
-		if (kvm_hv_msr_partition_wide(msr)) {
-			int r;
-			mutex_enter(&vcpu->kvm->lock);
-			r = get_msr_hyperv_pw(vcpu, msr, pdata);
-			mutex_exit(&vcpu->kvm->lock);
-			return (r);
-		} else
-			return (get_msr_hyperv(vcpu, msr, pdata));
-		break;
+	switch (arg) {
+	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	case KVM_CAP_SET_BOOT_CPU_ID:
+#endif
+	case KVM_CAP_INTERNAL_ERROR_DATA:
+		*rv = 1;
+		return (DDI_SUCCESS);
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	case KVM_CAP_IRQ_ROUTING:
+		*rv = KVM_MAX_IRQ_ROUTES;
+		return (DDI_SUCCESS);
+#endif
 	default:
-		if (!ignore_msrs) {
-			cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr);
-			return (1);
-		} else {
-			cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr);
-			data = 0;
-		}
 		break;
 	}
-	*pdata = data;
-
-	return (0);
+	return (kvm_dev_ioctl_check_extension(arg, rv));
 }
 
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int
-__msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
-    struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu,
-    unsigned index, uint64_t *data))
+static void
+hardware_enable(void *junk)
 {
-	int i, idx;
-
-	vcpu_load(vcpu);
-
-#ifdef XXX
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
-#else
-	XXX_KVM_SYNC_PROBE;
-#endif
-	for (i = 0; i < msrs->nmsrs; i++) {
-		if (do_msr(vcpu, entries[i].index, &entries[i].data))
-			break;
-	}
-
-#ifdef XXX
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-#else
-	XXX_KVM_SYNC_PROBE;
-#endif
-	vcpu_put(vcpu);
-
-	return (i);
-}
+	int cpu;
+	int r;
 
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int
-kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata)
-{
-	return (kvm_x86_ops->get_msr(vcpu, msr_index, pdata));
-}
+	cpu = curthread->t_cpu->cpu_id;
 
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int
-kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
-{
-	return (kvm_x86_ops->set_msr(vcpu, msr_index, data));
-}
+	if (CPU_IN_SET(cpus_hardware_enabled, cpu))
+		return;
 
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int
-do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data)
-{
-	return (kvm_set_msr(vcpu, index, *data));
-}
+	CPUSET_ADD(cpus_hardware_enabled, cpu);
 
-#define	EXCPT_BENIGN		0
-#define	EXCPT_CONTRIBUTORY	1
-#define	EXCPT_PF		2
+	r = kvm_arch_hardware_enable(NULL);
 
-static int
-exception_class(int vector)
-{
-	switch (vector) {
-	case PF_VECTOR:
-		return (EXCPT_PF);
-	case DE_VECTOR:
-	case TS_VECTOR:
-	case NP_VECTOR:
-	case SS_VECTOR:
-	case GP_VECTOR:
-		return (EXCPT_CONTRIBUTORY);
-	default:
-		break;
+	if (r) {
+		CPUSET_DEL(cpus_hardware_enabled, cpu);
+		atomic_inc_32(&hardware_enable_failed);
+		cmn_err(CE_WARN, "kvm: enabling virtualization CPU%d failed\n",
+			cpu);
 	}
-
-	return (EXCPT_BENIGN);
 }
 
 static void
-kvm_multiple_exception(struct kvm_vcpu *vcpu,
-    unsigned nr, int has_error, uint32_t error_code)
+hardware_disable(void *junk)
 {
-	uint32_t prev_nr;
-	int class1, class2;
-
-	if (!vcpu->arch.exception.pending) {
-queue:
-		vcpu->arch.exception.pending = 1;
-		vcpu->arch.exception.has_error_code = has_error;
-		vcpu->arch.exception.nr = nr;
-		vcpu->arch.exception.error_code = error_code;
-		return;
-	}
+	int cpu = curthread->t_cpu->cpu_id;
 
-	/* to check exception */
-	prev_nr = vcpu->arch.exception.nr;
-	if (prev_nr == DF_VECTOR) {
-		/* triple fault -> shutdown */
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+	if (!CPU_IN_SET(cpus_hardware_enabled, cpu))
 		return;
-	}
-	class1 = exception_class(prev_nr);
-	class2 = exception_class(nr);
-	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
-	    (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
-		/* generate double fault per SDM Table 5-5 */
-		vcpu->arch.exception.pending = 1;
-		vcpu->arch.exception.has_error_code = 1;
-		vcpu->arch.exception.nr = DF_VECTOR;
-		vcpu->arch.exception.error_code = 0;
-	} else {
-		/*
-		 * replace previous exception with a new one in a hope
-		 * that instruction re-execution will regenerate lost
-		 * exception
-		 */
-		goto queue;
-	}
-}
-
-void
-kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
-	kvm_multiple_exception(vcpu, nr, 0, 0);
-}
 
-void
-kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code)
-{
-	kvm_multiple_exception(vcpu, nr, 1, error_code);
+	CPUSET_DEL(cpus_hardware_enabled, cpu);
+	kvm_arch_hardware_disable(NULL);
 }
 
-inline void
-kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.exception.pending = 0;
-}
+/*
+ * The following needs to run on each cpu.  Currently,
+ * wait is always 1, so we use the kvm_xcall() routine which
+ * calls xc_sync.  Later, if needed, the implementation can be
+ * changed to use xc_call or xc_call_nowait.
+ */
+#define	on_each_cpu(func, info, wait)	\
+	/*CSTYLED*/			\
+	({				\
+		kvm_xcall(KVM_CPUALL, func, info);	\
+	0;				\
+	})
 
-inline void
-kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+static void
+hardware_disable_all_nolock(void)
 {
-	vcpu->arch.interrupt.pending = 0;
+	kvm_usage_count--;
+	if (!kvm_usage_count)
+		on_each_cpu(hardware_disable, NULL, 1);
 }
 
-
-void
-shared_msr_update(unsigned slot, uint32_t msr)
+static void
+hardware_disable_all(void)
 {
-	struct kvm_shared_msrs *smsr;
-	uint64_t value;
-	smsr = shared_msrs[CPU->cpu_id];
-
-	/*
-	 * only read, and nobody should modify it at this time,
-	 * so don't need lock
-	 */
-	if (slot >= shared_msrs_global.nr) {
-		cmn_err(CE_WARN, "kvm: invalid MSR slot!");
-		return;
-	}
-
-	rdmsrl_safe(msr, (unsigned long long *)&value);
-	smsr->values[slot].host = value;
-	smsr->values[slot].curr = value;
+	mutex_enter(&kvm_lock);
+	hardware_disable_all_nolock();
+	mutex_exit(&kvm_lock);
 }
 
-void
-kvm_set_shared_msr(struct kvm_vcpu *vcpu, unsigned slot, uint64_t value,
-    uint64_t mask)
+static int
+hardware_enable_all(void)
 {
-	struct kvm_shared_msrs *smsr = shared_msrs[CPU->cpu_id];
-
-	if (((value ^ smsr->values[slot].curr) & mask) == 0)
-		return;
-
-	smsr->values[slot].curr = value;
-	wrmsrl(shared_msrs_global.msrs[slot], value);
+	int r = 0;
 
-	if (!smsr->registered) {
-		smsr->urn.on_user_return = kvm_on_user_return;
-		kvm_user_return_notifier_register(vcpu, &smsr->urn);
-		smsr->registered = 1;
-	}
-}
+	mutex_enter(&kvm_lock);
 
-int
-kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-	return (kvm_x86_ops->interrupt_allowed(vcpu));
-}
+	kvm_usage_count++;
+	if (kvm_usage_count == 1) {
+		hardware_enable_failed = 0;
+		on_each_cpu(hardware_enable, NULL, 1);
 
-static int
-kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
-    struct kvm_vcpu *vcpu, uint32_t access, uint32_t *error)
-{
-	uintptr_t data = (uintptr_t)val;
-	int r = 0; /* X86EMUL_CONTINUE */
-
-	while (bytes) {
-		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
-		    access, error);
-		unsigned offset = addr & (PAGESIZE-1);
-		unsigned toread = min(bytes, (unsigned)PAGESIZE - offset);
-		int ret;
-
-		if (gpa == UNMAPPED_GVA) {
-			r = 1; /* X86EMUL_PROPAGATE_FAULT */
-			goto out;
-		}
-		ret = kvm_read_guest(vcpu->kvm, gpa, (void *)data, toread);
-		if (ret < 0) {
-			r = 1; /* X86EMUL_UNHANDLEABLE */
-			goto out;
+		if (hardware_enable_failed) {
+			hardware_disable_all_nolock();
+			r = EBUSY;
 		}
-
-		bytes -= toread;
-		data += toread;
-		addr += toread;
 	}
-out:
-	return (r);
-}
-
-void
-kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
-    uint32_t error_code)
-{
-	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_guest);
-	vcpu->arch.cr2 = addr;
-	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
-}
 
-static int
-kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
-    struct kvm_vcpu *vcpu, uint32_t *error)
-{
-	return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error));
-}
-
-/* used for instruction fetching */
-static int
-kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
-    struct kvm_vcpu *vcpu, uint32_t *error)
-{
-	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
-	    PFERR_USER_MASK : 0;
+	mutex_exit(&kvm_lock);
 
-	return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
-	    access | PFERR_FETCH_MASK, error));
+	return (r);
 }
 
 /* kvm_io_bus_write - called under kvm->slots_lock */
@@ -3530,2596 +1437,398 @@ kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	return (-EOPNOTSUPP);
 }
 
-static int
-vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v)
-{
-	if (vcpu->arch.apic &&
-	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
-		return (0);
-
-	return (kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v));
-}
-
-static int
-vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
-{
-	if (vcpu->arch.apic &&
-	    !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
-		return (0);
-
-	return (kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v));
-}
-
-gpa_t
-kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
-{
-	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
-	    PFERR_USER_MASK : 0;
-
-	return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error));
-}
-
-static int
-kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-    struct kvm_vcpu *vcpu, uint32_t *error)
-{
-	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
-	    PFERR_USER_MASK : 0;
-
-	return (kvm_read_guest_virt_helper(addr, val,
-	    bytes, vcpu, access, error));
-}
-
-static int
-emulator_read_emulated(unsigned long addr, void *val,
-    unsigned int bytes, struct kvm_vcpu *vcpu)
-{
-	gpa_t gpa;
-	uint32_t error_code;
-
-	if (vcpu->mmio_read_completed) {
-		memcpy(val, vcpu->mmio_data, bytes);
-		KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t,
-		    vcpu->mmio_phys_addr, uint64_t, *(uint64_t *)val);
-
-		vcpu->mmio_read_completed = 0;
-		return (X86EMUL_CONTINUE);
-	}
-
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
-
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
-		return (X86EMUL_PROPAGATE_FAULT);
-	}
-
-	/* For APIC access vmexit */
-	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
-		goto mmio;
-
-	if (kvm_read_guest_virt(addr, val,
-	    bytes, vcpu, NULL) == X86EMUL_CONTINUE)
-		return (X86EMUL_CONTINUE);
-
-mmio:
-	/*
-	 * Is this MMIO handled locally?
-	 */
-	if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
-		KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, gpa,
-		    uint64_t, *(uint64_t *)val);
-		return (X86EMUL_CONTINUE);
-	}
-
-	KVM_TRACE2(mmio__read__unsatisfied, unsigned int, bytes,
-	    uintptr_t, gpa);
-
-	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 0;
-
-	return (X86EMUL_UNHANDLEABLE);
-}
-
-int
-emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-    const void *val, int bytes)
-{
-	int ret;
-
-	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-
-	if (ret < 0)
-		return (0);
-
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
-
-	return (1);
-}
-
-gpa_t
-kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
-{
-	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
-	    PFERR_USER_MASK : 0;
-
-	access |= PFERR_WRITE_MASK;
-
-	return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error));
-}
-
-static int
-emulator_write_emulated_onepage(unsigned long addr, const void *val,
-    unsigned int bytes, struct kvm_vcpu *vcpu)
-{
-	gpa_t gpa;
-	uint32_t error_code;
-
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
-
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
-		return (X86EMUL_PROPAGATE_FAULT);
-	}
-
-	/* For APIC access vmexit */
-	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
-		goto mmio;
-
-	if (emulator_write_phys(vcpu, gpa, val, bytes))
-		return (X86EMUL_CONTINUE);
-
-mmio:
-	KVM_TRACE3(mmio__write, unsigned int, bytes, uintptr_t, gpa,
-	    uint64_t, *(uint64_t *)val);
-
-	/*
-	 * Is this MMIO handled locally?
-	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
-		return (X86EMUL_CONTINUE);
-
-	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 1;
-	memcpy(vcpu->mmio_data, val, bytes);
-
-	return (X86EMUL_CONTINUE);
-}
-
+/* Caller must hold slots_lock. */
 int
-emulator_write_emulated(unsigned long addr, const void *val,
-    unsigned int bytes, struct kvm_vcpu *vcpu)
+kvm_io_bus_register_dev(struct kvm *kvm,
+    enum kvm_bus bus_idx, struct kvm_io_device *dev)
 {
-	uintptr_t data = (uintptr_t)val;
-
-	/* Crossing a page boundary? */
-	if (((addr + bytes - 1) ^ addr) & PAGEMASK) {
-		int rc, now;
-
-		now = -addr & ~PAGEMASK;
-		rc = emulator_write_emulated_onepage(addr,
-		    (void *)data, now, vcpu);
-
-		if (rc != X86EMUL_CONTINUE)
-			return (rc);
-
-		addr += now;
-		data += now;
-		bytes -= now;
-	}
-
-	return (emulator_write_emulated_onepage(addr, val, bytes, vcpu));
-}
-
-static int
-emulator_cmpxchg_emulated(unsigned long addr, const void *old,
-    const void *new, unsigned int bytes, struct kvm_vcpu *vcpu)
-{
-	cmn_err(CE_WARN, "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
-	/* guests cmpxchg8b have to be emulated atomically */
-	if (bytes == 8) {
-		gpa_t gpa;
-		page_t page;
-		char *kaddr;
-		uint64_t val;
-
-		gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
-
-		if (gpa == UNMAPPED_GVA ||
-		    (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
-			goto emul_write;
+	struct kvm_io_bus *new_bus, *bus;
 
-		if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK))
-			goto emul_write;
-
-		val = *(uint64_t *)new;
-
-		page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT);
-		kaddr = kmap_atomic(page, KM_USER0);
-
-		set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val);
-		kunmap_atomic(kaddr, KM_USER0);
-		kvm_release_page_dirty(page);
-	}
-emul_write:
-#endif
-
-	return (emulator_write_emulated(addr, new, bytes, vcpu));
-}
-
-static struct x86_emulate_ops emulate_ops = {
-	.read_std		= kvm_read_guest_virt_system,
-	.fetch			= kvm_fetch_guest_virt,
-	.read_emulated		= emulator_read_emulated,
-	.write_emulated		= emulator_write_emulated,
-	.cmpxchg_emulated	= emulator_cmpxchg_emulated,
-};
-
-static void
-cache_all_regs(struct kvm_vcpu *vcpu)
-{
-	kvm_register_read(vcpu, VCPU_REGS_RAX);
-	kvm_register_read(vcpu, VCPU_REGS_RSP);
-	kvm_register_read(vcpu, VCPU_REGS_RIP);
-	vcpu->arch.regs_dirty = ~0;
-}
-
-static unsigned long
-get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-	return (kvm_x86_ops->get_segment_base(vcpu, seg));
-}
-
-void
-kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
-	uint8_t opcodes[4];
-	unsigned long rip = kvm_rip_read(vcpu);
-	unsigned long rip_linear;
+	bus = kvm->buses[bus_idx];
+	if (bus->dev_count > NR_IOBUS_DEVS-1)
+		return (-ENOSPC);
 
+	new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP);
+	if (!new_bus)
+		return (-ENOMEM);
+	memcpy(new_bus, bus, sizeof (struct kvm_io_bus));
+	new_bus->devs[new_bus->dev_count++] = dev;
 #ifdef XXX
-	if (!printk_ratelimit())
-		return;
+	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+	synchronize_srcu_expedited(&kvm->srcu);
 #else
 	XXX_KVM_PROBE;
+	kvm->buses[bus_idx] = new_bus;
 #endif
+	if (bus)
+		kmem_free(bus, sizeof (struct kvm_io_bus));
 
-	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
-	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
-	cmn_err(CE_WARN, "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-	    context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+	return (0);
 }
 
+/* Caller must hold slots_lock. */
 int
-emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
-    uint16_t error_code, int emulation_type)
+kvm_io_bus_unregister_dev(struct kvm *kvm,
+    enum kvm_bus bus_idx, struct kvm_io_device *dev)
 {
-	int r, shadow_mask;
-	struct decode_cache *c;
-	struct kvm_run *run = vcpu->run;
+	int i, r;
+	struct kvm_io_bus *new_bus, *bus;
 
-	kvm_clear_exception_queue(vcpu);
-	vcpu->arch.mmio_fault_cr2 = cr2;
+	new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP);
+	if (!new_bus)
+		return (-ENOMEM);
 
-	/*
-	 * TODO: fix emulate.c to use guest_read/write_register
-	 * instead of direct ->regs accesses, can save hundred cycles
-	 * on Intel for instructions that don't read/change RSP, for
-	 * for example.
-	 */
-	cache_all_regs(vcpu);
-
-	vcpu->mmio_is_write = 0;
-	vcpu->arch.pio.string = 0;
-
-	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
-		int cs_db, cs_l;
-		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
-		vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ?
-		    X86EMUL_MODE_REAL :
-		    (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) ?
-		    X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 :
-		    cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	bus = kvm->buses[bus_idx];
+	memcpy(new_bus, bus, sizeof (struct kvm_io_bus));
 
-		/*
-		 * Only allow emulation of specific instructions on #UD
-		 * (namely VMMCALL, sysenter, sysexit, syscall)
-		 */
-		c = &vcpu->arch.emulate_ctxt.decode;
-		if (emulation_type & EMULTYPE_TRAP_UD) {
-			if (!c->twobyte)
-				return (EMULATE_FAIL);
-			switch (c->b) {
-			case 0x01: /* VMMCALL */
-				if (c->modrm_mod != 3 || c->modrm_rm != 1)
-					return (EMULATE_FAIL);
-				break;
-			case 0x34: /* sysenter */
-			case 0x35: /* sysexit */
-				if (c->modrm_mod != 0 || c->modrm_rm != 0)
-					return (EMULATE_FAIL);
-				break;
-			case 0x05: /* syscall */
-				if (c->modrm_mod != 0 || c->modrm_rm != 0)
-					return (EMULATE_FAIL);
-				break;
-			default:
-				return (EMULATE_FAIL);
-			}
-
-			if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
-				return (EMULATE_FAIL);
-		}
-
-		KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation);
-
-		if (r)  {
-			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation_fail);
-
-			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-				return (EMULATE_DONE);
-			return (EMULATE_FAIL);
+	r = -ENOENT;
+	for (i = 0; i < new_bus->dev_count; i++) {
+		if (new_bus->devs[i] == dev) {
+			r = 0;
+			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+			break;
 		}
 	}
 
-	if (emulation_type & EMULTYPE_SKIP) {
-		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
-		return (EMULATE_DONE);
-	}
-
-	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
-
-	if (r == 0)
-		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
-
-	if (vcpu->arch.pio.string)
-		return (EMULATE_DO_MMIO);
-
-	if ((r || vcpu->mmio_is_write) && run) {
-		run->exit_reason = KVM_EXIT_MMIO;
-		run->mmio.phys_addr = vcpu->mmio_phys_addr;
-		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-		run->mmio.len = vcpu->mmio_size;
-		run->mmio.is_write = vcpu->mmio_is_write;
-	}
-
 	if (r) {
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			return (EMULATE_DONE);
-		if (!vcpu->mmio_needed) {
-			kvm_report_emulation_failure(vcpu, "mmio");
-			return (EMULATE_FAIL);
-		}
-
-		return (EMULATE_DO_MMIO);
-	}
-
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
-	if (vcpu->mmio_is_write) {
-		vcpu->mmio_needed = 0;
-		return (EMULATE_DO_MMIO);
-	}
-
-	return (EMULATE_DONE);
-}
-
-int
-kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
-{
-	return (vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
-	    vcpu->arch.nmi_injected);
-}
-
-int
-kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
-	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_exits);
-
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
-		return (1);
-	} else {
-		vcpu->run->exit_reason = KVM_EXIT_HLT;
-		return (0);
-	}
-}
-
-static int
-kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-    struct kvm_vcpu *vcpu, uint32_t *error)
-{
-	uintptr_t data = (uintptr_t)val;
-
-	while (bytes) {
-		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
-		unsigned offset = addr & (PAGESIZE-1);
-		unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset);
-		int ret;
-
-		if (gpa == UNMAPPED_GVA)
-			return (X86EMUL_PROPAGATE_FAULT);
-
-		if (kvm_write_guest(vcpu->kvm, gpa, (void *)data, towrite) < 0)
-			return (X86EMUL_UNHANDLEABLE);
-
-		bytes -= towrite;
-		data += towrite;
-		addr += towrite;
-	}
-
-	return (0);
-}
-
-static int
-pio_copy_data(struct kvm_vcpu *vcpu)
-{
-	void *p = vcpu->arch.pio_data;
-	gva_t q = vcpu->arch.pio.guest_gva;
-	unsigned bytes;
-	int ret;
-	uint32_t error_code;
-
-	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
-
-	if (vcpu->arch.pio.in)
-		ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
-	else
-		ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
-
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(vcpu, q, error_code);
-
-	return (ret);
-}
-
-int
-complete_pio(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	long delta;
-	int r;
-	unsigned long val;
-
-	if (!io->string) {
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-			memcpy(&val, vcpu->arch.pio_data, io->size);
-			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
-		}
-	} else {
-		if (io->in) {
-			r = pio_copy_data(vcpu);
-			if (r)
-				goto out;
-		}
-
-		delta = 1;
-		if (io->rep) {
-			delta *= io->cur_count;
-			/*
-			 * The size of the register should really depend on
-			 * current address size.
-			 */
-			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
-			val -= delta;
-			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
-		}
-		if (io->down)
-			delta = -delta;
-		delta *= io->size;
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
-		} else {
-			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
-		}
-	}
-out:
-	io->count -= io->cur_count;
-	io->cur_count = 0;
-
-	return (0);
-}
-
-static int
-kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
-	/* TODO: String I/O for in kernel device */
-	int r;
-
-	if (vcpu->arch.pio.in) {
-		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
-		    vcpu->arch.pio.size, pd);
-	} else {
-		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-		    vcpu->arch.pio.port, vcpu->arch.pio.size, pd);
-	}
-
-	return (r);
-}
-
-int
-kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
-{
-	unsigned long val;
-
-	DTRACE_PROBE4(kvm__pio, int, !in, unsigned, port, int, size,
-	    unsigned long, 1)
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGESIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count =
-	    vcpu->arch.pio.cur_count = 1;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-
-	if (!vcpu->arch.pio.in) {
-		val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-		memcpy(vcpu->arch.pio_data, &val, 4);
-	}
-
-	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-		complete_pio(vcpu);
-		return (1);
-	}
-
-	return (0);
-}
-
-
-void
-kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	cr0 |= X86_CR0_ET;
-
-#ifdef CONFIG_X86_64
-	if (cr0 & 0xffffffff00000000UL) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
-#endif
-
-	cr0 &= ~CR0_RESERVED_BITS;
-
-	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
-
-	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
+		kmem_free(new_bus, sizeof (struct kvm_io_bus));
+		return (r);
 	}
 
-	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-#ifdef CONFIG_X86_64
-		if ((vcpu->arch.efer & EFER_LME)) {
-			int cs_db, cs_l;
-
-			if (!is_pae(vcpu)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
-
-			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-			if (cs_l) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-
-			}
-		} else
+#ifdef XXX
+	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+	synchronize_srcu_expedited(&kvm->srcu);
+#else
+	XXX_KVM_SYNC_PROBE;
+	kvm->buses[bus_idx] = new_bus;
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-
-	}
-
-	kvm_x86_ops->set_cr0(vcpu, cr0);
-	vcpu->arch.cr0 = cr0;
-	kvm_mmu_reset_context(vcpu);
-}
-
-static int
-pdptrs_changed(struct kvm_vcpu *vcpu)
-{
-	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
-
-	if (is_long_mode(vcpu) || !is_pae(vcpu))
-		return (0);
-
-	if (!test_bit(VCPU_EXREG_PDPTR,
-	    (unsigned long *)&vcpu->arch.regs_avail)) {
-		return (1);
-	}
-
-	if (kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u,
-	    pdpte, sizeof (pdpte)) < 0)
-		return (1);
-
-	return (memcmp(pdpte, vcpu->arch.pdptrs, sizeof (pdpte)) != 0);
-}
-
-void
-kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
-		kvm_mmu_sync_roots(vcpu);
-		kvm_mmu_flush_tlb(vcpu);
-		return;
-	}
-
-	if (is_long_mode(vcpu)) {
-		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-	} else {
-		if (is_pae(vcpu)) {
-			if (cr3 & CR3_PAE_RESERVED_BITS) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
-		}
-		/*
-		 * We don't check reserved bits in nonpae mode, because
-		 * this isn't enforced, and VMware depends on this.
-		 */
-	}
-
-	/*
-	 * Does the new cr3 value map to physical memory? (Note, we
-	 * catch an invalid cr3 even in real-mode, because it would
-	 * cause trouble later on when we turn on paging anyway.)
-	 *
-	 * A real CPU would silently accept an invalid cr3 and would
-	 * attempt to use it - with largely undefined (and often hard
-	 * to debug) behavior on the guest side.
-	 */
-	if ((!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT)))
-		kvm_inject_gp(vcpu, 0);
-	else {
-		vcpu->arch.cr3 = cr3;
-		vcpu->arch.mmu.new_cr3(vcpu);
-	}
-}
-
-void
-kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-	unsigned long old_cr4 = kvm_read_cr4(vcpu);
-	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-
-	if (cr4 & CR4_RESERVED_BITS) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
-
-	if (is_long_mode(vcpu)) {
-		if (!(cr4 & X86_CR4_PAE)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) &&
-	    ((cr4 ^ old_cr4) & pdptr_bits) &&
-	    !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
-
-	if (cr4 & X86_CR4_VMXE) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
-
-	kvm_x86_ops->set_cr4(vcpu, cr4);
-	vcpu->arch.cr4 = cr4;
-	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
-	kvm_mmu_reset_context(vcpu);
-}
-
-void
-kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
-{
-	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
-}
-
-/*
- * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
- * a #GP and return false.
- */
-int
-kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
-{
-	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
-		return (1);
-	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-	return (0);
-}
-
-void
-kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
-	uint32_t function, index;
-	struct kvm_cpuid_entry2 *best;
-
-	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
-	best = kvm_find_cpuid_entry(vcpu, function, index);
-	if (best) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
-		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
-		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
-		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
-	}
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-	KVM_TRACE5(cpuid, uint32_t, function,
-	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RAX),
-	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RBX),
-	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RCX),
-	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RDX));
-}
-
-static int
-kvm_hv_hypercall_enabled(struct kvm *kvm)
-{
-	return (kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE);
+	kmem_free(bus, sizeof (struct kvm_io_bus));
+	return (r);
 }
 
 int
-kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+kvm_init(void *opaque)
 {
-	uint64_t param, ingpa, outgpa, ret;
-	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
-	int fast, longmode;
-	int cs_db, cs_l;
-
-	/*
-	 * hypercall generates UD from non zero cpl and real mode
-	 * per HYPER-V spec
-	 */
-	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return (0);
-	}
-
-	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-	longmode = is_long_mode(vcpu) && cs_l == 1;
-
-	if (!longmode) {
-		param = ((uint64_t)kvm_register_read(vcpu,
-		    VCPU_REGS_RDX) << 32) | (kvm_register_read(vcpu,
-		    VCPU_REGS_RAX) & 0xffffffff);
-
-		ingpa = ((uint64_t)kvm_register_read(vcpu,
-		    VCPU_REGS_RBX) << 32) | (kvm_register_read(vcpu,
-		    VCPU_REGS_RCX) & 0xffffffff);
+	int r;
+	int cpu;
 
-		outgpa = ((uint64_t)kvm_register_read(vcpu,
-		    VCPU_REGS_RDI) << 32) | (kvm_register_read(vcpu,
-		    VCPU_REGS_RSI) & 0xffffffff);
-	}
-#ifdef CONFIG_X86_64
-	else {
-		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
-		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
-		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
-	}
-#endif
+	r = kvm_arch_init(opaque);
 
-	code = param & 0xffff;
-	fast = (param >> 16) & 0x1;
-	rep_cnt = (param >> 32) & 0xfff;
-	rep_idx = (param >> 48) & 0xfff;
+	if (r != DDI_SUCCESS)
+		return (r);
 
-	KVM_TRACE6(hv__hypercall, uintptr_t, code, uintptr_t, fast,
-	    uintptr_t, rep_cnt, uintptr_t, rep_idx, uintptr_t, ingpa,
-	    uintptr_t, outgpa);
+	bad_page = alloc_page(PAGESIZE, KM_SLEEP);
+	bad_pfn = bad_page->p_pagenum;
 
-	switch (code) {
-	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
 #ifdef XXX
-		kvm_vcpu_on_spin(vcpu);
-#else
-		XXX_KVM_PROBE;
-#endif
-		break;
-	default:
-		res = HV_STATUS_INVALID_HYPERCALL_CODE;
-		break;
-	}
-
-	ret = res | (((uint64_t)rep_done & 0xfff) << 32);
-
-	if (longmode) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-	} else {
-		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+		r = -ENOMEM;
+		goto out_free_0;
 	}
-
-	return (1);
-}
-
-/* Return values for hypercalls */
-#define	KVM_ENOSYS		1000
-#define	KVM_EFAULT		EFAULT
-#define	KVM_E2BIG		E2BIG
-#define	KVM_EPERM		EPERM
-
-#define	KVM_HC_VAPIC_POLL_IRQ		1
-#define	KVM_HC_MMU_OP			2
-
-/*
- * hypercalls use architecture specific
- */
-
-#ifdef _KERNEL
-#ifdef CONFIG_KVM_GUEST
-void __init kvm_guest_init(void);
-#else
-#define	kvm_guest_init() do { } while (0)
-#endif
-
-static unsigned int
-kvm_arch_para_features(void)
-{
-#ifdef XXX
-	return (cpuid_eax(KVM_CPUID_FEATURES));
 #else
 	XXX_KVM_PROBE;
-	return (0);
 #endif
-}
-
-static inline int
-kvm_para_has_feature(unsigned int feature)
-{
-	if (kvm_arch_para_features() & (1UL << feature))
-		return (1);
-	return (0);
-}
-#endif /* _KERNEL */
-
-int
-kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
-{
-	unsigned long nr, a0, a1, a2, a3, ret;
-	int r = 1;
-
-	if (kvm_hv_hypercall_enabled(vcpu->kvm))
-		return (kvm_hv_hypercall(vcpu));
-
-	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
-
-	KVM_TRACE5(hypercall, uintptr_t, nr, uintptr_t, a0, uintptr_t, a1,
-	    uintptr_t, a2, uintptr_t, a3);
-
-	if (!is_long_mode(vcpu)) {
-		nr &= 0xFFFFFFFF;
-		a0 &= 0xFFFFFFFF;
-		a1 &= 0xFFFFFFFF;
-		a2 &= 0xFFFFFFFF;
-		a3 &= 0xFFFFFFFF;
-	}
+	r = kvm_arch_hardware_setup();
 
-	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
-		ret = -EPERM;
-		goto out;
-	}
+	if (r != DDI_SUCCESS)
+		goto out_free_0a;
 
-	switch (nr) {
-	case KVM_HC_VAPIC_POLL_IRQ:
-		ret = 0;
-		break;
-	case KVM_HC_MMU_OP:
 #ifdef XXX
-		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+	for_each_online_cpu(cpu) {
+		smp_call_function_single(cpu,
+				kvm_arch_check_processor_compat,
+				&r, 1);
+		if (r < 0)
+			goto out_free_1;
+	}
 #else
-		XXX_KVM_PROBE;
-		ret = -ENOSYS;
+	r = 0;
+	kvm_xcall(KVM_CPUALL, kvm_arch_check_processor_compat, &r);
+	if (r < 0)
+		goto out_free_1;
+	XXX_KVM_PROBE;
 #endif
-		break;
-	default:
-		ret = -ENOSYS;
-		break;
-	}
-out:
-	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-
-	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_hypercalls);
-
-	return (r);
-}
-
-static int
-is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
-{
-	return (seg != VCPU_SREG_LDTR) && (seg != VCPU_SREG_TR) &&
-	    (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
-}
 
-static void
-seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector,
-    struct kvm_segment *kvm_desct)
-{
-	kvm_desct->base = get_desc_base(seg_desc);
-	kvm_desct->limit = get_desc_limit(seg_desc);
-	if (seg_desc->c.b.g) {
-		kvm_desct->limit <<= 12;
-		kvm_desct->limit |= 0xfff;
-	}
-	kvm_desct->selector = selector;
-	kvm_desct->type = seg_desc->c.b.type;
-	kvm_desct->present = seg_desc->c.b.p;
-	kvm_desct->dpl = seg_desc->c.b.dpl;
-	kvm_desct->db = seg_desc->c.b.d;
-	kvm_desct->s = seg_desc->c.b.s;
-	kvm_desct->l = seg_desc->c.b.l;
-	kvm_desct->g = seg_desc->c.b.g;
-	kvm_desct->avl = seg_desc->c.b.avl;
-	if (!selector)
-		kvm_desct->unusable = 1;
-	else
-		kvm_desct->unusable = 0;
-	kvm_desct->padding = 0;
-}
 
-static int
-kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
-{
-	struct kvm_segment segvar = {
-		.base = selector << 4,
-		.limit = 0xffff,
-		.selector = selector,
-		.type = 3,
-		.present = 1,
-		.dpl = 3,
-		.db = 0,
-		.s = 1,
-		.l = 0,
-		.g = 0,
-		.avl = 0,
-		.unusable = 0,
-	};
-	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
-	return (0);
-}
-
-static void
-get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, uint16_t selector,
-    struct descriptor_table *dtable)
-{
-	if (selector & 1 << 2) {
-		struct kvm_segment kvm_seg;
-
-		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
-
-		if (kvm_seg.unusable)
-			dtable->limit = 0;
-		else
-			dtable->limit = kvm_seg.limit;
-		dtable->base = kvm_seg.base;
-	} else
-		kvm_x86_ops->get_gdt(vcpu, dtable);
-}
-
-/* allowed just for 8 bytes segments */
-static int
-load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
-    struct desc_struct *seg_desc)
-{
-	struct descriptor_table dtable;
-	uint16_t index = selector >> 3;
-	int ret;
-	uint32_t err;
-	gva_t addr;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.limit < index * 8 + 7) {
-		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
-		return (1);
-	}
-
-	addr = dtable.base + index * 8;
-	ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof (*seg_desc),
-	    vcpu,  &err);
-
-	if (ret == 1)
-		kvm_inject_page_fault(vcpu, addr, err);
-
-	return (ret);
-}
-
-/* allowed just for 8 bytes segments */
-static int
-save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
-    struct desc_struct *seg_desc)
-{
-	struct descriptor_table dtable;
-	uint16_t index = selector >> 3;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.limit < index * 8 + 7)
-		return (1);
-
-	return kvm_write_guest_virt(dtable.base + index * 8, seg_desc,
-	    sizeof (*seg_desc), vcpu, NULL);
-}
-
-int
-kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
-{
-	struct kvm_segment kvm_seg;
-	struct desc_struct seg_desc;
-	uint8_t dpl, rpl, cpl;
-	unsigned err_vec = GP_VECTOR;
-	uint32_t err_code = 0;
-	int null_selector = !(selector & ~0x3); /* 0000-0003 are null */
-	int ret;
-
-	if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
-		return (kvm_load_realmode_segment(vcpu, selector, seg));
-
-	/* NULL selector is not valid for TR, CS and SS */
-	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS ||
-	    seg == VCPU_SREG_TR) && null_selector)
-		goto exception;
-
-	/* TR should be in GDT only */
-	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
-		goto exception;
-
-	ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
-
-	if (ret)
-		return (ret);
-
-	seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
-
-	if (null_selector) { /* for NULL selector skip all following checks */
-		kvm_seg.unusable = 1;
-		goto load;
-	}
-
-	err_code = selector & 0xfffc;
-	err_vec = GP_VECTOR;
-
-	/* can't load system descriptor into segment selecor */
-	if (seg <= VCPU_SREG_GS && !kvm_seg.s)
-		goto exception;
-
-	if (!kvm_seg.present) {
-		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
-		goto exception;
-	}
-
-	rpl = selector & 3;
-	dpl = kvm_seg.dpl;
-	cpl = kvm_x86_ops->get_cpl(vcpu);
-
-	switch (seg) {
-	case VCPU_SREG_SS:
-		/*
-		 * segment is not a writable data segment or segment
-		 * selector's RPL != CPL or segment selector's RPL != CPL
-		 */
-		if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
-			goto exception;
-		break;
-	case VCPU_SREG_CS:
-		if (!(kvm_seg.type & 8))
-			goto exception;
-
-		if (kvm_seg.type & 4) {
-			/* conforming */
-			if (dpl > cpl)
-				goto exception;
-		} else {
-			/* nonconforming */
-			if (rpl > cpl || dpl != cpl)
-				goto exception;
-		}
-		/* CS(RPL) <- CPL */
-		selector = (selector & 0xfffc) | cpl;
-		break;
-	case VCPU_SREG_TR:
-		if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
-			goto exception;
-		break;
-	case VCPU_SREG_LDTR:
-		if (kvm_seg.s || kvm_seg.type != 2)
-			goto exception;
-		break;
-	default: /*  DS, ES, FS, or GS */
-		/*
-		 * segment is not a data or readable code segment or
-		 * ((segment is a data or nonconforming code segment)
-		 * and (both RPL and CPL > DPL))
-		 */
-		if ((kvm_seg.type & 0xa) == 0x8 ||
-		    (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
-			goto exception;
-		break;
-	}
-
-	if (!kvm_seg.unusable && kvm_seg.s) {
-		/* mark segment as accessed */
-		kvm_seg.type |= 1;
-		seg_desc.c.b.type |= 1;
-		save_guest_segment_descriptor(vcpu, selector, &seg_desc);
-	}
-load:
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-	return (0);
-exception:
-	kvm_queue_exception_e(vcpu, err_vec, err_code);
-	return (1);
-
-}
-
-static void
-save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss)
-{
-	tss->cr3 = vcpu->arch.cr3;
-	tss->eip = kvm_rip_read(vcpu);
-	tss->eflags = kvm_get_rflags(vcpu);
-	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
-	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
-	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static void
-kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg)
-{
-	struct kvm_segment kvm_seg;
-	kvm_get_segment(vcpu, &kvm_seg, seg);
-	kvm_seg.selector = sel;
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-}
-
-static int
-load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss)
-{
-	kvm_set_cr3(vcpu, tss->cr3);
-
-	kvm_rip_write(vcpu, tss->eip);
-	kvm_set_rflags(vcpu, tss->eflags | 2);
-
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-	kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
-	kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu,
-	    tss->ldt_selector, VCPU_SREG_LDTR))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
-		return (1);
-
-	return (0);
-}
-
-static void
-save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss)
-{
-	tss->ip = kvm_rip_read(vcpu);
-	tss->flag = kvm_get_rflags(vcpu);
-	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int
-load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss)
-{
-	kvm_rip_write(vcpu, tss->ip);
-	kvm_set_rflags(vcpu, tss->flag | 2);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return (1);
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return (1);
-
-	return (0);
-}
-
-int
-kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
-{
-	gfn_t gfn = gpa >> PAGESHIFT;
-	int seg;
-	int offset = offset_in_page(gpa);
-	int ret;
-	uintptr_t dp = (uintptr_t)data;
-
-	while ((seg = next_segment(len, offset)) != 0) {
-		ret = kvm_read_guest_page(kvm, gfn, (void *)dp, offset, seg);
-		if (ret < 0)
-			return (ret);
-		offset = 0;
-		len -= seg;
-		dp += seg;
-		++gfn;
-	}
-	return (0);
-}
-
-static gpa_t
-get_tss_base_addr_write(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc)
-{
-	uint32_t base_addr = get_desc_base(seg_desc);
-
-	return (kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL));
-}
-
-static gpa_t
-get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc)
-{
-	uint32_t base_addr = get_desc_base(seg_desc);
-
-	return (kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL));
-}
-
-static int
-kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector,
-    uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc)
-{
-	struct tss_segment_16 tss_segment_16;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base,
-	    &tss_segment_16, sizeof (tss_segment_16)))
-		goto out;
-
-	save_state_to_tss16(vcpu, &tss_segment_16);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base,
-	    &tss_segment_16, sizeof (tss_segment_16)))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-	    &tss_segment_16, sizeof (tss_segment_16)))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_16.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu,
-		    nseg_desc), &tss_segment_16.prev_task_link,
-		    sizeof (tss_segment_16.prev_task_link)))
-			goto out;
-	}
-
-	if (load_state_from_tss16(vcpu, &tss_segment_16))
-		goto out;
-
-	ret = 1;
-out:
-	return (ret);
-}
-
-static int
-kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector,
-    uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc)
-{
-	struct tss_segment_32 tss_segment_32;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base,
-	    &tss_segment_32, sizeof (tss_segment_32)))
-		goto out;
-
-	save_state_to_tss32(vcpu, &tss_segment_32);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base,
-	    &tss_segment_32, sizeof (tss_segment_32)))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-	    &tss_segment_32, sizeof (tss_segment_32)))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_32.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu,
-		    nseg_desc), &tss_segment_32.prev_task_link,
-		    sizeof (tss_segment_32.prev_task_link)))
-			goto out;
-	}
-
-	if (load_state_from_tss32(vcpu, &tss_segment_32))
-		goto out;
-
-	ret = 1;
-out:
-	return (ret);
-}
-
-int
-kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason)
-{
-	struct kvm_segment tr_seg;
-	struct desc_struct cseg_desc;
-	struct desc_struct nseg_desc;
-	int ret = 0;
-	uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
-	uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
-	uint32_t desc_limit;
-
-	old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
-
-	/*
-	 * FIXME: Handle errors. Failure to read either TSS or their
-	 * descriptors should generate a pagefault.
-	 */
-	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
-		goto out;
-
-	if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
-		goto out;
-
-	if (reason != TASK_SWITCH_IRET) {
-		int cpl;
-
-		cpl = kvm_x86_ops->get_cpl(vcpu);
-		if ((tss_selector & 3) > nseg_desc.c.b.dpl ||
-		    cpl > nseg_desc.c.b.dpl) {
-			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-			return (1);
-		}
-	}
-
-	desc_limit = get_desc_limit(&nseg_desc);
-
-	if (!nseg_desc.c.b.p || ((desc_limit < 0x67 &&
-	    (nseg_desc.c.b.type & 8)) || desc_limit < 0x2b)) {
-		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
-		return (1);
-	}
-
-	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-		cseg_desc.c.b.type &= ~(1 << 1); // clear the B flag
-		save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
-	}
-
-	if (reason == TASK_SWITCH_IRET) {
-		uint32_t eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
-	}
-
-	/*
-	 * set back link to prev task only if NT bit is set in eflags
-	 * note that old_tss_sel is not used afetr this point
-	 */
-	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
-		old_tss_sel = 0xffff;
-
-	if (nseg_desc.c.b.type & 8) {
-		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
-		    old_tss_base, &nseg_desc);
-	} else {
-		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
-		    old_tss_base, &nseg_desc);
-	}
-
-	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
-		uint32_t eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
-	}
-
-	if (reason != TASK_SWITCH_IRET) {
-		nseg_desc.c.b.type |= (1 << 1);
-		save_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc);
-	}
+#ifdef XXX
+	r = register_cpu_notifier(&kvm_cpu_notifier);
+	if (r)
+		goto out_free_2;
+	register_reboot_notifier(&kvm_reboot_notifier);
 
-	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
-	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
-	tr_seg.type = 11;
-	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
-	return (ret);
-}
+	r = sysdev_class_register(&kvm_sysdev_class);
+	if (r)
+		goto out_free_3;
 
-void
-kvm_guest_exit(void)
-{
-#ifdef XXX
-	account_system_vtime(current);
-	current->flags &= ~PF_VCPU;
+	r = sysdev_register(&kvm_sysdev);
+	if (r)
+		goto out_free_4;
 #else
 	XXX_KVM_PROBE;
 #endif
-}
 
-void
-kvm_guest_enter(void)
-{
 #ifdef XXX
-	account_system_vtime(current);
-	current->flags |= PF_VCPU;
-#else
-	XXX_KVM_PROBE;
-#endif
-}
-
-/*
- * Often times we have pages that correspond to addresses that are in a users
- * virtual address space. Rather than trying to constantly map them in and out
- * of our address space we instead go through and use the kpm segment to
- * facilitate this for us. This always returns an address that is always in the
- * kernel's virtual address space.
- */
-caddr_t
-page_address(page_t *page)
-{
-	return (hat_kpm_mapin_pfn(page->p_pagenum));
-}
-
-static void
-inject_pending_event(struct kvm_vcpu *vcpu)
-{
-	/* try to reinject previous events if any */
-	if (vcpu->arch.exception.pending) {
-		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
-		    vcpu->arch.exception.has_error_code,
-		    vcpu->arch.exception.error_code);
-		return;
-	}
-
-	if (vcpu->arch.nmi_injected) {
-		kvm_x86_ops->set_nmi(vcpu);
-		return;
-	}
-
-	if (vcpu->arch.interrupt.pending) {
-		kvm_x86_ops->set_irq(vcpu);
-		return;
-	}
-
-	/* try to inject new event if pending */
-	if (vcpu->arch.nmi_pending) {
-		if (kvm_x86_ops->nmi_allowed(vcpu)) {
-			vcpu->arch.nmi_pending = 0;
-			vcpu->arch.nmi_injected = 1;
-			kvm_x86_ops->set_nmi(vcpu);
-		}
-	} else if (kvm_cpu_has_interrupt(vcpu)) {
-		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
-					    0);
-			kvm_x86_ops->set_irq(vcpu);
-		}
-	}
-}
-
-void
-kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->guest_fpu_loaded)
-		return;
-
-	vcpu->guest_fpu_loaded = 1;
-	kvm_fx_save(&vcpu->arch.host_fx_image);
-	kvm_fx_restore(&vcpu->arch.guest_fx_image);
-	KVM_TRACE1(fpu, int, 1);
-}
-
-static inline unsigned long
-native_get_debugreg(int regno)
-{
-	unsigned long val = 0;	/* Damn you, gcc! */
-
-	switch (regno) {
-	case 0:
-		__asm__("mov %%db0, %0" :"=r" (val));
-		break;
-	case 1:
-		__asm__("mov %%db1, %0" :"=r" (val));
-		break;
-	case 2:
-		__asm__("mov %%db2, %0" :"=r" (val));
-		break;
-	case 3:
-		__asm__("mov %%db3, %0" :"=r" (val));
-		break;
-	case 6:
-		__asm__("mov %%db6, %0" :"=r" (val));
-		break;
-	case 7:
-		__asm__("mov %%db7, %0" :"=r" (val));
-		break;
-	default:
-		cmn_err(CE_WARN, "kvm: invalid debug register retrieval, "
-		    "regno =  %d\n", regno);
-	}
-
-	return (val);
-}
+	kvm_chardev_ops.owner = module;
+	kvm_vm_fops.owner = module;
+	kvm_vcpu_fops.owner = module;
 
-static inline void
-native_set_debugreg(int regno, unsigned long value)
-{
-	switch (regno) {
-	case 0:
-		__asm__("mov %0, %%db0"	::"r" (value));
-		break;
-	case 1:
-		__asm__("mov %0, %%db1"	::"r" (value));
-		break;
-	case 2:
-		__asm__("mov %0, %%db2"	::"r" (value));
-		break;
-	case 3:
-		__asm__("mov %0, %%db3"	::"r" (value));
-		break;
-	case 6:
-		__asm__("mov %0, %%db6"	::"r" (value));
-		break;
-	case 7:
-		__asm__("mov %0, %%db7"	::"r" (value));
-		break;
-	default:
-		cmn_err(CE_WARN, "kvm: invalid debug register set, "
-		    "regno =  %d\n", regno);
+	r = misc_register(&kvm_dev);
+	if (r) {
+		cmn_err(CE_WARN, "kvm: misc device register failed\n");
+		goto out_free;
 	}
-}
-
-static uint32_t
-div_frac(uint32_t dividend, uint32_t divisor)
-{
-	uint32_t quotient, remainder;
 
 	/*
-	 * Don't try to replace with do_div(), this one calculates
-	 * "(dividend << 32) / divisor"
+	 * XXX - if kernel preemption occurs, we probably need
+	 * to implement these, and add hooks to the preemption code.
+	 * For right now, we'll make the totally unreasonable
+	 * assumption that we won't be preempted while in the
+	 * kernel, i.e., no realtime threads are running
 	 */
-	__asm__("divl %4"
-		: "=a" (quotient), "=d" (remainder)
-		: "0" (0), "1" (dividend), "r" (divisor));
-
-	return (quotient);
-}
-
-static void
-kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
-{
-	uint64_t nsecs = 1000000000LL;
-	int32_t  shift = 0;
-	uint64_t tps64;
-	uint32_t tps32;
-
-	tps64 = tsc_khz * 1000LL;
-	while (tps64 > nsecs*2) {
-		tps64 >>= 1;
-		shift--;
-	}
-
-	tps32 = (uint32_t)tps64;
-	while (tps32 <= (uint32_t)nsecs) {
-		tps32 <<= 1;
-		shift++;
-	}
-
-	hv_clock->tsc_shift = shift;
-	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
-}
-
-static void
-kvm_write_guest_time(struct kvm_vcpu *v)
-{
-	struct timespec ts;
-	unsigned long flags;
-	struct kvm_vcpu_arch *vcpu = &v->arch;
-	void *shared_kaddr;
-	unsigned long this_tsc_khz;
-
-	if ((!vcpu->time_page))
-		return;
-
-	this_tsc_khz = cpu_tsc_khz;
-	if (vcpu->hv_clock_tsc_khz != this_tsc_khz) {
-		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = this_tsc_khz;
-	}
-
-#ifdef XXX
-	put_cpu_var(cpu_tsc_khz);
-#else
-	XXX_KVM_PROBE;
-#endif
+	kvm_preempt_ops.sched_in = kvm_sched_in;
+	kvm_preempt_ops.sched_out = kvm_sched_out;
 
-#ifdef XXX
-	/* Keep irq disabled to prevent changes to the clock */
-	local_irq_save(flags);
-#else
-	/*
-	 * may need to mask interrupts for local_irq_save, and unmask
-	 * for local_irq_restore.  cli()/sti() might be done...
-	 */
-	XXX_KVM_PROBE;
-#endif
-	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
-	gethrestime(&ts);
-#ifdef XXX
-	monotonic_to_bootbased(&ts);
-	local_irq_restore(flags);
+	kvm_init_debug();
 #else
 	XXX_KVM_PROBE;
 #endif
 
-	/* With all the info we got, fill in the values */
-
-	vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC *
-	    (uint64_t)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
-
-	/*
-	 * The interface expects us to write an even number signaling that the
-	 * update is finished. Since the guest won't see the intermediate
-	 * state, we just increase by 2 at the end.
-	 */
-	vcpu->hv_clock.version += 2;
-
-	shared_kaddr = page_address(vcpu->time_page);
-
-	memcpy((void *)((uintptr_t)shared_kaddr + vcpu->time_offset),
-	    &vcpu->hv_clock, sizeof (vcpu->hv_clock));
-
-	mark_page_dirty(v->kvm, vcpu->time >> PAGESHIFT);
-}
-
-/*
- * These special macros can be used to get or set a debugging register
- */
-#define	get_debugreg(var, register)				\
-	(var) = native_get_debugreg(register)
-#define	set_debugreg(value, register)				\
-	native_set_debugreg(register, value)
-
-static int
-vcpu_enter_guest(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	int req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-	    vcpu->run->request_interrupt_window;
-
-	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
-			kvm_mmu_unload(vcpu);
-	}
-
-	r = kvm_mmu_reload(vcpu);
-
-	if (r)
-		goto out;
-
-	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER,
-		    &vcpu->requests)) {
-			__kvm_migrate_timers(vcpu);
-		}
-		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE,
-		    &vcpu->requests)) {
-			kvm_write_guest_time(vcpu);
-		}
-
-		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
-			kvm_mmu_sync_roots(vcpu);
-
-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-			kvm_x86_ops->tlb_flush(vcpu);
-
-		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-		    &vcpu->requests)) {
-			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
-			r = 0;
-			goto out;
-		}
-
-		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
-			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
-			r = 0;
-			goto out;
-		}
-
-		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU,
-		    &vcpu->requests)) {
-			vcpu->fpu_active = 0;
-			kvm_x86_ops->fpu_deactivate(vcpu);
-		}
-	}
-
-	kpreempt_disable();
-
-	kvm_x86_ops->prepare_guest_switch(vcpu);
-	if (vcpu->fpu_active)
-		kvm_load_guest_fpu(vcpu);
-
-	cli();
+	return (0);
 
-	clear_bit(KVM_REQ_KICK, &vcpu->requests);
+out_free:
+out_free_5:
 #ifdef XXX
-	smp_mb__after_clear_bit();
+	sysdev_unregister(&kvm_sysdev);
+out_free_4:
+	sysdev_class_unregister(&kvm_sysdev_class);
+out_free_3:
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	unregister_cpu_notifier(&kvm_cpu_notifier);
 #else
 	XXX_KVM_PROBE;
 #endif
-
-	if (vcpu->requests || issig(JUSTLOOKING)) {
-		set_bit(KVM_REQ_KICK, &vcpu->requests);
-		sti();
-		kpreempt_enable();
-		r = 1;
-		goto out;
-	}
-
-	inject_pending_event(vcpu);
-
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		kvm_x86_ops->enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		kvm_x86_ops->enable_irq_window(vcpu);
-
-	if (kvm_lapic_enabled(vcpu)) {
-		update_cr8_intercept(vcpu);
-		kvm_lapic_sync_to_vapic(vcpu);
-	}
+out_free_2:
+out_free_1:
 #ifdef XXX
-	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	kvm_arch_hardware_unsetup();
 #else
 	XXX_KVM_PROBE;
 #endif
-	kvm_guest_enter();
-
-	if (vcpu->arch.switch_db_regs) {
-		set_debugreg(0, 7);
-		set_debugreg(vcpu->arch.eff_db[0], 0);
-		set_debugreg(vcpu->arch.eff_db[1], 1);
-		set_debugreg(vcpu->arch.eff_db[2], 2);
-		set_debugreg(vcpu->arch.eff_db[3], 3);
-	}
-
-	KVM_TRACE1(vm__entry, int, vcpu->vcpu_id);
-
-	kvm_x86_ops->run(vcpu);
+out_free_0a:
 #ifdef XXX
-	/*
-	 * If the guest has used debug registers, at least dr7
-	 * will be disabled while returning to the host.
-	 * If we don't have active breakpoints in the host, we don't
-	 * care about the messed up debug address registers. But if
-	 * we have some of them active, restore the old state.
-	 */
-	if (hw_breakpoint_active())
-		hw_breakpoint_restore();
+	free_cpumask_var(cpus_hardware_enabled);
 #else
 	XXX_KVM_PROBE;
 #endif
-	set_bit(KVM_REQ_KICK, &vcpu->requests);
-
-	sti();
-
+out_free_0:
 #ifdef XXX
-	local_irq_enable();  /* XXX - should be ok with kpreempt_enable below */
-
-	barrier();
+	free_page(bad_page, PAGESIZE);
 #else
 	XXX_KVM_PROBE;
 #endif
-	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_exits);
-	kvm_guest_exit();
-
-	kpreempt_enable();
+out:
 #ifdef XXX
-	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-
-	/*
-	 * Profile KVM exit RIPs:
-	 */
-	if (unlikely(prof_on == KVM_PROFILING)) {
-		unsigned long rip = kvm_rip_read(vcpu);
-		profile_hit(KVM_PROFILING, (void *)rip);
-	}
+	kvm_arch_exit();
 #else
 	XXX_KVM_PROBE;
 #endif
-	kvm_lapic_sync_from_vapic(vcpu);
-	r = kvm_x86_ops->handle_exit(vcpu);
-
-out:
+out_fail:
 	return (r);
 }
 
-static void
-post_kvm_run_save(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *kvm_run = vcpu->run;
-
-	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = kvm_get_cr8(vcpu);
-	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_run->ready_for_interrupt_injection = 1;
-	else
-		kvm_run->ready_for_interrupt_injection =
-			kvm_arch_interrupt_allowed(vcpu) &&
-			!kvm_cpu_has_interrupt(vcpu) &&
-			!kvm_event_needs_reinjection(vcpu);
-}
 
-/*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
- */
 void
-kvm_vcpu_block(struct kvm_vcpu *vcpu)
+kvm_guest_exit(void)
 {
-	for (;;) {
-		if (kvm_arch_vcpu_runnable(vcpu)) {
-			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
-			break;
-		}
-
-		if (issig(JUSTLOOKING))
-			break;
-
-		mutex_enter(&vcpu->kvcpu_kick_lock);
-
-		if (kvm_cpu_has_pending_timer(vcpu)) {
-			mutex_exit(&vcpu->kvcpu_kick_lock);
-			break;
-		}
-
-		(void) cv_wait_sig_swap(&vcpu->kvcpu_kick_cv,
-		    &vcpu->kvcpu_kick_lock);
-
-		mutex_exit(&vcpu->kvcpu_kick_lock);
-	}
+#ifdef XXX
+	account_system_vtime(current);
+	current->flags &= ~PF_VCPU;
+#else
+	XXX_KVM_PROBE;
+#endif
 }
 
 void
-kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-	processorid_t cpu = vcpu->cpu;
-
-	mutex_enter(&vcpu->kvcpu_kick_lock);
-
-	if (CV_HAS_WAITERS(&vcpu->kvcpu_kick_cv))
-		KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_wakeup);
-
-	cv_broadcast(&vcpu->kvcpu_kick_cv);
-	mutex_exit(&vcpu->kvcpu_kick_lock);
-
-	if (cpu != CPU->cpu_id && cpu != -1) {
-		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) {
-			/*
-			 * If we haven't already kicked this VCPU, we'll poke
-			 * the the CPU on which it's running.  (This will serve
-			 * to induce a VM exit.)
-			 */
-			poke_cpu(cpu);
-		}
-	}
-}
-
-static void
-vapic_enter(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	page_t *page;
-
-	if (!apic || !apic->vapic_addr)
-		return;
-
-	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
-
-	vcpu->arch.apic->vapic_page = page;
-}
-
-static void
-vapic_exit(struct kvm_vcpu *vcpu)
+kvm_guest_enter(void)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int idx;
-
-	if (!apic || !apic->vapic_addr)
-		return;
-#ifdef XXX
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
-#else
-	XXX_KVM_SYNC_PROBE;
-#endif
-	kvm_release_page_dirty(apic->vapic_page);
-	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
 #ifdef XXX
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	account_system_vtime(current);
+	current->flags |= PF_VCPU;
 #else
-	XXX_KVM_SYNC_PROBE;
+	XXX_KVM_PROBE;
 #endif
 }
 
 static int
-dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
-{
-	return (!irqchip_in_kernel(vcpu->kvm) &&
-	    !kvm_cpu_has_interrupt(vcpu) &&
-	    vcpu->run->request_interrupt_window &&
-	    kvm_arch_interrupt_allowed(vcpu));
-}
-
-static int
-__vcpu_run(struct kvm_vcpu *vcpu)
+kvm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
-	int r;
-	struct kvm *kvm = vcpu->kvm;
+	minor_t instance;
 
-	if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
-		cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n",
-		    vcpu->vcpu_id, vcpu->arch.sipi_vector);
-		kvm_lapic_reset(vcpu);
-		r = kvm_arch_vcpu_reset(vcpu);
-		if (r)
-			return (r);
-		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	if (kpm_enable == 0) {
+		cmn_err(CE_WARN, "kvm: kpm_enable must be true\n");
+		return (DDI_FAILURE);
 	}
 
-#ifdef XXX
-	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-#else
-	XXX_KVM_SYNC_PROBE;
-#endif
-	vapic_enter(vcpu);
-
-	r = 1;
-	while (r > 0) {
-		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
-			r = vcpu_enter_guest(vcpu);
-		else {
-#ifdef XXX
-			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-#else
-			XXX_KVM_SYNC_PROBE;
-#endif
-			kvm_vcpu_block(vcpu);
-#ifdef XXX
-			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-#else
-			XXX_KVM_SYNC_PROBE;
-#endif
-			if (test_and_clear_bit(KVM_REQ_UNHALT,
-			    &vcpu->requests)) {
-				switch (vcpu->arch.mp_state) {
-				case KVM_MP_STATE_HALTED:
-					vcpu->arch.mp_state =
-					    KVM_MP_STATE_RUNNABLE;
-				case KVM_MP_STATE_RUNNABLE:
-					break;
-				case KVM_MP_STATE_SIPI_RECEIVED:
-				default:
-					r = -EINTR;
-					break;
-				}
-			}
-		}
 
-		if (r <= 0)
-			break;
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
 
-		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-		if (kvm_cpu_has_pending_timer(vcpu))
-			kvm_inject_pending_timer_irqs(vcpu);
+	if (kvm_dip != NULL)
+		return (DDI_FAILURE);
 
-		if (dm_request_for_irq_injection(vcpu)) {
-			r = -EINTR;
-			vcpu->run->exit_reason = KVM_EXIT_INTR;
-			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_irq_exits);
-		}
+	if (ddi_soft_state_init(&kvm_state, sizeof (kvm_devstate_t), 1) != 0)
+		return (DDI_FAILURE);
 
-		if (issig(JUSTLOOKING)) {
-			r = -EINTR;
-			vcpu->run->exit_reason = KVM_EXIT_INTR;
-			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_signal_exits);
-		}
+	instance = ddi_get_instance(dip);
+	if (ddi_create_minor_node(dip, "kvm",
+	    S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_soft_state_fini(&kvm_state);
+		return (DDI_FAILURE);
 	}
-#ifdef XXX
-	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-#else
-	XXX_KVM_SYNC_PROBE;
-#endif
-	post_kvm_run_save(vcpu);
-	vapic_exit(vcpu);
-
-	return (r);
-}
-
-int
-kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
-{
-	int r;
-	sigset_t sigsaved;
-	struct kvm_run *kvm_run = vcpu->run;
 
-	vcpu_load(vcpu);
-
-	if (vcpu->sigset_active)
-		kvm_sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
-	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) {
-		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
-		r = -EAGAIN;
-		goto out;
+	mutex_init(&kvm_lock, NULL, MUTEX_DRIVER, 0);
+	if (vmx_init() != DDI_SUCCESS) {
+		ddi_soft_state_fini(&kvm_state);
+		ddi_remove_minor_node(dip, NULL);
+		mutex_destroy(&kvm_lock);
+		return (DDI_FAILURE);
 	}
 
-	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm))
-		kvm_set_cr8(vcpu, kvm_run->cr8);
-
-	if (vcpu->arch.pio.cur_count) {
-#ifdef XXX
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-#else
-		XXX_KVM_SYNC_PROBE;
-#endif
-		r = complete_pio(vcpu);
-#ifdef XXX
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-#else
-		XXX_KVM_SYNC_PROBE;
-#endif
-		if (r)
-			goto out;
-	}
-	if (vcpu->mmio_needed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-		vcpu->mmio_read_completed = 1;
-		vcpu->mmio_needed = 0;
-#ifdef XXX
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-#else
-		XXX_KVM_SYNC_PROBE;
-#endif
-		r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
-					EMULTYPE_NO_DECODE);
-#ifdef XXX
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-#else
-		XXX_KVM_SYNC_PROBE;
-#endif
-		if (r == EMULATE_DO_MMIO) {
-			/*
-			 * Read-modify-write.  Back to userspace.
-			 */
-			r = 0;
-			goto out;
-		}
+	if (hardware_enable_all() != 0) {
+		ddi_soft_state_fini(&kvm_state);
+		ddi_remove_minor_node(dip, NULL);
+		mutex_destroy(&kvm_lock);
+		vmx_fini();
+		return (DDI_FAILURE);
 	}
 
-	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
-		kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret);
-
-	r = __vcpu_run(vcpu);
-
-out:
-	if (vcpu->sigset_active)
-		kvm_sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	vcpu_put(vcpu);
-	return (r);
-}
-
-int
-kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
-    struct kvm_mp_state *mp_state)
-{
-	vcpu_load(vcpu);
-	mp_state->mp_state = vcpu->arch.mp_state;
-	vcpu_put(vcpu);
-	return (0);
-}
-
-int
-kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
-    struct kvm_mp_state *mp_state)
-{
-	vcpu_load(vcpu);
-	vcpu->arch.mp_state = mp_state->mp_state;
-	vcpu_put(vcpu);
-	return (0);
-}
-
-static int
-kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
-    struct kvm_vcpu_events *events)
-{
-	vcpu_load(vcpu);
-
-	events->exception.injected = vcpu->arch.exception.pending;
-	events->exception.nr = vcpu->arch.exception.nr;
-	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-	events->exception.error_code = vcpu->arch.exception.error_code;
-
-	events->interrupt.injected = vcpu->arch.interrupt.pending;
-	events->interrupt.nr = vcpu->arch.interrupt.nr;
-	events->interrupt.soft = vcpu->arch.interrupt.soft;
-
-	events->nmi.injected = vcpu->arch.nmi_injected;
-	events->nmi.pending = vcpu->arch.nmi_pending;
-	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
-
-	events->sipi_vector = vcpu->arch.sipi_vector;
-
-	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING |
-	    KVM_VCPUEVENT_VALID_SIPI_VECTOR);
-
-	vcpu_put(vcpu);
-
-	return (0);
-}
-
-static int
-kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
-    struct kvm_vcpu_events *events)
-{
-	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING |
-	    KVM_VCPUEVENT_VALID_SIPI_VECTOR))
-		return (-EINVAL);
-
-	vcpu_load(vcpu);
-
-	vcpu->arch.exception.pending = events->exception.injected;
-	vcpu->arch.exception.nr = events->exception.nr;
-	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
-	vcpu->arch.exception.error_code = events->exception.error_code;
-
-	vcpu->arch.interrupt.pending = events->interrupt.injected;
-	vcpu->arch.interrupt.nr = events->interrupt.nr;
-	vcpu->arch.interrupt.soft = events->interrupt.soft;
-
-	if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
-		kvm_pic_clear_isr_ack(vcpu->kvm);
-
-	vcpu->arch.nmi_injected = events->nmi.injected;
-
-	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
-		vcpu->arch.nmi_pending = events->nmi.pending;
-
-	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+	kvm_dip = dip;
+	kvm_base_minor = instance;
 
-	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
-		vcpu->arch.sipi_vector = events->sipi_vector;
+	list_create(&vm_list, sizeof (struct kvm),
+	    offsetof(struct kvm, vm_list));
+	kvm_minor = vmem_create("kvm_minor", (void *)1, UINT32_MAX - 1, 1,
+	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
 
-	vcpu_put(vcpu);
+	ddi_report_dev(dip);
 
-	return (0);
+	return (DDI_SUCCESS);
 }
 
 static int
-kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr)
-{
-	kvm->arch.ept_identity_map_addr = ident_addr;
-	return (0);
-}
-
-void
-kvm_timer_fire(void *arg)
+kvm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	struct kvm_timer *timer = (struct kvm_timer *)arg;
-	struct kvm_vcpu *vcpu = timer->vcpu;
-
-	if (vcpu == NULL)
-		return;
-
-	mutex_enter(&vcpu->kvcpu_kick_lock);
-
-	if (timer->reinject || !timer->pending) {
-		atomic_add_32(&timer->pending, 1);
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-	}
-
-	timer->intervals++;
-
-	cv_broadcast(&vcpu->kvcpu_kick_cv);
-	mutex_exit(&vcpu->kvcpu_kick_lock);
-}
+	int instance;
 
-static int
-kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
-{
-	vcpu_load(vcpu);
-	bcopy(vcpu->arch.apic->regs, s->regs, sizeof (*s));
-	vcpu_put(vcpu);
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
 
-	return (0);
-}
+	VERIFY(kvm_dip != NULL && kvm_dip == dip);
+	instance = ddi_get_instance(dip);
+	VERIFY(instance == kvm_base_minor);
+	ddi_prop_remove_all(dip);
+	ddi_remove_minor_node(dip, NULL);
+	list_destroy(&vm_list);
+	vmem_destroy(kvm_minor);
+	kvm_dip = NULL;
 
-static int
-kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
-{
-	vcpu_load(vcpu);
-	bcopy(s->regs, vcpu->arch.apic->regs, sizeof (*s));
-	kvm_apic_post_state_restore(vcpu);
-	update_cr8_intercept(vcpu);
-	vcpu_put(vcpu);
+	hardware_disable_all();
+	mutex_destroy(&kvm_lock);
+	ddi_soft_state_fini(&kvm_state);
+	vmx_fini();
 
-	return (0);
+	return (DDI_SUCCESS);
 }
 
+/*ARGSUSED*/
 static int
-kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+kvm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 {
-	int r;
+	kvm_devstate_t *rsp;
+	int error = DDI_FAILURE;
 
-	r = 0;
-	switch (chip->chip_id) {
-	case KVM_IRQCHIP_PIC_MASTER:
-		memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0],
-		    sizeof (struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_PIC_SLAVE:
-		memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1],
-		    sizeof (struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_IOAPIC:
-		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
-		break;
-	default:
-		r = EINVAL;
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = kvm_dip;
 		break;
-	}
-
-	return (r);
-}
-
-static int
-kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
-	int r;
-
-	r = 0;
 
-	switch (chip->chip_id) {
-	case KVM_IRQCHIP_PIC_MASTER:
-		mutex_enter(&pic_irqchip(kvm)->lock);
-		memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic,
-		    sizeof (struct kvm_pic_state));
-		mutex_exit(&pic_irqchip(kvm)->lock);
-		break;
-	case KVM_IRQCHIP_PIC_SLAVE:
-		mutex_enter(&pic_irqchip(kvm)->lock);
-		memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic,
-		    sizeof (struct kvm_pic_state));
-		mutex_exit(&pic_irqchip(kvm)->lock);
-		break;
-	case KVM_IRQCHIP_IOAPIC:
-		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)((uint64_t)getminor((dev_t)arg));
+		error = DDI_SUCCESS;
 		break;
+
 	default:
-		r = EINVAL;
 		break;
 	}
 
-	kvm_pic_update_irq(pic_irqchip(kvm));
-
-	return (r);
-}
-
-static int
-kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
-{
-	if (irq->irq < 0 || irq->irq >= 256)
-		return (-EINVAL);
-
-	if (irqchip_in_kernel(vcpu->kvm))
-		return (-ENXIO);
-
-	vcpu_load(vcpu);
-
-	kvm_queue_interrupt(vcpu, irq->irq, 0);
-
-	vcpu_put(vcpu);
-
-	return (0);
-}
-
-static int
-kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp)
-{
-	int rval;
-	uint64_t mcg_cap = *mcg_capp;
-	unsigned bank_num = mcg_cap & 0xff, bank;
-
-	rval = -EINVAL;
-	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
-		goto out;
-	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
-		goto out;
-	rval = 0;
-	vcpu->arch.mcg_cap = mcg_cap;
-	/* Init IA32_MCG_CTL to all 1s */
-	if (mcg_cap & MCG_CTL_P)
-		vcpu->arch.mcg_ctl = ~(uint64_t)0;
-	/* Init IA32_MCi_CTL to all 1s */
-	for (bank = 0; bank < bank_num; bank++)
-		vcpu->arch.mce_banks[bank*4] = ~(uint64_t)0;
-out:
-	return (rval);
-}
-
-static int
-kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
-{
-	if (sigset) {
-		vcpu->sigset_active = 1;
-		vcpu->sigset = *sigset;
-	} else
-		vcpu->sigset_active = 0;
-
-	return (0);
-}
-
-static int
-kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv)
-{
-	int r;
-
-	if (msrs->nmsrs >= MAX_IO_MSRS)
-		return (-E2BIG);
-
-	if ((r = __msr_io(vcpu, msrs, msrs->entries, kvm_get_msr)) < 0)
-		return (r);
-
-	*rv = r;
-
-	return (0);
+	return (error);
 }
 
+/*ARGSUSED*/
 static int
-kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv)
-{
-	int r;
-
-	if (msrs->nmsrs >= MAX_IO_MSRS)
-		return (-E2BIG);
-
-	if ((r = __msr_io(vcpu, msrs, msrs->entries, do_set_msr)) < 0)
-		return (-EINVAL);
-
-	*rv = r;
-
-	return (0);
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int
-kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+kvm_open(dev_t *devp, int flag, int otype, cred_t *credp)
 {
-	int r, i;
-	struct kvm_memory_slot *memslot;
-	unsigned long n;
-	unsigned long is_dirty = 0;
-	unsigned long *dirty_bitmap = NULL;
-
-	mutex_enter(&kvm->slots_lock);
-
-	r = EINVAL;
-	if (log->slot >= KVM_MEMORY_SLOTS)
-		goto out;
-
-	memslot = &kvm->memslots->memslots[log->slot];
-	r = ENOENT;
-	if (!memslot->dirty_bitmap)
-		goto out;
-
-	n = kvm_dirty_bitmap_bytes(memslot);
+	minor_t minor;
+	kvm_devstate_t *ksp;
 
-	dirty_bitmap = kmem_alloc(n, KM_SLEEP);
-	memset(dirty_bitmap, 0, n);
+	if (flag & FEXCL || flag & FNDELAY)
+		return (EINVAL);
 
-	for (i = 0; !is_dirty && i < n / sizeof (long); i++)
-		is_dirty = memslot->dirty_bitmap[i];
+	if (otype != OTYP_CHR)
+		return (EINVAL);
 
-	/* If nothing is dirty, don't bother messing with page tables. */
-	if (is_dirty) {
-		struct kvm_memslots *slots, *old_slots;
+	/*
+	 * XXX This should be its own privilage
+	 */
+	if (drv_priv(credp) != 0)
+		return (EPERM);
 
-		mutex_enter(&kvm->mmu_lock);
-		kvm_mmu_slot_remove_write_access(kvm, log->slot);
-		mutex_exit(&kvm->mmu_lock);
+	if (!(flag & FREAD && flag & FWRITE))
+		return (EINVAL);
 
-		slots = kmem_zalloc(sizeof (struct kvm_memslots), KM_SLEEP);
-		if (!slots)
-			goto out_free;
+	if (getminor(*devp) != kvm_base_minor)
+		return (ENXIO);
 
-		memcpy(slots, kvm->memslots, sizeof (struct kvm_memslots));
-		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+	minor = (minor_t)(uintptr_t)vmem_alloc(kvm_minor,
+	    1, VM_BESTFIT | VM_SLEEP);
 
-		old_slots = kvm->memslots;
-#ifdef XXX
-		rcu_assign_pointer(kvm->memslots, slots);
-		kvm_synchronize_srcu_expedited(&kvm->srcu);
-#else
-		kvm->memslots = slots;
-		XXX_KVM_SYNC_PROBE;
-#endif
-		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
-		kmem_free(old_slots, sizeof (struct kvm_memslots));
+	if (ddi_soft_state_zalloc(kvm_state, minor) != 0) {
+		vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1);
+		return (ENXIO);
 	}
 
-	r = 0;
-	if (copyout(dirty_bitmap, log->v.dirty_bitmap, n) != 0)
-		r = EFAULT;
-out_free:
-	kmem_free(dirty_bitmap, n);
-out:
-	mutex_exit(&kvm->slots_lock);
-	return (r);
-}
-
-static int
-kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
-	struct kvm_pit *vpit = kvm->arch.vpit;
-
-	mutex_enter(&vpit->pit_state.lock);
-	memcpy(ps->channels, &vpit->pit_state.channels, sizeof (ps->channels));
-	ps->flags = vpit->pit_state.flags;
-	mutex_exit(&vpit->pit_state.lock);
+	*devp = makedevice(getmajor(*devp), minor);
+	ksp = ddi_get_soft_state(kvm_state, minor);
+	VERIFY(ksp != NULL);
 
 	return (0);
 }
 
+/*ARGSUSED*/
 static int
-kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+kvm_close(dev_t dev, int flag, int otyp, cred_t *cred)
 {
-	boolean_t prev_legacy, cur_legacy, start = B_FALSE;
-	struct kvm_pit *vpit = kvm->arch.vpit;
-
-	mutex_enter(&vpit->pit_state.lock);
-	prev_legacy = vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
-	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+	kvm_devstate_t *ksp;
+	minor_t minor = getminor(dev);
+	kvm_t *kvmp;
 
-	if (!prev_legacy && cur_legacy)
-		start = B_TRUE;
+	VERIFY(getminor(dev) != kvm_base_minor);
+	ksp = ddi_get_soft_state(kvm_state, minor);
 
-	memcpy(&vpit->pit_state.channels, &ps->channels,
-	    sizeof (vpit->pit_state.channels));
+	if ((kvmp = ksp->kds_kvmp) != NULL) {
+		mutex_enter(&kvm_lock);
 
-	vpit->pit_state.flags = ps->flags;
-	kvm_pit_load_count(kvm, 0, vpit->pit_state.channels[0].count, start);
+		if (kvmp->kvm_clones > 0) {
+			kvmp->kvm_clones--;
+			mutex_exit(&kvm_lock);
+		} else {
+			mutex_exit(&kvm_lock);
+			kvm_destroy_vm(kvmp);
+		}
+	}
 
-	mutex_exit(&vpit->pit_state.lock);
+	ddi_soft_state_free(kvm_state, minor);
+	vmem_free(kvm_minor, (void *)(uintptr_t)minor, 1);
 
 	return (0);
 }
@@ -6440,6 +2149,11 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 		break;
 	}
 
+	case KVM_GET_MSR_INDEX_LIST: {
+		rval = kvm_vm_ioctl_get_msr_index_list(NULL, arg);
+		*rv = 0;
+		break;
+	}
 	case KVM_CREATE_VCPU: {
 		uint32_t id = (uintptr_t)arg;
 		struct kvm *kvmp;
@@ -6507,58 +2221,6 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 		kmem_free(cpuid, sizeof (struct kvm_cpuid2));
 		break;
 	}
-
-	case KVM_GET_MSR_INDEX_LIST: {
-		struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg;
-		struct kvm_msr_list *msr_list;
-		size_t sz = sizeof (struct kvm_msr_list);
-		unsigned n;
-
-		msr_list = kmem_zalloc(sz, KM_SLEEP);
-
-		if (copyin(user_msr_list, msr_list, sz) != 0) {
-			kmem_free(msr_list, sz);
-			rval = EFAULT;
-			break;
-		}
-
-		n = msr_list->nmsrs;
-		msr_list->nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
-
-		if (copyout(msr_list, user_msr_list, sz) != 0) {
-			kmem_free(msr_list, sz);
-			rval = EFAULT;
-			break;
-		}
-
-		if (n < msr_list->nmsrs) {
-			kmem_free(msr_list, sz);
-			rval = E2BIG;
-			break;
-		}
-
-		if (copyout(&msrs_to_save, user_msr_list->indices,
-		    num_msrs_to_save * sizeof (uint32_t))) {
-			kmem_free(msr_list, sz);
-			rval = EFAULT;
-			break;
-		}
-
-		if (copyout(&emulated_msrs, user_msr_list->indices +
-		    num_msrs_to_save, ARRAY_SIZE(emulated_msrs) *
-		    sizeof (uint32_t)) != 0) {
-			kmem_free(msr_list, sz);
-			rval = EFAULT;
-			break;
-		}
-
-		kmem_free(msr_list, sz);
-
-		rval = 0;
-		*rv = 0;
-		break;
-	}
-
 	case KVM_SET_SIGNAL_MASK: {
 		struct kvm_signal_mask *sigmask = argp;
 		struct kvm_signal_mask kvm_sigmask;
@@ -7041,43 +2703,64 @@ kvm_segmap(dev_t dev, off_t off, struct as *asp, caddr_t *addrp, off_t len,
 }
 
 
-static void
-kvm_on_user_return(struct kvm_vcpu *vcpu, struct kvm_user_return_notifier *urn)
+static struct cb_ops kvm_cb_ops = {
+	kvm_open,
+	kvm_close,	/* close */
+	nodev,
+	nodev,
+	nodev,		/* dump */
+	nodev,		/* read */
+	nodev,		/* write */
+	kvm_ioctl,
+	kvm_devmap,
+	nodev,		/* mmap */
+	kvm_segmap,	/* segmap */
+	nochpoll,	/* poll */
+	ddi_prop_op,
+	NULL,
+	D_NEW | D_MP | D_DEVMAP
+};
+
+static struct dev_ops kvm_ops = {
+	DEVO_REV,
+	0,
+	kvm_getinfo,
+	nulldev,	/* identify */
+	nulldev,	/* probe */
+	kvm_attach,
+	kvm_detach,
+	nodev,		/* reset */
+	&kvm_cb_ops,
+	(struct bus_ops *)0
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"kvm driver v0.1",
+	&kvm_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	{ &modldrv, NULL }
+};
+
+int
+_init(void)
 {
-	unsigned slot;
-	struct kvm_shared_msrs *locals =
-	    (struct kvm_shared_msrs *)(((caddr_t)urn) -
-		offsetof(struct kvm_shared_msrs, urn));
-	struct kvm_shared_msr_values *values;
-
-	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
-		values = &locals->values[slot];
-		if (values->host != values->curr) {
-			wrmsrl(shared_msrs_global.msrs[slot], values->host);
-			values->curr = values->host;
-		}
-	}
-	locals->registered = 0;
-	kvm_user_return_notifier_unregister(vcpu, urn);
+
+	return (mod_install(&modlinkage));
 }
 
-void
-kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+int
+_fini(void)
 {
-	kvm_arch_vcpu_uninit(vcpu);
-	ddi_umem_free(vcpu->cookie);
+	return (mod_remove(&modlinkage));
 }
 
-static int
-kvm_avlmmucmp(const void *arg1, const void *arg2)
+int
+_info(struct modinfo *modinfop)
 {
-	const kvm_mmu_page_t *mp1 = arg1;
-	const kvm_mmu_page_t *mp2 = arg2;
-	if (mp1->kmp_avlspt > mp2->kmp_avlspt)
-		return (1);
-	if (mp1->kmp_avlspt < mp2->kmp_avlspt)
-		return (-1);
-	ASSERT(mp1->kmp_avlspt == mp2->kmp_avlspt);
-	return (0);
+	return (mod_info(&modlinkage, modinfop));
 }
 /* END CSTYLED */
diff --git a/kvm_host.h b/kvm_host.h
index 57a8c0d..61c87bf 100644
--- a/kvm_host.h
+++ b/kvm_host.h
@@ -43,6 +43,35 @@
 struct kvm;
 struct kvm_vcpu;
 
+typedef struct kvm_user_return_notifier {
+	void (*on_user_return)(struct kvm_vcpu *,
+	    struct kvm_user_return_notifier *);
+} kvm_user_return_notifier_t;
+
+void kvm_user_return_notifier_register(struct kvm_vcpu *vcpu,
+    struct kvm_user_return_notifier *urn);
+void kvm_user_return_notifier_unregister(struct kvm_vcpu *vcpu,
+    struct kvm_user_return_notifier *urn);
+void kvm_fire_urn(struct kvm_vcpu *vcpu);
+
+#define KVM_NR_SHARED_MSRS 16
+
+typedef struct kvm_shared_msrs_global {
+	int nr;
+	uint32_t msrs[KVM_NR_SHARED_MSRS];
+} kvm_shared_msrs_global_t;
+
+typedef struct kvm_shared_msrs {
+	struct kvm_user_return_notifier urn;
+	int registered;
+	struct kvm_shared_msr_values {
+		uint64_t host;
+		uint64_t curr;
+	} values[KVM_NR_SHARED_MSRS];
+} kvm_shared_msrs_t;
+
+extern struct kvm_shared_msrs **shared_msrs;
+
 /*
  * It would be nice to use something smarter than a linear search, TBD...
  * Thankfully we dont expect many devices to register (famous last words :),
@@ -316,12 +345,14 @@ int kvm_get_dirty_log(struct kvm *kvm,
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 
+int kvm_vm_ioctl_get_msr_index_list(struct kvm *kvm, uintptr_t arg);
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   struct
 				   kvm_userspace_memory_region *mem,
 				   int user_alloc);
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg);
+int kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, caddr_t addr);
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
@@ -340,6 +371,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 					struct kvm_guest_debug *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);
 
+int kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv);
+int kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv);
+int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid);
+int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+    struct kvm_vcpu_events *events);
+int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+    struct kvm_vcpu_events *events);
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr);
+int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *,
+    struct kvm_cpuid_entry2 *);
+
 int kvm_arch_init(void *opaque);
 void kvm_arch_exit(void);
 
@@ -349,6 +398,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu * kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
@@ -444,20 +494,6 @@ void kvm_guest_enter(void);
 void kvm_guest_exit(void);
 void kvm_migrate_timers(struct kvm_vcpu *vcpu);
 
-enum kvm_stat_kind {
-	KVM_STAT_VM,
-	KVM_STAT_VCPU,
-};
-
-typedef struct kvm_stats_debugfs_item {
-	const char *name;
-	int offset;
-	enum kvm_stat_kind kind;
-	struct dentry *dentry;
-} kvm_stats_debugfs_item_t;
-extern struct kvm_stats_debugfs_item debugfs_entries[];
-extern struct dentry *kvm_debugfs_dir;
-
 #ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
 #define unalias_gfn_instantiation unalias_gfn
 #endif
@@ -488,27 +524,8 @@ void kvm_sigprocmask(int how, sigset_t *, sigset_t *);
  */
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGEMASK)
 
-/* borrowed liberally from linux... */
-
-#define MAX_IO_MSRS 256
-#define CR0_RESERVED_BITS						\
-	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
-			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
-			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS						\
-	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
-			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
-			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
-			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-
-#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
-#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
 #define page_to_pfn(page) (page->p_pagenum)
 
-
 /* LDT or TSS descriptor in the GDT. 16 bytes. */
 struct ldttss_desc64 {
 	unsigned short limit0;
diff --git a/kvm_mmu.c b/kvm_mmu.c
index d32967a..c908c08 100644
--- a/kvm_mmu.c
+++ b/kvm_mmu.c
@@ -701,6 +701,13 @@ kvm_page_table_hashfn(gfn_t gfn)
 	return (gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1));
 }
 
+static void
+bitmap_zero(unsigned long *dst, int nbits)
+{
+	int len = BITS_TO_LONGS(nbits) * sizeof (unsigned long);
+	memset(dst, 0, len);
+}
+
 static struct kvm_mmu_page *
 kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, uint64_t *parent_pte)
 {
@@ -3000,3 +3007,34 @@ is_present_gpte(unsigned long pte)
 {
 	return (pte & PT_PRESENT_MASK);
 }
+
+static struct kvm_mmu_page *
+page_private(kvm_t *kvmp, page_t *page)
+{
+	kvm_mmu_page_t mp, *res;
+	mp.kmp_avlspt = (uintptr_t)page;
+	mutex_enter(&kvmp->kvm_avllock);
+	res = avl_find(&kvmp->kvm_avlmp, &mp, NULL);
+	mutex_exit(&kvmp->kvm_avllock);
+	ASSERT(res != NULL);
+	return (res);
+}
+
+struct kvm_mmu_page *
+page_header(kvm_t *kvmp, hpa_t shadow_page)
+{
+	return (page_private(kvmp, pfn_to_page(shadow_page >> PAGESHIFT)));
+}
+
+int
+kvm_avlmmucmp(const void *arg1, const void *arg2)
+{
+	const kvm_mmu_page_t *mp1 = arg1;
+	const kvm_mmu_page_t *mp2 = arg2;
+	if (mp1->kmp_avlspt > mp2->kmp_avlspt)
+		return (1);
+	if (mp1->kmp_avlspt < mp2->kmp_avlspt)
+		return (-1);
+	ASSERT(mp1->kmp_avlspt == mp2->kmp_avlspt);
+	return (0);
+}
diff --git a/kvm_mmu.h b/kvm_mmu.h
index d19a1e4..f547755 100644
--- a/kvm_mmu.h
+++ b/kvm_mmu.h
@@ -58,5 +58,6 @@ extern int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *,
 extern void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 extern int kvm_mmu_reload(struct kvm_vcpu *vcpu);
 extern int is_present_gpte(unsigned long);
+extern int kvm_avlmmucmp(const void *arg1, const void *arg2);
 
 #endif
diff --git a/kvm_subr.c b/kvm_subr.c
index f023eaa..845fb5f 100644
--- a/kvm_subr.c
+++ b/kvm_subr.c
@@ -195,21 +195,7 @@ kvm_xcall(processorid_t cpu, kvm_xcall_t func, void *arg)
 	kpreempt_enable();
 }
 
-uint32_t
-bit(int bitno)
-{
-	return (1 << (bitno & 31));
-}
 
-int
-is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-	return (vcpu->arch.efer & EFER_LMA);
-#else
-	return (0);
-#endif
-}
 
 unsigned short
 kvm_read_fs(void)
@@ -357,3 +343,154 @@ get_page(page_t *page)
 {
 	page = compound_head(page);
 }
+
+
+page_t *
+pfn_to_page(pfn_t pfn)
+{
+	return (page_numtopp_nolock(pfn));
+}
+
+
+inline void
+kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.exception.pending = 0;
+}
+
+inline void
+kvm_queue_interrupt(struct kvm_vcpu *vcpu, uint8_t vector, int soft)
+{
+	vcpu->arch.interrupt.pending = 1;
+	vcpu->arch.interrupt.soft = soft;
+	vcpu->arch.interrupt.nr = vector;
+}
+
+inline void
+kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.interrupt.pending = 0;
+}
+
+int
+kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+	    vcpu->arch.nmi_injected);
+}
+
+inline int
+kvm_exception_is_soft(unsigned int nr)
+{
+	return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
+
+inline int
+is_protmode(struct kvm_vcpu *vcpu)
+{
+	return (kvm_read_cr0_bits(vcpu, X86_CR0_PE));
+}
+
+int
+is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+	return (vcpu->arch.efer & EFER_LMA);
+#else
+	return (0);
+#endif
+}
+
+inline int
+is_pae(struct kvm_vcpu *vcpu)
+{
+	return (kvm_read_cr4_bits(vcpu, X86_CR4_PAE));
+}
+
+int
+is_pse(struct kvm_vcpu *vcpu)
+{
+	return (kvm_read_cr4_bits(vcpu, X86_CR4_PSE));
+}
+
+int
+is_paging(struct kvm_vcpu *vcpu)
+{
+	return (kvm_read_cr0_bits(vcpu, X86_CR0_PG));
+}
+
+uint64_t
+native_read_msr_safe(unsigned int msr, int *err)
+{
+	DECLARE_ARGS(val, low, high);
+	uint64_t ret = 0;
+	on_trap_data_t otd;
+
+	if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
+		ret = native_read_msr(msr);
+		*err = 0;
+	} else {
+		*err = EINVAL; /* XXX probably not right... */
+	}
+	no_trap();
+
+	return (ret);
+}
+
+/* Can be uninlined because referenced by paravirt */
+int
+native_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+	int err = 0;
+	on_trap_data_t otd;
+
+	if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
+		native_write_msr(msr, low, high);
+	} else {
+		err = EINVAL;  /* XXX probably not right... */
+	}
+	no_trap();
+
+	return (err);
+}
+
+
+/* XXX Where should this live */
+page_t *
+alloc_page(size_t size, int flag)
+{
+	caddr_t page_addr;
+	pfn_t pfn;
+	page_t *pp;
+
+	if ((page_addr = kmem_zalloc(size, flag)) == NULL)
+		return ((page_t *)NULL);
+
+	pp = page_numtopp_nolock(hat_getpfnum(kas.a_hat, page_addr));
+	return (pp);
+}
+
+int
+kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id);
+}
+
+/*
+ * Often times we have pages that correspond to addresses that are in a users
+ * virtual address space. Rather than trying to constantly map them in and out
+ * of our address space we instead go through and use the kpm segment to
+ * facilitate this for us. This always returns an address that is always in the
+ * kernel's virtual address space.
+ */
+caddr_t
+page_address(page_t *page)
+{
+	return (hat_kpm_mapin_pfn(page->p_pagenum));
+}
+
+uint32_t
+bit(int bitno)
+{
+	return (1 << (bitno & 31));
+}
diff --git a/kvm_vmx.c b/kvm_vmx.c
index fc8034b..5edb0d7 100644
--- a/kvm_vmx.c
+++ b/kvm_vmx.c
@@ -35,6 +35,12 @@
 #include "kvm_mmu.h"
 #include "kvm_vmx.h"
 
+
+/*
+ * Globals
+ */
+struct kvm_shared_msrs **shared_msrs;
+
 #define	VMX_NR_VPIDS				(1 << 16)
 static kmutex_t vmx_vpid_lock;
 static ulong_t *vmx_vpid_bitmap;
@@ -57,8 +63,6 @@ static kmem_cache_t *kvm_vcpu_cache;
 
 static struct vmcs **vmxarea;  /* 1 per cpu */
 static struct vmcs **current_vmcs;
-/* XXX Should shared_msrs be static? */
-struct kvm_shared_msrs **shared_msrs;
 static list_t **vcpus_on_cpu;
 static uint64_t *vmxarea_pa;   /* physical address of each vmxarea */
 
@@ -587,6 +591,30 @@ vmwrite_error(unsigned long field, unsigned long value)
 	    field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 }
 
+static void
+__vmwrite(unsigned long field, unsigned long value)
+{
+	uint8_t err = 0;
+
+	/*CSTYLED*/
+	__asm__ volatile ( ASM_VMX_VMWRITE_RAX_RDX "\n\t" "setna %0"
+	    /* XXX: CF==1 or ZF==1 --> crash (ud2) */
+	    /* "ja 1f ; ud2 ; 1:\n" */
+	    : "=q"(err) : "a" (value), "d" (field)
+	    : "cc", "memory");
+
+	/* XXX the following should be ifdef debug... */
+	if (err) {
+#ifdef XXX
+		vmcs_read32(VM_INSTRUCTION_ERROR);
+		cmn_err(CE_WARN, "_vmwrite: error writing %lx to %lx: "
+		    "error number = %d\n", value, field, err & 0xff);
+#else
+		XXX_KVM_PROBE;
+#endif
+	}
+}
+
 /* XXX Should be static! */
 void
 vmcs_writel(unsigned long field, unsigned long value)
diff --git a/kvm_x86.c b/kvm_x86.c
index 9ea4429..e6cbf00 100644
--- a/kvm_x86.c
+++ b/kvm_x86.c
@@ -1,38 +1,179 @@
+/*
+ *
+ */
+
 #include <sys/types.h>
 #include <sys/param.h>
-#include <sys/errno.h>
-#include <sys/modctl.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
+#include <sys/mutex.h>
+#include <sys/ksynch.h>
+#include <sys/condvar_impl.h>
 #include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/cpuvar.h>
-#include <vm/hat_i86.h>
-#include <sys/segments.h>
-#include <sys/mman.h>
-#include <sys/mach_mmu.h>
-#include <sys/int_limits.h>
-#include <sys/x_call.h>
+
+#include <vm/page.h>
+#include <vm/hat.h>
+
+#include <asm/cpu.h>
 
 #include "kvm_bitops.h"
-#include "kvm_apicdef.h"
-#include "kvm_types.h"
+#include "kvm_vmx.h"
+#include "msr-index.h"
+#include "msr.h"
+#include "irqflags.h"
 #include "kvm_host.h"
-
-#include "kvm_coalesced_mmio.h"
+#include "kvm_lapic.h"
+#include "processor-flags.h"
+#include "kvm_cpuid.h"
+#include "hyperv.h"
+#include "kvm_apicdef.h"
+#include "kvm_iodev.h"
+#include "kvm.h"
+#include "kvm_x86impl.h"
 #include "kvm_irq.h"
+#include "kvm_tss.h"
+#include "kvm_ioapic.h"
+#include "kvm_coalesced_mmio.h"
 #include "kvm_i8254.h"
-#include "kvm_x86impl.h"
+#include "kvm_mmu.h"
+#include "kvm_cache_regs.h"
 
-#undef DEBUG
-
-extern struct kvm_shared_msrs_global shared_msrs_global;
-extern void shared_msr_update(unsigned slot, uint32_t msr);
+/* XXX These don't belong here! */
 extern caddr_t smmap64(caddr_t addr, size_t len, int prot, int flags,
     int fd, off_t pos);
 
+#ifdef XXX_KVM_DECLARATION
+unsigned long *vmx_io_bitmap_a;
+unsigned long *vmx_io_bitmap_b;
+unsigned long *vmx_msr_bitmap_legacy;
+unsigned long *vmx_msr_bitmap_longmode;
+#else
+/* make these arrays to try to force into low 4GB memory... */
+/* also need to be aligned... */
+__attribute__((__aligned__(PAGESIZE)))unsigned long
+    vmx_io_bitmap_a[PAGESIZE / sizeof (unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long
+    vmx_io_bitmap_b[PAGESIZE / sizeof (unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long
+    vmx_msr_bitmap_legacy[PAGESIZE / sizeof (unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long
+    vmx_msr_bitmap_longmode[PAGESIZE / sizeof (unsigned long)];
+#endif
+
+static unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)];
+
+#define	MAX_IO_MSRS 256
+#define	CR0_RESERVED_BITS						\
+	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+	    | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+	    | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+#define	CR4_RESERVED_BITS						\
+	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+	    | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
+	    | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
+	    | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define	CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+/*
+ * EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
+
+static void update_cr8_intercept(struct kvm_vcpu *);
+
+struct kvm_x86_ops *kvm_x86_ops;
+int ignore_msrs = 0;
+
+static struct kvm_shared_msrs_global shared_msrs_global;
+
+static void
+kvm_on_user_return(struct kvm_vcpu *vcpu, struct kvm_user_return_notifier *urn)
+{
+	unsigned slot;
+	struct kvm_shared_msrs *locals =
+	    (struct kvm_shared_msrs *)(((caddr_t)urn) -
+		offsetof(struct kvm_shared_msrs, urn));
+	struct kvm_shared_msr_values *values;
+
+	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+		values = &locals->values[slot];
+		if (values->host != values->curr) {
+			wrmsrl(shared_msrs_global.msrs[slot], values->host);
+			values->curr = values->host;
+		}
+	}
+	locals->registered = 0;
+	kvm_user_return_notifier_unregister(vcpu, urn);
+}
+
+static void
+shared_msr_update(unsigned slot, uint32_t msr)
+{
+	struct kvm_shared_msrs *smsr;
+	uint64_t value;
+	smsr = shared_msrs[CPU->cpu_id];
+
+	/*
+	 * only read, and nobody should modify it at this time,
+	 * so don't need lock
+	 */
+	if (slot >= shared_msrs_global.nr) {
+		cmn_err(CE_WARN, "kvm: invalid MSR slot!");
+		return;
+	}
+
+	rdmsrl_safe(msr, (unsigned long long *)&value);
+	smsr->values[slot].host = value;
+	smsr->values[slot].curr = value;
+}
+
+void
+kvm_define_shared_msr(unsigned slot, uint32_t msr)
+{
+	if (slot >= shared_msrs_global.nr)
+		shared_msrs_global.nr = slot + 1;
+	shared_msrs_global.msrs[slot] = msr;
+#ifdef XXX
+	/* we need ensured the shared_msr_global have been updated */
+	smp_wmb();
+#else
+	XXX_KVM_SYNC_PROBE;
+#endif
+}
+
+static void
+kvm_shared_msr_cpu_online(void)
+{
+	unsigned i;
+
+	for (i = 0; i < shared_msrs_global.nr; i++)
+		shared_msr_update(i, shared_msrs_global.msrs[i]);
+}
+
+void
+kvm_set_shared_msr(struct kvm_vcpu *vcpu, unsigned slot, uint64_t value,
+    uint64_t mask)
+{
+	struct kvm_shared_msrs *smsr = shared_msrs[CPU->cpu_id];
+
+	if (((value ^ smsr->values[slot].curr) & mask) == 0)
+		return;
+
+	smsr->values[slot].curr = value;
+	wrmsrl(shared_msrs_global.msrs[slot], value);
+
+	if (!smsr->registered) {
+		smsr->urn.on_user_return = kvm_on_user_return;
+		kvm_user_return_notifier_register(vcpu, &smsr->urn);
+		smsr->registered = 1;
+	}
+}
+
 unsigned long
 segment_base(uint16_t selector)
 {
@@ -64,253 +205,1353 @@ segment_base(uint16_t selector)
 	return (v);
 }
 
+uint64_t
+kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+	if (irqchip_in_kernel(vcpu->kvm))
+		return (vcpu->arch.apic_base);
+	else
+		return (vcpu->arch.apic_base);
+}
 
-struct  kvm *
-kvm_arch_create_vm(void)
+void
+kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data)
 {
-	struct kvm *kvm = kmem_zalloc(sizeof (struct kvm), KM_SLEEP);
+	/* TODO: reserve bits check */
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_lapic_set_base(vcpu, data);
+	else
+		vcpu->arch.apic_base = data;
+}
 
-	if (!kvm)
-		return (NULL);
+#define	EXCPT_BENIGN		0
+#define	EXCPT_CONTRIBUTORY	1
+#define	EXCPT_PF		2
 
-	if ((kvm->arch.aliases =
-	    kmem_zalloc(sizeof (struct kvm_mem_aliases), KM_SLEEP)) == NULL) {
-		kmem_free(kvm, sizeof (struct kvm));
-		return (NULL);
+static int
+exception_class(int vector)
+{
+	switch (vector) {
+	case PF_VECTOR:
+		return (EXCPT_PF);
+	case DE_VECTOR:
+	case TS_VECTOR:
+	case NP_VECTOR:
+	case SS_VECTOR:
+	case GP_VECTOR:
+		return (EXCPT_CONTRIBUTORY);
+	default:
+		break;
 	}
 
-	list_create(&kvm->arch.active_mmu_pages, sizeof (struct kvm_mmu_page),
-	    offsetof(struct kvm_mmu_page, link));
+	return (EXCPT_BENIGN);
+}
 
-	list_create(&kvm->arch.assigned_dev_head,
-	    sizeof (struct kvm_assigned_dev_kernel),
-	    offsetof(struct kvm_assigned_dev_kernel, list));
+static void
+kvm_multiple_exception(struct kvm_vcpu *vcpu,
+    unsigned nr, int has_error, uint32_t error_code)
+{
+	uint32_t prev_nr;
+	int class1, class2;
+
+	if (!vcpu->arch.exception.pending) {
+queue:
+		vcpu->arch.exception.pending = 1;
+		vcpu->arch.exception.has_error_code = has_error;
+		vcpu->arch.exception.nr = nr;
+		vcpu->arch.exception.error_code = error_code;
+		return;
+	}
 
-	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
-	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+	/* to check exception */
+	prev_nr = vcpu->arch.exception.nr;
+	if (prev_nr == DF_VECTOR) {
+		/* triple fault -> shutdown */
+		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		return;
+	}
+	class1 = exception_class(prev_nr);
+	class2 = exception_class(nr);
+	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+	    (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+		/* generate double fault per SDM Table 5-5 */
+		vcpu->arch.exception.pending = 1;
+		vcpu->arch.exception.has_error_code = 1;
+		vcpu->arch.exception.nr = DF_VECTOR;
+		vcpu->arch.exception.error_code = 0;
+	} else {
+		/*
+		 * replace previous exception with a new one in a hope
+		 * that instruction re-execution will regenerate lost
+		 * exception
+		 */
+		goto queue;
+	}
+}
 
-	/* XXX - original is rdtscll() */
-	kvm->arch.vm_init_tsc = (uint64_t)gethrtime();
+void
+kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+	kvm_multiple_exception(vcpu, nr, 0, 0);
+}
 
-	return (kvm);
+void
+kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+    uint32_t error_code)
+{
+	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_guest);
+	vcpu->arch.cr2 = addr;
+	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
-inline gpa_t
-gfn_to_gpa(gfn_t gfn)
+void
+kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
-	return ((gpa_t)gfn << PAGESHIFT);
+	vcpu->arch.nmi_pending = 1;
 }
 
-page_t *pfn_to_page(pfn_t pfn);
+void
+kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code)
+{
+	kvm_multiple_exception(vcpu, nr, 1, error_code);
+}
+
+/*
+ * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
+ * a #GP and return false.
+ */
+int
+kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
+{
+	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+		return (1);
+	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+	return (0);
+}
+
+/*
+ * Load the pae pdptrs.  Return true is they are all valid.
+ */
+int
+load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	gfn_t pdpt_gfn = cr3 >> PAGESHIFT;
+	unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2;
+	int i;
+	int ret;
+	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn,
+	    pdpte, offset * sizeof (uint64_t), sizeof (pdpte));
+
+	if (ret < 0) {
+		ret = 0;
+		goto out;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(pdpte); i++) {
+		if (is_present_gpte(pdpte[i]) &&
+		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
+			ret = 0;
+			goto out;
+		}
+	}
+	ret = 1;
+
+	memcpy(vcpu->arch.pdptrs, pdpte, sizeof (vcpu->arch.pdptrs));
+	__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail);
+	__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty);
+out:
+	return (ret);
+}
+
+static int
+pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+	if (is_long_mode(vcpu) || !is_pae(vcpu))
+		return (0);
+
+	if (!test_bit(VCPU_EXREG_PDPTR,
+	    (unsigned long *)&vcpu->arch.regs_avail)) {
+		return (1);
+	}
+
+	if (kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u,
+	    pdpte, sizeof (pdpte)) < 0)
+		return (1);
+
+	return (memcmp(pdpte, vcpu->arch.pdptrs, sizeof (pdpte)) != 0);
+}
+
+void
+kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+	if (cr0 & 0xffffffff00000000UL) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+#endif
+
+	cr0 &= ~CR0_RESERVED_BITS;
+
+	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+		if ((vcpu->arch.efer & EFER_LME)) {
+			int cs_db, cs_l;
+
+			if (!is_pae(vcpu)) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+
+			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+			if (cs_l) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+
+			}
+		} else
+#endif
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+
+	}
+
+	kvm_x86_ops->set_cr0(vcpu, cr0);
+	vcpu->arch.cr0 = cr0;
+	kvm_mmu_reset_context(vcpu);
+}
+
+void
+kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
+}
+
+void
+kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	unsigned long old_cr4 = kvm_read_cr4(vcpu);
+	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+	if (cr4 & CR4_RESERVED_BITS) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (is_long_mode(vcpu)) {
+		if (!(cr4 & X86_CR4_PAE)) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) &&
+	    ((cr4 ^ old_cr4) & pdptr_bits) &&
+	    !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (cr4 & X86_CR4_VMXE) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	kvm_x86_ops->set_cr4(vcpu, cr4);
+	vcpu->arch.cr4 = cr4;
+	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
+	kvm_mmu_reset_context(vcpu);
+}
+
+void
+kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+		kvm_mmu_sync_roots(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
+		return;
+	}
+
+	if (is_long_mode(vcpu)) {
+		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	} else {
+		if (is_pae(vcpu)) {
+			if (cr3 & CR3_PAE_RESERVED_BITS) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+		}
+		/*
+		 * We don't check reserved bits in nonpae mode, because
+		 * this isn't enforced, and VMware depends on this.
+		 */
+	}
+
+	/*
+	 * Does the new cr3 value map to physical memory? (Note, we
+	 * catch an invalid cr3 even in real-mode, because it would
+	 * cause trouble later on when we turn on paging anyway.)
+	 *
+	 * A real CPU would silently accept an invalid cr3 and would
+	 * attempt to use it - with largely undefined (and often hard
+	 * to debug) behavior on the guest side.
+	 */
+	if ((!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT)))
+		kvm_inject_gp(vcpu, 0);
+	else {
+		vcpu->arch.cr3 = cr3;
+		vcpu->arch.mmu.new_cr3(vcpu);
+	}
+}
+
+void
+kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	if (cr8 & CR8_RESERVED_BITS) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_lapic_set_tpr(vcpu, cr8);
+	else
+		vcpu->arch.cr8 = cr8;
+}
+
+unsigned long
+kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		return (kvm_lapic_get_cr8(vcpu));
+	} else {
+		return (vcpu->arch.cr8);
+	}
+}
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in the beginning of the list.
+ */
+
+#define	MSR_KVM_WALL_CLOCK  0x11
+#define	MSR_KVM_SYSTEM_TIME 0x12
+
+#define	KVM_SAVE_MSRS_BEGIN	5
+static uint32_t msrs_to_save[] = {
+	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+	HV_X64_MSR_APIC_ASSIST_PAGE,
+	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+	MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+};
+
+static unsigned num_msrs_to_save;
+
+static uint32_t emulated_msrs[] = {
+	MSR_IA32_MISC_ENABLE,
+};
+
+static void
+set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
+{
+	if (efer & efer_reserved_bits) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (is_paging(vcpu) &&
+	    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (efer & EFER_FFXSR) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
+	if (efer & EFER_SVME) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
+	kvm_x86_ops->set_efer(vcpu, efer);
+
+	efer &= ~EFER_LMA;
+	efer |= vcpu->arch.efer & EFER_LMA;
+
+	vcpu->arch.efer = efer;
+
+	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+	kvm_mmu_reset_context(vcpu);
+}
 
 void
-kvm_release_pfn_clean(pfn_t pfn)
+kvm_enable_efer_bits(uint64_t mask)
+{
+	efer_reserved_bits &= ~mask;
+}
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int
+kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
+{
+	return (kvm_x86_ops->set_msr(vcpu, msr_index, data));
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int
+do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data)
+{
+	return (kvm_set_msr(vcpu, index, *data));
+}
+
+static void
+kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
+	static int version;
+	struct pvclock_wall_clock wc;
+	struct timespec boot;
+
+	if (!wall_clock)
+		return;
+
+	version++;
+
+	kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
+
+	/*
+	 * The guest calculates current wall clock time by adding
+	 * system time (updated by kvm_write_guest_time below) to the
+	 * wall clock specified here.  guest system time equals host
+	 * system time for us, thus we must fill in host boot time here.
+	 */
 #ifdef XXX
-	if (!kvm_is_mmio_pfn(pfn))
-		put_page(pfn_to_page(pfn));
+	getboottime(&boot);
+
+	wc.sec = boot.tv_sec;
+	wc.nsec = boot.tv_nsec;
+	wc.version = version;
+
+	kvm_write_guest(kvm, wall_clock, &wc, sizeof (wc));
+
+	version++;
+	kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
 #else
 	XXX_KVM_PROBE;
 #endif
 }
 
-#ifdef IOMMU
+static uint32_t
+div_frac(uint32_t dividend, uint32_t divisor)
+{
+	uint32_t quotient, remainder;
 
-paddr_t
-iommu_iova_to_phys(struct iommu_domain *domain, unsigned long iova)
+	/*
+	 * Don't try to replace with do_div(), this one calculates
+	 * "(dividend << 32) / divisor"
+	 */
+	__asm__("divl %4"
+		: "=a" (quotient), "=d" (remainder)
+		: "0" (0), "1" (dividend), "r" (divisor));
+
+	return (quotient);
+}
+
+static void
+kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 {
-	return (iommu_ops->iova_to_phys(domain, iova));
+	uint64_t nsecs = 1000000000LL;
+	int32_t  shift = 0;
+	uint64_t tps64;
+	uint32_t tps32;
+
+	tps64 = tsc_khz * 1000LL;
+	while (tps64 > nsecs*2) {
+		tps64 >>= 1;
+		shift--;
+	}
+
+	tps32 = (uint32_t)tps64;
+	while (tps32 <= (uint32_t)nsecs) {
+		tps32 <<= 1;
+		shift++;
+	}
+
+	hv_clock->tsc_shift = shift;
+	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 }
 
+/* XXX Expected to be per cpu */
+static uint64_t cpu_tsc_khz;
+/* XXX extern?! */
+extern uint64_t cpu_freq_hz;
 
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages)
+static void
+kvm_write_guest_time(struct kvm_vcpu *v)
 {
-	gfn_t gfn = base_gfn;
-	pfn_t pfn;
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	unsigned long i;
-	uint64_t phys;
+	struct timespec ts;
+	unsigned long flags;
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	void *shared_kaddr;
+	unsigned long this_tsc_khz;
 
-	/* check if iommu exists and in use */
-	if (!domain)
+	if ((!vcpu->time_page))
 		return;
 
-	for (i = 0; i < npages; i++) {
-		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-		pfn = phys >> PAGESHIFT;
-		kvm_release_pfn_clean(pfn);
-		gfn++;
+	this_tsc_khz = cpu_tsc_khz;
+	if (vcpu->hv_clock_tsc_khz != this_tsc_khz) {
+		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = this_tsc_khz;
 	}
 
-	iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGESIZE * npages);
+#ifdef XXX
+	put_cpu_var(cpu_tsc_khz);
+#else
+	XXX_KVM_PROBE;
+#endif
+
+#ifdef XXX
+	/* Keep irq disabled to prevent changes to the clock */
+	local_irq_save(flags);
+#else
+	/*
+	 * may need to mask interrupts for local_irq_save, and unmask
+	 * for local_irq_restore.  cli()/sti() might be done...
+	 */
+	XXX_KVM_PROBE;
+#endif
+	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+	gethrestime(&ts);
+#ifdef XXX
+	monotonic_to_bootbased(&ts);
+	local_irq_restore(flags);
+#else
+	XXX_KVM_PROBE;
+#endif
+
+	/* With all the info we got, fill in the values */
+
+	vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC *
+	    (uint64_t)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+
+	/*
+	 * The interface expects us to write an even number signaling that the
+	 * update is finished. Since the guest won't see the intermediate
+	 * state, we just increase by 2 at the end.
+	 */
+	vcpu->hv_clock.version += 2;
+
+	shared_kaddr = page_address(vcpu->time_page);
+
+	memcpy((void *)((uintptr_t)shared_kaddr + vcpu->time_offset),
+	    &vcpu->hv_clock, sizeof (vcpu->hv_clock));
+
+	mark_page_dirty(v->kvm, vcpu->time >> PAGESHIFT);
 }
 
 static int
-kvm_iommu_unmap_memslots(struct kvm *kvm)
+kvm_request_guest_time_update(struct kvm_vcpu *v)
 {
-	int i;
-	struct kvm_memslots *slots;
+	struct kvm_vcpu_arch *vcpu = &v->arch;
 
-	slots = kvm->memslots;
+	if (!vcpu->time_page)
+		return (0);
+
+	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+
+	return (1);
+}
 
-	for (i = 0; i < slots->nmemslots; i++) {
-		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
-		    slots->memslots[i].npages);
+static int
+msr_mtrr_valid(unsigned msr)
+{
+	switch (msr) {
+	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+	case MSR_MTRRfix64K_00000:
+	case MSR_MTRRfix16K_80000:
+	case MSR_MTRRfix16K_A0000:
+	case MSR_MTRRfix4K_C0000:
+	case MSR_MTRRfix4K_C8000:
+	case MSR_MTRRfix4K_D0000:
+	case MSR_MTRRfix4K_D8000:
+	case MSR_MTRRfix4K_E0000:
+	case MSR_MTRRfix4K_E8000:
+	case MSR_MTRRfix4K_F0000:
+	case MSR_MTRRfix4K_F8000:
+	case MSR_MTRRdefType:
+	case MSR_IA32_CR_PAT:
+		return (1);
+	case 0x2f8:
+		return (1);
 	}
 
 	return (0);
 }
 
-int
-kvm_iommu_unmap_guest(struct kvm *kvm)
+static int
+valid_pat_type(unsigned t)
 {
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	return (t < 8 && (1 << t) & 0xf3); /* 0, 1, 4, 5, 6, 7 */
+}
+
+static int
+valid_mtrr_type(unsigned t)
+{
+	return (t < 8 && (1 << t) & 0x73); /* 0, 1, 4, 5, 6 */
+}
 
-	/* check if iommu exists and in use */
-	if (!domain)
+static int
+mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	int i;
+
+	if (!msr_mtrr_valid(msr))
 		return (0);
 
-	kvm_iommu_unmap_memslots(kvm);
-	iommu_domain_free(domain);
+	if (msr == MSR_IA32_CR_PAT) {
+		for (i = 0; i < 8; i++)
+			if (!valid_pat_type((data >> (i * 8)) & 0xff))
+				return (0);
+		return (1);
+	} else if (msr == MSR_MTRRdefType) {
+		if (data & ~0xcff)
+			return (0);
+		return (valid_mtrr_type(data & 0xff));
+	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+		for (i = 0; i < 8; i++)
+			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+				return (0);
+		return (1);
+	}
+
+	/* variable MTRRs */
+	return (valid_mtrr_type(data & 0xff));
+}
+
+static int
+set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	struct mtrr_state_type *state = &vcpu->arch.mtrr_state;
+
+	uint64_t *p = (uint64_t *)&state->fixed_ranges;
+
+	if (!mtrr_valid(vcpu, msr, data))
+		return (1);
+
+	if (msr == MSR_MTRRdefType) {
+		state->def_type = data;
+		state->enabled = (data & 0xc00) >> 10;
+	} else if (msr == MSR_MTRRfix64K_00000)
+		p[0] = data;
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		p[1 + msr - MSR_MTRRfix16K_80000] = data;
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+	else if (msr == MSR_IA32_CR_PAT)
+		vcpu->arch.pat = data;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		uint64_t *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+
+		if (!is_mtrr_mask) {
+			pt = (uint64_t *)&state->var_ranges[idx].base_lo;
+		} else {
+			pt = (uint64_t *)&state->var_ranges[idx].mask_lo;
+		}
+
+		*pt = data;
+	}
+
+	kvm_mmu_reset_context(vcpu);
+
 	return (0);
 }
-#endif /* IOMMU */
 
-static void
-kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+static int
+set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
 {
-	vcpu_load(vcpu);
-	kvm_mmu_unload(vcpu);
-	vcpu_put(vcpu);
+	uint64_t mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+
+	switch (msr) {
+	case MSR_IA32_MCG_STATUS:
+		vcpu->arch.mcg_status = data;
+		break;
+	case MSR_IA32_MCG_CTL:
+		if (!(mcg_cap & MCG_CTL_P))
+			return (1);
+		if (data != 0 && data != ~(uint64_t)0)
+			return (-1);
+		vcpu->arch.mcg_ctl = data;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			uint32_t offset = msr - MSR_IA32_MC0_CTL;
+			/*
+			 * only 0 or all 1s can be written to IA32_MCi_CTL
+			 * some Linux kernels though clear bit 10 in bank 4 to
+			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+			 * this to avoid an uncatched #GP in the guest
+			 */
+			if ((offset & 0x3) == 0 &&
+			    data != 0 && (data | (1 << 10)) != ~(uint64_t)0)
+				return (-1);
+			vcpu->arch.mce_banks[offset] = data;
+			break;
+		}
+		return (1);
+	}
+	return (0);
 }
 
-static void
-kvm_free_vcpus(struct kvm *kvmp)
+static int
+xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data)
 {
-	int ii, maxcpus;
+	struct kvm *kvm = vcpu->kvm;
+	int lm = is_long_mode(vcpu);
+	uint8_t *blob_addr = lm ?
+	    (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 :
+	    (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+	uint8_t blob_size = lm ?
+	    kvm->arch.xen_hvm_config.blob_size_64 :
+	    kvm->arch.xen_hvm_config.blob_size_32;
+	uint32_t page_num = data & ~PAGEMASK;
+	uint64_t page_addr = data & PAGEMASK;
+	uint8_t *page;
+	int r;
 
-	maxcpus = kvmp->online_vcpus;
-	XXX_KVM_SYNC_PROBE;
-	for (ii = 0; ii < maxcpus; ii++)
-		kvm_unload_vcpu_mmu(kvmp->vcpus[ii]);
+	r = E2BIG;
+	if (page_num >= blob_size)
+		goto out;
+	r = ENOMEM;
+	page = kmem_alloc(PAGESIZE, KM_SLEEP);
+	r = EFAULT;
+	if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE))
+		goto out_free;
+	if (kvm_write_guest(kvm, page_addr, page, PAGESIZE))
+		goto out_free;
+	r = 0;
+out_free:
+	kmem_free(page, PAGESIZE);
+out:
+	return (r);
+}
 
-	for (ii = 0; ii < maxcpus; ii++)
-		kvm_arch_vcpu_free(kvmp->vcpus[ii]);
 
-	mutex_enter(&kvmp->lock);
-	for (ii = 0; ii < maxcpus; ii++)
-		kvmp->vcpus[ii] = NULL;
-	kvmp->online_vcpus = 0;
-	mutex_exit(&kvmp->lock);
+static int
+kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+	return (kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE);
 }
 
-/*
- * This function exists because of a difference in methodologies from our
- * ancestor. With our ancestors, there is no imputus to clean up lists and
- * mutexes. This is unfortunate, because they seem to even have debug kernels
- * which would seemingly check for these kinds of things. But because in the
- * common case mutex_exit is currently a #define to do {} while(0), it seems
- * that they just ignore this.
- *
- * This leads to the following behavior: during our time we create a lot of
- * auxillary structs potentially related to pits, apics, etc. Tearing down these
- * structures relies on having the correct locks, etc. However
- * kvm_arch_destroy_vm() is designed to be the final death blow, i.e. it's doing
- * the kmem_free. Logically these auxillary structures need to be freed and
- * dealt with before we go back and do the rest of the tear down related to the
- * device.
- */
-void
-kvm_arch_destroy_vm_comps(struct kvm *kvmp)
+static int
+kvm_hv_msr_partition_wide(uint32_t msr)
 {
-	if (kvmp == NULL)
+	int r = 0;
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+	case HV_X64_MSR_HYPERCALL:
+		r = 1;
+		break;
+	}
 
-#ifdef IOMMU
-	kvm_iommu_unmap_guest(kvmp);
+	return (r);
+}
+
+static int
+set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+		kvm->arch.hv_guest_os_id = data;
+		/* setting guest os id to zero disables hypercall page */
+		if (!kvm->arch.hv_guest_os_id)
+			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+		break;
+	case HV_X64_MSR_HYPERCALL: {
+		uint64_t gfn;
+		unsigned long addr;
+		uint8_t instructions[4];
+
+		/* if guest os id is not set hypercall should remain disabled */
+		if (!kvm->arch.hv_guest_os_id)
+			break;
+		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+			kvm->arch.hv_hypercall = data;
+			break;
+		}
+		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+		addr = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(addr))
+			return (1);
+		kvm_x86_ops->patch_hypercall(vcpu, instructions);
+		((unsigned char *)instructions)[3] = 0xc3; /* ret */
+		if (copyout(instructions, (caddr_t)addr, 4))
+			return (1);
+		kvm->arch.hv_hypercall = data;
+		break;
+	}
+	default:
+		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
+		    "data 0x%lx\n", msr, data);
+		return (1);
+	}
+
+	return (0);
+}
+
+static int
+clear_user(void *addr, unsigned long size)
+{
+	caddr_t ka;
+	int rval = 0;
+
+	ka = kmem_zalloc(size, KM_SLEEP);
+	rval = copyout(ka, addr, size);
+	kmem_free(ka, size);
+
+	return (rval);
+}
+
+static int
+set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	switch (msr) {
+	case HV_X64_MSR_APIC_ASSIST_PAGE: {
+		unsigned long addr;
+
+		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+			vcpu->arch.hv_vapic = data;
+			break;
+		}
+
+		addr = gfn_to_hva(vcpu->kvm,
+		    data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+
+		if (kvm_is_error_hva(addr))
+			return (1);
+
+		if (clear_user((void *)addr, PAGESIZE))
+			return (1);
+
+		vcpu->arch.hv_vapic = data;
+		break;
+	}
+
+	case HV_X64_MSR_EOI:
+		return (kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data));
+	case HV_X64_MSR_ICR:
+		return (kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data));
+	case HV_X64_MSR_TPR:
+		return (kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data));
+
+	default:
+		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
+		    "data 0x%lx\n", msr, data);
+		return (1);
+	}
+
+	return (0);
+}
+
+int
+kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	switch (msr) {
+	case MSR_EFER:
+		set_efer(vcpu, data);
+		break;
+	case MSR_K7_HWCR:
+		data &= ~(uint64_t)0x40; /* ignore flush filter disable */
+		if (data != 0) {
+			cmn_err(CE_NOTE,
+			    "unimplemented HWCR wrmsr: 0x%lx\n", data);
+			return (1);
+		}
+		break;
+	case MSR_FAM10H_MMIO_CONF_BASE:
+		if (data != 0) {
+			cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: "
+				"0x%lx\n", data);
+			return (1);
+		}
+		break;
+	case MSR_AMD64_NB_CFG:
+		break;
+	case MSR_IA32_DEBUGCTLMSR:
+		if (!data) {
+			/* We support the non-activated case already */
+			break;
+		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+			/*
+			 * Values other than LBR and BTF are vendor-specific,
+			 * thus reserved and should throw a #GP
+			 */
+			return (1);
+		}
+		cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%lx, nop\n",
+			__func__, data);
+		break;
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_UCODE_WRITE:
+	case MSR_VM_HSAVE_PA:
+	case MSR_AMD64_PATCH_LOADER:
+		break;
+	case 0x200 ... 0x2ff:
+		return (set_msr_mtrr(vcpu, msr, data));
+	case MSR_IA32_APICBASE:
+		kvm_set_apic_base(vcpu, data);
+		break;
+	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+		return (kvm_x2apic_msr_write(vcpu, msr, data));
+	case MSR_IA32_MISC_ENABLE:
+		vcpu->arch.ia32_misc_enable_msr = data;
+		break;
+	case MSR_KVM_WALL_CLOCK:
+		vcpu->kvm->arch.wall_clock = data;
+		kvm_write_wall_clock(vcpu->kvm, data);
+		break;
+	case MSR_KVM_SYSTEM_TIME: {
+#ifdef XXX
+		if (vcpu->arch.time_page) {
+			kvm_release_page_dirty(vcpu->arch.time_page);
+			vcpu->arch.time_page = NULL;
+		}
 #else
-	XXX_KVM_PROBE;
-#endif /* IOMMU */
-	kvm_free_pit(kvmp);
-	kvm_free_vcpus(kvmp);
-	kvm_free_physmem(kvmp);
+		XXX_KVM_PROBE;
+#endif
+
+		vcpu->arch.time = data;
+
+		/* we verify if the enable bit is set... */
+		if (!(data & 1))
+			break;
+
+		/* ...but clean it before doing the actual write */
+		vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1);
 #ifdef XXX
-#ifdef APIC
-	if (kvm->arch.apic_access_page)
-		put_page(kvm->arch.apic_access_page);
-	if (kvm->arch.ept_identity_pagetable)
-		put_page(kvm->arch.ept_identity_pagetable);
-#endif /* APIC */
+		vcpu->arch.time_page =
+				gfn_to_page(vcpu->kvm, data >> PAGESHIFT);
+
+		if (is_error_page(vcpu->arch.time_page)) {
+			kvm_release_page_clean(vcpu->arch.time_page);
+			vcpu->arch.time_page = NULL;
+		}
+
+		kvm_request_guest_time_update(vcpu);
 #else
-	XXX_KVM_PROBE;
-#endif /* XXX */
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
-	cleanup_srcu_struct(&kvm->srcu);
-#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+		XXX_KVM_PROBE;
+#endif
+		break;
+	}
+	case MSR_IA32_MCG_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+		return (set_msr_mce(vcpu, msr, data));
+
+	/*
+	 * Performance counters are not protected by a CPUID bit, so we should
+	 * check all of them in the generic path for the sake of cross vendor
+	 * migration. Writing a zero into the event select MSRs disables them,
+	 * which we perfectly emulate ;-). Any other value should be at least
+	 * reported, some guests depend on them.
+	 */
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
+	case MSR_K7_EVNTSEL0:
+	case MSR_K7_EVNTSEL1:
+	case MSR_K7_EVNTSEL2:
+	case MSR_K7_EVNTSEL3:
+		if (data != 0)
+			cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
+				"0x%x data 0x%lx\n", msr, data);
+		break;
+	/*
+	 * at least RHEL 4 unconditionally writes to the perfctr registers,
+	 * so we ignore writes to make it happy.
+	 */
+	case MSR_P6_PERFCTR0:
+	case MSR_P6_PERFCTR1:
+	case MSR_K7_PERFCTR0:
+	case MSR_K7_PERFCTR1:
+	case MSR_K7_PERFCTR2:
+	case MSR_K7_PERFCTR3:
+		cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
+			"0x%x data 0x%lx\n", msr, data);
+		break;
+	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+		if (kvm_hv_msr_partition_wide(msr)) {
+			int r;
+			mutex_enter(&vcpu->kvm->lock);
+			r = set_msr_hyperv_pw(vcpu, msr, data);
+			mutex_exit(&vcpu->kvm->lock);
+			return (r);
+		} else
+			return (set_msr_hyperv(vcpu, msr, data));
+		break;
+	default:
+		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+			return (xen_hvm_config(vcpu, data));
+		if (!ignore_msrs) {
+			cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %lx\n",
+				msr, data);
+			return (1);
+		} else {
+			cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %lx\n",
+				msr, data);
+			break;
+		}
+	}
+
+	return (0);
 }
 
-void
-kvm_arch_destroy_vm(struct kvm *kvmp)
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int
+kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata)
 {
-	if (kvmp == NULL)
-		return;  /* nothing to do here */
+	return (kvm_x86_ops->get_msr(vcpu, msr_index, pdata));
+}
 
-	if (kvmp->arch.aliases) {
-		kmem_free(kvmp->arch.aliases, sizeof (struct kvm_mem_aliases));
-		kvmp->arch.aliases = NULL;
+static int
+get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	struct mtrr_state_type *state = &vcpu->arch.mtrr_state;
+	uint64_t *p = (uint64_t *)&state->fixed_ranges;
+
+	if (!msr_mtrr_valid(msr))
+		return (1);
+
+	if (msr == MSR_MTRRdefType)
+		*pdata = vcpu->arch.mtrr_state.def_type +
+		    (vcpu->arch.mtrr_state.enabled << 10);
+	else if (msr == MSR_MTRRfix64K_00000)
+		*pdata = p[0];
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+	else if (msr == MSR_IA32_CR_PAT)
+		*pdata = vcpu->arch.pat;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		uint64_t *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+		if (!is_mtrr_mask) {
+			pt = (uint64_t *)&state->var_ranges[idx].base_lo;
+		} else {
+			pt = (uint64_t *)&state->var_ranges[idx].mask_lo;
+		}
+
+		*pdata = *pt;
 	}
-	kmem_free(kvmp, sizeof (struct kvm));
+
+	return (0);
 }
 
-#define	X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */
-#define	MSR_IA32_FEATURE_CONTROL	0x0000003a
+static int
+get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t data;
+	uint64_t mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+
+	switch (msr) {
+	case MSR_IA32_P5_MC_ADDR:
+	case MSR_IA32_P5_MC_TYPE:
+		data = 0;
+		break;
+	case MSR_IA32_MCG_CAP:
+		data = vcpu->arch.mcg_cap;
+		break;
+	case MSR_IA32_MCG_CTL:
+		if (!(mcg_cap & MCG_CTL_P))
+			return (1);
+		data = vcpu->arch.mcg_ctl;
+		break;
+	case MSR_IA32_MCG_STATUS:
+		data = vcpu->arch.mcg_status;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			uint32_t offset = msr - MSR_IA32_MC0_CTL;
+			data = vcpu->arch.mce_banks[offset];
+			break;
+		}
+		return (1);
+	}
+	*pdata = data;
+	return (0);
+}
 
-#define	FEATURE_CONTROL_LOCKED		(1<<0)
-#define	FEATURE_CONTROL_VMXON_ENABLED	(1<<2)
+static int
+get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t data = 0;
+	struct kvm *kvm = vcpu->kvm;
 
-#define	ASM_VMX_VMXON_RAX		".byte 0xf3, 0x0f, 0xc7, 0x30"
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+		data = kvm->arch.hv_guest_os_id;
+		break;
+	case HV_X64_MSR_HYPERCALL:
+		data = kvm->arch.hv_hypercall;
+		break;
+	default:
+		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		return (1);
+	}
 
-void
-kvm_shared_msr_cpu_online(void)
+	*pdata = data;
+
+	return (0);
+}
+
+static int
+get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
 {
-	unsigned i;
+	uint64_t data = 0;
+
+	switch (msr) {
+	case HV_X64_MSR_VP_INDEX: {
+		int r;
+		struct kvm_vcpu *v;
+		kvm_for_each_vcpu(r, v, vcpu->kvm)
+			if (v == vcpu)
+				data = r;
+		break;
+	}
+	case HV_X64_MSR_EOI:
+		return (kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata));
+	case HV_X64_MSR_ICR:
+		return (kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata));
+	case HV_X64_MSR_TPR:
+		return (kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata));
+	default:
+		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		return (1);
+	}
 
-	for (i = 0; i < shared_msrs_global.nr; i++)
-		shared_msr_update(i, shared_msrs_global.msrs[i]);
+	*pdata = data;
+	return (0);
 }
 
 int
-kvm_arch_hardware_enable(void *garbage)
+kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t data;
+
+	switch (msr) {
+	case MSR_IA32_PLATFORM_ID:
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_EBL_CR_POWERON:
+	case MSR_IA32_DEBUGCTLMSR:
+	case MSR_IA32_LASTBRANCHFROMIP:
+	case MSR_IA32_LASTBRANCHTOIP:
+	case MSR_IA32_LASTINTFROMIP:
+	case MSR_IA32_LASTINTTOIP:
+	case MSR_K8_SYSCFG:
+	case MSR_K7_HWCR:
+	case MSR_VM_HSAVE_PA:
+	case MSR_P6_PERFCTR0:
+	case MSR_P6_PERFCTR1:
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
+	case MSR_K7_EVNTSEL0:
+	case MSR_K7_PERFCTR0:
+	case MSR_K8_INT_PENDING_MSG:
+	case MSR_AMD64_NB_CFG:
+	case MSR_FAM10H_MMIO_CONF_BASE:
+		data = 0;
+		break;
+	case MSR_MTRRcap:
+		data = 0x500 | KVM_NR_VAR_MTRR;
+		break;
+	case 0x200 ... 0x2ff:
+		return (get_msr_mtrr(vcpu, msr, pdata));
+	case 0xcd: /* fsb frequency */
+		data = 3;
+		break;
+	case MSR_IA32_APICBASE:
+		data = kvm_get_apic_base(vcpu);
+		break;
+	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+		return (kvm_x2apic_msr_read(vcpu, msr, pdata));
+		break;
+	case MSR_IA32_MISC_ENABLE:
+		data = vcpu->arch.ia32_misc_enable_msr;
+		break;
+	case MSR_IA32_PERF_STATUS:
+		/* TSC increment by tick */
+		data = 1000ULL;
+		/* CPU multiplier */
+		data |= (((uint64_t)4ULL) << 40);
+		break;
+	case MSR_EFER:
+		data = vcpu->arch.efer;
+		break;
+	case MSR_KVM_WALL_CLOCK:
+		data = vcpu->kvm->arch.wall_clock;
+		break;
+	case MSR_KVM_SYSTEM_TIME:
+		data = vcpu->arch.time;
+		break;
+	case MSR_IA32_P5_MC_ADDR:
+	case MSR_IA32_P5_MC_TYPE:
+	case MSR_IA32_MCG_CAP:
+	case MSR_IA32_MCG_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+		return (get_msr_mce(vcpu, msr, pdata));
+	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+		if (kvm_hv_msr_partition_wide(msr)) {
+			int r;
+			mutex_enter(&vcpu->kvm->lock);
+			r = get_msr_hyperv_pw(vcpu, msr, pdata);
+			mutex_exit(&vcpu->kvm->lock);
+			return (r);
+		} else
+			return (get_msr_hyperv(vcpu, msr, pdata));
+		break;
+	default:
+		if (!ignore_msrs) {
+			cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr);
+			return (1);
+		} else {
+			cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr);
+			data = 0;
+		}
+		break;
+	}
+	*pdata = data;
+
+	return (0);
+}
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int
+__msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+    struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu,
+    unsigned index, uint64_t *data))
 {
+	int i, idx;
+
+	vcpu_load(vcpu);
+
 #ifdef XXX
-	/*
-	 * Since this may be called from a hotplug notifcation,
-	 * we can't get the CPU frequency directly.
-	 */
-	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-		int cpu = raw_smp_processor_id();
-		per_cpu(cpu_tsc_khz, cpu) = 0;
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+#else
+	XXX_KVM_SYNC_PROBE;
+#endif
+	for (i = 0; i < msrs->nmsrs; i++) {
+		if (do_msr(vcpu, entries[i].index, &entries[i].data))
+			break;
 	}
+
+#ifdef XXX
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 #else
-	XXX_KVM_PROBE;
+	XXX_KVM_SYNC_PROBE;
 #endif
-	kvm_shared_msr_cpu_online();
+	vcpu_put(vcpu);
 
-	return (kvm_x86_ops->hardware_enable(garbage));
+	return (i);
 }
 
-void
-kvm_arch_hardware_disable(void *garbage)
+int
+kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv)
 {
-	kvm_x86_ops->hardware_disable(garbage);
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
-	drop_user_return_notifiers(garbage);
-#endif
+	int r;
+
+	if (msrs->nmsrs >= MAX_IO_MSRS)
+		return (-E2BIG);
+
+	if ((r = __msr_io(vcpu, msrs, msrs->entries, kvm_get_msr)) < 0)
+		return (r);
+
+	*rv = r;
+
+	return (0);
 }
 
-static inline int
-iommu_found(void)
+int
+kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv)
 {
+	int r;
+
+	if (msrs->nmsrs >= MAX_IO_MSRS)
+		return (-E2BIG);
+
+	if ((r = __msr_io(vcpu, msrs, msrs->entries, do_set_msr)) < 0)
+		return (-EINVAL);
+
+	*rv = r;
+
 	return (0);
 }
 
@@ -374,7 +1615,7 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p)
 		r = EINVAL;
 		break;
 	case KVM_CAP_IOMMU:
-		*rval_p = iommu_found();
+		*rval_p = 0;
 		r = DDI_SUCCESS;
 		break;
 	case KVM_CAP_MCE:
@@ -389,598 +1630,3166 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p)
 	return (r);
 }
 
-static inline int
-apic_x2apic_mode(struct kvm_lapic *apic)
+/* XXX Some part of kvm_ioctl goes here? */
+
+void
+kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	return (apic->vcpu->arch.apic_base & X2APIC_ENABLE);
+	kvm_x86_ops->vcpu_load(vcpu, cpu);
+#ifdef XXX
+	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
+		unsigned long khz = cpufreq_quick_get(cpu);
+		if (!khz)
+			khz = tsc_khz;
+		per_cpu(cpu_tsc_khz, cpu) = khz;
+	}
+#else
+	XXX_KVM_PROBE;
+#endif
+	kvm_request_guest_time_update(vcpu);
 }
 
 void
-kvm_inject_nmi(struct kvm_vcpu *vcpu)
+kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.nmi_pending = 1;
+	kvm_put_guest_fpu(vcpu);
+
+	kvm_x86_ops->vcpu_put(vcpu);
+}
+
+static int
+is_efer_nx(void)
+{
+	unsigned long long efer = 0;
+
+	rdmsrl_safe(MSR_EFER, &efer);
+	return (efer & EFER_NX);
 }
 
 int
-kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid)
+{
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		return (E2BIG);
+
+	bcopy(cpuid->entries, vcpu->arch.cpuid_entries,
+	    cpuid->nent * sizeof (struct kvm_cpuid_entry2));
+
+	vcpu_load(vcpu);
+	vcpu->arch.cpuid_nent = cpuid->nent;
+	kvm_apic_set_version(vcpu);
+	kvm_x86_ops->cpuid_update(vcpu);
+	vcpu_put(vcpu);
+
+	return (0);
+}
+
+int
+kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid)
 {
-	page_t *page;
-	struct kvm *kvm;
 	int r;
+	struct kvm_cpuid_entry2 *entries = cpuid->entries;
 
-	kvm = vcpu->kvm;
+	cpuid->nent = vcpu->arch.cpuid_nent;
 
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	if (cpuid->nent < vcpu->arch.cpuid_nent)
+		return (E2BIG);
 
-	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
-		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-	else
-		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+	bcopy(&vcpu->arch.cpuid_entries, cpuid->entries,
+	    vcpu->arch.cpuid_nent * sizeof (struct kvm_cpuid_entry2));
 
+	return (0);
+}
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+				unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	__asm__ volatile("cpuid"
+	    : "=a" (*eax),
+	    "=b" (*ebx),
+	    "=c" (*ecx),
+	    "=d" (*edx)
+	    : "0" (*eax), "2" (*ecx));
+}
+
+#define	__cpuid			native_cpuid
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void
+cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx,
+    unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = count;
+	__cpuid(eax, ebx, ecx, edx);
+}
+
+static void
+do_cpuid_1_ent(kvm_cpuid_entry2_t *entry, uint32_t function, uint32_t index)
+{
+	entry->function = function;
+	entry->index = index;
+	cpuid_count(entry->function, entry->index,
+		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+	entry->flags = 0;
+}
+
+
+#define	F(x) bit(X86_FEATURE_##x)
+
+static void
+do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function,
+    uint32_t index, int *nent, int maxnent)
+{
+	unsigned int ddic;
+	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+				? F(GBPAGES) : 0;
+	unsigned f_lm = F(LM);
+#else
+	unsigned f_gbpages = 0;
+	unsigned f_lm = 0;
+#endif
+	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+
+	/* cpuid 1.edx */
+	const uint32_t kvm_supported_word0_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+		0 /* Reserved, DS, ACPI */ | F(MMX) |
+		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+		0 /* HTT, TM, Reserved, PBE */;
+	/* cpuid 0x80000001.edx */
+	const uint32_t kvm_supported_word1_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* Reserved */ |
+		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+	/* cpuid 1.ecx */
+	const uint32_t kvm_supported_word4_x86_features =
+		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+		0 /* DS-CPL, VMX, SMX, EST */ |
+		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+		0 /* Reserved, XSAVE, OSXSAVE */;
+	/* cpuid 0x80000001.ecx */
+	const uint32_t kvm_supported_word6_x86_features =
+		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+		0 /* SKINIT */ | 0 /* WDT */;
+
+	/* all calls to cpuid_count() should be made on the same cpu */
+	/* XXX - right now, system panics at ddi_exit_critical() */
+	/* XXX - to run everything on same cpu, bind qemu at startup */
+
+	kpreempt_disable();
+
+	do_cpuid_1_ent(entry, function, index);
+	++*nent;
+
+	switch (function) {
+	case 0:
+		entry->eax = min(entry->eax, (uint32_t)0xb);
+		break;
+	case 1:
+		entry->edx &= kvm_supported_word0_x86_features;
+		entry->ecx &= kvm_supported_word4_x86_features;
+		/*
+		 * we support x2apic emulation even if host does not support
+		 * it since we emulate x2apic in software
+		 */
+		entry->ecx |= F(X2APIC);
+		break;
 	/*
-	 * page = alloc_page(PAGESIZE, KM_SLEEP);
-	 * if (!page) {
-	 *	r = ENOMEM;
-	 *	goto fail;
-	 * }
-	 * vcpu->arch.pio_data = page_address(page);
+	 * function 2 entries are STATEFUL. That is, repeated cpuid commands
+	 * may return different values. This forces us to get_cpu() before
+	 * issuing the first command, and also to emulate this annoying behavior
+	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT
 	 */
-	vcpu->arch.pio_data = (caddr_t)vcpu->run +
-	    (KVM_PIO_PAGE_OFFSET * PAGESIZE);
+	case 2: {
+		int t, times = entry->eax & 0xff;
+
+		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+		for (t = 1; t < times && *nent < maxnent; ++t) {
+			do_cpuid_1_ent(&entry[t], function, 0);
+			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+			++*nent;
+		}
+		break;
+	}
+	/* function 4 and 0xb have additional index. */
+	case 4: {
+		int i, cache_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until cache_type is zero */
+		for (i = 1; *nent < maxnent; ++i) {
+			cache_type = entry[i - 1].eax & 0x1f;
+			if (!cache_type)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 0xb: {
+		int i, level_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until level_type is zero */
+		for (i = 1; *nent < maxnent; ++i) {
+			level_type = entry[i - 1].ecx & 0xff00;
+			if (!level_type)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 0x80000000:
+		entry->eax = min(entry->eax, 0x8000001a);
+		break;
+	case 0x80000001:
+		entry->edx &= kvm_supported_word1_x86_features;
+		entry->ecx &= kvm_supported_word6_x86_features;
+		break;
+	}
+	/*
+	 * XXX - see comment above for ddi_enter_critical()
+	 *
+	 * ddi_exit_critical(ddic);
+	 */
+	kpreempt_enable();
+}
 
-	r = kvm_mmu_create(vcpu);
-	if (r < 0)
-		goto fail;
+#undef F
 
-	if (irqchip_in_kernel(kvm)) {
-		r = kvm_create_lapic(vcpu);
-		if (r < 0)
-			goto fail_mmu_destroy;
-	}
+int
+kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+    struct kvm_cpuid_entry2  *entries)
+{
+	struct kvm_cpuid_entry2 *cpuid_entries;
+	int limit, nent = 0, r = E2BIG;
+	uint32_t func;
+	int allocsize = 0;
 
-	vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS *
-	    sizeof (uint64_t) * 4, KM_SLEEP);
+	if (cpuid->nent < 1)
+		goto out;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+	r = ENOMEM;
+	allocsize = sizeof (struct kvm_cpuid_entry2) * cpuid->nent;
+	cpuid_entries = kmem_zalloc(allocsize, KM_SLEEP);
+
+	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[0].eax;
+	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent);
+
+	r = E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[nent - 1].eax;
+	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent);
+	r = E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	r = EFAULT;
+	if (copyout(cpuid_entries, entries,
+	    nent * sizeof (kvm_cpuid_entry2_t)))
+		goto out_free;
+
+	cpuid->nent = nent;
+	r = 0;
+
+out_free:
+	kmem_free(cpuid_entries, allocsize);
+out:
+	return (r);
+}
 
-	if (!vcpu->arch.mce_banks) {
-		r = ENOMEM;
-		goto fail_free_lapic;
-	}
+int
+kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+	vcpu_load(vcpu);
+	bcopy(vcpu->arch.apic->regs, s->regs, sizeof (*s));
+	vcpu_put(vcpu);
 
-	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+	return (0);
+}
+
+int
+kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+	vcpu_load(vcpu);
+	bcopy(s->regs, vcpu->arch.apic->regs, sizeof (*s));
+	kvm_apic_post_state_restore(vcpu);
+	update_cr8_intercept(vcpu);
+	vcpu_put(vcpu);
 
 	return (0);
-fail_free_lapic:
-	kvm_free_lapic(vcpu);
-fail_mmu_destroy:
-	kvm_mmu_destroy(vcpu);
-fail:
-	return (r);
 }
 
-void
-kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+int
+kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
-	kmem_free(vcpu->arch.mce_banks, sizeof (uint64_t) * 4 *
-	    KVM_MAX_MCE_BANKS);
-	kvm_free_lapic(vcpu);
-	kvm_mmu_destroy(vcpu);
+	if (irq->irq < 0 || irq->irq >= 256)
+		return (-EINVAL);
+
+	if (irqchip_in_kernel(vcpu->kvm))
+		return (-ENXIO);
+
+	vcpu_load(vcpu);
+
+	kvm_queue_interrupt(vcpu, irq->irq, 0);
+
+	vcpu_put(vcpu);
+
+	return (0);
+}
+
+int
+kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp)
+{
+	int rval;
+	uint64_t mcg_cap = *mcg_capp;
+	unsigned bank_num = mcg_cap & 0xff, bank;
+
+	rval = -EINVAL;
+	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
+		goto out;
+	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+		goto out;
+	rval = 0;
+	vcpu->arch.mcg_cap = mcg_cap;
+	/* Init IA32_MCG_CTL to all 1s */
+	if (mcg_cap & MCG_CTL_P)
+		vcpu->arch.mcg_ctl = ~(uint64_t)0;
+	/* Init IA32_MCi_CTL to all 1s */
+	for (bank = 0; bank < bank_num; bank++)
+		vcpu->arch.mce_banks[bank*4] = ~(uint64_t)0;
+out:
+	return (rval);
 }
 
+int
+kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+    struct kvm_vcpu_events *events)
+{
+	vcpu_load(vcpu);
+
+	events->exception.injected = vcpu->arch.exception.pending;
+	events->exception.nr = vcpu->arch.exception.nr;
+	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+	events->exception.error_code = vcpu->arch.exception.error_code;
+
+	events->interrupt.injected = vcpu->arch.interrupt.pending;
+	events->interrupt.nr = vcpu->arch.interrupt.nr;
+	events->interrupt.soft = vcpu->arch.interrupt.soft;
+
+	events->nmi.injected = vcpu->arch.nmi_injected;
+	events->nmi.pending = vcpu->arch.nmi_pending;
+	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+
+	events->sipi_vector = vcpu->arch.sipi_vector;
+
+	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING |
+	    KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+
+	vcpu_put(vcpu);
+
+	return (0);
+}
 
 int
-kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+    struct kvm_vcpu_events *events)
 {
-	int r;
+	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING |
+	    KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+		return (-EINVAL);
+
+	vcpu_load(vcpu);
+
+	vcpu->arch.exception.pending = events->exception.injected;
+	vcpu->arch.exception.nr = events->exception.nr;
+	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+	vcpu->arch.exception.error_code = events->exception.error_code;
+
+	vcpu->arch.interrupt.pending = events->interrupt.injected;
+	vcpu->arch.interrupt.nr = events->interrupt.nr;
+	vcpu->arch.interrupt.soft = events->interrupt.soft;
 
-	mutex_init(&vcpu->mutex, NULL, MUTEX_DRIVER, 0);
-	vcpu->cpu = -1;
-	vcpu->kvm = kvm;
-	vcpu->vcpu_id = id;
+	if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
+		kvm_pic_clear_isr_ack(vcpu->kvm);
+
+	vcpu->arch.nmi_injected = events->nmi.injected;
+
+	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
+		vcpu->arch.nmi_pending = events->nmi.pending;
+
+	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+
+	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
+		vcpu->arch.sipi_vector = events->sipi_vector;
+
+	vcpu_put(vcpu);
+
+	return (0);
+}
+
+int
+kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, caddr_t addr)
+{
+	/*
+	 * XXX They have some other code here to check the validity of the
+	 * address
+	 */
+	return (kvm_x86_ops->set_tss_addr(kvmp, addr));
+}
+
+int
+kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr)
+{
+	kvm->arch.ept_identity_map_addr = ident_addr;
+	return (0);
+}
+
+gfn_t
+unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+	struct kvm_mem_alias *alias;
+	struct kvm_mem_aliases *aliases;
 #ifdef XXX
-	init_waitqueue_head(&vcpu->wq);
+	aliases = rcu_dereference(kvm->arch.aliases);
 #else
-	XXX_KVM_PROBE;
+	XXX_KVM_SYNC_PROBE;
+	aliases = kvm->arch.aliases;
 #endif
-	vcpu->run = ddi_umem_alloc(PAGESIZE * 2, DDI_UMEM_SLEEP, &vcpu->cookie);
 
-	r = kvm_arch_vcpu_init(vcpu);
+	for (i = 0; i < aliases->naliases; i++) {
+		alias = &aliases->aliases[i];
+		if (alias->flags & KVM_ALIAS_INVALID)
+			continue;
+		if (gfn >= alias->base_gfn &&
+		    gfn < alias->base_gfn + alias->npages)
+			return (alias->target_gfn + gfn - alias->base_gfn);
+	}
 
-	if (r != 0) {
-		vcpu->run = NULL;
-		ddi_umem_free(vcpu->cookie);
-		return (r);
+	return (gfn);
+}
+
+gfn_t
+unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+	struct kvm_mem_alias *alias;
+	struct kvm_mem_aliases *aliases;
+
+	/* XXX need protection */
+	aliases = kvm->arch.aliases;
+
+	for (i = 0; i < aliases->naliases; ++i) {
+		alias = &aliases->aliases[i];
+		if (gfn >= alias->base_gfn &&
+		    gfn < alias->base_gfn + alias->npages)
+			return (alias->target_gfn + gfn - alias->base_gfn);
 	}
+	return (gfn);
+}
+
+int
+kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+	int r;
+
+	r = 0;
+	switch (chip->chip_id) {
+	case KVM_IRQCHIP_PIC_MASTER:
+		memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0],
+		    sizeof (struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_PIC_SLAVE:
+		memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1],
+		    sizeof (struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_IOAPIC:
+		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
+		break;
+	default:
+		r = EINVAL;
+		break;
+	}
+
+	return (r);
+}
+
+int
+kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+	int r;
+
+	r = 0;
+
+	switch (chip->chip_id) {
+	case KVM_IRQCHIP_PIC_MASTER:
+		mutex_enter(&pic_irqchip(kvm)->lock);
+		memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic,
+		    sizeof (struct kvm_pic_state));
+		mutex_exit(&pic_irqchip(kvm)->lock);
+		break;
+	case KVM_IRQCHIP_PIC_SLAVE:
+		mutex_enter(&pic_irqchip(kvm)->lock);
+		memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic,
+		    sizeof (struct kvm_pic_state));
+		mutex_exit(&pic_irqchip(kvm)->lock);
+		break;
+	case KVM_IRQCHIP_IOAPIC:
+		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
+		break;
+	default:
+		r = EINVAL;
+		break;
+	}
+
+	kvm_pic_update_irq(pic_irqchip(kvm));
+
+	return (r);
+}
+
+int
+kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+	struct kvm_pit *vpit = kvm->arch.vpit;
+
+	mutex_enter(&vpit->pit_state.lock);
+	memcpy(ps->channels, &vpit->pit_state.channels, sizeof (ps->channels));
+	ps->flags = vpit->pit_state.flags;
+	mutex_exit(&vpit->pit_state.lock);
+
+	return (0);
+}
+
+int
+kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+	boolean_t prev_legacy, cur_legacy, start = B_FALSE;
+	struct kvm_pit *vpit = kvm->arch.vpit;
+
+	mutex_enter(&vpit->pit_state.lock);
+	prev_legacy = vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+
+	if (!prev_legacy && cur_legacy)
+		start = B_TRUE;
+
+	memcpy(&vpit->pit_state.channels, &ps->channels,
+	    sizeof (vpit->pit_state.channels));
+
+	vpit->pit_state.flags = ps->flags;
+	kvm_pit_load_count(kvm, 0, vpit->pit_state.channels[0].count, start);
+
+	mutex_exit(&vpit->pit_state.lock);
+
+	return (0);
+}
+
+/* TODO: As Pascal would say, we can do better */
+int
+kvm_vm_ioctl_get_msr_index_list(struct kvm *kvm, uintptr_t arg)
+{
+
+	struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg;
+	struct kvm_msr_list *msr_list;
+	size_t sz = sizeof (struct kvm_msr_list);
+	unsigned n;
+
+	msr_list = kmem_zalloc(sz, KM_SLEEP);
+
+	if (copyin(user_msr_list, msr_list, sz) != 0) {
+		kmem_free(msr_list, sz);
+		return (EFAULT);
+	}
+
+	n = msr_list->nmsrs;
+	msr_list->nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+
+	if (copyout(msr_list, user_msr_list, sz) != 0) {
+		kmem_free(msr_list, sz);
+		return (EFAULT);
+	}
+
+	if (n < msr_list->nmsrs) {
+		kmem_free(msr_list, sz);
+		return (E2BIG);
+	}
+
+	if (copyout(&msrs_to_save, user_msr_list->indices,
+	    num_msrs_to_save * sizeof (uint32_t))) {
+		kmem_free(msr_list, sz);
+		return (EFAULT);
+	}
+
+	if (copyout(&emulated_msrs, user_msr_list->indices +
+	    num_msrs_to_save, ARRAY_SIZE(emulated_msrs) *
+	    sizeof (uint32_t)) != 0) {
+		kmem_free(msr_list, sz);
+		return (EFAULT);
+	}
+
+	kmem_free(msr_list, sz);
 
 	return (0);
 }
 
 /*
- * For pages for which vmx needs physical addresses,
- * linux allocates pages from an area that maps virtual
- * addresses 1-1 with physical memory.  In this way,
- * translating virtual to physical just involves subtracting
- * the start of the area from the virtual address.
- * This solaris version uses kmem_alloc, so there is no
- * direct mapping of virtual to physical.  We'll change this
- * later if performance is an issue.  For now, we'll use
- * hat_getpfnum() to do the conversion.  Also note that
- * we're assuming 64-bit address space (we won't run on
- * 32-bit hardware).
+ * Get (and clear) the dirty memory log for a memory slot.
  */
-uint64_t
-kvm_va2pa(caddr_t va)
+int
+kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-	uint64_t pa;
+	int r, i;
+	struct kvm_memory_slot *memslot;
+	unsigned long n;
+	unsigned long is_dirty = 0;
+	unsigned long *dirty_bitmap = NULL;
 
-	pa = (hat_getpfnum(kas.a_hat, va)<<PAGESHIFT)|((uint64_t)va&PAGEOFFSET);
-	return (pa);
-}
+	mutex_enter(&kvm->slots_lock);
 
-#ifdef XXX_KVM_DECLARATION
-unsigned long *vmx_io_bitmap_a;
-unsigned long *vmx_io_bitmap_b;
-unsigned long *vmx_msr_bitmap_legacy;
-unsigned long *vmx_msr_bitmap_longmode;
+	r = EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
+		goto out;
+
+	memslot = &kvm->memslots->memslots[log->slot];
+	r = ENOENT;
+	if (!memslot->dirty_bitmap)
+		goto out;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
+
+	dirty_bitmap = kmem_alloc(n, KM_SLEEP);
+	memset(dirty_bitmap, 0, n);
+
+	for (i = 0; !is_dirty && i < n / sizeof (long); i++)
+		is_dirty = memslot->dirty_bitmap[i];
+
+	/* If nothing is dirty, don't bother messing with page tables. */
+	if (is_dirty) {
+		struct kvm_memslots *slots, *old_slots;
+
+		mutex_enter(&kvm->mmu_lock);
+		kvm_mmu_slot_remove_write_access(kvm, log->slot);
+		mutex_exit(&kvm->mmu_lock);
+
+		slots = kmem_zalloc(sizeof (struct kvm_memslots), KM_SLEEP);
+		if (!slots)
+			goto out_free;
+
+		memcpy(slots, kvm->memslots, sizeof (struct kvm_memslots));
+		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+
+		old_slots = kvm->memslots;
+#ifdef XXX
+		rcu_assign_pointer(kvm->memslots, slots);
+		kvm_synchronize_srcu_expedited(&kvm->srcu);
 #else
-/* make these arrays to try to force into low 4GB memory... */
-/* also need to be aligned... */
-__attribute__((__aligned__(PAGESIZE)))unsigned long
-    vmx_io_bitmap_a[PAGESIZE / sizeof (unsigned long)];
-__attribute__((__aligned__(PAGESIZE)))unsigned long
-    vmx_io_bitmap_b[PAGESIZE / sizeof (unsigned long)];
-__attribute__((__aligned__(PAGESIZE)))unsigned long
-    vmx_msr_bitmap_legacy[PAGESIZE / sizeof (unsigned long)];
-__attribute__((__aligned__(PAGESIZE)))unsigned long
-    vmx_msr_bitmap_longmode[PAGESIZE / sizeof (unsigned long)];
+		kvm->memslots = slots;
+		XXX_KVM_SYNC_PROBE;
 #endif
+		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
+		kmem_free(old_slots, sizeof (struct kvm_memslots));
+	}
 
-struct kvm_vcpu *
-kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
-{
-	char buf[32];
-	struct kvm_vcpu *vcpu;
-	kstat_t *kstat;
+	r = 0;
+	if (copyout(dirty_bitmap, log->v.dirty_bitmap, n) != 0)
+		r = EFAULT;
+out_free:
+	kmem_free(dirty_bitmap, n);
+out:
+	mutex_exit(&kvm->slots_lock);
+	return (r);
+}
 
-	(void) snprintf(buf, sizeof (buf), "vcpu-%d", kvm->kvmid);
+/* XXX kvm_arch_vm_ioctl */
 
-	if ((kstat = kstat_create("kvm", id, buf, "misc", KSTAT_TYPE_NAMED,
-	    sizeof (kvm_vcpu_stats_t) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL)) == NULL) {
-		return (NULL);
+static void
+kvm_init_msr_list(void)
+{
+	uint32_t dummy[2];
+	unsigned i, j;
+
+	/* skip the first msrs in the list. KVM-specific */
+	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+			continue;
+		if (j < i)
+			msrs_to_save[j] = msrs_to_save[i];
+		j++;
 	}
+	num_msrs_to_save = j;
+}
 
-	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+static int
+vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v)
+{
+	if (vcpu->arch.apic &&
+	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+		return (0);
 
-	if (vcpu == NULL) {
-		kstat_delete(kstat);
-		return (NULL);
-	}
+	return (kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v));
+}
 
-	vcpu->kvcpu_kstat = kstat;
-	vcpu->kvcpu_kstat->ks_data = &vcpu->kvcpu_stats;
+static int
+vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+	if (vcpu->arch.apic &&
+	    !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+		return (0);
 
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "id");
-	vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvmid;
+	return (kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v));
+}
 
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "pid");
-	vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvm_pid;
+gpa_t
+kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
+{
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
+	    PFERR_USER_MASK : 0;
 
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_injections, "nmi-injections");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_injections, "irq-injections");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_fpu_reload, "fpu-reload");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_host_state_reload, "host-state-reload");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation, "insn-emulation");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation_fail,
-	    "inst-emulation-fail");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_exits, "exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_exits, "halt-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_exits, "irq-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_io_exits, "io-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_mmio_exits, "mmio-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_window_exits, "nmi-window-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_window_exits, "irq-window-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_request_irq_exits, "request-irq-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_signal_exits, "signal-exits");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_wakeup, "halt-wakeup");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_invlpg, "invlpg");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_guest, "pf-guest");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_fixed, "pf-fixed");
-	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_hypercalls, "hypercalls");
+	return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error));
+}
 
-	kstat_install(vcpu->kvcpu_kstat);
+gpa_t
+kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
+{
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
+	    PFERR_USER_MASK : 0;
 
-	return (vcpu);
+	access |= PFERR_WRITE_MASK;
+
+	return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error));
 }
 
-void
-kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+static int
+kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+    struct kvm_vcpu *vcpu, uint32_t access, uint32_t *error)
 {
-	if (vcpu->arch.time_page) {
-		/* XXX We aren't doing anything with the time page */
-		XXX_KVM_PROBE;
-		vcpu->arch.time_page = NULL;
+	uintptr_t data = (uintptr_t)val;
+	int r = 0; /* X86EMUL_CONTINUE */
+
+	while (bytes) {
+		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
+		    access, error);
+		unsigned offset = addr & (PAGESIZE-1);
+		unsigned toread = min(bytes, (unsigned)PAGESIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA) {
+			r = 1; /* X86EMUL_PROPAGATE_FAULT */
+			goto out;
+		}
+		ret = kvm_read_guest(vcpu->kvm, gpa, (void *)data, toread);
+		if (ret < 0) {
+			r = 1; /* X86EMUL_UNHANDLEABLE */
+			goto out;
+		}
+
+		bytes -= toread;
+		data += toread;
+		addr += toread;
 	}
+out:
+	return (r);
+}
 
-	if (vcpu->kvcpu_kstat != NULL)
-		kstat_delete(vcpu->kvcpu_kstat);
+/* used for instruction fetching */
+static int
+kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+    struct kvm_vcpu *vcpu, uint32_t *error)
+{
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
+	    PFERR_USER_MASK : 0;
 
-	kvm_x86_ops->vcpu_free(vcpu);
+	return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+	    access | PFERR_FETCH_MASK, error));
 }
 
+static int
+kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+    struct kvm_vcpu *vcpu, uint32_t *error)
+{
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
+	    PFERR_USER_MASK : 0;
 
-uint64_t
-kvm_get_apic_base(struct kvm_vcpu *vcpu)
+	return (kvm_read_guest_virt_helper(addr, val,
+	    bytes, vcpu, access, error));
+}
+
+static int
+kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+    struct kvm_vcpu *vcpu, uint32_t *error)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
-		return (vcpu->arch.apic_base);
-	else
-		return (vcpu->arch.apic_base);
+	return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error));
 }
 
-void
-kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data)
+static int
+kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+    struct kvm_vcpu *vcpu, uint32_t *error)
 {
-	/* TODO: reserve bits check */
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_lapic_set_base(vcpu, data);
-	else
-		vcpu->arch.apic_base = data;
+	uintptr_t data = (uintptr_t)val;
+
+	while (bytes) {
+		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+		unsigned offset = addr & (PAGESIZE-1);
+		unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA)
+			return (X86EMUL_PROPAGATE_FAULT);
+
+		if (kvm_write_guest(vcpu->kvm, gpa, (void *)data, towrite) < 0)
+			return (X86EMUL_UNHANDLEABLE);
+
+		bytes -= towrite;
+		data += towrite;
+		addr += towrite;
+	}
+
+	return (0);
 }
 
-void
-kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+static int
+emulator_read_emulated(unsigned long addr, void *val,
+    unsigned int bytes, struct kvm_vcpu *vcpu)
 {
-	if (cr8 & CR8_RESERVED_BITS) {
-		kvm_inject_gp(vcpu, 0);
-		return;
+	gpa_t gpa;
+	uint32_t error_code;
+
+	if (vcpu->mmio_read_completed) {
+		memcpy(val, vcpu->mmio_data, bytes);
+		KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t,
+		    vcpu->mmio_phys_addr, uint64_t, *(uint64_t *)val);
+
+		vcpu->mmio_read_completed = 0;
+		return (X86EMUL_CONTINUE);
 	}
 
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_lapic_set_tpr(vcpu, cr8);
-	else
-		vcpu->arch.cr8 = cr8;
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+	if (gpa == UNMAPPED_GVA) {
+		kvm_inject_page_fault(vcpu, addr, error_code);
+		return (X86EMUL_PROPAGATE_FAULT);
+	}
+
+	/* For APIC access vmexit */
+	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+		goto mmio;
+
+	if (kvm_read_guest_virt(addr, val,
+	    bytes, vcpu, NULL) == X86EMUL_CONTINUE)
+		return (X86EMUL_CONTINUE);
+
+mmio:
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+		KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, gpa,
+		    uint64_t, *(uint64_t *)val);
+		return (X86EMUL_CONTINUE);
+	}
+
+	KVM_TRACE2(mmio__read__unsatisfied, unsigned int, bytes,
+	    uintptr_t, gpa);
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 0;
+
+	return (X86EMUL_UNHANDLEABLE);
 }
 
 int
-is_paging(struct kvm_vcpu *vcpu)
+emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+    const void *val, int bytes)
 {
-	return (kvm_read_cr0_bits(vcpu, X86_CR0_PG));
+	int ret;
+
+	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+
+	if (ret < 0)
+		return (0);
+
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+
+	return (1);
 }
 
-unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)];
+static int
+emulator_write_emulated_onepage(unsigned long addr, const void *val,
+    unsigned int bytes, struct kvm_vcpu *vcpu)
+{
+	gpa_t gpa;
+	uint32_t error_code;
+
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+
+	if (gpa == UNMAPPED_GVA) {
+		kvm_inject_page_fault(vcpu, addr, error_code);
+		return (X86EMUL_PROPAGATE_FAULT);
+	}
+
+	/* For APIC access vmexit */
+	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+		goto mmio;
+
+	if (emulator_write_phys(vcpu, gpa, val, bytes))
+		return (X86EMUL_CONTINUE);
+
+mmio:
+	KVM_TRACE3(mmio__write, unsigned int, bytes, uintptr_t, gpa,
+	    uint64_t, *(uint64_t *)val);
+
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+		return (X86EMUL_CONTINUE);
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 1;
+	memcpy(vcpu->mmio_data, val, bytes);
+
+	return (X86EMUL_CONTINUE);
+}
 
 int
-kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+emulator_write_emulated(unsigned long addr, const void *val,
+    unsigned int bytes, struct kvm_vcpu *vcpu)
 {
-	return (kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len));
+	uintptr_t data = (uintptr_t)val;
+
+	/* Crossing a page boundary? */
+	if (((addr + bytes - 1) ^ addr) & PAGEMASK) {
+		int rc, now;
+
+		now = -addr & ~PAGEMASK;
+		rc = emulator_write_emulated_onepage(addr,
+		    (void *)data, now, vcpu);
+
+		if (rc != X86EMUL_CONTINUE)
+			return (rc);
+
+		addr += now;
+		data += now;
+		bytes -= now;
+	}
+
+	return (emulator_write_emulated_onepage(addr, val, bytes, vcpu));
+}
+
+static int
+emulator_cmpxchg_emulated(unsigned long addr, const void *old,
+    const void *new, unsigned int bytes, struct kvm_vcpu *vcpu)
+{
+	cmn_err(CE_WARN, "kvm: emulating exchange as write\n");
+#ifndef CONFIG_X86_64
+	/* guests cmpxchg8b have to be emulated atomically */
+	if (bytes == 8) {
+		gpa_t gpa;
+		page_t page;
+		char *kaddr;
+		uint64_t val;
+
+		gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+		if (gpa == UNMAPPED_GVA ||
+		    (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+			goto emul_write;
+
+		if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK))
+			goto emul_write;
+
+		val = *(uint64_t *)new;
+
+		page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT);
+		kaddr = kmap_atomic(page, KM_USER0);
+
+		set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val);
+		kunmap_atomic(kaddr, KM_USER0);
+		kvm_release_page_dirty(page);
+	}
+emul_write:
+#endif
+
+	return (emulator_write_emulated(addr, new, bytes, vcpu));
+}
+
+static unsigned long
+get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	return (kvm_x86_ops->get_segment_base(vcpu, seg));
 }
 
 void
-fx_init(struct kvm_vcpu *vcpu)
+kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 {
-	unsigned after_mxcsr_mask;
+	uint8_t opcodes[4];
+	unsigned long rip = kvm_rip_read(vcpu);
+	unsigned long rip_linear;
+
 #ifdef XXX
-	/*
-	 * Touch the fpu the first time in non atomic context as if
-	 * this is the first fpu instruction the exception handler
-	 * will fire before the instruction returns and it'll have to
-	 * allocate ram with GFP_KERNEL.
-	 */
-	if (!used_math())
+	if (!printk_ratelimit())
+		return;
 #else
 	XXX_KVM_PROBE;
 #endif
-		kvm_fx_save(&vcpu->arch.host_fx_image);
 
-	/* Initialize guest FPU by resetting ours and saving into guest's */
-	kpreempt_disable();
-	kvm_fx_save(&vcpu->arch.host_fx_image);
-	kvm_fx_finit();
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
-	kpreempt_enable();
+	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-	vcpu->arch.cr0 |= X86_CR0_ET;
-	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-	memset((void *)((uintptr_t)&vcpu->arch.guest_fx_image +
-	    after_mxcsr_mask), 0, sizeof (struct i387_fxsave_struct) -
-	    after_mxcsr_mask);
+	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
+
+	cmn_err(CE_WARN, "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+	    context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+}
+
+static struct x86_emulate_ops emulate_ops = {
+	.read_std		= kvm_read_guest_virt_system,
+	.fetch			= kvm_fetch_guest_virt,
+	.read_emulated		= emulator_read_emulated,
+	.write_emulated		= emulator_write_emulated,
+	.cmpxchg_emulated	= emulator_cmpxchg_emulated,
+};
+
+static void
+cache_all_regs(struct kvm_vcpu *vcpu)
+{
+	kvm_register_read(vcpu, VCPU_REGS_RAX);
+	kvm_register_read(vcpu, VCPU_REGS_RSP);
+	kvm_register_read(vcpu, VCPU_REGS_RIP);
+	vcpu->arch.regs_dirty = ~0;
 }
 
 int
-kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+    uint16_t error_code, int emulation_type)
 {
-	vcpu->arch.nmi_pending = 0;
-	vcpu->arch.nmi_injected = 0;
+	int r, shadow_mask;
+	struct decode_cache *c;
+	struct kvm_run *run = vcpu->run;
 
-	vcpu->arch.switch_db_regs = 0;
-	memset(vcpu->arch.db, 0, sizeof (vcpu->arch.db));
-	vcpu->arch.dr6 = DR6_FIXED_1;
-	vcpu->arch.dr7 = DR7_FIXED_1;
+	kvm_clear_exception_queue(vcpu);
+	vcpu->arch.mmio_fault_cr2 = cr2;
 
-	return (kvm_x86_ops->vcpu_reset(vcpu));
+	/*
+	 * TODO: fix emulate.c to use guest_read/write_register
+	 * instead of direct ->regs accesses, can save hundred cycles
+	 * on Intel for instructions that don't read/change RSP, for
+	 * for example.
+	 */
+	cache_all_regs(vcpu);
+
+	vcpu->mmio_is_write = 0;
+	vcpu->arch.pio.string = 0;
+
+	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+		int cs_db, cs_l;
+		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+		vcpu->arch.emulate_ctxt.vcpu = vcpu;
+		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ?
+		    X86EMUL_MODE_REAL :
+		    (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) ?
+		    X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 :
+		    cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+		/*
+		 * Only allow emulation of specific instructions on #UD
+		 * (namely VMMCALL, sysenter, sysexit, syscall)
+		 */
+		c = &vcpu->arch.emulate_ctxt.decode;
+		if (emulation_type & EMULTYPE_TRAP_UD) {
+			if (!c->twobyte)
+				return (EMULATE_FAIL);
+			switch (c->b) {
+			case 0x01: /* VMMCALL */
+				if (c->modrm_mod != 3 || c->modrm_rm != 1)
+					return (EMULATE_FAIL);
+				break;
+			case 0x34: /* sysenter */
+			case 0x35: /* sysexit */
+				if (c->modrm_mod != 0 || c->modrm_rm != 0)
+					return (EMULATE_FAIL);
+				break;
+			case 0x05: /* syscall */
+				if (c->modrm_mod != 0 || c->modrm_rm != 0)
+					return (EMULATE_FAIL);
+				break;
+			default:
+				return (EMULATE_FAIL);
+			}
+
+			if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+				return (EMULATE_FAIL);
+		}
+
+		KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation);
+
+		if (r)  {
+			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation_fail);
+
+			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+				return (EMULATE_DONE);
+			return (EMULATE_FAIL);
+		}
+	}
+
+	if (emulation_type & EMULTYPE_SKIP) {
+		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+		return (EMULATE_DONE);
+	}
+
+	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+	if (r == 0)
+		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+
+	if (vcpu->arch.pio.string)
+		return (EMULATE_DO_MMIO);
+
+	if ((r || vcpu->mmio_is_write) && run) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		run->mmio.phys_addr = vcpu->mmio_phys_addr;
+		memcpy(run->mmio.data, vcpu->mmio_data, 8);
+		run->mmio.len = vcpu->mmio_size;
+		run->mmio.is_write = vcpu->mmio_is_write;
+	}
+
+	if (r) {
+		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			return (EMULATE_DONE);
+		if (!vcpu->mmio_needed) {
+			kvm_report_emulation_failure(vcpu, "mmio");
+			return (EMULATE_FAIL);
+		}
+
+		return (EMULATE_DO_MMIO);
+	}
+
+	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+	if (vcpu->mmio_is_write) {
+		vcpu->mmio_needed = 0;
+		return (EMULATE_DO_MMIO);
+	}
+
+	return (EMULATE_DONE);
 }
 
-struct kvm_memory_slot *
-gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+static int
+pio_copy_data(struct kvm_vcpu *vcpu)
 {
-	gfn = unalias_gfn(kvm, gfn);
-	return (gfn_to_memslot_unaliased(kvm, gfn));
+	void *p = vcpu->arch.pio_data;
+	gva_t q = vcpu->arch.pio.guest_gva;
+	unsigned bytes;
+	int ret;
+	uint32_t error_code;
+
+	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
+
+	if (vcpu->arch.pio.in)
+		ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
+	else
+		ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
+
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		kvm_inject_page_fault(vcpu, q, error_code);
+
+	return (ret);
 }
 
-unsigned long
-kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+int
+complete_pio(struct kvm_vcpu *vcpu)
 {
-	struct vm_area_struct *vma;
-	unsigned long addr, size;
+	struct kvm_pio_request *io = &vcpu->arch.pio;
+	long delta;
+	int r;
+	unsigned long val;
 
-	size = PAGESIZE;
+	if (!io->string) {
+		if (io->in) {
+			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+			memcpy(&val, vcpu->arch.pio_data, io->size);
+			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+		}
+	} else {
+		if (io->in) {
+			r = pio_copy_data(vcpu);
+			if (r)
+				goto out;
+		}
 
-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr))
-		return (PAGESIZE);
+		delta = 1;
+		if (io->rep) {
+			delta *= io->cur_count;
+			/*
+			 * The size of the register should really depend on
+			 * current address size.
+			 */
+			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+			val -= delta;
+			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
+		}
+		if (io->down)
+			delta = -delta;
+		delta *= io->size;
+		if (io->in) {
+			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+			val += delta;
+			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+		} else {
+			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+			val += delta;
+			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+		}
+	}
+out:
+	io->count -= io->cur_count;
+	io->cur_count = 0;
 
-#ifdef XXX
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, addr);
-	if (!vma)
+	return (0);
+}
+
+static int
+kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+	/* TODO: String I/O for in kernel device */
+	int r;
+
+	if (vcpu->arch.pio.in) {
+		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+		    vcpu->arch.pio.size, pd);
+	} else {
+		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+		    vcpu->arch.pio.port, vcpu->arch.pio.size, pd);
+	}
+
+	return (r);
+}
+
+int
+kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
+{
+	unsigned long val;
+
+	DTRACE_PROBE4(kvm__pio, int, !in, unsigned, port, int, size,
+	    unsigned long, 1)
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+	vcpu->run->io.size = vcpu->arch.pio.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGESIZE;
+	vcpu->run->io.count = vcpu->arch.pio.count =
+	    vcpu->arch.pio.cur_count = 1;
+	vcpu->run->io.port = vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = in;
+	vcpu->arch.pio.string = 0;
+	vcpu->arch.pio.down = 0;
+	vcpu->arch.pio.rep = 0;
+
+	if (!vcpu->arch.pio.in) {
+		val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+		memcpy(vcpu->arch.pio_data, &val, 4);
+	}
+
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+		complete_pio(vcpu);
+		return (1);
+	}
+
+	return (0);
+}
+
+void
+kvm_timer_fire(void *arg)
+{
+	struct kvm_timer *timer = (struct kvm_timer *)arg;
+	struct kvm_vcpu *vcpu = timer->vcpu;
+
+	if (vcpu == NULL)
+		return;
+
+	mutex_enter(&vcpu->kvcpu_kick_lock);
+
+	if (timer->reinject || !timer->pending) {
+		atomic_add_32(&timer->pending, 1);
+		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+	}
+
+	timer->intervals++;
+
+	cv_broadcast(&vcpu->kvcpu_kick_cv);
+	mutex_exit(&vcpu->kvcpu_kick_lock);
+}
+
+static void
+kvm_timer_init(void)
+{
+	int cpu;
+
+	/*
+	 * XXX We assume that any machine running solaris kvm
+	 * has constant time stamp counter increment rate.
+	 * This will be true for all but older machines.
+	 */
+	/* assume pi_clock in mhz */
+	cpu_tsc_khz = (cpu_freq_hz / 1000);
+}
+
+int
+kvm_arch_init(void *opaque)
+{
+	int r;
+	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+
+	if (ops->cpu_has_kvm_support()) {
+		cmn_err(CE_WARN, "kvm: no hardware support\n");
+		r = ENOTSUP;
 		goto out;
+	}
+	if (ops->disabled_by_bios()) {
+		cmn_err(CE_WARN, "kvm: disabled by bios\n");
+		r = ENOTSUP;
+		goto out;
+	}
 
-	size = vma_kernel_pagesize(vma);
+	r = kvm_mmu_module_init();
+	if (r)
+		goto out;
+
+	kvm_init_msr_list();
+
+	kvm_x86_ops = ops;
+	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
+	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+	    PT_DIRTY_MASK, PT64_NX_MASK, 0);
+
+	kvm_timer_init();
+
+	return (0);
 
 out:
-	up_read(&current->mm->mmap_sem);
-	return (size);
+	return (r);
+}
+
+int
+kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_exits);
+
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+		return (1);
+	} else {
+		vcpu->run->exit_reason = KVM_EXIT_HLT;
+		return (0);
+	}
+}
+
+int
+kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+	uint64_t param, ingpa, outgpa, ret;
+	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+	int fast, longmode;
+	int cs_db, cs_l;
+
+	/*
+	 * hypercall generates UD from non zero cpl and real mode
+	 * per HYPER-V spec
+	 */
+	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return (0);
+	}
+
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+	longmode = is_long_mode(vcpu) && cs_l == 1;
+
+	if (!longmode) {
+		param = ((uint64_t)kvm_register_read(vcpu,
+		    VCPU_REGS_RDX) << 32) | (kvm_register_read(vcpu,
+		    VCPU_REGS_RAX) & 0xffffffff);
+
+		ingpa = ((uint64_t)kvm_register_read(vcpu,
+		    VCPU_REGS_RBX) << 32) | (kvm_register_read(vcpu,
+		    VCPU_REGS_RCX) & 0xffffffff);
+
+		outgpa = ((uint64_t)kvm_register_read(vcpu,
+		    VCPU_REGS_RDI) << 32) | (kvm_register_read(vcpu,
+		    VCPU_REGS_RSI) & 0xffffffff);
+	}
+#ifdef CONFIG_X86_64
+	else {
+		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+	}
+#endif
+
+	code = param & 0xffff;
+	fast = (param >> 16) & 0x1;
+	rep_cnt = (param >> 32) & 0xfff;
+	rep_idx = (param >> 48) & 0xfff;
+
+	KVM_TRACE6(hv__hypercall, uintptr_t, code, uintptr_t, fast,
+	    uintptr_t, rep_cnt, uintptr_t, rep_idx, uintptr_t, ingpa,
+	    uintptr_t, outgpa);
+
+	switch (code) {
+	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+#ifdef XXX
+		kvm_vcpu_on_spin(vcpu);
 #else
-	XXX_KVM_PROBE;
-	return (PAGESIZE);
+		XXX_KVM_PROBE;
 #endif
+		break;
+	default:
+		res = HV_STATUS_INVALID_HYPERCALL_CODE;
+		break;
+	}
+
+	ret = res | (((uint64_t)rep_done & 0xfff) << 32);
+
+	if (longmode) {
+		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+	} else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+	}
+
+	return (1);
 }
 
-static pfn_t
-hva_to_pfn(struct kvm *kvm, unsigned long addr)
+#define	KVM_HC_VAPIC_POLL_IRQ		1
+#define	KVM_HC_MMU_OP			2
+
+int
+kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
-	page_t page[1];
-	int npages;
-	pfn_t pfn;
-	proc_t *procp = ttoproc(curthread);
-	struct as *as = procp->p_as;
+	unsigned long nr, a0, a1, a2, a3, ret;
+	int r = 1;
+
+	if (kvm_hv_hypercall_enabled(vcpu->kvm))
+		return (kvm_hv_hypercall(vcpu));
+
+	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+
+	KVM_TRACE5(hypercall, uintptr_t, nr, uintptr_t, a0, uintptr_t, a1,
+	    uintptr_t, a2, uintptr_t, a3);
+
+	if (!is_long_mode(vcpu)) {
+		nr &= 0xFFFFFFFF;
+		a0 &= 0xFFFFFFFF;
+		a1 &= 0xFFFFFFFF;
+		a2 &= 0xFFFFFFFF;
+		a3 &= 0xFFFFFFFF;
+	}
+
+	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+		ret = -EPERM;
+		goto out;
+	}
 
+	switch (nr) {
+	case KVM_HC_VAPIC_POLL_IRQ:
+		ret = 0;
+		break;
+	case KVM_HC_MMU_OP:
 #ifdef XXX
+		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+#else
+		XXX_KVM_PROBE;
+		ret = -ENOSYS;
+#endif
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+out:
+	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
 
-	npages = get_user_pages_fast(addr, 1, 1, page);
+	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_hypercalls);
 
-	if (unlikely(npages != 1)) {
-		struct vm_area_struct *vma;
+	return (r);
+}
 
-		down_read(&current->mm->mmap_sem);
-		vma = find_vma(current->mm, addr);
+static int
+move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+	int j, nent = vcpu->arch.cpuid_nent;
+
+	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+	/* when no next entry is found, the current entry[i] is reselected */
+	for (j = i + 1; ; j = (j + 1) % nent) {
+		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+		if (ej->function == e->function) {
+			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+			return (j);
+		}
+	}
 
-		if (vma == NULL || addr < vma->vm_start ||
-		    !(vma->vm_flags & VM_PFNMAP)) {
-			up_read(&current->mm->mmap_sem);
-			get_page(bad_page);
-			return (page_to_pfn(bad_page));
+	return (0); /* silence gcc, even though control never reaches here */
+}
+
+/*
+ * find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful)
+ */
+static int
+is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+    uint32_t function, uint32_t index)
+{
+	if (e->function != function)
+		return (0);
+	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+		return (0);
+	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+		return (0);
+	return (1);
+}
+
+struct kvm_cpuid_entry2 *
+kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, uint32_t index)
+{
+	int i;
+	struct kvm_cpuid_entry2 *best = NULL;
+
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		struct kvm_cpuid_entry2 *e;
+
+		e = &vcpu->arch.cpuid_entries[i];
+		if (is_matching_cpuid_entry(e, function, index)) {
+			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+				move_to_next_stateful_cpuid_entry(vcpu, i);
+			best = e;
+			break;
 		}
+		/*
+		 * Both basic or both extended?
+		 */
+		if (((e->function ^ function) & 0x80000000) == 0)
+			if (!best || e->function > best->function)
+				best = e;
+	}
 
-		pfn = ((addr - vma->vm_start) >> PAGESHIFT) + vma->vm_pgoff;
-		up_read(&current->mm->mmap_sem);
-		BUG_ON(!kvm_is_mmio_pfn(pfn));
-	} else
-		pfn = page_to_pfn(page[0]);
-#else
-	XXX_KVM_PROBE;
-	if (addr < kernelbase)
-		pfn = hat_getpfnum(as->a_hat, (caddr_t)addr);
+	return (best);
+}
+
+int
+cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+	return (36);  /* from linux.  number of bits, perhaps? */
+}
+
+void
+kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	uint32_t function, index;
+	struct kvm_cpuid_entry2 *best;
+
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+	best = kvm_find_cpuid_entry(vcpu, function, index);
+	if (best) {
+		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
+	}
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+	KVM_TRACE5(cpuid, uint32_t, function,
+	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RAX),
+	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RBX),
+	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RCX),
+	    uint32_t, kvm_register_read(vcpu, VCPU_REGS_RDX));
+}
+
+static int
+dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
+{
+	return (!irqchip_in_kernel(vcpu->kvm) &&
+	    !kvm_cpu_has_interrupt(vcpu) &&
+	    vcpu->run->request_interrupt_window &&
+	    kvm_arch_interrupt_allowed(vcpu));
+}
+
+static void
+post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+
+	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+	kvm_run->cr8 = kvm_get_cr8(vcpu);
+	kvm_run->apic_base = kvm_get_apic_base(vcpu);
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_run->ready_for_interrupt_injection = 1;
 	else
-		pfn = hat_getpfnum(kas.a_hat, (caddr_t)addr);
+		kvm_run->ready_for_interrupt_injection =
+			kvm_arch_interrupt_allowed(vcpu) &&
+			!kvm_cpu_has_interrupt(vcpu) &&
+			!kvm_event_needs_reinjection(vcpu);
+}
+
+static void
+vapic_enter(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	page_t *page;
+
+	if (!apic || !apic->vapic_addr)
+		return;
+
+	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
+
+	vcpu->arch.apic->vapic_page = page;
+}
+
+static void
+vapic_exit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int idx;
+
+	if (!apic || !apic->vapic_addr)
+		return;
+#ifdef XXX
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+#else
+	XXX_KVM_SYNC_PROBE;
+#endif
+	kvm_release_page_dirty(apic->vapic_page);
+	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
+#ifdef XXX
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+#else
+	XXX_KVM_SYNC_PROBE;
 #endif
-	return (pfn);
 }
 
-pfn_t
-gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+static void
+update_cr8_intercept(struct kvm_vcpu *vcpu)
 {
-	unsigned long addr;
-	pfn_t pfn;
+	int max_irr, tpr;
 
-	addr = gfn_to_hva(kvm, gfn);
+	if (!kvm_x86_ops->update_cr8_intercept)
+		return;
 
-	if (kvm_is_error_hva(addr)) {
-		get_page(bad_page);
-		return (page_to_pfn(bad_page));
-	}
+	if (!vcpu->arch.apic)
+		return;
+	if (!vcpu->arch.apic->vapic_addr)
+		max_irr = kvm_lapic_find_highest_irr(vcpu);
+	else
+		max_irr = -1;
 
-	pfn = hva_to_pfn(kvm, addr);
+	if (max_irr != -1)
+		max_irr >>= 4;
+	tpr = kvm_lapic_get_cr8(vcpu);
 
-	return (pfn);
+	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
+static void
+inject_pending_event(struct kvm_vcpu *vcpu)
+{
+	/* try to reinject previous events if any */
+	if (vcpu->arch.exception.pending) {
+		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+		    vcpu->arch.exception.has_error_code,
+		    vcpu->arch.exception.error_code);
+		return;
+	}
 
-int
-is_error_pfn(pfn_t pfn)
+	if (vcpu->arch.nmi_injected) {
+		kvm_x86_ops->set_nmi(vcpu);
+		return;
+	}
+
+	if (vcpu->arch.interrupt.pending) {
+		kvm_x86_ops->set_irq(vcpu);
+		return;
+	}
+
+	/* try to inject new event if pending */
+	if (vcpu->arch.nmi_pending) {
+		if (kvm_x86_ops->nmi_allowed(vcpu)) {
+			vcpu->arch.nmi_pending = 0;
+			vcpu->arch.nmi_injected = 1;
+			kvm_x86_ops->set_nmi(vcpu);
+		}
+	} else if (kvm_cpu_has_interrupt(vcpu)) {
+		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+					    0);
+			kvm_x86_ops->set_irq(vcpu);
+		}
+	}
+}
+
+static inline unsigned long
+native_get_debugreg(int regno)
 {
-	return (pfn == bad_pfn);
+	unsigned long val = 0;	/* Damn you, gcc! */
+
+	switch (regno) {
+	case 0:
+		__asm__("mov %%db0, %0" :"=r" (val));
+		break;
+	case 1:
+		__asm__("mov %%db1, %0" :"=r" (val));
+		break;
+	case 2:
+		__asm__("mov %%db2, %0" :"=r" (val));
+		break;
+	case 3:
+		__asm__("mov %%db3, %0" :"=r" (val));
+		break;
+	case 6:
+		__asm__("mov %%db6, %0" :"=r" (val));
+		break;
+	case 7:
+		__asm__("mov %%db7, %0" :"=r" (val));
+		break;
+	default:
+		cmn_err(CE_WARN, "kvm: invalid debug register retrieval, "
+		    "regno =  %d\n", regno);
+	}
+
+	return (val);
 }
 
-page_t *
-pfn_to_page(pfn_t pfn)
+static inline void
+native_set_debugreg(int regno, unsigned long value)
 {
-	return (page_numtopp_nolock(pfn));
+	switch (regno) {
+	case 0:
+		__asm__("mov %0, %%db0"	::"r" (value));
+		break;
+	case 1:
+		__asm__("mov %0, %%db1"	::"r" (value));
+		break;
+	case 2:
+		__asm__("mov %0, %%db2"	::"r" (value));
+		break;
+	case 3:
+		__asm__("mov %0, %%db3"	::"r" (value));
+		break;
+	case 6:
+		__asm__("mov %0, %%db6"	::"r" (value));
+		break;
+	case 7:
+		__asm__("mov %0, %%db7"	::"r" (value));
+		break;
+	default:
+		cmn_err(CE_WARN, "kvm: invalid debug register set, "
+		    "regno =  %d\n", regno);
+	}
 }
 
-void
-kvm_set_pfn_accessed(struct kvm *kvm, pfn_t pfn)
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define	get_debugreg(var, register)				\
+	(var) = native_get_debugreg(register)
+#define	set_debugreg(value, register)				\
+	native_set_debugreg(register, value)
+
+static int
+vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
+	int r;
+
+	int req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+	    vcpu->run->request_interrupt_window;
+
+	if (vcpu->requests) {
+		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+			kvm_mmu_unload(vcpu);
+	}
+
+	r = kvm_mmu_reload(vcpu);
+
+	if (r)
+		goto out;
+
+	if (vcpu->requests) {
+		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER,
+		    &vcpu->requests)) {
+			__kvm_migrate_timers(vcpu);
+		}
+		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE,
+		    &vcpu->requests)) {
+			kvm_write_guest_time(vcpu);
+		}
+
+		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+			kvm_mmu_sync_roots(vcpu);
+
+		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+			kvm_x86_ops->tlb_flush(vcpu);
+
+		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+		    &vcpu->requests)) {
+			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+			r = 0;
+			goto out;
+		}
+
+		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+			r = 0;
+			goto out;
+		}
+
+		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU,
+		    &vcpu->requests)) {
+			vcpu->fpu_active = 0;
+			kvm_x86_ops->fpu_deactivate(vcpu);
+		}
+	}
+
+	kpreempt_disable();
+
+	kvm_x86_ops->prepare_guest_switch(vcpu);
+	if (vcpu->fpu_active)
+		kvm_load_guest_fpu(vcpu);
+
+	cli();
+
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
 #ifdef XXX
-	if (!kvm_is_mmio_pfn(pfn))
-		mark_page_accessed(pfn_to_page(pfn));
+	smp_mb__after_clear_bit();
 #else
 	XXX_KVM_PROBE;
 #endif
-}
 
+	if (vcpu->requests || issig(JUSTLOOKING)) {
+		set_bit(KVM_REQ_KICK, &vcpu->requests);
+		sti();
+		kpreempt_enable();
+		r = 1;
+		goto out;
+	}
 
-void
-kvm_set_pfn_dirty(pfn_t pfn)
-{
+	inject_pending_event(vcpu);
+
+	/* enable NMI/IRQ window open exits if needed */
+	if (vcpu->arch.nmi_pending)
+		kvm_x86_ops->enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		kvm_x86_ops->enable_irq_window(vcpu);
+
+	if (kvm_lapic_enabled(vcpu)) {
+		update_cr8_intercept(vcpu);
+		kvm_lapic_sync_to_vapic(vcpu);
+	}
 #ifdef XXX
-	if (!kvm_is_mmio_pfn(pfn)) {
-		struct page *page = pfn_to_page(pfn);
-		if (!PageReserved(page))
-			SetPageDirty(page); /* XXX - not defined in linux?! */
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+#else
+	XXX_KVM_PROBE;
+#endif
+	kvm_guest_enter();
+
+	if (vcpu->arch.switch_db_regs) {
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.eff_db[0], 0);
+		set_debugreg(vcpu->arch.eff_db[1], 1);
+		set_debugreg(vcpu->arch.eff_db[2], 2);
+		set_debugreg(vcpu->arch.eff_db[3], 3);
 	}
+
+	KVM_TRACE1(vm__entry, int, vcpu->vcpu_id);
+
+	kvm_x86_ops->run(vcpu);
+#ifdef XXX
+	/*
+	 * If the guest has used debug registers, at least dr7
+	 * will be disabled while returning to the host.
+	 * If we don't have active breakpoints in the host, we don't
+	 * care about the messed up debug address registers. But if
+	 * we have some of them active, restore the old state.
+	 */
+	if (hw_breakpoint_active())
+		hw_breakpoint_restore();
 #else
 	XXX_KVM_PROBE;
 #endif
-}
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
 
+	sti();
 
-int
-memslot_id(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-#ifdef XXX_KVM_DECLARATION
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+#ifdef XXX
+	local_irq_enable();  /* XXX - should be ok with kpreempt_enable below */
+
+	barrier();
 #else
-	struct kvm_memslots *slots = kvm->memslots;
+	XXX_KVM_PROBE;
 #endif
-	struct kvm_memory_slot *memslot = NULL;
+	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_exits);
+	kvm_guest_exit();
 
-	gfn = unalias_gfn(kvm, gfn);
-	for (i = 0; i < slots->nmemslots; ++i) {
-		memslot = &slots->memslots[i];
+	kpreempt_enable();
+#ifdef XXX
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-		if (gfn >= memslot->base_gfn &&
-		    gfn < memslot->base_gfn + memslot->npages)
-			break;
+	/*
+	 * Profile KVM exit RIPs:
+	 */
+	if (unlikely(prof_on == KVM_PROFILING)) {
+		unsigned long rip = kvm_rip_read(vcpu);
+		profile_hit(KVM_PROFILING, (void *)rip);
 	}
+#else
+	XXX_KVM_PROBE;
+#endif
+	kvm_lapic_sync_from_vapic(vcpu);
+	r = kvm_x86_ops->handle_exit(vcpu);
 
-	return (memslot - slots->memslots);
+out:
+	return (r);
 }
 
-void
-kvm_release_pfn_dirty(pfn_t pfn)
+static int
+__vcpu_run(struct kvm_vcpu *vcpu)
 {
-	kvm_set_pfn_dirty(pfn);
-	kvm_release_pfn_clean(pfn);
-}
+	int r;
+	struct kvm *kvm = vcpu->kvm;
 
-int
-cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
-	return (36);  /* from linux.  number of bits, perhaps? */
-}
+	if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+		cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n",
+		    vcpu->vcpu_id, vcpu->arch.sipi_vector);
+		kvm_lapic_reset(vcpu);
+		r = kvm_arch_vcpu_reset(vcpu);
+		if (r)
+			return (r);
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	}
 
+#ifdef XXX
+	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+#else
+	XXX_KVM_SYNC_PROBE;
+#endif
+	vapic_enter(vcpu);
+
+	r = 1;
+	while (r > 0) {
+		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+			r = vcpu_enter_guest(vcpu);
+		else {
+#ifdef XXX
+			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+#else
+			XXX_KVM_SYNC_PROBE;
+#endif
+			kvm_vcpu_block(vcpu);
+#ifdef XXX
+			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+#else
+			XXX_KVM_SYNC_PROBE;
+#endif
+			if (test_and_clear_bit(KVM_REQ_UNHALT,
+			    &vcpu->requests)) {
+				switch (vcpu->arch.mp_state) {
+				case KVM_MP_STATE_HALTED:
+					vcpu->arch.mp_state =
+					    KVM_MP_STATE_RUNNABLE;
+				case KVM_MP_STATE_RUNNABLE:
+					break;
+				case KVM_MP_STATE_SIPI_RECEIVED:
+				default:
+					r = -EINTR;
+					break;
+				}
+			}
+		}
+
+		if (r <= 0)
+			break;
+
+		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		if (kvm_cpu_has_pending_timer(vcpu))
+			kvm_inject_pending_timer_irqs(vcpu);
+
+		if (dm_request_for_irq_injection(vcpu)) {
+			r = -EINTR;
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_irq_exits);
+		}
+
+		if (issig(JUSTLOOKING)) {
+			r = -EINTR;
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_signal_exits);
+		}
+	}
+#ifdef XXX
+	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+#else
+	XXX_KVM_SYNC_PROBE;
+#endif
+	post_kvm_run_save(vcpu);
+	vapic_exit(vcpu);
+
+	return (r);
+}
 
 int
-kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
 	int r;
-	unsigned long addr;
-	gfn_t gfn = gpa >> PAGESHIFT;
-	int offset = offset_in_page(gpa);
+	sigset_t sigsaved;
+	struct kvm_run *kvm_run = vcpu->run;
 
-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr))
-		return (-EFAULT);
+	vcpu_load(vcpu);
 
+	if (vcpu->sigset_active)
+		kvm_sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) {
+		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		r = -EAGAIN;
+		goto out;
+	}
+
+	/* re-sync apic's tpr */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		kvm_set_cr8(vcpu, kvm_run->cr8);
+
+	if (vcpu->arch.pio.cur_count) {
 #ifdef XXX
-	pagefault_disable();
+		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 #else
-	XXX_KVM_PROBE;
+		XXX_KVM_SYNC_PROBE;
 #endif
-
-	r = copyin((caddr_t)addr + offset, data, len);
+		r = complete_pio(vcpu);
 #ifdef XXX
-	pagefault_enable();
+		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 #else
-	XXX_KVM_PROBE;
+		XXX_KVM_SYNC_PROBE;
 #endif
-	if (r)
-		return (-EFAULT);
+		if (r)
+			goto out;
+	}
+	if (vcpu->mmio_needed) {
+		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+		vcpu->mmio_read_completed = 1;
+		vcpu->mmio_needed = 0;
+#ifdef XXX
+		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+#else
+		XXX_KVM_SYNC_PROBE;
+#endif
+		r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
+					EMULTYPE_NO_DECODE);
+#ifdef XXX
+		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+#else
+		XXX_KVM_SYNC_PROBE;
+#endif
+		if (r == EMULATE_DO_MMIO) {
+			/*
+			 * Read-modify-write.  Back to userspace.
+			 */
+			r = 0;
+			goto out;
+		}
+	}
+
+	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+		kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret);
+
+	r = __vcpu_run(vcpu);
+
+out:
+	if (vcpu->sigset_active)
+		kvm_sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	vcpu_put(vcpu);
+	return (r);
+}
+
+int
+kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+
+	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+#ifdef CONFIG_X86_64
+	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+#endif
+
+	regs->rip = kvm_rip_read(vcpu);
+	regs->rflags = kvm_get_rflags(vcpu);
+
+	vcpu_put(vcpu);
 
 	return (0);
 }
 
+int
+kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+#ifdef CONFIG_X86_64
+	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#endif
+
+	kvm_rip_write(vcpu, regs->rip);
+	kvm_set_rflags(vcpu, regs->rflags);
+
+	vcpu->arch.exception.pending = 0;
+
+	vcpu_put(vcpu);
+
+	return (0);
+}
+
+void
+kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+int
+kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	struct descriptor_table dt;
+
+	vcpu_load(vcpu);
+
+	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+	kvm_x86_ops->get_idt(vcpu, &dt);
+	sregs->idt.limit = dt.limit;
+	sregs->idt.base = dt.base;
+	kvm_x86_ops->get_gdt(vcpu, &dt);
+	sregs->gdt.limit = dt.limit;
+	sregs->gdt.base = dt.base;
+
+	sregs->cr0 = kvm_read_cr0(vcpu);
+	sregs->cr2 = vcpu->arch.cr2;
+	sregs->cr3 = vcpu->arch.cr3;
+	sregs->cr4 = kvm_read_cr4(vcpu);
+	sregs->cr8 = kvm_get_cr8(vcpu);
+	sregs->efer = vcpu->arch.efer;
+	sregs->apic_base = kvm_get_apic_base(vcpu);
+
+	memset(sregs->interrupt_bitmap, 0, sizeof (sregs->interrupt_bitmap));
+
+	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) {
+		set_bit(vcpu->arch.interrupt.nr,
+			(unsigned long *)sregs->interrupt_bitmap);
+	}
+
+	vcpu_put(vcpu);
+
+	return (0);
+}
+
+int
+kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+    struct kvm_mp_state *mp_state)
+{
+	vcpu_load(vcpu);
+	mp_state->mp_state = vcpu->arch.mp_state;
+	vcpu_put(vcpu);
+	return (0);
+}
+
+int
+kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+    struct kvm_mp_state *mp_state)
+{
+	vcpu_load(vcpu);
+	vcpu->arch.mp_state = mp_state->mp_state;
+	vcpu_put(vcpu);
+	return (0);
+}
+
+static void
+kvm_set_segment(struct kvm_vcpu *vcpu,
+			struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
 static void
-ack_flush(void *_completed)
+seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector,
+    struct kvm_segment *kvm_desct)
+{
+	kvm_desct->base = get_desc_base(seg_desc);
+	kvm_desct->limit = get_desc_limit(seg_desc);
+	if (seg_desc->c.b.g) {
+		kvm_desct->limit <<= 12;
+		kvm_desct->limit |= 0xfff;
+	}
+	kvm_desct->selector = selector;
+	kvm_desct->type = seg_desc->c.b.type;
+	kvm_desct->present = seg_desc->c.b.p;
+	kvm_desct->dpl = seg_desc->c.b.dpl;
+	kvm_desct->db = seg_desc->c.b.d;
+	kvm_desct->s = seg_desc->c.b.s;
+	kvm_desct->l = seg_desc->c.b.l;
+	kvm_desct->g = seg_desc->c.b.g;
+	kvm_desct->avl = seg_desc->c.b.avl;
+	if (!selector)
+		kvm_desct->unusable = 1;
+	else
+		kvm_desct->unusable = 0;
+	kvm_desct->padding = 0;
+}
+
+static void
+get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, uint16_t selector,
+    struct descriptor_table *dtable)
+{
+	if (selector & 1 << 2) {
+		struct kvm_segment kvm_seg;
+
+		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+
+		if (kvm_seg.unusable)
+			dtable->limit = 0;
+		else
+			dtable->limit = kvm_seg.limit;
+		dtable->base = kvm_seg.base;
+	} else
+		kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+
+/* allowed just for 8 bytes segments */
+static int
+load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
+    struct desc_struct *seg_desc)
+{
+	struct descriptor_table dtable;
+	uint16_t index = selector >> 3;
+	int ret;
+	uint32_t err;
+	gva_t addr;
+
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+	if (dtable.limit < index * 8 + 7) {
+		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+		return (1);
+	}
+
+	addr = dtable.base + index * 8;
+	ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof (*seg_desc),
+	    vcpu,  &err);
+
+	if (ret == 1)
+		kvm_inject_page_fault(vcpu, addr, err);
+
+	return (ret);
+}
+
+/* allowed just for 8 bytes segments */
+static int
+save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
+    struct desc_struct *seg_desc)
+{
+	struct descriptor_table dtable;
+	uint16_t index = selector >> 3;
+
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+	if (dtable.limit < index * 8 + 7)
+		return (1);
+
+	return kvm_write_guest_virt(dtable.base + index * 8, seg_desc,
+	    sizeof (*seg_desc), vcpu, NULL);
+}
+
+static gpa_t
+get_tss_base_addr_write(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc)
+{
+	uint32_t base_addr = get_desc_base(seg_desc);
+
+	return (kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL));
+}
+
+static gpa_t
+get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc)
+{
+	uint32_t base_addr = get_desc_base(seg_desc);
+
+	return (kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL));
+}
+
+static uint16_t
+get_segment_selector(struct kvm_vcpu *vcpu, int seg)
 {
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+
+	return (kvm_seg.selector);
+}
+
+static int
+kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
+{
+	struct kvm_segment segvar = {
+		.base = selector << 4,
+		.limit = 0xffff,
+		.selector = selector,
+		.type = 3,
+		.present = 1,
+		.dpl = 3,
+		.db = 0,
+		.s = 1,
+		.l = 0,
+		.g = 0,
+		.avl = 0,
+		.unusable = 0,
+	};
+	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+	return (0);
+}
+
+static int
+is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+	return (seg != VCPU_SREG_LDTR) && (seg != VCPU_SREG_TR) &&
+	    (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
 }
 
 int
-make_all_cpus_request(struct kvm *kvm, unsigned int req)
+kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
 {
-	int i;
-	cpuset_t set;
-	processorid_t me, cpu;
-	struct kvm_vcpu *vcpu;
+	struct kvm_segment kvm_seg;
+	struct desc_struct seg_desc;
+	uint8_t dpl, rpl, cpl;
+	unsigned err_vec = GP_VECTOR;
+	uint32_t err_code = 0;
+	int null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	int ret;
 
-	CPUSET_ZERO(set);
+	if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
+		return (kvm_load_realmode_segment(vcpu, selector, seg));
 
-	mutex_enter(&kvm->requests_lock);
-	me = curthread->t_cpu->cpu_id;
-	for (i = 0; i < kvm->online_vcpus; i++) {
-		vcpu = kvm->vcpus[i];
-		if (!vcpu)
-			break;
-		if (test_and_set_bit(req, &vcpu->requests))
-			continue;
-		cpu = vcpu->cpu;
-		if (cpu != -1 && cpu != me)
-			CPUSET_ADD(set, cpu);
+	/* NULL selector is not valid for TR, CS and SS */
+	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS ||
+	    seg == VCPU_SREG_TR) && null_selector)
+		goto exception;
+
+	/* TR should be in GDT only */
+	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+		goto exception;
+
+	ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
+
+	if (ret)
+		return (ret);
+
+	seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+	if (null_selector) { /* for NULL selector skip all following checks */
+		kvm_seg.unusable = 1;
+		goto load;
 	}
-	if (CPUSET_ISNULL(set))
-		kvm_xcall(KVM_CPUALL, ack_flush, NULL);
-	else {
-		kpreempt_disable();
-		xc_sync((xc_arg_t) ack_flush, (xc_arg_t) NULL,
-			0, CPUSET2BV(set), (xc_func_t) kvm_xcall_func);
-		kpreempt_enable();
+
+	err_code = selector & 0xfffc;
+	err_vec = GP_VECTOR;
+
+	/* can't load system descriptor into segment selecor */
+	if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+		goto exception;
+
+	if (!kvm_seg.present) {
+		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+		goto exception;
 	}
-	mutex_exit(&kvm->requests_lock);
 
+	rpl = selector & 3;
+	dpl = kvm_seg.dpl;
+	cpl = kvm_x86_ops->get_cpl(vcpu);
+
+	switch (seg) {
+	case VCPU_SREG_SS:
+		/*
+		 * segment is not a writable data segment or segment
+		 * selector's RPL != CPL or segment selector's RPL != CPL
+		 */
+		if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+			goto exception;
+		break;
+	case VCPU_SREG_CS:
+		if (!(kvm_seg.type & 8))
+			goto exception;
+
+		if (kvm_seg.type & 4) {
+			/* conforming */
+			if (dpl > cpl)
+				goto exception;
+		} else {
+			/* nonconforming */
+			if (rpl > cpl || dpl != cpl)
+				goto exception;
+		}
+		/* CS(RPL) <- CPL */
+		selector = (selector & 0xfffc) | cpl;
+		break;
+	case VCPU_SREG_TR:
+		if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+			goto exception;
+		break;
+	case VCPU_SREG_LDTR:
+		if (kvm_seg.s || kvm_seg.type != 2)
+			goto exception;
+		break;
+	default: /*  DS, ES, FS, or GS */
+		/*
+		 * segment is not a data or readable code segment or
+		 * ((segment is a data or nonconforming code segment)
+		 * and (both RPL and CPL > DPL))
+		 */
+		if ((kvm_seg.type & 0xa) == 0x8 ||
+		    (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+			goto exception;
+		break;
+	}
+
+	if (!kvm_seg.unusable && kvm_seg.s) {
+		/* mark segment as accessed */
+		kvm_seg.type |= 1;
+		seg_desc.c.b.type |= 1;
+		save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+	}
+load:
+	kvm_set_segment(vcpu, &kvm_seg, seg);
+	return (0);
+exception:
+	kvm_queue_exception_e(vcpu, err_vec, err_code);
 	return (1);
+
 }
 
-void
-kvm_flush_remote_tlbs(struct kvm *kvm)
+static void
+save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss)
 {
-	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
-		KVM_KSTAT_INC(kvm, kvmks_remote_tlb_flush);
+	tss->cr3 = vcpu->arch.cr3;
+	tss->eip = kvm_rip_read(vcpu);
+	tss->eflags = kvm_get_rflags(vcpu);
+	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
 }
 
-gfn_t
-unalias_gfn(struct kvm *kvm, gfn_t gfn)
+static void
+kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg)
 {
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
+	struct kvm_segment kvm_seg;
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	kvm_seg.selector = sel;
+	kvm_set_segment(vcpu, &kvm_seg, seg);
+}
 
-	/* XXX need protection */
-	aliases = kvm->arch.aliases;
+static int
+load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss)
+{
+	kvm_set_cr3(vcpu, tss->cr3);
 
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (gfn >= alias->base_gfn &&
-		    gfn < alias->base_gfn + alias->npages)
-			return (alias->target_gfn + gfn - alias->base_gfn);
+	kvm_rip_write(vcpu, tss->eip);
+	kvm_set_rflags(vcpu, tss->eflags | 2);
+
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+	kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+	kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	if (kvm_load_segment_descriptor(vcpu,
+	    tss->ldt_selector, VCPU_SREG_LDTR))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
+		return (1);
+
+	return (0);
+}
+
+static void
+save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss)
+{
+	tss->ip = kvm_rip_read(vcpu);
+	tss->flag = kvm_get_rflags(vcpu);
+	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
+
+	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static int
+load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss)
+{
+	kvm_rip_write(vcpu, tss->ip);
+	kvm_set_rflags(vcpu, tss->flag | 2);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+		return (1);
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+		return (1);
+
+	return (0);
+}
+
+static int
+kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector,
+    uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc)
+{
+	struct tss_segment_16 tss_segment_16;
+	int ret = 0;
+
+	if (kvm_read_guest(vcpu->kvm, old_tss_base,
+	    &tss_segment_16, sizeof (tss_segment_16)))
+		goto out;
+
+	save_state_to_tss16(vcpu, &tss_segment_16);
+
+	if (kvm_write_guest(vcpu->kvm, old_tss_base,
+	    &tss_segment_16, sizeof (tss_segment_16)))
+		goto out;
+
+	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+	    &tss_segment_16, sizeof (tss_segment_16)))
+		goto out;
+
+	if (old_tss_sel != 0xffff) {
+		tss_segment_16.prev_task_link = old_tss_sel;
+
+		if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu,
+		    nseg_desc), &tss_segment_16.prev_task_link,
+		    sizeof (tss_segment_16.prev_task_link)))
+			goto out;
 	}
-	return (gfn);
+
+	if (load_state_from_tss16(vcpu, &tss_segment_16))
+		goto out;
+
+	ret = 1;
+out:
+	return (ret);
+}
+
+static int
+kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector,
+    uint16_t old_tss_sel, uint32_t old_tss_base, struct desc_struct *nseg_desc)
+{
+	struct tss_segment_32 tss_segment_32;
+	int ret = 0;
+
+	if (kvm_read_guest(vcpu->kvm, old_tss_base,
+	    &tss_segment_32, sizeof (tss_segment_32)))
+		goto out;
+
+	save_state_to_tss32(vcpu, &tss_segment_32);
+
+	if (kvm_write_guest(vcpu->kvm, old_tss_base,
+	    &tss_segment_32, sizeof (tss_segment_32)))
+		goto out;
+
+	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+	    &tss_segment_32, sizeof (tss_segment_32)))
+		goto out;
+
+	if (old_tss_sel != 0xffff) {
+		tss_segment_32.prev_task_link = old_tss_sel;
+
+		if (kvm_write_guest(vcpu->kvm, get_tss_base_addr_write(vcpu,
+		    nseg_desc), &tss_segment_32.prev_task_link,
+		    sizeof (tss_segment_32.prev_task_link)))
+			goto out;
+	}
+
+	if (load_state_from_tss32(vcpu, &tss_segment_32))
+		goto out;
+
+	ret = 1;
+out:
+	return (ret);
 }
 
 int
-is_pse(struct kvm_vcpu *vcpu)
+kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason)
 {
-	return (kvm_read_cr4_bits(vcpu, X86_CR4_PSE));
+	struct kvm_segment tr_seg;
+	struct desc_struct cseg_desc;
+	struct desc_struct nseg_desc;
+	int ret = 0;
+	uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
+	uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
+	uint32_t desc_limit;
+
+	old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
+
+	/*
+	 * FIXME: Handle errors. Failure to read either TSS or their
+	 * descriptors should generate a pagefault.
+	 */
+	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+		goto out;
+
+	if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
+		goto out;
+
+	if (reason != TASK_SWITCH_IRET) {
+		int cpl;
+
+		cpl = kvm_x86_ops->get_cpl(vcpu);
+		if ((tss_selector & 3) > nseg_desc.c.b.dpl ||
+		    cpl > nseg_desc.c.b.dpl) {
+			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+			return (1);
+		}
+	}
+
+	desc_limit = get_desc_limit(&nseg_desc);
+
+	if (!nseg_desc.c.b.p || ((desc_limit < 0x67 &&
+	    (nseg_desc.c.b.type & 8)) || desc_limit < 0x2b)) {
+		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+		return (1);
+	}
+
+	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+		cseg_desc.c.b.type &= ~(1 << 1); // clear the B flag
+		save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
+	}
+
+	if (reason == TASK_SWITCH_IRET) {
+		uint32_t eflags = kvm_get_rflags(vcpu);
+		kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+	}
+
+	/*
+	 * set back link to prev task only if NT bit is set in eflags
+	 * note that old_tss_sel is not used afetr this point
+	 */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
+
+	if (nseg_desc.c.b.type & 8) {
+		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+		    old_tss_base, &nseg_desc);
+	} else {
+		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+		    old_tss_base, &nseg_desc);
+	}
+
+	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+		uint32_t eflags = kvm_get_rflags(vcpu);
+		kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+	}
+
+	if (reason != TASK_SWITCH_IRET) {
+		nseg_desc.c.b.type |= (1 << 1);
+		save_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc);
+	}
+
+	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
+	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+	tr_seg.type = 11;
+	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+	return (ret);
+}
+
+static unsigned long
+find_next_bit(const unsigned long *addr,
+    unsigned long size, unsigned long offset)
+{
+	const unsigned long *p = addr + (offset/64);
+	unsigned long result = offset & ~(64-1);
+	unsigned long tmp;
+
+	if (offset >= size)
+		return (size);
+
+	size -= result;
+	offset %= 64;
+
+	if (offset) {
+		tmp = *(p++);
+		tmp &= (~0UL << offset);
+		if (size < 64)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= 64;
+		result += 64;
+	}
+	while (size & ~(64-1)) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += 64;
+		size -= 64;
+	}
+
+	if (!size)
+		return (result);
+	tmp = *p;
+
+found_first:
+	tmp &= (~0UL >> (64 - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return (result + size);	/* Nope. */
+found_middle:
+	return (result + __ffs(tmp));
+}
+
+int
+kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	int mmu_reset_needed = 0;
+	int pending_vec, max_bits;
+	struct descriptor_table dt;
+
+	vcpu_load(vcpu);
+
+	dt.limit = sregs->idt.limit;
+	dt.base = sregs->idt.base;
+	kvm_x86_ops->set_idt(vcpu, &dt);
+	dt.limit = sregs->gdt.limit;
+	dt.base = sregs->gdt.base;
+	kvm_x86_ops->set_gdt(vcpu, &dt);
+
+	vcpu->arch.cr2 = sregs->cr2;
+	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+	vcpu->arch.cr3 = sregs->cr3;
+
+	kvm_set_cr8(vcpu, sregs->cr8);
+
+	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+	kvm_x86_ops->set_efer(vcpu, sregs->efer);
+	kvm_set_apic_base(vcpu, sregs->apic_base);
+
+	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+	vcpu->arch.cr0 = sregs->cr0;
+
+	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+
+	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+		load_pdptrs(vcpu, vcpu->arch.cr3);
+		mmu_reset_needed = 1;
+	}
+
+	if (mmu_reset_needed)
+		kvm_mmu_reset_context(vcpu);
+
+	max_bits = (sizeof (sregs->interrupt_bitmap)) << 3;
+	pending_vec =
+	    find_next_bit((const unsigned long *)sregs->interrupt_bitmap,
+	    max_bits, 0);
+
+	if (pending_vec < max_bits) {
+		kvm_queue_interrupt(vcpu, pending_vec, 0);
+		if (irqchip_in_kernel(vcpu->kvm))
+			kvm_pic_clear_isr_ack(vcpu->kvm);
+	}
+
+	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+	update_cr8_intercept(vcpu);
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	/* Older userspace won't unhalt the vcpu on reset. */
+	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+	    !is_protmode(vcpu))
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+#endif /* CONFIG_KVM_APIC_ARCHITECTURE */
+
+	vcpu_put(vcpu);
+
+	return (0);
+}
+
+/*
+ * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
+ * we have asm/x86/processor.h
+ *
+ * 8*16 bytes for each FP-reg = 128 bytes
+ * 16*16 bytes for each XMM-reg=256 bytes
+ */
+typedef struct fxsave {
+	uint16_t	cwd;
+	uint16_t	swd;
+	uint16_t	twd;
+	uint16_t	fop;
+	uint64_t	rip;
+	uint64_t	rdp;
+	uint32_t	mxcsr;
+	uint32_t	mxcsr_mask;
+	uint32_t	st_space[32];
+#ifdef CONFIG_X86_64
+	uint32_t	xmm_space[64];
+#else
+	uint32_t	xmm_space[32];
+#endif
+} fxsave_t;
+
+int
+kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+	vcpu_load(vcpu);
+
+	memcpy(fpu->fpr, fxsave->st_space, 128);
+	fpu->fcw = fxsave->cwd;
+	fpu->fsw = fxsave->swd;
+	fpu->ftwx = fxsave->twd;
+	fpu->last_opcode = fxsave->fop;
+	fpu->last_ip = fxsave->rip;
+	fpu->last_dp = fxsave->rdp;
+	memcpy(fpu->xmm, fxsave->xmm_space, sizeof (fxsave->xmm_space));
+
+	vcpu_put(vcpu);
+
+	return (0);
+}
+
+int
+kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+	vcpu_load(vcpu);
+
+	memcpy(fxsave->st_space, fpu->fpr, 128);
+	fxsave->cwd = fpu->fcw;
+	fxsave->swd = fpu->fsw;
+	fxsave->twd = fpu->ftwx;
+	fxsave->fop = fpu->last_opcode;
+	fxsave->rip = fpu->last_ip;
+	fxsave->rdp = fpu->last_dp;
+	memcpy(fxsave->xmm_space, fpu->xmm, sizeof (fxsave->xmm_space));
+
+	vcpu_put(vcpu);
+
+	return (0);
 }
 
 void
-kvm_get_pfn(struct kvm_vcpu *vcpu, pfn_t pfn)
+fx_init(struct kvm_vcpu *vcpu)
 {
-	if (!kvm_is_mmio_pfn(pfn))
-		get_page(pfn_to_page(pfn));
+	unsigned after_mxcsr_mask;
+#ifdef XXX
+	/*
+	 * Touch the fpu the first time in non atomic context as if
+	 * this is the first fpu instruction the exception handler
+	 * will fire before the instruction returns and it'll have to
+	 * allocate ram with GFP_KERNEL.
+	 */
+	if (!used_math())
+#else
+	XXX_KVM_PROBE;
+#endif
+		kvm_fx_save(&vcpu->arch.host_fx_image);
+
+	/* Initialize guest FPU by resetting ours and saving into guest's */
+	kpreempt_disable();
+	kvm_fx_save(&vcpu->arch.host_fx_image);
+	kvm_fx_finit();
+	kvm_fx_save(&vcpu->arch.guest_fx_image);
+	kvm_fx_restore(&vcpu->arch.host_fx_image);
+	kpreempt_enable();
+
+	vcpu->arch.cr0 |= X86_CR0_ET;
+	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
+	memset((void *)((uintptr_t)&vcpu->arch.guest_fx_image +
+	    after_mxcsr_mask), 0, sizeof (struct i387_fxsave_struct) -
+	    after_mxcsr_mask);
+}
+
+void
+kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->guest_fpu_loaded)
+		return;
+
+	vcpu->guest_fpu_loaded = 1;
+	kvm_fx_save(&vcpu->arch.host_fx_image);
+	kvm_fx_restore(&vcpu->arch.guest_fx_image);
+	KVM_TRACE1(fpu, int, 1);
+}
+
+void
+kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->guest_fpu_loaded)
+		return;
+
+	vcpu->guest_fpu_loaded = 0;
+	kvm_fx_save(&vcpu->arch.guest_fx_image);
+	kvm_fx_restore(&vcpu->arch.host_fx_image);
+	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_fpu_reload);
+	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+	KVM_TRACE1(fpu, int, 0);
+}
+
+void
+kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.time_page) {
+		/* XXX We aren't doing anything with the time page */
+		XXX_KVM_PROBE;
+		vcpu->arch.time_page = NULL;
+	}
+
+	if (vcpu->kvcpu_kstat != NULL)
+		kstat_delete(vcpu->kvcpu_kstat);
+
+	kvm_x86_ops->vcpu_free(vcpu);
+}
+
+struct kvm_vcpu *
+kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	char buf[32];
+	struct kvm_vcpu *vcpu;
+	kstat_t *kstat;
+
+	(void) snprintf(buf, sizeof (buf), "vcpu-%d", kvm->kvmid);
+
+	if ((kstat = kstat_create("kvm", id, buf, "misc", KSTAT_TYPE_NAMED,
+	    sizeof (kvm_vcpu_stats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL)) == NULL) {
+		return (NULL);
+	}
+
+	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+	if (vcpu == NULL) {
+		kstat_delete(kstat);
+		return (NULL);
+	}
+
+	vcpu->kvcpu_kstat = kstat;
+	vcpu->kvcpu_kstat->ks_data = &vcpu->kvcpu_stats;
+
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "id");
+	vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvmid;
+
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_id, "pid");
+	vcpu->kvcpu_stats.kvmvs_id.value.ui64 = kvm->kvm_pid;
+
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_injections, "nmi-injections");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_injections, "irq-injections");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_fpu_reload, "fpu-reload");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_host_state_reload, "host-state-reload");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation, "insn-emulation");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_insn_emulation_fail,
+	    "inst-emulation-fail");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_exits, "exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_exits, "halt-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_exits, "irq-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_io_exits, "io-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_mmio_exits, "mmio-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_nmi_window_exits, "nmi-window-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_irq_window_exits, "irq-window-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_request_irq_exits, "request-irq-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_signal_exits, "signal-exits");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_halt_wakeup, "halt-wakeup");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_invlpg, "invlpg");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_guest, "pf-guest");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_pf_fixed, "pf-fixed");
+	KVM_VCPU_KSTAT_INIT(vcpu, kvmvs_hypercalls, "hypercalls");
+
+	kstat_install(vcpu->kvcpu_kstat);
+
+	return (vcpu);
 }
 
 int
@@ -1016,94 +4825,243 @@ free_vcpu:
 	return (r);
 }
 
-void
-kvm_get_kvm(struct kvm *kvm)
+int
+kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
-	atomic_inc_32(&kvm->users_count);
+	vcpu->arch.nmi_pending = 0;
+	vcpu->arch.nmi_injected = 0;
+
+	vcpu->arch.switch_db_regs = 0;
+	memset(vcpu->arch.db, 0, sizeof (vcpu->arch.db));
+	vcpu->arch.dr6 = DR6_FIXED_1;
+	vcpu->arch.dr7 = DR7_FIXED_1;
+
+	return (kvm_x86_ops->vcpu_reset(vcpu));
 }
 
-/*
- * Creates some virtual cpus.  Good luck creating more than one.
- */
 int
-kvm_vm_ioctl_create_vcpu(struct kvm *kvm, uint32_t id, int *rval_p)
+kvm_arch_hardware_enable(void *garbage)
 {
-	int r, i;
-	struct kvm_vcpu *vcpu, *v;
-
-	vcpu = kvm_arch_vcpu_create(kvm, id);
-	if (vcpu == NULL)
-		return (EINVAL);
-
 #ifdef XXX
-	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+	/*
+	 * Since this may be called from a hotplug notifcation,
+	 * we can't get the CPU frequency directly.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		int cpu = raw_smp_processor_id();
+		per_cpu(cpu_tsc_khz, cpu) = 0;
+	}
 #else
 	XXX_KVM_PROBE;
 #endif
+	kvm_shared_msr_cpu_online();
 
-	r = kvm_arch_vcpu_setup(vcpu);
-	if (r)
-		return (r);
-
-	mutex_enter(&kvm->lock);
+	return (kvm_x86_ops->hardware_enable(garbage));
+}
 
-#ifdef XXX
-	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
-#else
-	XXX_KVM_SYNC_PROBE;
-	if (kvm->online_vcpus == KVM_MAX_VCPUS) {
+void
+kvm_arch_hardware_disable(void *garbage)
+{
+	kvm_x86_ops->hardware_disable(garbage);
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	drop_user_return_notifiers(garbage);
 #endif
-		r = EINVAL;
-		goto vcpu_destroy;
+}
+
+int
+kvm_arch_hardware_setup(void)
+{
+	return (kvm_x86_ops->hardware_setup());
+}
+
+void
+kvm_arch_check_processor_compat(void *rtn)
+{
+	kvm_x86_ops->check_processor_compatibility(rtn);
+}
+
+int
+kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	page_t *page;
+	struct kvm *kvm;
+	int r;
+
+	kvm = vcpu->kvm;
+
+	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+
+	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	else
+		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+
+	/*
+	 * page = alloc_page(PAGESIZE, KM_SLEEP);
+	 * if (!page) {
+	 *	r = ENOMEM;
+	 *	goto fail;
+	 * }
+	 * vcpu->arch.pio_data = page_address(page);
+	 */
+	vcpu->arch.pio_data = (caddr_t)vcpu->run +
+	    (KVM_PIO_PAGE_OFFSET * PAGESIZE);
+
+	r = kvm_mmu_create(vcpu);
+	if (r < 0)
+		goto fail;
+
+	if (irqchip_in_kernel(kvm)) {
+		r = kvm_create_lapic(vcpu);
+		if (r < 0)
+			goto fail_mmu_destroy;
 	}
 
-	/* kvm_for_each_vcpu(r, v, kvm) */
-	for (i = 0; i < kvm->online_vcpus; i++) {
-		v = kvm->vcpus[i];
-		if (v->vcpu_id == id) {
-			r = -EEXIST;
-			goto vcpu_destroy;
-		}
+	vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS *
+	    sizeof (uint64_t) * 4, KM_SLEEP);
+
+	if (!vcpu->arch.mce_banks) {
+		r = ENOMEM;
+		goto fail_free_lapic;
+	}
+
+	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+	return (0);
+fail_free_lapic:
+	kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+	kvm_mmu_destroy(vcpu);
+fail:
+	return (r);
+}
+
+void
+kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kmem_free(vcpu->arch.mce_banks, sizeof (uint64_t) * 4 *
+	    KVM_MAX_MCE_BANKS);
+	kvm_free_lapic(vcpu);
+	kvm_mmu_destroy(vcpu);
+}
+
+struct kvm *
+kvm_arch_create_vm(void)
+{
+	struct kvm *kvm = kmem_zalloc(sizeof (struct kvm), KM_SLEEP);
+
+	if (!kvm)
+		return (NULL);
+
+	if ((kvm->arch.aliases =
+	    kmem_zalloc(sizeof (struct kvm_mem_aliases), KM_SLEEP)) == NULL) {
+		kmem_free(kvm, sizeof (struct kvm));
+		return (NULL);
 	}
 
-	/* BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); */
+	list_create(&kvm->arch.active_mmu_pages, sizeof (struct kvm_mmu_page),
+	    offsetof(struct kvm_mmu_page, link));
 
-	/* Now it's all set up, let userspace reach it */
-	kvm_get_kvm(kvm);
+	list_create(&kvm->arch.assigned_dev_head,
+	    sizeof (struct kvm_assigned_dev_kernel),
+	    offsetof(struct kvm_assigned_dev_kernel, list));
 
-	*rval_p = kvm->online_vcpus;  /* guarantee unique id */
-	vcpu->vcpu_id = *rval_p;
+	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
-	/* XXX need to protect online_vcpus */
-	kvm->vcpus[kvm->online_vcpus] = vcpu;
+	/* XXX - original is rdtscll() */
+	kvm->arch.vm_init_tsc = (uint64_t)gethrtime();
 
-#ifdef XXX
-	smp_wmb();
-#else
+	return (kvm);
+}
+
+static void
+kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+	vcpu_load(vcpu);
+	kvm_mmu_unload(vcpu);
+	vcpu_put(vcpu);
+}
+
+static void
+kvm_free_vcpus(struct kvm *kvmp)
+{
+	int ii, maxcpus;
+
+	maxcpus = kvmp->online_vcpus;
 	XXX_KVM_SYNC_PROBE;
-#endif
-	atomic_inc_32(&kvm->online_vcpus);
+	for (ii = 0; ii < maxcpus; ii++)
+		kvm_unload_vcpu_mmu(kvmp->vcpus[ii]);
 
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	if (kvm->bsp_vcpu_id == id)
-		kvm->bsp_vcpu = vcpu;
-#endif
+	for (ii = 0; ii < maxcpus; ii++)
+		kvm_arch_vcpu_free(kvmp->vcpus[ii]);
 
-	mutex_exit(&kvm->lock);
-	return (r);
+	mutex_enter(&kvmp->lock);
+	for (ii = 0; ii < maxcpus; ii++)
+		kvmp->vcpus[ii] = NULL;
+	kvmp->online_vcpus = 0;
+	mutex_exit(&kvmp->lock);
+}
+
+/*
+ * This function exists because of a difference in methodologies from our
+ * ancestor. With our ancestors, there is no imputus to clean up lists and
+ * mutexes. This is unfortunate, because they seem to even have debug kernels
+ * which would seemingly check for these kinds of things. But because in the
+ * common case mutex_exit is currently a #define to do {} while(0), it seems
+ * that they just ignore this.
+ *
+ * This leads to the following behavior: during our time we create a lot of
+ * auxillary structs potentially related to pits, apics, etc. Tearing down these
+ * structures relies on having the correct locks, etc. However
+ * kvm_arch_destroy_vm() is designed to be the final death blow, i.e. it's doing
+ * the kmem_free. Logically these auxillary structures need to be freed and
+ * dealt with before we go back and do the rest of the tear down related to the
+ * device.
+ */
+void
+kvm_arch_destroy_vm_comps(struct kvm *kvmp)
+{
+	if (kvmp == NULL)
 
-vcpu_destroy:
+#ifdef IOMMU
+	kvm_iommu_unmap_guest(kvmp);
+#else
+	XXX_KVM_PROBE;
+#endif /* IOMMU */
+	kvm_free_pit(kvmp);
+	kvm_free_vcpus(kvmp);
+	kvm_free_physmem(kvmp);
 #ifdef XXX
-	mutex_exit(&kvm->lock);
-	kvm_arch_vcpu_destroy(vcpu);
+#ifdef APIC
+	if (kvm->arch.apic_access_page)
+		put_page(kvm->arch.apic_access_page);
+	if (kvm->arch.ept_identity_pagetable)
+		put_page(kvm->arch.ept_identity_pagetable);
+#endif /* APIC */
 #else
 	XXX_KVM_PROBE;
-#endif
-	return (r);
+#endif /* XXX */
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	cleanup_srcu_struct(&kvm->srcu);
+#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 }
 
+void
+kvm_arch_destroy_vm(struct kvm *kvmp)
+{
+	if (kvmp == NULL)
+		return;  /* nothing to do here */
 
-int kvm_arch_prepare_memory_region(struct kvm *kvm,
+	if (kvmp->arch.aliases) {
+		kmem_free(kvmp->arch.aliases, sizeof (struct kvm_mem_aliases));
+		kvmp->arch.aliases = NULL;
+	}
+	kmem_free(kvmp, sizeof (struct kvm));
+}
+
+int
+kvm_arch_prepare_memory_region(struct kvm *kvm,
     struct kvm_memory_slot *memslot, struct kvm_memory_slot old,
     struct kvm_userspace_memory_region *mem, int user_alloc)
 {
@@ -1162,171 +5120,154 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	return (0);
 }
 
-/*
- * Allocate some memory and give it an address in the guest physical address
- * space.
- *
- * Discontiguous memory is allowed, mostly for framebuffers.
- *
- * Must be called holding mmap_sem for write.
- */
-int
-kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-    struct kvm_userspace_memory_region *mem, int user_alloc)
+void
+kvm_arch_commit_memory_region(struct kvm *kvm,
+    struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old,
+    int user_alloc)
 {
-	if (mem->slot >= KVM_MEMORY_SLOTS)
-		return (EINVAL);
 
-	return (kvm_set_memory_region(kvm, mem, user_alloc));
-}
+	int npages = mem->memory_size >> PAGESHIFT;
 
+	if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+		int ret = 0;
 
-/* Caller must hold slots_lock. */
-int
-kvm_io_bus_register_dev(struct kvm *kvm,
-    enum kvm_bus bus_idx, struct kvm_io_device *dev)
-{
-	struct kvm_io_bus *new_bus, *bus;
+#ifdef XXX
+		down_write(&current->mm->mmap_sem);
+		ret = munmap(old.userspace_addr,
+				old.npages * PAGESIZE);
+		up_write(&current->mm->mmap_sem);
+#else
+		XXX_KVM_PROBE;
+		/* see comment in kvm_arch_prepare_memory_region */
+		/*
+		 * XXX this needs to be here, but I'm getting kernel heap
+		 * corruption panics with someone writing to a buffer after it
+		 * is freed
+		 */
+		kmem_free((caddr_t)old.userspace_addr, old.npages * PAGESIZE);
+#endif
+		if (ret < 0) {
+			cmn_err(CE_WARN, "kvm_vm_ioctl_set_memory_region: "
+			    "failed to munmap memory\n");
+		}
+	}
 
-	bus = kvm->buses[bus_idx];
-	if (bus->dev_count > NR_IOBUS_DEVS-1)
-		return (-ENOSPC);
+	mutex_enter(&kvm->mmu_lock);
+	if (!kvm->arch.n_requested_mmu_pages) {
+		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+	}
+
+	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	mutex_exit(&kvm->mmu_lock);
+}
 
-	new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP);
-	if (!new_bus)
-		return (-ENOMEM);
-	memcpy(new_bus, bus, sizeof (struct kvm_io_bus));
-	new_bus->devs[new_bus->dev_count++] = dev;
+void
+kvm_arch_flush_shadow(struct kvm *kvm)
+{
+	kvm_mmu_zap_all(kvm);
 #ifdef XXX
-	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
-	synchronize_srcu_expedited(&kvm->srcu);
+	kvm_reload_remote_mmus(kvm);
 #else
 	XXX_KVM_PROBE;
-	kvm->buses[bus_idx] = new_bus;
 #endif
-	if (bus)
-		kmem_free(bus, sizeof (struct kvm_io_bus));
-
-	return (0);
 }
 
-/* Caller must hold slots_lock. */
 int
-kvm_io_bus_unregister_dev(struct kvm *kvm,
-    enum kvm_bus bus_idx, struct kvm_io_device *dev)
+kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-	int i, r;
-	struct kvm_io_bus *new_bus, *bus;
+	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE ||
+	    vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+	    vcpu->arch.nmi_pending ||
+	    (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)));
+}
 
-	new_bus = kmem_zalloc(sizeof (struct kvm_io_bus), KM_SLEEP);
-	if (!new_bus)
-		return (-ENOMEM);
+void
+kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	processorid_t cpu = vcpu->cpu;
 
-	bus = kvm->buses[bus_idx];
-	memcpy(new_bus, bus, sizeof (struct kvm_io_bus));
+	mutex_enter(&vcpu->kvcpu_kick_lock);
 
-	r = -ENOENT;
-	for (i = 0; i < new_bus->dev_count; i++) {
-		if (new_bus->devs[i] == dev) {
-			r = 0;
-			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
-			break;
-		}
-	}
+	if (CV_HAS_WAITERS(&vcpu->kvcpu_kick_cv))
+		KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_wakeup);
 
-	if (r) {
-		kmem_free(new_bus, sizeof (struct kvm_io_bus));
-		return (r);
-	}
+	cv_broadcast(&vcpu->kvcpu_kick_cv);
+	mutex_exit(&vcpu->kvcpu_kick_lock);
 
-#ifdef XXX
-	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
-	synchronize_srcu_expedited(&kvm->srcu);
-#else
-	XXX_KVM_SYNC_PROBE;
-	kvm->buses[bus_idx] = new_bus;
-#endif
-	kmem_free(bus, sizeof (struct kvm_io_bus));
-	return (r);
+	if (cpu != CPU->cpu_id && cpu != -1) {
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) {
+			/*
+			 * If we haven't already kicked this VCPU, we'll poke
+			 * the the CPU on which it's running.  (This will serve
+			 * to induce a VM exit.)
+			 */
+			poke_cpu(cpu);
+		}
+	}
 }
 
-long
-kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
+int
+kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-	void *argp = (void  *)arg;
-	int r;
-	proc_t *p;
+	return (kvm_x86_ops->interrupt_allowed(vcpu));
+}
 
-	if (kvmp->mm != curproc->p_as)
-		return (EIO);
+unsigned long
+kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
 
-	switch (ioctl) {
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-	case KVM_REGISTER_COALESCED_MMIO: {
-		struct kvm_coalesced_mmio_zone zone;
-		r = EFAULT;
-		if (copyin(argp, &zone, sizeof (zone)))
-			goto out;
-		r = ENXIO;
-		r = kvm_vm_ioctl_register_coalesced_mmio(kvmp, &zone);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_UNREGISTER_COALESCED_MMIO: {
-		struct kvm_coalesced_mmio_zone zone;
-		r = EFAULT;
-		if (copyin(argp, &zone, sizeof (zone)))
-			goto out;
-		r = ENXIO;
-		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvmp, &zone);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-#endif
-#ifdef XXX_KVM_DECLARATION
-	case KVM_IRQFD: {
-		struct kvm_irqfd data;
+	rflags = kvm_x86_ops->get_rflags(vcpu);
 
-		if (ddi_copyin(argp, &data, sizeof (data), mode))
-			return (EFAULT);
-		r = kvm_irqfd(kvmp, data.fd, data.gsi, data.flags);
-		break;
-	}
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
 
-	case KVM_IOEVENTFD: {
-		struct kvm_ioeventfd data;
+	return (rflags);
+}
 
-		r = -EFAULT;
-		if (copy_from_user(&data, argp, sizeof (data)))
-			goto out;
-		r = kvm_ioeventfd(kvmp, &data);
-		break;
+void
+kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+	    vcpu->arch.singlestep_cs == get_segment_selector(vcpu,
+	    VCPU_SREG_CS) && vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) {
+		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
 	}
-#endif
 
-	default:
-		return (EINVAL);
-	}
+	kvm_x86_ops->set_rflags(vcpu, rflags);
+}
 
-out:
-	return (r);
+inline gpa_t
+gfn_to_gpa(gfn_t gfn)
+{
+	return ((gpa_t)gfn << PAGESHIFT);
 }
 
-int
-kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+/*
+ * For pages for which vmx needs physical addresses,
+ * linux allocates pages from an area that maps virtual
+ * addresses 1-1 with physical memory.  In this way,
+ * translating virtual to physical just involves subtracting
+ * the start of the area from the virtual address.
+ * This solaris version uses kmem_alloc, so there is no
+ * direct mapping of virtual to physical.  We'll change this
+ * later if performance is an issue.  For now, we'll use
+ * hat_getpfnum() to do the conversion.  Also note that
+ * we're assuming 64-bit address space (we won't run on
+ * 32-bit hardware).
+ */
+uint64_t
+kvm_va2pa(caddr_t va)
 {
-	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE ||
-	    vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
-	    vcpu->arch.nmi_pending ||
-	    (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)));
+	uint64_t pa;
+
+	pa = (hat_getpfnum(kas.a_hat, va)<<PAGESHIFT)|((uint64_t)va&PAGEOFFSET);
+	return (pa);
 }
 
 void
-kvm_reload_remote_mmus(struct kvm *kvm)
+kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
-	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
 }
diff --git a/kvm_x86host.h b/kvm_x86host.h
index 96056b6..6549ac4 100644
--- a/kvm_x86host.h
+++ b/kvm_x86host.h
@@ -16,6 +16,10 @@
 #define offsetof(s, m) ((size_t)(&((s *)0)->m))
 #endif
 
+#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
 #define	KVM_MAX_VCPUS	64
 #define	KVM_MEMORY_SLOTS	32
 /* memory slots that are not exposted to userspace */
@@ -741,7 +745,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data);
diff --git a/kvm_x86impl.h b/kvm_x86impl.h
index 4e90237..b7c5726 100644
--- a/kvm_x86impl.h
+++ b/kvm_x86impl.h
@@ -29,7 +29,6 @@ inline int is_paging(struct kvm_vcpu *vcpu);
 caddr_t page_address(page_t *page);
 extern page_t *alloc_page(size_t, int);
 extern uint64_t kvm_va2pa(caddr_t va);
-extern void bitmap_zero(unsigned long *, int);
 extern page_t *pfn_to_page(pfn_t);
 extern int zero_constructor(void *, void *, int);
 
@@ -39,56 +38,6 @@ typedef void (*kvm_xcall_t)(void *);
 extern void kvm_xcall(processorid_t cpu, kvm_xcall_t func, void *arg);
 extern int kvm_xcall_func(kvm_xcall_t func, void *arg);
 
-/*
- * All the follwoing definitions are ones that are expected to just be in
- * x86/x86.c by Linux. However we currently have the things that need them
- * spread out across two files. For now we are putting them here, but this
- * should not last very long.
- */
-#define KVM_NR_SHARED_MSRS 16
-
-typedef struct kvm_shared_msrs_global {
-	int nr;
-	uint32_t msrs[KVM_NR_SHARED_MSRS];
-} kvm_shared_msrs_global_t;
-
-struct kvm_vcpu;
-
-typedef struct kvm_user_return_notifier {
-	void (*on_user_return)(struct kvm_vcpu *,
-	    struct kvm_user_return_notifier *);
-} kvm_user_return_notifier_t;
-
-typedef struct kvm_shared_msrs {
-	struct kvm_user_return_notifier urn;
-	int registered;
-	struct kvm_shared_msr_values {
-		uint64_t host;
-		uint64_t curr;
-	} values[KVM_NR_SHARED_MSRS];
-} kvm_shared_msrs_t;
-
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-typedef struct fxsave {
-	uint16_t	cwd;
-	uint16_t	swd;
-	uint16_t	twd;
-	uint16_t	fop;
-	uint64_t	rip;
-	uint64_t	rdp;
-	uint32_t	mxcsr;
-	uint32_t	mxcsr_mask;
-	uint32_t	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-	uint32_t	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-	uint32_t	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-} fxsave_t;
-
 unsigned long native_read_cr0(void);
 #define	read_cr0()	(native_read_cr0())
 unsigned long native_read_cr4(void);
@@ -96,11 +45,11 @@ unsigned long native_read_cr4(void);
 unsigned long native_read_cr3(void);
 #define	read_cr3()	(native_read_cr3())
 
-uint32_t bit(int bitno);
-inline unsigned long get_desc_limit(const struct desc_struct *desc);
-unsigned long get_desc_base(const struct desc_struct *desc);
-
 inline page_t *compound_head(page_t *page);
 inline void get_page(page_t *page);
+inline unsigned long get_desc_limit(const struct desc_struct *desc);
+
+extern unsigned long get_desc_base(const struct desc_struct *);
+uint32_t bit(int);
 
 #endif