Lots of new code, and lots of code turned on. Doesn't compile...

author: max <max@maxpad.(none)> 2010-12-03 19:19:31 +0100
committer: max <max@maxpad.(none)> 2010-12-03 19:19:31 +0100
commit: aaf4078a2967dbd67bf0efad9c3f4b81ab35e665 (patch)
tree: 5bfa0a8d72f2fa2b5c8f3b38880e0d3eb5ce01d4
parent: 00233f503e3241dd6361421e306acbba7454c99f (diff)
download: illumos-kvm-aaf4078a2967dbd67bf0efad9c3f4b81ab35e665.tar.gz
6 files changed, 8115 insertions, 1762 deletions
diff --git a/Makefile b/Makefile
index 70cb946..b74e15d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,16 +9,19 @@ LD=/usr/bin/ld
 CTFCONVERT=$(KERNEL_SOURCE)/usr/src/tools/proto/opt/onbld/bin/i386/ctfconvert
 CTFMERGE=$(KERNEL_SOURCE)/usr/src/tools/proto/opt/onbld/bin/i386/ctfmerge
 
-CFLAGS += -D_KERNEL -D_MACHDEP -Dx86 -DCONFIG_X86_64 -DDEBUG -c -O -g
-INCLUDEDIR= -I $(KERNEL_SOURCE)/usr/src/uts/intel -I $(KERNEL_SOURCE)/usr/src/uts/i86pc
+CFLAGS += -D_KERNEL -D_MACHDEP -Dx86 -DCONFIG_X86_64 -DDEBUG -c -g -DCONFIG_SOLARIS -DCONFIG_KVM_MMIO
 
-kvm: kvm.c kvm_x86.c kvm.h
+INCLUDEDIR= -I $(KERNEL_SOURCE)/usr/src/uts/intel -I $(KERNEL_SOURCE)/usr/src/uts/i86pc -I $(KERNEL_SOURCE)/usr/src/uts/common
+
+kvm: kvm.c kvm_x86.c emulate.c kvm.h kvm_x86host.h
 	$(CC) $(CFLAGS) $(INCLUDEDIR) kvm.c
 	$(CC) $(CFLAGS) $(INCLUDEDIR) kvm_x86.c
+	$(CC) $(CFLAGS) $(INCLUDEDIR) emulate.c
 	$(CTFCONVERT) -i -L VERSION kvm.o
 	$(CTFCONVERT) -i -L VERSION kvm_x86.o
-	$(LD) -r -o kvm kvm.o kvm_x86.o
-	$(CTFMERGE) -L VERSION -o kvm kvm.o kvm_x86.o
+	$(CTFCONVERT) -i -L VERSION emulate.o
+	$(LD) -r -o kvm kvm.o kvm_x86.o emulate.o
+	$(CTFMERGE) -L VERSION -o kvm kvm.o kvm_x86.o emulate.o
 
 install: kvm
 	@echo "==> Installing kvm module"
diff --git a/kvm.c b/kvm.c
index 30508bc..e8a9d27 100644
--- a/kvm.c
+++ b/kvm.c
@@ -24,7 +24,15 @@
 #include "msr.h"
 #include "irqflags.h"
 #include "kvm_host.h"
+#include "kvm_x86host.h"
+#include "processor-flags.h"
+#include "hyperv.h"
+#include "apicdef.h"
+#include "segment.h"
+#include "iodev.h"
 #include "kvm.h"
+#include "irq.h"
+#include "tss.h"
 
 int kvmid;  /* monotonically increasing, unique per vm */
 int largepages_enabled = 1;
@@ -126,6 +134,69 @@ extern void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 static int vmx_set_tss_addr(struct kvm *kvmp, uintptr_t addr);
 static int vmx_hardware_setup(void);
 extern int vmx_hardware_enable(void *garbage);
+extern unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmcs_writel(unsigned long field, unsigned long value);
+unsigned long vmcs_readl(unsigned long field);
+extern void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+void vmx_get_segment(struct kvm_vcpu *vcpu,
+		     struct kvm_segment *var, int seg);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg);
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+static int vmx_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata);
+static int vmx_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data);
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu);
+static void vmx_save_host_state(struct kvm_vcpu *vcpu);
+
+struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static int vmx_handle_exit(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu);
+static int vmx_get_lpage_level(void);
+static int vmx_rdtscp_supported(void);
+void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer);
+static uint64_t vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static int vmx_get_cpl(struct kvm_vcpu *vcpu);
+int get_ept_level(void);
+
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	vpid_sync_vcpu_all(to_vmx(vcpu));
+	if (enable_ept)
+		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+#endif
+}
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	unsigned long guest_cr3;
+	uint64_t eptp;
+
+	guest_cr3 = cr3;
+#ifdef XXX
+	if (enable_ept) {
+		/*
+		 * ept not implemented right now...
+		 */
+		eptp = construct_eptp(cr3);
+		vmcs_write64(EPT_POINTER, eptp);
+		guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+			vcpu->kvm->arch.ept_identity_map_addr;
+		ept_load_pdptrs(vcpu);
+	}
+#endif /*XXX*/
+
+	vmx_flush_tlb(vcpu);
+	vmcs_writel(GUEST_CR3, guest_cr3);
+}
 
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = nulldev/*cpu_has_kvm_support*/,
@@ -141,38 +212,38 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.vcpu_free = nulldev /*vmx_free_vcpu*/,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = nulldev /*vmx_save_host_state*/,
+	.prepare_guest_switch = vmx_save_host_state /*vmx_save_host_state*/,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
 	.set_guest_debug = nulldev /*set_guest_debug*/,
-	.get_msr = nulldev /*vmx_get_msr*/,
-	.set_msr = nulldev /*vmx_set_msr*/,
-	.get_segment_base = nulldev /*vmx_get_segment_base*/,
-	.get_segment = nulldev /*vmx_get_segment*/,
-	.set_segment = nulldev /*vmx_set_segment*/,
-	.get_cpl = nulldev /*vmx_get_cpl*/,
+	.get_msr = vmx_get_msr /*vmx_get_msr*/,
+	.set_msr = vmx_set_msr /*vmx_set_msr*/,
+	.get_segment_base = vmx_get_segment_base /*vmx_get_segment_base*/,
+	.get_segment = vmx_get_segment /*vmx_get_segment*/,
+	.set_segment = vmx_set_segment /*vmx_set_segment*/,
+	.get_cpl = vmx_get_cpl /*vmx_get_cpl*/,
 	.get_cs_db_l_bits = nulldev /*vmx_get_cs_db_l_bits*/,
 	.decache_cr0_guest_bits = nulldev /*vmx_decache_cr0_guest_bits*/,
 	.decache_cr4_guest_bits = nulldev /*vmx_decache_cr4_guest_bits*/,
 	.set_cr0 = vmx_set_cr0,
-	.set_cr3 = nulldev /*vmx_set_cr3*/,
+	.set_cr3 = vmx_set_cr3 /*vmx_set_cr3*/,
 	.set_cr4 = vmx_set_cr4,
-	.set_efer = nulldev /*vmx_set_efer*/,
-	.get_idt = nulldev /*vmx_get_idt*/,
-	.set_idt = nulldev /*vmx_set_idt*/,
-	.get_gdt = nulldev /*vmx_get_gdt*/,
-	.set_gdt = nulldev /*vmx_set_gdt*/,
+	.set_efer = vmx_set_efer /*vmx_set_efer*/,
+	.get_idt = vmx_get_idt /*vmx_get_idt*/,
+	.set_idt = vmx_set_idt /*vmx_set_idt*/,
+	.get_gdt = vmx_get_gdt /*vmx_get_gdt*/,
+	.set_gdt = vmx_set_gdt /*vmx_set_gdt*/,
 	.cache_reg = nulldev /*vmx_cache_reg*/,
-	.get_rflags = nulldev /*vmx_get_rflags*/,
-	.set_rflags = nulldev /*vmx_set_rflags*/,
+	.get_rflags = vmx_get_rflags /*vmx_get_rflags*/,
+	.set_rflags = vmx_set_rflags /*vmx_set_rflags*/,
 	.fpu_activate = nulldev /*vmx_fpu_activate*/,
 	.fpu_deactivate = nulldev /*vmx_fpu_deactivate*/,
 
 	.tlb_flush = nulldev /*vmx_flush_tlb*/,
 
-	.run = nulldev /*vmx_vcpu_run*/,
-	.handle_exit = nulldev /*vmx_handle_exit*/,
+	.run = vmx_vcpu_run /*vmx_vcpu_run*/,
+	.handle_exit = vmx_handle_exit /*vmx_handle_exit*/,
 	.skip_emulated_instruction = nulldev /*skip_emulated_instruction*/,
 	.set_interrupt_shadow = nulldev /*vmx_set_interrupt_shadow*/,
 	.get_interrupt_shadow = nulldev /*vmx_get_interrupt_shadow*/,
@@ -180,28 +251,62 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_irq = nulldev /*vmx_inject_irq*/,
 	.set_nmi = nulldev /*vmx_inject_nmi*/,
 	.queue_exception = nulldev /*vmx_queue_exception*/,
-	.interrupt_allowed = nulldev /*vmx_interrupt_allowed*/,
+	.interrupt_allowed = vmx_interrupt_allowed /*vmx_interrupt_allowed*/,
 	.nmi_allowed = nulldev /*vmx_nmi_allowed*/,
 	.get_nmi_mask = nulldev /*vmx_get_nmi_mask*/,
 	.set_nmi_mask = nulldev /*vmx_set_nmi_mask*/,
 	.enable_nmi_window = nulldev /*enable_nmi_window*/,
 	.enable_irq_window = nulldev /*enable_irq_window*/,
-	.update_cr8_intercept = nulldev /*update_cr8_intercept*/,
+	.update_cr8_intercept = vmx_update_cr8_intercept /*update_cr8_intercept*/,
 
 	.set_tss_addr = vmx_set_tss_addr,
-	.get_tdp_level = nulldev /*get_ept_level*/,
+	.get_tdp_level = get_ept_level /*get_ept_level*/,
 	.get_mt_mask = nulldev /*vmx_get_mt_mask*/,
 
 	.exit_reasons_str = nulldev /*vmx_exit_reasons_str*/,
-	.get_lpage_level = nulldev /*vmx_get_lpage_level*/,
+	.get_lpage_level = vmx_get_lpage_level /*vmx_get_lpage_level*/,
 
 	.cpuid_update = nulldev /*vmx_cpuid_update*/,
 
-	.rdtscp_supported = nulldev /*vmx_rdtscp_supported*/,
+	.rdtscp_supported = vmx_rdtscp_supported /*vmx_rdtscp_supported*/,
 };
 
 struct kvm_x86_ops *kvm_x86_ops;
 
+uint32_t vmcs_read32(unsigned long field)
+{
+	return vmcs_readl(field);
+}
+
+void vmcs_write32(unsigned long field, uint32_t value)
+{
+	vmcs_writel(field, value);
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	dt->base = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
+	vmcs_writel(GUEST_IDTR_BASE, dt->base);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	dt->base = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
+	vmcs_writel(GUEST_GDTR_BASE, dt->base);
+}
+
 /*
  * In linux, there is a separate vmx kernel module from the kvm driver.
  * That may be a good idea, but we're going to do everything in
@@ -212,8 +317,8 @@ struct kvm_x86_ops *kvm_x86_ops;
 
 struct vmcs **vmxarea;  /* 1 per cpu */
 
-static int alloc_kvm_area(void)
-{
+static int alloc_kvm_area(void){
+
 	int i, j;
 
 	/*
@@ -244,6 +349,39 @@ static int alloc_kvm_area(void)
 
 extern struct vmcs_config vmcs_config;
 
+static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
+				      uint32_t msr, uint32_t *result)
+{
+	uint32_t vmx_msr_low, vmx_msr_high;
+	uint32_t ctl = ctl_min | ctl_opt;
+
+	rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+	/* Ensure minimum (required) set of control bits are supported. */
+	if (ctl_min & ~ctl)
+		return EIO;
+
+	*result = ctl;
+	return DDI_SUCCESS;
+}
+
+/* Pure 2^n version of get_order */
+static inline int get_order(unsigned long size)
+{
+	int order;
+
+	size = (size - 1) >> (PAGESHIFT - 1);
+	order = -1;
+	do {
+		size >>= 1;
+		order++;
+	} while (size);
+	return order;
+}
+
 static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 {
 	uint32_t vmx_msr_low, vmx_msr_high;
@@ -254,12 +392,11 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	uint32_t _vmexit_control = 0;
 	uint32_t _vmentry_control = 0;
 
-#ifdef XXX
 	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
 	opt = PIN_BASED_VIRTUAL_NMIS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
+				&_pin_based_exec_control) != DDI_SUCCESS)
+		return EIO;
 
 	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
@@ -278,8 +415,8 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
-				&_cpu_based_exec_control) < 0)
-		return -EIO;
+				&_cpu_based_exec_control) != DDI_SUCCESS)
+		return EIO;
 #ifdef CONFIG_X86_64
 	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
 		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
@@ -296,8 +433,8 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_RDTSCP;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
-					&_cpu_based_2nd_exec_control) < 0)
-			return -EIO;
+					&_cpu_based_2nd_exec_control) != DDI_SUCCESS)
+			return EIO;
 	}
 #ifndef CONFIG_X86_64
 	if (!(_cpu_based_2nd_exec_control &
@@ -320,15 +457,14 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #endif
 	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
-				&_vmexit_control) < 0)
-		return -EIO;
+				&_vmexit_control) != DDI_SUCCESS)
+		return EIO;
 
 	min = 0;
 	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
-				&_vmentry_control) < 0)
-		return -EIO;
-#endif /*XXX*/
+				&_vmentry_control) != DDI_SUCCESS)
+		return EIO;
 
 	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
 
@@ -347,29 +483,100 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		return EIO;
 
 	vmcs_conf->size = vmx_msr_high & 0x1fff;
-#ifdef XXX
 	vmcs_conf->order = get_order(vmcs_config.size);
-#endif
 	vmcs_conf->revision_id = vmx_msr_low;
 
-#ifdef XXX
 	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
 	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
 	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
 	vmcs_conf->vmexit_ctrl         = _vmexit_control;
 	vmcs_conf->vmentry_ctrl        = _vmentry_control;
-#endif
+
 	return 0;
 }
 
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
+
+static int bypass_guest_pf = 1;
+int enable_vpid = 1;
+static int flexpriority_enabled = 1;
+int enable_ept = 0;  
+int enable_unrestricted_guest = 1;
+int emulate_invalid_guest_state = 0;
+
+void kvm_enable_efer_bits(uint64_t mask)
+{
+       efer_reserved_bits &= ~mask;
+}
+
+static inline int cpu_has_vmx_vpid(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int cpu_has_vmx_ept(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_EPT;
+}
+static inline int cpu_has_vmx_unrestricted_guest(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
+static inline int cpu_has_vmx_tpr_shadow(void)
+{
+	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
+}
+
+static inline int cpu_has_vmx_virtualize_apic_accesses(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline int cpu_has_vmx_flexpriority(void)
+{
+	return cpu_has_vmx_tpr_shadow() &&
+		cpu_has_vmx_virtualize_apic_accesses();
+}
+
+static inline int cpu_has_vmx_ept_2m_page(void)
+{
+	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+}
+
+void kvm_disable_largepages(void)
+{
+	largepages_enabled = 0;
+}
+
+static inline int cpu_has_vmx_ple(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
 static int vmx_hardware_setup(void)
 {
 
-	if (setup_vmcs_config(&vmcs_config) < 0)
+	if (setup_vmcs_config(&vmcs_config) != DDI_SUCCESS)
 		return EIO;
 #ifdef XXX
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
+#endif /*XXX*/
+
 
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
@@ -390,10 +597,11 @@ static int vmx_hardware_setup(void)
 
 	if (enable_ept && !cpu_has_vmx_ept_2m_page())
 		kvm_disable_largepages();
-
+#ifdef XXX
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
-#endif /*XXX*/
+#endif
+
 	return alloc_kvm_area();
 }
 
@@ -402,42 +610,520 @@ int kvm_arch_hardware_setup(void)
 	return kvm_x86_ops->hardware_setup();
 }
 
-int kvm_mmu_module_init(void)
+struct kmem_cache *pte_chain_cache;
+struct kmem_cache *rmap_desc_cache;
+struct kmem_cache *mmu_page_header_cache;
+
+int tdp_enabled = 0;
+
+#define PT_WRITABLE_SHIFT 1
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(uint64_t)(PAGESIZE-1))
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define ACC_EXEC_MASK    1
+#define ACC_WRITE_MASK   PT_WRITABLE_MASK
+#define ACC_USER_MASK    PT_USER_MASK
+#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+				    size_t size)
 {
+	void *p;
+
+	p = mc->objects[--mc->nobjs];
+	return p;
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+					       uint64_t *parent_pte)
+{
+	struct kvm_mmu_page *sp;
+
+	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE);
+	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE);
+	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+	list_insert_head(&vcpu->kvm->arch.active_mmu_pages, sp);
 #ifdef XXX
+	/* XXX don't see this used anywhere */
+	INIT_LIST_HEAD(&sp->oos_link);
+#endif /*XXX*/
+	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+	sp->multimapped = 0;
+	sp->parent_pte = parent_pte;
+	--vcpu->kvm->arch.n_free_mmu_pages;
+	return sp;
+}
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
+struct kvm_mmu_page *
+shadow_hpa_to_kvmpage(hpa_t shadow_page)
+{
+	/*
+	 * XXX - We'll probably need a faster way to do this...
+	 * For right now, search all kvm_mmu_page for matching hpa
+	 */
+
+}	
+
+struct kvm_mmu_page *
+page_header(hpa_t shadow_page)
+{
+	return (struct kvm_mmu_page *)shadow_hpa_to_kvmpage(shadow_page);
+}
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    mmu_parent_walk_fn fn)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	struct kvm_mmu_page *parent_sp;
+	int i;
+
+	if (!sp->multimapped && sp->parent_pte) {
+		parent_sp = page_header(__pa(sp->parent_pte));
+		fn(vcpu, parent_sp);
+		mmu_parent_walk(vcpu, parent_sp, fn);
+		return;
+	}
+	for(pte_chain = list_head(sp->parent_ptes); pte_chain;
+	    pte_chain = list_next(sp->parent_ptes, pte_chain)) {
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+			fn(vcpu, parent_sp);
+			mmu_parent_walk(vcpu, parent_sp, fn);
+		}
+	}
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+					struct kvm_mmu_page *sp)
+{
+	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	kvm_mmu_update_parents_unsync(sp);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+					     gfn_t gfn,
+					     gva_t gaddr,
+					     unsigned level,
+					     int direct,
+					     unsigned access,
+					     uint64_t *parent_pte)
+{
+	union kvm_mmu_page_role role;
+	unsigned index;
+	unsigned quadrant;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *sp;
+	struct hlist_node *node, *tmp;
+
+	role = vcpu->arch.mmu.base_role;
+	role.level = level;
+	role.direct = direct;
+	role.access = access;
+	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+		role.quadrant = quadrant;
+	}
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	for (sp = list_head(&vcpu->kvm->arch.mmu_page_hash[index]); sp;
+	     sp = list_next(&vcpu->kvm->arch.mmu_page_hash[index], sp)) {
+		if (sp->gfn == gfn) {
+			if (sp->unsync)
+				if (kvm_sync_page(vcpu, sp))
+					continue;
+
+			if (sp->role.word != role.word)
+				continue;
+
+			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+			if (sp->unsync_children) {
+				BT_SET(&vcpu->requests, KVM_REQ_MMU_SYNC);
+				kvm_mmu_mark_parents_unsync(vcpu, sp);
+			}
+			return sp;
+		}
+	}
+#ifdef XXX
+	++vcpu->kvm->stat.mmu_cache_miss;
+#endif
+	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+	if (!sp)
+		return sp;
+	sp->gfn = gfn;
+	sp->role = role;
+	list_insert_head(bucket, &sp);
+	if (!direct) {
+		if (rmap_write_protect(vcpu->kvm, gfn))
+			kvm_flush_remote_tlbs(vcpu->kvm);
+#ifdef XXX
+		account_shadowed(vcpu->kvm, gfn);
+#endif /*XXX*/
+	}
+	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
+		vcpu->arch.mmu.prefetch_page(vcpu, sp);
+	else
+		nonpaging_prefetch_page(vcpu, sp);
+#ifdef XXX
+	trace_kvm_mmu_get_page(sp, true);
+#endif /*XXX*/
+	return sp;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	gfn_t root_gfn;
+	struct kvm_mmu_page *sp;
+	int direct = 0;
+	uint64_t pdptr;
+
+	root_gfn = vcpu->arch.cr3 >> PAGESHIFT;
+
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+
+		ASSERT(!VALID_PAGE(root));
+		if (tdp_enabled)
+			direct = 1;
+		if (mmu_check_root(vcpu, root_gfn))
+			return 1;
+		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+				      PT64_ROOT_LEVEL, direct,
+				      ACC_ALL, NULL);
+		root = kvm_va2pa(sp->spt);
+		++sp->root_count;
+		vcpu->arch.mmu.root_hpa = root;
+		return 0;
+	}
+	direct = !is_paging(vcpu);
+	if (tdp_enabled)
+		direct = 1;
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		ASSERT(!VALID_PAGE(root));
+		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+			pdptr = kvm_pdptr_read(vcpu, i);
+			if (!is_present_gpte(pdptr)) {
+				vcpu->arch.mmu.pae_root[i] = 0;
+				continue;
+			}
+			root_gfn = pdptr >> PAGESHIFT;
+		} else if (vcpu->arch.mmu.root_level == 0)
+			root_gfn = 0;
+		if (mmu_check_root(vcpu, root_gfn))
+			return 1;
+		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+				      PT32_ROOT_LEVEL, direct,
+				      ACC_ALL, NULL);
+		root = __pa(sp->spt);
+		++sp->root_count;
+		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+	}
+	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+	return 0;
+}
+
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		mmu_sync_children(vcpu, sp);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			mmu_sync_children(vcpu, sp);
+		}
+	}
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_sync_roots(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
+static void mmu_destroy_caches(void)
+{
+	if (pte_chain_cache)
+		kmem_cache_destroy(pte_chain_cache);
+	if (rmap_desc_cache)
+		kmem_cache_destroy(rmap_desc_cache);
+	if (mmu_page_header_cache)
+		kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int
+zero_constructor(void *buf, void *arg, int tags)
+{
+	bzero(buf, (size_t)arg);
+}
+
+int kvm_mmu_module_init(void)
+{
 	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-					    sizeof(struct kvm_pte_chain),
-					    0, 0, NULL);
+					    sizeof(struct kvm_pte_chain), 0,
+					    zero_constructor, NULL, NULL,
+					    sizeof(struct kvm_pte_chain), NULL, 0);
 	if (!pte_chain_cache)
 		goto nomem;
 	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-					    sizeof(struct kvm_rmap_desc),
-					    0, 0, NULL);
+					    sizeof(struct kvm_rmap_desc), 0,
+					    zero_constructor, NULL, NULL, 
+					    sizeof(struct kvm_rmap_desc), NULL, 0);
 	if (!rmap_desc_cache)
 		goto nomem;
 
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
-						  sizeof(struct kvm_mmu_page),
-						  0, 0, NULL);
+						  sizeof(struct kvm_mmu_page), 0,
+						  zero_constructor, NULL, NULL, 
+						  sizeof(struct kvm_mmu_page), NULL, 0);
 	if (!mmu_page_header_cache)
 		goto nomem;
 
+#ifdef XXX
+	/* this looks like a garbage collector/reaper.  Implement later if needed */
 	register_shrinker(&mmu_shrinker);
+#endif /*XXX*/
 
 	return 0;
 
 nomem:
 	mmu_destroy_caches();
-	return -ENOMEM;
+	return ENOMEM;
+}
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in the beginning of the list.
+ */
+
+#define MSR_KVM_WALL_CLOCK  0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_SAVE_MSRS_BEGIN	5
+static uint32_t msrs_to_save[] = {
+	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+	HV_X64_MSR_APIC_ASSIST_PAGE,
+	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+	MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+};
+
+static unsigned num_msrs_to_save;
+
+static uint32_t emulated_msrs[] = {
+	MSR_IA32_MISC_ENABLE,
+};
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+uint64_t native_read_msr_safe(unsigned int msr,
+				     int *err)
+{
+	DECLARE_ARGS(val, low, high);
+
+#ifdef CONFIG_SOLARIS
+	{
+		on_trap_data_t otd;
+
+		if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
+			native_read_msr(msr);
+		} else {
+			*err = EINVAL; /* XXX probably not right... */
+		}
+		no_trap();
+	}
 #else
-	return DDI_SUCCESS;
-#endif /*XXX*/
+	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
+		     ".previous\n\t"
+		     _ASM_EXTABLE(2b, 3b)
+		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
+		     : "c" (msr), [fault] "i" (-EIO));
+#endif /*CONFIG_SOLARIS*/
+	return EAX_EDX_VAL(val, low, high);
+}
+
+/* Can be uninlined because referenced by paravirt */
+int native_write_msr_safe(unsigned int msr,
+				 unsigned low, unsigned high)
+{
+	int err;
+#ifdef CONFIG_SOLARIS
+	{
+		on_trap_data_t otd;
+
+		if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
+			native_write_msr(msr, low, high);
+		} else {
+			err = EINVAL;  /* XXX probably not right... */
+		}
+		no_trap();
+	}
+#else
+	asm volatile("2: wrmsr ; xor %[err],%[err]\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
+		     ".previous\n\t"
+		     _ASM_EXTABLE(2b, 3b)
+		     : [err] "=a" (err)
+		     : "c" (msr), "0" (low), "d" (high),
+		       [fault] "i" (-EIO)
+		     : "memory");
+#endif /*CONFIG_SOLARIS*/
+	return err;
+}
+
+static void kvm_init_msr_list(void)
+{
+	uint32_t dummy[2];
+	unsigned i, j;
+
+	/* skip the first msrs in the list. KVM-specific */
+	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+			continue;
+		if (j < i)
+			msrs_to_save[j] = msrs_to_save[i];
+		j++;
+	}
+	num_msrs_to_save = j;
+}
+
+static uint64_t shadow_trap_nonpresent_pte;
+static uint64_t shadow_notrap_nonpresent_pte;
+static uint64_t shadow_base_present_pte;
+static uint64_t shadow_nx_mask;
+static uint64_t shadow_x_mask;	/* mutual exclusive with nx_mask */
+static uint64_t shadow_user_mask;
+static uint64_t shadow_accessed_mask;
+static uint64_t shadow_dirty_mask;
+
+void kvm_mmu_set_nonpresent_ptes(uint64_t trap_pte, uint64_t notrap_pte)
+{
+	shadow_trap_nonpresent_pte = trap_pte;
+	shadow_notrap_nonpresent_pte = notrap_pte;
+}
+
+void kvm_mmu_set_base_ptes(uint64_t base_pte)
+{
+	shadow_base_present_pte = base_pte;
+}
+
+void kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask,
+		uint64_t dirty_mask, uint64_t nx_mask, uint64_t x_mask)
+{
+	shadow_user_mask = user_mask;
+	shadow_accessed_mask = accessed_mask;
+	shadow_dirty_mask = dirty_mask;
+	shadow_nx_mask = nx_mask;
+	shadow_x_mask = x_mask;
+}
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_SHIFT 5
+#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
+#define PFERR_FETCH_MASK (1U << 4)
+
+static void kvm_timer_init(void)
+{
+	int cpu;
+
+	/*
+	 * XXX We assume that any machine running solaris kvm
+	 * has constant time stamp counter increment rate.
+	 * This will be true for all but older machines.
+	 */
+#ifndef CONFIG_SOLARIS
+	for_each_possible_cpu(cpu)
+		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+#else
+	/* assume pi_clock in mhz */
+	/* cpu_tsc_khz = (CPU)->cpu_type_info.pi_clock * 1000;*/
+#endif /*CONFIG_SOLARIS*/
 }
 
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+	volatile int x;  /* XXX - dtrace return probe missing */
 
 	if (ops->cpu_has_kvm_support()) {
 		cmn_err(CE_WARN, "kvm: no hardware support\n");
@@ -454,7 +1140,6 @@ int kvm_arch_init(void *opaque)
 	if (r)
 		goto out;
 
-#ifdef XXX
 	kvm_init_msr_list();
 
 	kvm_x86_ops = ops;
@@ -463,16 +1148,20 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
-
 	kvm_timer_init();
-#endif
 
+	x = 10; /*XXX*/
 	return 0;
 
 out:
+	x = 20; /*XXX*/
 	return r;
 }
 
+caddr_t bad_page;  /* XXX page_t on linux... */
+pfn_t bad_pfn;
+kmem_cache_t *kvm_vcpu_cache;
+
 int kvm_init(void *opaque, unsigned int vcpu_size)
 {
 	int r;
@@ -482,33 +1171,28 @@ int kvm_init(void *opaque, unsigned int vcpu_size)
 
 	if (r != DDI_SUCCESS)
 		return (r);
-#ifdef XXX
-	if (r)
-		goto out_fail;
 
-	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	bad_page = kmem_zalloc(PAGESIZE, KM_SLEEP);
 
 	if (bad_page == NULL) {
-		r = -ENOMEM;
+		r = ENOMEM;
 		goto out;
 	}
 
-	bad_pfn = page_to_pfn(bad_page);
+	bad_pfn = hat_getpfnum(kas.a_hat, bad_page);
 
+#ifdef XXX
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
 	}
-
 #endif /*XXX*/
-
 	r = kvm_arch_hardware_setup();
-	return (r);
 
-#ifdef XXX
-	if (r < 0)
+	if (r != DDI_SUCCESS)
 		goto out_free_0a;
 
+#ifdef XXX
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu,
 				kvm_arch_check_processor_compat,
@@ -516,7 +1200,10 @@ int kvm_init(void *opaque, unsigned int vcpu_size)
 		if (r < 0)
 			goto out_free_1;
 	}
+#endif /*XXX*/
+
 
+#ifdef XXX
 	r = register_cpu_notifier(&kvm_cpu_notifier);
 	if (r)
 		goto out_free_2;
@@ -529,64 +1216,84 @@ int kvm_init(void *opaque, unsigned int vcpu_size)
 	r = sysdev_register(&kvm_sysdev);
 	if (r)
 		goto out_free_4;
-
+#endif /*XXX*/
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
 	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
 					   __alignof__(struct kvm_vcpu),
-					   0, NULL);
+					   NULL, NULL, NULL, NULL, NULL, 0);
 	if (!kvm_vcpu_cache) {
-		r = -ENOMEM;
+		r = ENOMEM;
 		goto out_free_5;
 	}
 
+#ifdef XXX
 	kvm_chardev_ops.owner = module;
 	kvm_vm_fops.owner = module;
 	kvm_vcpu_fops.owner = module;
 
 	r = misc_register(&kvm_dev);
 	if (r) {
-		printk(KERN_ERR "kvm: misc device register failed\n");
+		cmn_err(CE_WARN, "kvm: misc device register failed\n");
 		goto out_free;
 	}
 
+	/*
+	 * XXX - if kernel preemption occurs, we probably need
+	 * to implement these, and add hooks to the preemption code.
+	 * For right now, we'll make the totally unreasonable
+	 * assumption that we won't be preempted while in the
+	 * kernel, i.e., no realtime threads are running
+	 */
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
 
 	kvm_init_debug();
+#endif /*XXX*/
 
 	return 0;
 
 out_free:
 	kmem_cache_destroy(kvm_vcpu_cache);
 out_free_5:
+#ifdef XXX
 	sysdev_unregister(&kvm_sysdev);
 out_free_4:
 	sysdev_class_unregister(&kvm_sysdev_class);
 out_free_3:
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
+#endif /*XXX*/
 out_free_2:
 out_free_1:
+#ifdef XXX
 	kvm_arch_hardware_unsetup();
+#endif /*XXX*/
 out_free_0a:
+#ifdef XXX
 	free_cpumask_var(cpus_hardware_enabled);
+#endif /*XXX*/
 out_free_0:
-	__free_page(bad_page);
+	kmem_free(bad_page, PAGESIZE);
 out:
+#ifdef XXX
 	kvm_arch_exit();
+#endif
 out_fail:
 	return r;
-#endif /*XXX*/
 }
 
-extern unsigned long *vmx_io_bitmap_a;
-extern unsigned long *vmx_io_bitmap_b;
-extern unsigned long *vmx_msr_bitmap_legacy;
-extern unsigned long *vmx_msr_bitmap_longmode;
+extern unsigned long vmx_io_bitmap_a[];
+extern unsigned long vmx_io_bitmap_b[];
+extern unsigned long vmx_msr_bitmap_legacy[];
+extern unsigned long vmx_msr_bitmap_longmode[];
+
+static inline int cpu_has_vmx_msr_bitmap(void)
+{
+	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
+}
 
 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
 {
-#ifdef XXX
 	int f = sizeof(unsigned long);
 
 	if (!cpu_has_vmx_msr_bitmap())
@@ -598,14 +1305,13 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t
 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 	 */
 	if (msr <= 0x1fff) {
-		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
-		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+		BT_CLEAR(msr_bitmap + 0x000 / f, msr); /* read-low */
+		BT_CLEAR(msr_bitmap + 0x800 / f, msr); /* write-low */
 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 		msr &= 0x1fff;
-		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
-		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+		BT_CLEAR(msr_bitmap + 0x400 / f, msr); /* read-high */
+		BT_CLEAR(msr_bitmap + 0xc00 / f, msr); /* write-high */
 	}
-#endif /*XXX*/
 }
 
 static void vmx_disable_intercept_for_msr(uint32_t msr, int longmode_only)
@@ -615,16 +1321,52 @@ static void vmx_disable_intercept_for_msr(uint32_t msr, int longmode_only)
 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
 }
 
+static struct kvm_shared_msrs_global shared_msrs_global;
+
+void kvm_define_shared_msr(unsigned slot, uint32_t msr)
+{
+	if (slot >= shared_msrs_global.nr)
+		shared_msrs_global.nr = slot + 1;
+	shared_msrs_global.msrs[slot] = msr;
+#ifdef XXX
+	/* we need ensured the shared_msr_global have been updated */
+	smp_wmb();
+#endif /*XXX*/
+}
+
+static uint64_t host_efer;
+
+/*
+ * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * away by decrementing the array size.
+ */
+static const uint32_t vmx_msr_index[] = {
+#ifdef CONFIG_X86_64
+	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+#endif
+	MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
+};
+#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
+#define VMX_NR_VPIDS				(1 << 16)
+ulong_t *vmx_vpid_bitmap;
+size_t vpid_bitmap_words;
+kmutex_t vmx_vpid_lock;
+
+void kvm_disable_tdp(void)
+{
+	tdp_enabled = 0;
+}
+
 static int vmx_init(void)
 {
 	int r, i;
-#ifdef XXX
+
 	rdmsrl_safe(MSR_EFER, &host_efer);
 
 	for (i = 0; i < NR_VMX_MSR; ++i)
 		kvm_define_shared_msr(i, vmx_msr_index[i]);
-#endif /*XXX*/
 
+#ifdef XXX
 	vmx_io_bitmap_a = (unsigned long *)kmem_zalloc(PAGESIZE, KM_SLEEP);
 	if (!vmx_io_bitmap_a)
 		return ENOMEM;
@@ -643,10 +1385,9 @@ static int vmx_init(void)
 
 	vmx_msr_bitmap_longmode = (unsigned long *)kmem_zalloc(PAGESIZE, KM_SLEEP);
 	if (!vmx_msr_bitmap_longmode) {
-		r = ENOMEM;
-		goto out2;
+		r = ENOMEM;		goto out2;
 	}
-
+#endif
 	/*
 	 * Allow direct access to the PC debug port (it is often used for I/O
 	 * delays, but the vmexits simply slow things down).
@@ -659,9 +1400,7 @@ static int vmx_init(void)
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGESIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGESIZE);
 
-#ifdef XXX
-	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-#endif /*XXX*/
+	BT_SET(vmx_vpid_bitmap, 0); /* 0 is reserved for host */
 
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx));
 
@@ -675,7 +1414,6 @@ static int vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, 0);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, 0);
 
-#ifdef XXX
 	if (enable_ept) {
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -686,10 +1424,10 @@ static int vmx_init(void)
 	} else
 		kvm_disable_tdp();
 
+#ifdef XXX
 	if (bypass_guest_pf)
 		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 #endif /*XXX*/
-
 	return 0;
 
 out3:
@@ -718,10 +1456,20 @@ _init(void)
 		ddi_soft_state_fini(&kvm_state);
 	}
 
+	if (enable_vpid) {
+		vpid_bitmap_words = howmany(VMX_NR_VPIDS, BT_NBIPUL);
+		vmx_vpid_bitmap = kmem_zalloc(sizeof(ulong_t)*vpid_bitmap_words, KM_SLEEP);
+		mutex_init(&vmx_vpid_lock, NULL, MUTEX_DRIVER, NULL);
+	}
+		
 	mutex_init(&kvm_lock, NULL, MUTEX_DRIVER, 0);  /* XXX */
 	kvm_x86_ops = &vmx_x86_ops;
 	if ((r = vmx_init()) != DDI_SUCCESS) {
 		mutex_destroy(&kvm_lock);
+		if (vmx_vpid_bitmap) {
+			kmem_free(vmx_vpid_bitmap, sizeof(ulong_t)*vpid_bitmap_words);
+			mutex_destroy(&vmx_vpid_lock);
+		}
 		mod_remove(&modlinkage);
 		ddi_soft_state_fini(&kvm_state);
 		return (r);
@@ -953,7 +1701,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
 	return container_of(mn, struct kvm, mmu_notifier);
 }
-#endif
 
 static void 
 kvm_mmu_pages_init(struct kvm_mmu_page *parent,
@@ -981,21 +1728,8 @@ mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
 	return (pvec->nr == KVM_PAGE_ARRAY_NR);
 }
 
-static uint64_t shadow_trap_nonpresent_pte;
-static uint64_t shadow_notrap_nonpresent_pte;
-
 extern pfn_t hat_getpfnum(struct hat *hat, caddr_t);
 
-#ifdef XXX
-
-static inline struct kvm_mmu_page *
-page_header(hpa_t shadow_page)
-{
-	struct page *page = pfn_to_page(shadow_page >> PAGESHIFT);
-
-	return (struct kvm_mmu_page *)page_private(page);
-}
-
 static int 
 is_large_pte(uint64_t pte)
 {
@@ -1009,6 +1743,7 @@ is_shadow_present_pte(uint64_t pte)
 		&& pte != shadow_notrap_nonpresent_pte;
 }
 
+
 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 			   struct kvm_mmu_pages *pvec)
 {
@@ -1042,7 +1777,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 		}
 	}
 
-	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+	if (bt_getlowbit(sp->unsync_child_bitmap, 0, 512) == 512)
 		sp->unsync_children = 0;
 
 	return nr_unsync_leaf;
@@ -1228,7 +1963,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	kvm->mmu_notifier_count--;
 	spin_unlock(&kvm->mmu_lock);
 
-	BUG_ON(kvm->mmu_notifier_count < 0);
+	assert(kvm->mmu_notifier_count >= 0);
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1257,18 +1992,14 @@ kvm_arch_flush_shadow(struct kvm *kvm)
 	kvm_reload_remote_mmus(kvm);
 }
 
-#ENDIF /*XXX*/
-
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 				     struct mm_struct *mm)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int idx;
-#ifdef XXX
 	idx = srcu_read_lock(&kvm->srcu);
 	kvm_arch_flush_shadow(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
-#endif /*XXX*/
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
@@ -1285,7 +2016,6 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }
-
 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
@@ -1345,7 +2075,8 @@ kvm_create_vm(void)
 		   (void *)ipltospl(DISP_LEVEL));
 #ifdef XXX
 	kvm_eventfd_init(kvmp);
-#endif
+#endif /*XXX*/
+
 	mutex_init(&kvmp->lock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&kvmp->irq_lock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&kvmp->slots_lock, NULL, MUTEX_DRIVER, NULL);
@@ -1436,10 +2167,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
 	used_pages = max(0, used_pages);
 
-#ifdef XXX
 	/* for the time being, assume that address space will only grow */
 	/* larger.  The following code will be added later. */
-
+#ifdef XXX
 	/*
 	 * If we set the number of mmu pages to be smaller be than the
 	 * number of actived pages , we must to free some mmu pages before we
@@ -1448,7 +2178,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 	if (used_pages > kvm_nr_mmu_pages) {
 		while (used_pages > kvm_nr_mmu_pages &&
-			!list_empty(&kvm->arch.active_mmu_pages)) {
+			!list_is_empty(&kvm->arch.active_mmu_pages)) {
 			struct kvm_mmu_page *page;
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
@@ -1483,7 +2213,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				old.npages * PAGESIZE);
 		up_write(&current->mm->mmap_sem);
 		if (ret < 0)
-			printk(KERN_WARNING
+			cmn_err(CE_WARN,
 			       "kvm_vm_ioctl_set_memory_region: "
 			       "failed to munmap memory\n");
 	}
@@ -2004,6 +2734,31 @@ static inline uint32_t bit(int bitno)
 	return 1 << (bitno & 31);
 }
 
+static inline int cpu_has_vmx_ept_1g_page(void)
+{
+	return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+}
+
+static int vmx_get_lpage_level(void)
+{
+	if (enable_ept && !cpu_has_vmx_ept_1g_page())
+		return PT_DIRECTORY_LEVEL;
+	else
+		/* For shadow and EPT supported 1GB page */
+		return PT_PDPE_LEVEL;
+}
+
+static inline int cpu_has_vmx_rdtscp(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_RDTSCP;
+}
+
+static int vmx_rdtscp_supported(void)
+{
+	return cpu_has_vmx_rdtscp();
+}
+
 
 #define F(x) bit(X86_FEATURE_##x)
 
@@ -2058,10 +2813,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function,
 		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
 		0 /* SKINIT */ | 0 /* WDT */;
 
+	volatile int x;  /* XXX - dtrace return probe missing */
+
 	/* all calls to cpuid_count() should be made on the same cpu */
 	/* XXX - right now, system panics at ddi_exit_critical() */
 	/* XXX - to run everything on same cpu, bind qemu at startup */
-	/*ddic = ddi_enter_critical(); */
+	kpreempt_disable();
 	do_cpuid_1_ent(entry, function, index);
 	++*nent;
 
@@ -2135,6 +2892,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function,
 	}
 	/*XXX - see comment above for ddi_enter_critical() */
 	/*ddi_exit_critical(ddic);*/
+	kpreempt_enable();
+	x = 10; /*XXX*/
 }
 
 #undef F
@@ -2146,13 +2905,15 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	struct kvm_cpuid_entry2 *cpuid_entries;
 	int limit, nent = 0, r = E2BIG;
 	uint32_t func;
+	int allocsize = 0;
 
 	if (cpuid->nent < 1)
 		goto out;
 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
 	r = ENOMEM;
-	cpuid_entries = kmem_alloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent, KM_SLEEP);
+	allocsize = sizeof(struct kvm_cpuid_entry2)*cpuid->nent;
+	cpuid_entries = kmem_alloc(allocsize, KM_SLEEP);
 	if (!cpuid_entries)
 		goto out;
 
@@ -2182,16 +2943,11 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	r = 0;
 
 out_free:
-	kmem_free(cpuid_entries, sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+	kmem_free(cpuid_entries, allocsize);
 out:
 	return r;
 }
 
-struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-{
-	return container_of(vcpu, struct vcpu_vmx, vcpu);
-}
-
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
 
@@ -2199,6 +2955,7 @@ void vmcs_clear(struct vmcs *vmcs)
 {
 	unsigned char error;
 	uint64_t phys_addr = (hat_getpfnum(kas.a_hat, (char *)vmcs)<<PAGESHIFT)|((uint64_t)vmcs&PAGEOFFSET);
+	volatile int x;  /*XXX - dtrace return probe missing */
 
 	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "\n\tsetna %0\n"
 		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
@@ -2206,6 +2963,7 @@ void vmcs_clear(struct vmcs *vmcs)
 	if (error)
 		cmn_err(CE_PANIC, "kvm: vmclear fail: %p/%llx\n",
 		       vmcs, phys_addr);
+	x = 10; /*XXX*/
 }
 
 static void __vcpu_clear(void *arg)
@@ -2236,16 +2994,21 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
 }
 
 
+
+static void vmwrite_error(unsigned long field, unsigned long value)
+{
+	cmn_err(CE_WARN, "vmwrite error: reg %lx value %lx (err %d)\n",
+	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
 void vmcs_writel(unsigned long field, unsigned long value)
 {
 	unsigned char error;
 
 	asm volatile (ASM_VMX_VMWRITE_RAX_RDX "\n\tsetna %0"
 		       : "=q"(error) : "a"(value), "d"(field) : "cc");
-#ifdef XXX
-	if (unlikely(error))
+	if ((error))
 		vmwrite_error(field, value);
-#endif
 }
 
 unsigned long vmcs_readl(unsigned long field)
@@ -2257,7 +3020,6 @@ unsigned long vmcs_readl(unsigned long field)
 	return value;
 }
 
-
 uint64_t vmcs_read64(unsigned long field)
 {
 #ifdef CONFIG_X86_64
@@ -2267,6 +3029,11 @@ uint64_t vmcs_read64(unsigned long field)
 #endif
 }
 
+uint16_t vmcs_read16(unsigned long field)
+{
+	return vmcs_readl(field);
+}
+
 void vmcs_write64(unsigned long field, uint64_t value)
 {
 	vmcs_writel(field, value);
@@ -2276,15 +3043,208 @@ void vmcs_write64(unsigned long field, uint64_t value)
 #endif
 }
 
+
+void vmcs_write16(unsigned long field, uint16_t value)
+{
+	vmcs_writel(field, value);
+}
+
+/*
+ * writes 'guest_tsc' into guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ */
+static void guest_write_tsc(uint64_t guest_tsc, uint64_t host_tsc)
+{
+	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+}
+
+static inline int cpu_has_secondary_exec_ctrls(void)
+{
+	return vmcs_config.cpu_based_exec_ctrl &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+	return flexpriority_enabled && irqchip_in_kernel(kvm);
+}
+
+extern uint64_t kvm_va2pa(caddr_t va);
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+	uint32_t host_sysenter_cs, msr_low, msr_high;
+	uint32_t junk;
+	uint64_t host_pat, tsc_this, tsc_base;
+	unsigned long a;
+	struct descriptor_table dt;
+	int i;
+	unsigned long kvm_vmx_return;
+	uint32_t exec_control;
+
+	/* I/O */
+	vmcs_write64(IO_BITMAP_A, kvm_va2pa((caddr_t)vmx_io_bitmap_a));
+	vmcs_write64(IO_BITMAP_B, kvm_va2pa((caddr_t)vmx_io_bitmap_b));
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, kvm_va2pa((caddr_t)vmx_msr_bitmap_legacy));
+
+	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+	/* Control */
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		vmcs_config.pin_based_exec_ctrl);
+
+	exec_control = vmcs_config.cpu_based_exec_ctrl;
+#ifdef XXX
+	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+				CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+#endif /*XXX*/
+
+	if (!enable_ept)
+		exec_control |= CPU_BASED_CR3_STORE_EXITING |
+				CPU_BASED_CR3_LOAD_EXITING  |
+				CPU_BASED_INVLPG_EXITING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	if (cpu_has_secondary_exec_ctrls()) {
+		exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+		if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+			exec_control &=
+				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		if (vmx->vpid == 0)
+			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+		if (!enable_ept) {
+			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+			enable_unrestricted_guest = 0;
+		}
+		if (!enable_unrestricted_guest)
+			exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+#ifdef XXX
+		if (!ple_gap)
+#endif /*XXX*/
+			exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+#ifdef XXX
+	if (ple_gap) {
+		vmcs_write32(PLE_GAP, ple_gap);
+		vmcs_write32(PLE_WINDOW, ple_window);
+	}
+#endif /*XXX*/
+
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+
+	vmcs_writel(HOST_CR0, getcr0());  /* 22.2.3 */
+	vmcs_writel(HOST_CR4, getcr4());  /* 22.2.3, 22.2.5 */
+	vmcs_writel(HOST_CR3, getcr3());  /* 22.2.3  FIXME: shadow tables */
+
+	vmcs_write16(HOST_CS_SELECTOR, GDT_KCODE);  /* 22.2.4 */
+	vmcs_write16(HOST_DS_SELECTOR, GDT_KDATA);  /* 22.2.4 */
+	vmcs_write16(HOST_ES_SELECTOR, GDT_KDATA);  /* 22.2.4 */
+	vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
+	vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
+	vmcs_write16(HOST_SS_SELECTOR, GDT_KDATA);  /* 22.2.4 */
+#ifdef CONFIG_X86_64
+	rdmsrl(MSR_FS_BASE, a);
+	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
+	rdmsrl(MSR_GS_BASE, a);
+	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
+#else
+	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+
+	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
+
+	kvm_get_idt(&dt);
+	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+
+	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
+	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
+	rdmsrl(MSR_IA32_SYSENTER_ESP, a);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
+	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+		host_pat = msr_low | ((uint64_t) msr_high << 32);
+		vmcs_write64(HOST_IA32_PAT, host_pat);
+	}
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+		host_pat = msr_low | ((uint64_t) msr_high << 32);
+		/* Write the default value follow host pat */
+		vmcs_write64(GUEST_IA32_PAT, host_pat);
+		/* Keep arch.pat sync with GUEST_IA32_PAT */
+		vmx->vcpu.arch.pat = host_pat;
+	}
+
+	for (i = 0; i < NR_VMX_MSR; ++i) {
+		uint32_t index = vmx_msr_index[i];
+		uint32_t data_low, data_high;
+		int j = vmx->nmsrs;
+
+		if (rdmsr_safe(index, &data_low, &data_high) < 0)
+			continue;
+		if (wrmsr_safe(index, data_low, data_high) < 0)
+			continue;
+		vmx->guest_msrs[j].index = i;
+		vmx->guest_msrs[j].data = 0;
+		vmx->guest_msrs[j].mask = -1ull;
+		++vmx->nmsrs;
+	}
+
+	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+	/* 22.2.1, 20.8.1 */
+	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+	if (enable_ept)
+		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+
+	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
+	rdtscll(tsc_this);
+	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
+		tsc_base = tsc_this;
+
+	guest_write_tsc(0, tsc_base);
+	return 0;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
  */
 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	/* XXX - the following assignment assumes vmx contains vcpu */
+	/* at the beginning of the structure */
+
+	struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu; 
 	uint64_t phys_addr = (hat_getpfnum(kas.a_hat, (char *)vmx->vmcs)<<PAGESHIFT)|((uint64_t)(vmx->vmcs)&0xfff);
 	uint64_t tsc_this, delta, new_offset;
+	volatile int x;  /* XXX - dtrace return probe missing */
 
 	if (vcpu->cpu != cpu) {
 		vcpu_clear(vmx);
@@ -2293,10 +3253,10 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 #endif /*XXX*/
 		BT_SET(&vcpu->requests, KVM_REQ_TLB_FLUSH);
 #ifdef XXX
-		local_irq_disable();
+		kpreempt_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
-		local_irq_enable();
+		kpreempt_enable();
 #endif /*XXX*/
 	}
 
@@ -2351,6 +3311,8 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			vmcs_write64(TSC_OFFSET, new_offset);
 		}
 	}
+	x = 10;
+	return;
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2382,6 +3344,15 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 #endif /*XXX*/
 }
 
+/* straight from xen code... */
+void
+ldt_load(void)
+{
+	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
+	wr_ldtr(ULDT_SEL);
+}
+
+
 static void reload_tss(void)
 {
 	/*
@@ -2396,7 +3367,7 @@ static void reload_tss(void)
 	load_TR_desc();
 }
 
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
+int is_long_mode(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
 	return vcpu->arch.efer & EFER_LMA;
@@ -2405,6 +3376,27 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
 #endif
 }
 
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS				  \
+	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR  \
+	 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+
+ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+	uint64_t tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+#ifdef XXX
+	if (tmask & vcpu->arch.cr4_guest_owned_bits)
+		kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+#endif /*XXX*/
+	return vcpu->arch.cr4 & mask;
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+	return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+
 static void
 __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
@@ -2425,18 +3417,15 @@ __vmx_load_host_state(struct vcpu_vmx *vmx)
 		 * If we have to reload gs, we must take care to
 		 * preserve our gs base.
 		 */
-#ifdef XXX
-		local_irq_save(flags);
-#endif /*XXX*/
+		kpreempt_disable();
 		kvm_load_gs(vmx->host_state.gs_sel);
 #ifdef CONFIG_X86_64
 		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
 #endif
-#ifdef XXX
-		local_irq_restore(flags);
-#endif /*XXX*/
+		kpreempt_enable();
 	}
 	reload_tss();
+
 #ifdef CONFIG_X86_64
 	if (is_long_mode(&vmx->vcpu)) {
 		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -2445,9 +3434,16 @@ __vmx_load_host_state(struct vcpu_vmx *vmx)
 #endif
 }
 
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+	kpreempt_disable();
+	__vmx_load_host_state(vmx);
+	kpreempt_enable();
+}
+
 void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	__vmx_load_host_state(to_vmx(vcpu));
+	__vmx_load_host_state((struct vcpu_vmx *)vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2488,9 +3484,75 @@ void vcpu_put(struct kvm_vcpu *vcpu)
 	mutex_exit(&vcpu->mutex);
 }
 
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+	uint32_t function, uint32_t index)
+{
+	if (e->function != function)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+		return 0;
+	return 1;
+}
+
+struct kvm_pic *pic_irqchip(struct kvm *kvm);
+extern int irqchip_in_kernel(struct kvm *kvm);
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+	int j, nent = vcpu->arch.cpuid_nent;
+
+	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+	/* when no next entry is found, the current entry[i] is reselected */
+	for (j = i + 1; ; j = (j + 1) % nent) {
+		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+		if (ej->function == e->function) {
+			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+			return j;
+		}
+	}
+	return 0; /* silence gcc, even though control never reaches here */
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      uint32_t function, uint32_t index)
+{
+	int i;
+	struct kvm_cpuid_entry2 *best = NULL;
+
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		struct kvm_cpuid_entry2 *e;
+
+		e = &vcpu->arch.cpuid_entries[i];
+		if (is_matching_cpuid_entry(e, function, index)) {
+			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+				move_to_next_stateful_cpuid_entry(vcpu, i);
+			best = e;
+			break;
+		}
+		/*
+		 * Both basic or both extended?
+		 */
+		if (((e->function ^ function) & 0x80000000) == 0)
+			if (!best || e->function > best->function)
+				best = e;
+	}
+	return best;
+}
+
+#define APIC_LVT_NUM			6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION			(0x14UL | ((APIC_LVT_NUM - 1) << 16))
+
+extern void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val);
+
 void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 {
-#ifdef XXX
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	struct kvm_cpuid_entry2 *feat;
 	uint32_t v = APIC_VERSION;
@@ -2502,7 +3564,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 	if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
 		v |= APIC_LVR_DIRECTED_EOI;
 	apic_set_reg(apic, APIC_LVR, v);
-#endif /*XXX*/
 }
 
 
@@ -2552,48 +3613,5166 @@ out:
 	return r;
 }
 
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+					      enum kvm_reg reg)
+{
+#ifdef XXX
+	if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+		kvm_x86_ops->cache_reg(vcpu, reg);
+#endif /*XXX*/
+
+	return vcpu->arch.regs[reg];
+}
+
+void kvm_register_write(struct kvm_vcpu *vcpu,
+				      enum kvm_reg reg,
+				      unsigned long val)
+{
+	vcpu->arch.regs[reg] = val;
+#ifdef XXX
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+#endif
+}
+
+unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+	return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+
+	rflags = kvm_x86_ops->get_rflags(vcpu);
+#ifdef XXX
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+#endif /*XXX*/
+	return rflags;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+
+	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+#ifdef CONFIG_X86_64
+	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+#endif
+
+	regs->rip = kvm_rip_read(vcpu);
+	regs->rflags = kvm_get_rflags(vcpu);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+#define VMX_SEGMENT_FIELD(seg)					\
+	[VCPU_SREG_##seg] = {                                   \
+		.selector = GUEST_##seg##_SELECTOR,		\
+		.base = GUEST_##seg##_BASE,		   	\
+		.limit = GUEST_##seg##_LIMIT,		   	\
+		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
+	}
+
+static struct kvm_vmx_segment_field {
+	unsigned selector;
+	unsigned base;
+	unsigned limit;
+	unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+	VMX_SEGMENT_FIELD(CS),
+	VMX_SEGMENT_FIELD(DS),
+	VMX_SEGMENT_FIELD(ES),
+	VMX_SEGMENT_FIELD(FS),
+	VMX_SEGMENT_FIELD(GS),
+	VMX_SEGMENT_FIELD(SS),
+	VMX_SEGMENT_FIELD(TR),
+	VMX_SEGMENT_FIELD(LDTR),
+};
+
+void vmx_get_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	uint32_t ar;
+
+	var->base = vmcs_readl(sf->base);
+	var->limit = vmcs_read32(sf->limit);
+	var->selector = vmcs_read16(sf->selector);
+	ar = vmcs_read32(sf->ar_bytes);
+#ifdef XXX
+	if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
+		ar = 0;
+#endif /*XXX*/
+	var->type = ar & 15;
+	var->s = (ar >> 4) & 1;
+	var->dpl = (ar >> 5) & 3;
+	var->present = (ar >> 7) & 1;
+	var->avl = (ar >> 12) & 1;
+	var->l = (ar >> 13) & 1;
+	var->db = (ar >> 14) & 1;
+	var->g = (ar >> 15) & 1;
+	var->unusable = (ar >> 16) & 1;
+}
+
+static uint32_t vmx_segment_access_rights(struct kvm_segment *var)
+{
+	uint32_t ar;
+
+	if (var->unusable)
+		ar = 1 << 16;
+	else {
+		ar = var->type & 15;
+		ar |= (var->s & 1) << 4;
+		ar |= (var->dpl & 3) << 5;
+		ar |= (var->present & 1) << 7;
+		ar |= (var->avl & 1) << 12;
+		ar |= (var->l & 1) << 13;
+		ar |= (var->db & 1) << 14;
+		ar |= (var->g & 1) << 15;
+	}
+	if (ar == 0) /* a 0 value means unusable */
+		ar = AR_UNUSABLE_MASK;
+
+	return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg)
+{
+	struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	uint32_t ar;
+
+	if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
+		vmx->rmode.tr.selector = var->selector;
+		vmx->rmode.tr.base = var->base;
+		vmx->rmode.tr.limit = var->limit;
+		vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+		return;
+	}
+	vmcs_writel(sf->base, var->base);
+	vmcs_write32(sf->limit, var->limit);
+	vmcs_write16(sf->selector, var->selector);
+	if (vmx->rmode.vm86_active && var->s) {
+		/*
+		 * Hack real-mode segments into vm86 compatibility.
+		 */
+		if (var->base == 0xffff0000 && var->selector == 0xf000)
+			vmcs_writel(sf->base, 0xf0000);
+		ar = 0xf3;
+	} else
+		ar = vmx_segment_access_rights(var);
+
+	/*
+	 *   Fix the "Accessed" bit in AR field of segment registers for older
+	 * qemu binaries.
+	 *   IA32 arch specifies that at the time of processor reset the
+	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
+	 * is setting it to 0 in the usedland code. This causes invalid guest
+	 * state vmexit when "unrestricted guest" mode is turned on.
+	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
+	 * tree. Newer qemu binaries with that qemu fix would not need this
+	 * kvm hack.
+	 */
+#ifdef XXX
+	if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+		ar |= 0x1; /* Accessed */
+#endif /*XXX*/
+
+	vmcs_write32(sf->ar_bytes, ar);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+		     struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+static uint16_t get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	return kvm_seg.selector;
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+#ifdef XXX
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+	    vcpu->arch.singlestep_cs ==
+			get_segment_selector(vcpu, VCPU_SREG_CS) &&
+	    vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+#endif /*XXX*/
+	kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+#ifdef CONFIG_X86_64
+	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#endif
+
+	kvm_rip_write(vcpu, regs->rip);
+	kvm_set_rflags(vcpu, regs->rflags);
+
+	vcpu->arch.exception.pending = 0;
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+#ifdef XXX
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+#endif /*XXX*/
+
+	vcpu_load(vcpu);
+#ifdef XXX
+	memcpy(fpu->fpr, fxsave->st_space, 128);
+	fpu->fcw = fxsave->cwd;
+	fpu->fsw = fxsave->swd;
+	fpu->ftwx = fxsave->twd;
+	fpu->last_opcode = fxsave->fop;
+	fpu->last_ip = fxsave->rip;
+	fpu->last_dp = fxsave->rdp;
+	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+#endif /*XXX*/
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+#ifdef XXX
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+#endif
+
+	vcpu_load(vcpu);
+#ifdef XXX
+	memcpy(fxsave->st_space, fpu->fpr, 128);
+	fxsave->cwd = fpu->fcw;
+	fxsave->swd = fpu->fsw;
+	fxsave->twd = fpu->ftwx;
+	fxsave->fop = fpu->last_opcode;
+	fxsave->rip = fpu->last_ip;
+	fxsave->rdp = fpu->last_dp;
+	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+#endif /*XXX*/
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+
+ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+	return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+	ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+#ifdef XXX
+	if (tmask & vcpu->arch.cr0_guest_owned_bits)
+		kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+#endif /*XXX*/
+	return vcpu->arch.cr0 & mask;
+}
+
+
+ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+	return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	if (irqchip_in_kernel(vcpu->kvm))
+		return kvm_lapic_get_cr8(vcpu);
+	else
+#endif /*XXX*/
+		return vcpu->arch.cr8;
+}
+
+extern uint64_t kvm_get_apic_base(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	struct descriptor_table dt;
+
+	vcpu_load(vcpu);
+
+	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+	kvm_x86_ops->get_idt(vcpu, &dt);
+	sregs->idt.limit = dt.limit;
+	sregs->idt.base = dt.base;
+	kvm_x86_ops->get_gdt(vcpu, &dt);
+	sregs->gdt.limit = dt.limit;
+	sregs->gdt.base = dt.base;
+
+	sregs->cr0 = kvm_read_cr0(vcpu);
+	sregs->cr2 = vcpu->arch.cr2;
+	sregs->cr3 = vcpu->arch.cr3;
+	sregs->cr4 = kvm_read_cr4(vcpu);
+	sregs->cr8 = kvm_get_cr8(vcpu);
+	sregs->efer = vcpu->arch.efer;
+	sregs->apic_base = kvm_get_apic_base(vcpu);
+
+	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+		BT_SET((unsigned long *)sregs->interrupt_bitmap,
+		       vcpu->arch.interrupt.nr);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+			struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+		vcpu->arch.mmu.free(vcpu);
+		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	}
+}
+
+extern int init_kvm_mmu(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+	destroy_kvm_mmu(vcpu);
+	return init_kvm_mmu(vcpu);
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, uint8_t vector,
+	int soft)
+{
+	vcpu->arch.interrupt.pending = 1;
+	vcpu->arch.interrupt.soft = soft;
+	vcpu->arch.interrupt.nr = vector;
+}
+
+
+static inline int is_present_gpte(unsigned long pte)
+{
+	return pte & PT_PRESENT_MASK;
+}
+
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+	struct kvm_mem_alias *alias;
+	struct kvm_mem_aliases *aliases;
+#ifdef XXX
+	aliases = rcu_dereference(kvm->arch.aliases);
+
+	for (i = 0; i < aliases->naliases; ++i) {
+		alias = &aliases->aliases[i];
+		if (alias->flags & KVM_ALIAS_INVALID)
+			continue;
+		if (gfn >= alias->base_gfn
+		    && gfn < alias->base_gfn + alias->npages)
+			return alias->target_gfn + gfn - alias->base_gfn;
+	}
+#endif /*XXX*/
+	return gfn;
+}
+
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+#ifdef XXX
+	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+#else
+	struct kvm_memslots *slots = kvm->memslots;
+#endif /*XXX*/
+
+	for (i = 0; i < slots->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
+	}
+	return NULL;
+}
+
+static inline unsigned long bad_hva(void)
+{
+	return PAGEOFFSET;
+}
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	gfn = unalias_gfn_instantiation(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return bad_hva();
+	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE);
+}
+
+
+int kvm_is_error_hva(unsigned long addr)
+{
+	return addr == bad_hva();
+}
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+			int len)
+{
+	int r;
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return EFAULT;
+	r = copyin((caddr_t)(addr + offset), data, len);
+	if (r)
+		return EFAULT;
+	return 0;
+}
+
+
+/*
+ * Load the pae pdptrs.  Return true is they are all valid.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	gfn_t pdpt_gfn = cr3 >> PAGESHIFT;
+	unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2;
+	int i;
+	int ret;
+	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+				  offset * sizeof(uint64_t), sizeof(pdpte));
+	if (ret < 0) {
+		ret = 0;
+		goto out;
+	}
+	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+		if (is_present_gpte(pdpte[i]) &&
+		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
+			ret = 0;
+			goto out;
+		}
+	}
+	ret = 1;
+
+	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+	BT_SET((unsigned long *)&vcpu->arch.regs_avail,
+	       VCPU_EXREG_PDPTR);
+	BT_SET((unsigned long *)&vcpu->arch.regs_dirty,
+	       VCPU_EXREG_PDPTR);
+out:
+
+	return ret;
+}
+
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+	if (irr == -1 || tpr < irr) {
+		vmcs_write32(TPR_THRESHOLD, 0);
+		return;
+	}
+
+	vmcs_write32(TPR_THRESHOLD, irr);
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+	int max_irr, tpr;
+
+	if (!kvm_x86_ops->update_cr8_intercept)
+		return;
+
+	if (!vcpu->arch.apic)
+		return;
+#ifdef XXX
+	if (!vcpu->arch.apic->vapic_addr)
+		max_irr = kvm_lapic_find_highest_irr(vcpu);
+	else
+#endif /*XXX*/
+		max_irr = -1;
+
+	if (max_irr != -1)
+		max_irr >>= 4;
+#ifdef XXX
+	tpr = kvm_lapic_get_cr8(vcpu);
+
+	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+#endif /*XXX*/
+}
+
+static int __find_msr_index(struct vcpu_vmx *vmx, uint32_t msr)
+{
+	int i;
+
+	for (i = 0; i < vmx->nmsrs; ++i)
+		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+			return i;
+	return -1;
+}
+
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, uint32_t msr)
+{
+	int i;
+
+	i = __find_msr_index(vmx, msr);
+	if (i >= 0)
+		return &vmx->guest_msrs[i];
+	return NULL;
+}
+
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+{
+	struct shared_msr_entry tmp;
+
+	tmp = vmx->guest_msrs[to];
+	vmx->guest_msrs[to] = vmx->guest_msrs[from];
+	vmx->guest_msrs[from] = tmp;
+}
+
+static int update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+{
+	uint64_t guest_efer;
+	uint64_t ignore_bits;
+
+	guest_efer = vmx->vcpu.arch.efer;
+
+	/*
+	 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+	 * outside long mode
+	 */
+	ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+	ignore_bits |= EFER_LMA | EFER_LME;
+	/* SCE is meaningful only in long mode on Intel */
+	if (guest_efer & EFER_LMA)
+		ignore_bits &= ~(uint64_t)EFER_SCE;
+#endif
+	guest_efer &= ~ignore_bits;
+	guest_efer |= host_efer & ignore_bits;
+	vmx->guest_msrs[efer_offset].data = guest_efer;
+	vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+	return 1;
+}
+
+/*
+ * Set up the vmcs to automatically save and restore system
+ * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
+ * mode, as fiddling with msrs is very expensive.
+ */
+void setup_msrs(struct vcpu_vmx *vmx)
+{
+	int save_nmsrs, index;
+	unsigned long *msr_bitmap;
+
+	vmx_load_host_state(vmx);
+	save_nmsrs = 0;
+#ifdef CONFIG_X86_64
+	if (is_long_mode(&vmx->vcpu)) {
+		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
+		if (index >= 0)
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_LSTAR);
+		if (index >= 0)
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_CSTAR);
+		if (index >= 0)
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_TSC_AUX);
+		if (index >= 0 && vmx->rdtscp_enabled)
+			move_msr_up(vmx, index, save_nmsrs++);
+		/*
+		 * MSR_K6_STAR is only needed on long mode guests, and only
+		 * if efer.sce is enabled.
+		 */
+		index = __find_msr_index(vmx, MSR_K6_STAR);
+		if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
+			move_msr_up(vmx, index, save_nmsrs++);
+	}
+#endif
+	index = __find_msr_index(vmx, MSR_EFER);
+	if (index >= 0 && update_transition_efer(vmx, index))
+		move_msr_up(vmx, index, save_nmsrs++);
+
+	vmx->save_nmsrs = save_nmsrs;
+
+	if (cpu_has_vmx_msr_bitmap()) {
+		if (is_long_mode(&vmx->vcpu))
+			msr_bitmap = vmx_msr_bitmap_longmode;
+		else
+			msr_bitmap = vmx_msr_bitmap_legacy;
+
+		vmcs_write64(MSR_BITMAP, kvm_va2pa((caddr_t)msr_bitmap));
+	}
+}
+
+void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+	if (!msr)
+		return;
+
+	/*
+	 * Force kernel_gs_base reloading before EFER changes, as control
+	 * of this msr depends on is_long_mode().
+	 */
+	vmx_load_host_state(to_vmx(vcpu));
+	vcpu->arch.efer = efer;
+	if (efer & EFER_LMA) {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) |
+			     VM_ENTRY_IA32E_MODE);
+		msr->data = efer;
+	} else {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) &
+			     ~VM_ENTRY_IA32E_MODE);
+
+		msr->data = efer & ~EFER_LME;
+	}
+	setup_msrs(vmx);
+}
+
+static inline int is_protmode(struct kvm_vcpu *vcpu)
+{
+	return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+int kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
+}
+#endif
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+	struct kvm_pic *s = pic_irqchip(kvm);
+
+	mutex_enter(&s->lock);
+	s->pics[0].isr_ack = 0xff;
+	s->pics[1].isr_ack = 0xff;
+	mutex_exit(&s->lock);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	int mmu_reset_needed = 0;
+	int pending_vec, max_bits;
+	struct descriptor_table dt;
+
+	vcpu_load(vcpu);
+
+	dt.limit = sregs->idt.limit;
+	dt.base = sregs->idt.base;
+	kvm_x86_ops->set_idt(vcpu, &dt);
+	dt.limit = sregs->gdt.limit;
+	dt.base = sregs->gdt.base;
+	kvm_x86_ops->set_gdt(vcpu, &dt);
+
+	vcpu->arch.cr2 = sregs->cr2;
+	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+	vcpu->arch.cr3 = sregs->cr3;
+
+	kvm_set_cr8(vcpu, sregs->cr8);
+
+	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+	kvm_x86_ops->set_efer(vcpu, sregs->efer);
+	kvm_set_apic_base(vcpu, sregs->apic_base);
+
+	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+	vcpu->arch.cr0 = sregs->cr0;
+
+	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+		load_pdptrs(vcpu, vcpu->arch.cr3);
+		mmu_reset_needed = 1;
+	}
+
+	if (mmu_reset_needed)
+		kvm_mmu_reset_context(vcpu);
+
+	max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+	pending_vec = bt_getlowbit(
+		(const unsigned long *)sregs->interrupt_bitmap, 0, max_bits);
+	if (pending_vec < max_bits) {
+		kvm_queue_interrupt(vcpu, pending_vec, 0);
+		cmn_err(CE_NOTE, "Set back pending irq %d\n", pending_vec);
+		if (irqchip_in_kernel(vcpu->kvm))
+			kvm_pic_clear_isr_ack(vcpu->kvm);
+	}
+
+	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+	update_cr8_intercept(vcpu);
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	/* Older userspace won't unhalt the vcpu on reset. */
+	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+	    !is_protmode(vcpu))
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+#endif /*CONFIG_KVM_APIC_ARCHITECTURE*/
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+	static int version;
+	struct pvclock_wall_clock wc;
+	struct timespec boot;
+
+#ifdef XXX
+	if (!wall_clock)
+		return;
+
+	version++;
+
+	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+	/*
+	 * The guest calculates current wall clock time by adding
+	 * system time (updated by kvm_write_guest_time below) to the
+	 * wall clock specified here.  guest system time equals host
+	 * system time for us, thus we must fill in host boot time here.
+	 */
+	getboottime(&boot);
+
+	wc.sec = boot.tv_sec;
+	wc.nsec = boot.tv_nsec;
+	wc.version = version;
+
+	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+	version++;
+	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+#endif /*XXX*/
+}
+
+static int next_segment(unsigned long len, int offset)
+{
+	if (len > PAGESIZE - offset)
+		return PAGESIZE - offset;
+	else
+		return len;
+}
+
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+#ifdef XXX
+	gfn = unalias_gfn(kvm, gfn);
+	memslot = gfn_to_memslot_unaliased(kvm, gfn);
+	if (memslot && memslot->dirty_bitmap) {
+		unsigned long rel_gfn = gfn - memslot->base_gfn;
+		unsigned long *p = memslot->dirty_bitmap +
+			rel_gfn / BT_NBIPUL;
+		int offset = rel_gfn % BT_NBIPUL;
+
+		/* avoid RMW */
+		if (!generic_test_le_bit(offset, p))
+			generic___set_le_bit(offset, p);
+	}
+#endif /*XXX*/
+}
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+			 int offset, int len)
+{
+	int r;
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	r = copyout(data, (caddr_t)((uint64_t)addr + offset), len);
+	if (r)
+		return -EFAULT;
+	mark_page_dirty(kvm, gfn);
+	return 0;
+}
+
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+		    unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGESHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+
+static int xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int lm = is_long_mode(vcpu);
+	uint8_t *blob_addr = lm ? (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+		: (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+	uint8_t blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+		: kvm->arch.xen_hvm_config.blob_size_32;
+	uint32_t page_num = data & ~PAGEMASK;
+	uint64_t page_addr = data & PAGEMASK;
+	uint8_t *page;
+	int r;
+
+	r = E2BIG;
+	if (page_num >= blob_size)
+		goto out;
+	r = ENOMEM;
+	page = kmem_alloc(PAGESIZE, KM_SLEEP);
+	if (!page)
+		goto out;
+	r = EFAULT;
+	if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE))
+		goto out_free;
+	if (kvm_write_guest(kvm, page_addr, page, PAGESIZE))
+		goto out_free;
+	r = 0;
+out_free:
+	kmem_free(page, PAGESIZE);
+out:
+	return r;
+}
+
+int ignore_msrs = 0;
+extern int is_paging(struct kvm_vcpu *vcpu);
+
+static void set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
+{
+	if (efer & efer_reserved_bits) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (is_paging(vcpu)
+	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (efer & EFER_FFXSR) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
+	if (efer & EFER_SVME) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
+	kvm_x86_ops->set_efer(vcpu, efer);
+
+	efer &= ~EFER_LMA;
+	efer |= vcpu->arch.efer & EFER_LMA;
+
+	vcpu->arch.efer = efer;
+
+	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+	kvm_mmu_reset_context(vcpu);
+}
+
+static int msr_mtrr_valid(unsigned msr)
+{
+	switch (msr) {
+	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+	case MSR_MTRRfix64K_00000:
+	case MSR_MTRRfix16K_80000:
+	case MSR_MTRRfix16K_A0000:
+	case MSR_MTRRfix4K_C0000:
+	case MSR_MTRRfix4K_C8000:
+	case MSR_MTRRfix4K_D0000:
+	case MSR_MTRRfix4K_D8000:
+	case MSR_MTRRfix4K_E0000:
+	case MSR_MTRRfix4K_E8000:
+	case MSR_MTRRfix4K_F0000:
+	case MSR_MTRRfix4K_F8000:
+	case MSR_MTRRdefType:
+	case MSR_IA32_CR_PAT:
+		return 1;
+	case 0x2f8:
+		return 1;
+	}
+	return 0;
+}
+
+
+static int valid_pat_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static int valid_mtrr_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static int mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	int i;
+
+	if (!msr_mtrr_valid(msr))
+		return 0;
+
+	if (msr == MSR_IA32_CR_PAT) {
+		for (i = 0; i < 8; i++)
+			if (!valid_pat_type((data >> (i * 8)) & 0xff))
+				return 0;
+		return 1;
+	} else if (msr == MSR_MTRRdefType) {
+		if (data & ~0xcff)
+			return 0;
+		return valid_mtrr_type(data & 0xff);
+	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+		for (i = 0; i < 8 ; i++)
+			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+				return 0;
+		return 1;
+	}
+
+	/* variable MTRRs */
+	return valid_mtrr_type(data & 0xff);
+}
+
+
+static int set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	uint64_t *p = (uint64_t *)&vcpu->arch.mtrr_state.fixed_ranges;
+
+	if (!mtrr_valid(vcpu, msr, data))
+		return 1;
+
+	if (msr == MSR_MTRRdefType) {
+		vcpu->arch.mtrr_state.def_type = data;
+		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+	} else if (msr == MSR_MTRRfix64K_00000)
+		p[0] = data;
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		p[1 + msr - MSR_MTRRfix16K_80000] = data;
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+	else if (msr == MSR_IA32_CR_PAT)
+		vcpu->arch.pat = data;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		uint64_t *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+		if (!is_mtrr_mask)
+			pt =
+			  (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+		else
+			pt =
+			  (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+		*pt = data;
+	}
+
+#ifdef XXX
+	kvm_mmu_reset_context(vcpu);
+#endif /*XXX*/
+	return 0;
+}
+
+static int set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	switch (msr) {
+#ifdef XXX
+	case HV_X64_MSR_APIC_ASSIST_PAGE: {
+		unsigned long addr;
+
+		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+			vcpu->arch.hv_vapic = data;
+			break;
+		}
+		addr = gfn_to_hva(vcpu->kvm, data >>
+				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+		if (kvm_is_error_hva(addr))
+			return 1;
+		if (clear_user((void __user *)addr, PAGESIZE))
+			return 1;
+		vcpu->arch.hv_vapic = data;
+		break;
+	}
+	case HV_X64_MSR_EOI:
+		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+	case HV_X64_MSR_ICR:
+		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+	case HV_X64_MSR_TPR:
+		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+#endif /*XXX*/
+	default:
+		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
+			  "data 0x%llx\n", msr, data);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+		kvm->arch.hv_guest_os_id = data;
+		/* setting guest os id to zero disables hypercall page */
+		if (!kvm->arch.hv_guest_os_id)
+			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+		break;
+	case HV_X64_MSR_HYPERCALL: {
+		uint64_t gfn;
+		unsigned long addr;
+		uint8_t instructions[4];
+
+		/* if guest os id is not set hypercall should remain disabled */
+		if (!kvm->arch.hv_guest_os_id)
+			break;
+		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+			kvm->arch.hv_hypercall = data;
+			break;
+		}
+		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+		addr = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(addr))
+			return 1;
+		kvm_x86_ops->patch_hypercall(vcpu, instructions);
+		((unsigned char *)instructions)[3] = 0xc3; /* ret */
+		if (copyout(instructions, (caddr_t)addr, 4))
+			return 1;
+		kvm->arch.hv_hypercall = data;
+		break;
+	}
+	default:
+		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
+			  "data 0x%llx\n", msr, data);
+		return 1;
+	}
+	return 0;
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	uint64_t mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+
+	switch (msr) {
+	case MSR_IA32_MCG_STATUS:
+		vcpu->arch.mcg_status = data;
+		break;
+	case MSR_IA32_MCG_CTL:
+		if (!(mcg_cap & MCG_CTL_P))
+			return 1;
+		if (data != 0 && data != ~(uint64_t)0)
+			return -1;
+		vcpu->arch.mcg_ctl = data;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			uint32_t offset = msr - MSR_IA32_MC0_CTL;
+			/* only 0 or all 1s can be written to IA32_MCi_CTL
+			 * some Linux kernels though clear bit 10 in bank 4 to
+			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+			 * this to avoid an uncatched #GP in the guest
+			 */
+			if ((offset & 0x3) == 0 &&
+			    data != 0 && (data | (1 << 10)) != ~(uint64_t)0)
+				return -1;
+			vcpu->arch.mce_banks[offset] = data;
+			break;
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int kvm_hv_msr_partition_wide(uint32_t msr)
+{
+	int r = 0;
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+	case HV_X64_MSR_HYPERCALL:
+		r = 1;
+		break;
+	}
+
+	return r;
+}
+
+
+static inline void get_page(caddr_t page)
+{
+}
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+	pfn_t pfn;
+
+	pfn = gfn_to_pfn(kvm, gfn);
+#ifdef XXX
+	if (!kvm_is_mmio_pfn(pfn))
+		return pfn_to_page(pfn);
+#endif /*XXX*/
+
+	get_page(bad_page);
+	return (struct page *)bad_page;
+}
+
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+	volatile int x;
+
+	switch (msr) {
+	case MSR_EFER:
+		set_efer(vcpu, data);
+		break;
+	case MSR_K7_HWCR:
+		data &= ~(uint64_t)0x40;	/* ignore flush filter disable */
+		if (data != 0) {
+			cmn_err(CE_NOTE, "unimplemented HWCR wrmsr: 0x%llx\n",
+				data);
+			return 1;
+		}
+		break;
+	case MSR_FAM10H_MMIO_CONF_BASE:
+		if (data != 0) {
+			cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: "
+				"0x%llx\n", data);
+			return 1;
+		}
+		break;
+	case MSR_AMD64_NB_CFG:
+		break;
+	case MSR_IA32_DEBUGCTLMSR:
+		if (!data) {
+			/* We support the non-activated case already */
+			break;
+		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+			/* Values other than LBR and BTF are vendor-specific,
+			   thus reserved and should throw a #GP */
+			return 1;
+		}
+		cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+			__func__, data);
+		break;
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_UCODE_WRITE:
+	case MSR_VM_HSAVE_PA:
+	case MSR_AMD64_PATCH_LOADER:
+		break;
+	case 0x200 ... 0x2ff:
+		return set_msr_mtrr(vcpu, msr, data);
+	case MSR_IA32_APICBASE:
+		kvm_set_apic_base(vcpu, data);
+		break;
+#ifdef XXX
+	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+		return kvm_x2apic_msr_write(vcpu, msr, data);
+#endif /*XXX*/
+	case MSR_IA32_MISC_ENABLE:
+		vcpu->arch.ia32_misc_enable_msr = data;
+		break;
+	case MSR_KVM_WALL_CLOCK:
+		vcpu->kvm->arch.wall_clock = data;
+		kvm_write_wall_clock(vcpu->kvm, data);
+		break;
+	case MSR_KVM_SYSTEM_TIME: {
+#ifdef XXX
+		if (vcpu->arch.time_page) {
+			kvm_release_page_dirty(vcpu->arch.time_page);
+			vcpu->arch.time_page = NULL;
+		}
+#endif /*XXX*/
+
+		vcpu->arch.time = data;
+
+		/* we verify if the enable bit is set... */
+		if (!(data & 1))
+			break;
+
+		/* ...but clean it before doing the actual write */
+		vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1);
+#ifdef XXX
+		vcpu->arch.time_page =
+				gfn_to_page(vcpu->kvm, data >> PAGESHIFT);
+
+		if (is_error_page(vcpu->arch.time_page)) {
+			kvm_release_page_clean(vcpu->arch.time_page);
+			vcpu->arch.time_page = NULL;
+		}
+
+		kvm_request_guest_time_update(vcpu);
+#endif /*XXX*/
+		break;
+	}
+	case MSR_IA32_MCG_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+		return set_msr_mce(vcpu, msr, data);
+
+	/* Performance counters are not protected by a CPUID bit,
+	 * so we should check all of them in the generic path for the sake of
+	 * cross vendor migration.
+	 * Writing a zero into the event select MSRs disables them,
+	 * which we perfectly emulate ;-). Any other value should be at least
+	 * reported, some guests depend on them.
+	 */
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
+	case MSR_K7_EVNTSEL0:
+	case MSR_K7_EVNTSEL1:
+	case MSR_K7_EVNTSEL2:
+	case MSR_K7_EVNTSEL3:
+		if (data != 0)
+			cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
+				"0x%x data 0x%llx\n", msr, data);
+		break;
+	/* at least RHEL 4 unconditionally writes to the perfctr registers,
+	 * so we ignore writes to make it happy.
+	 */
+	case MSR_P6_PERFCTR0:
+	case MSR_P6_PERFCTR1:
+	case MSR_K7_PERFCTR0:
+	case MSR_K7_PERFCTR1:
+	case MSR_K7_PERFCTR2:
+	case MSR_K7_PERFCTR3:
+		cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
+			"0x%x data 0x%llx\n", msr, data);
+		break;
+	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+		if (kvm_hv_msr_partition_wide(msr)) {
+			int r;
+			mutex_enter(&vcpu->kvm->lock);
+			r = set_msr_hyperv_pw(vcpu, msr, data);
+			mutex_exit(&vcpu->kvm->lock);
+			return r;
+		} else
+			return set_msr_hyperv(vcpu, msr, data);
+		break;
+	default:
+		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+			return xen_hvm_config(vcpu, data);
+		if (!ignore_msrs) {
+			cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %llx\n",
+				msr, data);
+			return 1;
+		} else {
+			cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %llx\n",
+				msr, data);
+			break;
+		}
+	}
+	x = 10; /*XXX*/
+	return 0;
+}
+
+
+
+static int get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t *p = (uint64_t *)&vcpu->arch.mtrr_state.fixed_ranges;
+
+	if (!msr_mtrr_valid(msr))
+		return 1;
+
+	if (msr == MSR_MTRRdefType)
+		*pdata = vcpu->arch.mtrr_state.def_type +
+			 (vcpu->arch.mtrr_state.enabled << 10);
+	else if (msr == MSR_MTRRfix64K_00000)
+		*pdata = p[0];
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+	else if (msr == MSR_IA32_CR_PAT)
+		*pdata = vcpu->arch.pat;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		uint64_t *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+		if (!is_mtrr_mask)
+			pt =
+			  (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+		else
+			pt =
+			  (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+		*pdata = *pt;
+	}
+
+	return 0;
+}
+
+
+
+static int get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t data = 0;
+
+	switch (msr) {
+#ifdef XXX
+	case HV_X64_MSR_VP_INDEX: {
+		int r;
+		struct kvm_vcpu *v;
+		kvm_for_each_vcpu(r, v, vcpu->kvm)
+			if (v == vcpu)
+				data = r;
+		break;
+	}
+	case HV_X64_MSR_EOI:
+		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+	case HV_X64_MSR_ICR:
+		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+	case HV_X64_MSR_TPR:
+		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+#endif /*XXX*/
+	default:
+		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		return 1;
+	}
+	*pdata = data;
+	return 0;
+}
+
+static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t data = 0;
+	struct kvm *kvm = vcpu->kvm;
+
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+		data = kvm->arch.hv_guest_os_id;
+		break;
+	case HV_X64_MSR_HYPERCALL:
+		data = kvm->arch.hv_hypercall;
+		break;
+	default:
+		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		return 1;
+	}
+
+	*pdata = data;
+	return 0;
+}
+
+static int get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t data;
+	uint64_t mcg_cap = vcpu->arch.mcg_cap;
+	unsigned bank_num = mcg_cap & 0xff;
+
+	switch (msr) {
+	case MSR_IA32_P5_MC_ADDR:
+	case MSR_IA32_P5_MC_TYPE:
+		data = 0;
+		break;
+	case MSR_IA32_MCG_CAP:
+		data = vcpu->arch.mcg_cap;
+		break;
+	case MSR_IA32_MCG_CTL:
+		if (!(mcg_cap & MCG_CTL_P))
+			return 1;
+		data = vcpu->arch.mcg_ctl;
+		break;
+	case MSR_IA32_MCG_STATUS:
+		data = vcpu->arch.mcg_status;
+		break;
+	default:
+		if (msr >= MSR_IA32_MC0_CTL &&
+		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+			uint32_t offset = msr - MSR_IA32_MC0_CTL;
+			data = vcpu->arch.mce_banks[offset];
+			break;
+		}
+		return 1;
+	}
+	*pdata = data;
+	return 0;
+}
+
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+	uint64_t data;
+	volatile int x;  /*XXX - dtrace return probe is not there... */
+
+	switch (msr) {
+	case MSR_IA32_PLATFORM_ID:
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_EBL_CR_POWERON:
+	case MSR_IA32_DEBUGCTLMSR:
+	case MSR_IA32_LASTBRANCHFROMIP:
+	case MSR_IA32_LASTBRANCHTOIP:
+	case MSR_IA32_LASTINTFROMIP:
+	case MSR_IA32_LASTINTTOIP:
+	case MSR_K8_SYSCFG:
+	case MSR_K7_HWCR:
+	case MSR_VM_HSAVE_PA:
+	case MSR_P6_PERFCTR0:
+	case MSR_P6_PERFCTR1:
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
+	case MSR_K7_EVNTSEL0:
+	case MSR_K7_PERFCTR0:
+	case MSR_K8_INT_PENDING_MSG:
+	case MSR_AMD64_NB_CFG:
+	case MSR_FAM10H_MMIO_CONF_BASE:
+		data = 0;
+		break;
+	case MSR_MTRRcap:
+		data = 0x500 | KVM_NR_VAR_MTRR;
+		break;
+	case 0x200 ... 0x2ff:
+		return get_msr_mtrr(vcpu, msr, pdata);
+	case 0xcd: /* fsb frequency */
+		data = 3;
+		break;
+	case MSR_IA32_APICBASE:
+		data = kvm_get_apic_base(vcpu);
+		break;
+#ifdef XXX
+	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+		return kvm_x2apic_msr_read(vcpu, msr, pdata);
+		break;
+#endif /*XXX*/
+	case MSR_IA32_MISC_ENABLE:
+		data = vcpu->arch.ia32_misc_enable_msr;
+		break;
+	case MSR_IA32_PERF_STATUS:
+		/* TSC increment by tick */
+		data = 1000ULL;
+		/* CPU multiplier */
+		data |= (((uint64_t)4ULL) << 40);
+		break;
+	case MSR_EFER:
+		data = vcpu->arch.efer;
+		break;
+	case MSR_KVM_WALL_CLOCK:
+		data = vcpu->kvm->arch.wall_clock;
+		break;
+	case MSR_KVM_SYSTEM_TIME:
+		data = vcpu->arch.time;
+		break;
+	case MSR_IA32_P5_MC_ADDR:
+	case MSR_IA32_P5_MC_TYPE:
+	case MSR_IA32_MCG_CAP:
+	case MSR_IA32_MCG_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+		return get_msr_mce(vcpu, msr, pdata);
+	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+		if (kvm_hv_msr_partition_wide(msr)) {
+			int r;
+			mutex_enter(&vcpu->kvm->lock);
+			r = get_msr_hyperv_pw(vcpu, msr, pdata);
+			mutex_exit(&vcpu->kvm->lock);
+			return r;
+		} else
+			return get_msr_hyperv(vcpu, msr, pdata);
+		break;
+	default:
+		if (!ignore_msrs) {
+			cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr);
+			return 1;
+		} else {
+			cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr);
+			data = 0;
+		}
+		break;
+	}
+	*pdata = data;
+	x = 10;  /*XXX*/
+	return 0;
+}
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+		    struct kvm_msr_entry *entries,
+		    int (*do_msr)(struct kvm_vcpu *vcpu,
+				  unsigned index, uint64_t *data))
+{
+	int i, idx;
+
+	vcpu_load(vcpu);
+
+#ifdef XXX
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+#endif
+	for (i = 0; i < msrs->nmsrs; ++i)
+		if (do_msr(vcpu, entries[i].index, &entries[i].data))
+			break;
+#ifdef XXX
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+#endif
+	vcpu_put(vcpu);
+
+	return i;
+}
+
+/*
+ * reads and returns guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset    -- 21.3
+ */
+static uint64_t guest_read_tsc(void)
+{
+	uint64_t host_tsc, tsc_offset;
+
+	rdtscll(host_tsc);
+	tsc_offset = vmcs_read64(TSC_OFFSET);
+	return host_tsc + tsc_offset;
+}
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata)
+{
+	uint64_t data;
+	struct shared_msr_entry *msr;
+
+	if (!pdata) {
+		cmn_err(CE_WARN, "BUG: get_msr called with NULL pdata\n");
+		return EINVAL;
+	}
+
+	switch (msr_index) {
+#ifdef CONFIG_X86_64
+	case MSR_FS_BASE:
+		data = vmcs_readl(GUEST_FS_BASE);
+		break;
+	case MSR_GS_BASE:
+		data = vmcs_readl(GUEST_GS_BASE);
+		break;
+	case MSR_KERNEL_GS_BASE:
+		vmx_load_host_state((struct vcpu_vmx *)vcpu);
+		data = ((struct vcpu_vmx *)(vcpu))->msr_guest_kernel_gs_base;
+		break;
+#endif
+	case MSR_EFER:
+		return kvm_get_msr_common(vcpu, msr_index, pdata);
+	case MSR_IA32_TSC:
+		data = guest_read_tsc();
+		break;
+	case MSR_IA32_SYSENTER_CS:
+		data = vmcs_read32(GUEST_SYSENTER_CS);
+		break;
+	case MSR_IA32_SYSENTER_EIP:
+		data = vmcs_readl(GUEST_SYSENTER_EIP);
+		break;
+	case MSR_IA32_SYSENTER_ESP:
+		data = vmcs_readl(GUEST_SYSENTER_ESP);
+		break;
+	case MSR_TSC_AUX:
+		if (!((struct vcpu_vmx *)(vcpu))->rdtscp_enabled)
+			return 1;
+		/* Otherwise falls through */
+	default:
+		vmx_load_host_state((struct vcpu_vmx *)vcpu);
+		msr = find_msr_entry((struct vcpu_vmx *)vcpu, msr_index);
+		if (msr) {
+			vmx_load_host_state((struct vcpu_vmx *)vcpu);
+			data = msr->data;
+			break;
+		}
+		return kvm_get_msr_common(vcpu, msr_index, pdata);
+	}
+
+	*pdata = data;
+	return 0;
+}
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata)
+{
+	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
+
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
+{
+	struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
+	struct shared_msr_entry *msr;
+	uint64_t host_tsc;
+	int ret = 0;
+
+	switch (msr_index) {
+	case MSR_EFER:
+		vmx_load_host_state(vmx);
+		ret = kvm_set_msr_common(vcpu, msr_index, data);
+		break;
+#ifdef CONFIG_X86_64
+	case MSR_FS_BASE:
+		vmcs_writel(GUEST_FS_BASE, data);
+		break;
+	case MSR_GS_BASE:
+		vmcs_writel(GUEST_GS_BASE, data);
+		break;
+	case MSR_KERNEL_GS_BASE:
+		vmx_load_host_state(vmx);
+		vmx->msr_guest_kernel_gs_base = data;
+		break;
+#endif
+	case MSR_IA32_SYSENTER_CS:
+		vmcs_write32(GUEST_SYSENTER_CS, data);
+		break;
+	case MSR_IA32_SYSENTER_EIP:
+		vmcs_writel(GUEST_SYSENTER_EIP, data);
+		break;
+	case MSR_IA32_SYSENTER_ESP:
+		vmcs_writel(GUEST_SYSENTER_ESP, data);
+		break;
+	case MSR_IA32_TSC:
+		rdtscll(host_tsc);
+		guest_write_tsc(data, host_tsc);
+		break;
+	case MSR_IA32_CR_PAT:
+		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+			vmcs_write64(GUEST_IA32_PAT, data);
+			vcpu->arch.pat = data;
+			break;
+		}
+		ret = kvm_set_msr_common(vcpu, msr_index, data);
+		break;
+	case MSR_TSC_AUX:
+		if (!vmx->rdtscp_enabled)
+			return 1;
+		/* Check reserved bit, higher 32 bits should be zero */
+		if ((data >> 32) != 0)
+			return 1;
+		/* Otherwise falls through */
+	default:
+		msr = find_msr_entry(vmx, msr_index);
+		if (msr) {
+			vmx_load_host_state(vmx);
+			msr->data = data;
+			break;
+		}
+		ret = kvm_set_msr_common(vcpu, msr_index, data);
+	}
+
+	return ret;
+}
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
+{
+	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data)
+{
+	return kvm_set_msr(vcpu, index, *data);
+}
+
+static inline int is_machine_check(uint32_t intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+			     INTR_INFO_VALID_MASK)) ==
+		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#ifdef XXX
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+	struct pt_regs regs = {
+		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
+		.flags = X86_EFLAGS_IF,
+	};
+
+	do_machine_check(&regs, 0);
+#endif
+#endif /*XXX*/
+}
+
+static void vmcs_clear_bits(unsigned long field, uint32_t mask)
+{
+	vmcs_writel(field, vmcs_readl(field) & ~mask);
+}
+
+static void vmcs_set_bits(unsigned long field, uint32_t mask)
+{
+	vmcs_writel(field, vmcs_readl(field) | mask);
+}
+
+#define EXCPT_BENIGN		0
+#define EXCPT_CONTRIBUTORY	1
+#define EXCPT_PF		2
+
+static int exception_class(int vector)
+{
+	switch (vector) {
+	case PF_VECTOR:
+		return EXCPT_PF;
+	case DE_VECTOR:
+	case TS_VECTOR:
+	case NP_VECTOR:
+	case SS_VECTOR:
+	case GP_VECTOR:
+		return EXCPT_CONTRIBUTORY;
+	default:
+		break;
+	}
+	return EXCPT_BENIGN;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+		unsigned nr, int has_error, uint32_t error_code)
+{
+	uint32_t prev_nr;
+	int class1, class2;
+
+	if (!vcpu->arch.exception.pending) {
+	queue:
+		vcpu->arch.exception.pending = 1;
+		vcpu->arch.exception.has_error_code = has_error;
+		vcpu->arch.exception.nr = nr;
+		vcpu->arch.exception.error_code = error_code;
+		return;
+	}
+
+	/* to check exception */
+	prev_nr = vcpu->arch.exception.nr;
+	if (prev_nr == DF_VECTOR) {
+		/* triple fault -> shutdown */
+		BT_SET(&vcpu->requests, KVM_REQ_TRIPLE_FAULT);
+		return;
+	}
+	class1 = exception_class(prev_nr);
+	class2 = exception_class(nr);
+	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+		/* generate double fault per SDM Table 5-5 */
+		vcpu->arch.exception.pending = 1;
+		vcpu->arch.exception.has_error_code = 1;
+		vcpu->arch.exception.nr = DF_VECTOR;
+		vcpu->arch.exception.error_code = 0;
+	} else
+		/* replace previous exception with a new one in a hope
+		   that instruction re-execution will regenerate lost
+		   exception */
+		goto queue;
+}
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+	kvm_multiple_exception(vcpu, nr, 0, 0);
+}
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code)
+{
+	kvm_multiple_exception(vcpu, nr, 1, error_code);
+}
+
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+	uint32_t exit_intr_info;
+	uint32_t idt_vectoring_info = vmx->idt_vectoring_info;
+	int unblock_nmi;
+	uint8_t vector;
+	int type;
+	int idtv_info_valid;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+	/* Handle machine checks before interrupts are enabled */
+	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
+		&& is_machine_check(exit_intr_info)))
+		kvm_machine_check();
+
+	/* We need to handle NMIs before interrupts are enabled */
+	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+	    (exit_intr_info & INTR_INFO_VALID_MASK))
+		asm("int $2");
+
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+#ifdef XXX
+	if (cpu_has_virtual_nmis()) {
+		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+		/*
+		 * SDM 3: 27.7.1.2 (September 2008)
+		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
+		 * a guest IRET fault.
+		 * SDM 3: 23.2.2 (September 2008)
+		 * Bit 12 is undefined in any of the following cases:
+		 *  If the VM exit sets the valid bit in the IDT-vectoring
+		 *   information field.
+		 *  If the VM exit is due to a double fault.
+		 */
+		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+		    vector != DF_VECTOR && !idtv_info_valid)
+			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				      GUEST_INTR_STATE_NMI);
+	} else if (unlikely(vmx->soft_vnmi_blocked))
+		vmx->vnmi_blocked_time +=
+			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+#endif /*XXX*/
+	vmx->vcpu.arch.nmi_injected = 0;
+#ifdef XXX
+	kvm_clear_exception_queue(&vmx->vcpu);
+	kvm_clear_interrupt_queue(&vmx->vcpu);
+
+	if (!idtv_info_valid)
+		return;
+#endif /*XXX*/
+	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+
+	switch (type) {
+	case INTR_TYPE_NMI_INTR:
+		vmx->vcpu.arch.nmi_injected = 1;
+		/*
+		 * SDM 3: 27.7.1.2 (September 2008)
+		 * Clear bit "block by NMI" before VM entry if a NMI
+		 * delivery faulted.
+		 */
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_NMI);
+		break;
+	case INTR_TYPE_SOFT_EXCEPTION:
+		vmx->vcpu.arch.event_exit_inst_len =
+			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		/* fall through */
+	case INTR_TYPE_HARD_EXCEPTION:
+#ifdef XXX
+		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+			uint32_t err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			kvm_queue_exception_e(&vmx->vcpu, vector, err);
+		} else
+			kvm_queue_exception(&vmx->vcpu, vector);
+#endif /*XXX*/
+		break;
+	case INTR_TYPE_SOFT_INTR:
+		vmx->vcpu.arch.event_exit_inst_len =
+			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		/* fall through */
+	case INTR_TYPE_EXT_INTR:
+#ifdef XXX
+		kvm_queue_interrupt(&vmx->vcpu, vector,
+			type == INTR_TYPE_SOFT_INTR);
+#endif /*XXX*/
+		break;
+	default:
+		break;
+	}
+}
+
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+	return val;
+}
+
+#define read_cr0()	(native_read_cr0())
+
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
+
+	/* Record the guest's net vcpu time for enforced NMI injections. */
+#ifdef XXX
+	if (!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)
+		vmx->entry_time = ktime_get();
+
+	/* Don't enter VMX if guest state is invalid, let the exit handler
+	   start emulation until we arrive back to a valid state */
+	if (vmx->emulation_required && emulate_invalid_guest_state)
+		return;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	/* When single-stepping over STI and MOV SS, we must clear the
+	 * corresponding interruptibility bits in the guest state. Otherwise
+	 * vmentry fails as it then expects bit 14 (BS) in pending debug
+	 * exceptions being set, but that's not correct for the guest debugging
+	 * case. */
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmx_set_interrupt_shadow(vcpu, 0);
+#endif /*XXX*/
+
+	/*
+	 * Loading guest fpu may have cleared host cr0.ts
+	 */
+	vmcs_writel(HOST_CR0, read_cr0());
+
+	asm(
+		/* Store host registers */
+		"push %%"R"dx; push %%"R"bp;"
+		"push %%"R"cx \n\t"
+		"cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+		"je 1f \n\t"
+		"mov %%"R"sp, %c[host_rsp](%0) \n\t"
+		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+		"1: \n\t"
+		/* Reload cr2 if changed */
+		"mov %c[cr2](%0), %%"R"ax \n\t"
+		"mov %%cr2, %%"R"dx \n\t"
+		"cmp %%"R"ax, %%"R"dx \n\t"
+		"je 2f \n\t"
+		"mov %%"R"ax, %%cr2 \n\t"
+		"2: \n\t"
+		/* Check if vmlaunch of vmresume is needed */
+		"cmpl $0, %c[launched](%0) \n\t"
+		/* Load guest registers.  Don't clobber flags. */
+		"mov %c[rax](%0), %%"R"ax \n\t"
+		"mov %c[rbx](%0), %%"R"bx \n\t"
+		"mov %c[rdx](%0), %%"R"dx \n\t"
+		"mov %c[rsi](%0), %%"R"si \n\t"
+		"mov %c[rdi](%0), %%"R"di \n\t"
+		"mov %c[rbp](%0), %%"R"bp \n\t"
+#ifdef CONFIG_X86_64
+		"mov %c[r8](%0),  %%r8  \n\t"
+		"mov %c[r9](%0),  %%r9  \n\t"
+		"mov %c[r10](%0), %%r10 \n\t"
+		"mov %c[r11](%0), %%r11 \n\t"
+		"mov %c[r12](%0), %%r12 \n\t"
+		"mov %c[r13](%0), %%r13 \n\t"
+		"mov %c[r14](%0), %%r14 \n\t"
+		"mov %c[r15](%0), %%r15 \n\t"
+#endif
+		"mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
+		/* Enter guest mode */
+		"jne .Llaunched \n\t"
+		__ex(ASM_VMX_VMLAUNCH) "\n\t"
+		"jmp .Lkvm_vmx_return \n\t"
+		".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
+		".Lkvm_vmx_return: "
+		/* Save guest registers, load host registers, keep flags */
+		"xchg %0,     (%%"R"sp) \n\t"
+		"mov %%"R"ax, %c[rax](%0) \n\t"
+		"mov %%"R"bx, %c[rbx](%0) \n\t"
+		"push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+		"mov %%"R"dx, %c[rdx](%0) \n\t"
+		"mov %%"R"si, %c[rsi](%0) \n\t"
+		"mov %%"R"di, %c[rdi](%0) \n\t"
+		"mov %%"R"bp, %c[rbp](%0) \n\t"
+#ifdef CONFIG_X86_64
+		"mov %%r8,  %c[r8](%0) \n\t"
+		"mov %%r9,  %c[r9](%0) \n\t"
+		"mov %%r10, %c[r10](%0) \n\t"
+		"mov %%r11, %c[r11](%0) \n\t"
+		"mov %%r12, %c[r12](%0) \n\t"
+		"mov %%r13, %c[r13](%0) \n\t"
+		"mov %%r14, %c[r14](%0) \n\t"
+		"mov %%r15, %c[r15](%0) \n\t"
+#endif
+		"mov %%cr2, %%"R"ax   \n\t"
+		"mov %%"R"ax, %c[cr2](%0) \n\t"
+
+		"pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
+		"setbe %c[fail](%0) \n\t"
+	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
+		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
+		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+		[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+		[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+		[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+		[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+		[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+		[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+		[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+		[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+		[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+		[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+		[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+		[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+#endif
+		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+	      : "cc", "memory"
+		, R"bx", R"di", R"si"
+#ifdef CONFIG_X86_64
+		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#endif
+	      );
+
+	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+				  | (1 << VCPU_EXREG_PDPTR));
+	vcpu->arch.regs_dirty = 0;
+
+	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+#ifdef XXX
+	if (vmx->rmode.irq.pending)
+		fixup_rmode_irq(vmx);
+#endif /*XXX*/
+
+	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	vmx->launched = 1;
+
+	vmx_complete_interrupts(vmx);
+}
+
+#undef R
+#undef Q
+
+void kvm_set_shared_msr(unsigned slot, uint64_t value, uint64_t mask)
+{
+#ifdef XXX
+	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+	if (((value ^ smsr->values[slot].curr) & mask) == 0)
+		return;
+	smsr->values[slot].curr = value;
+	wrmsrl(shared_msrs_global.msrs[slot], value);
+	if (!smsr->registered) {
+		smsr->urn.on_user_return = kvm_on_user_return;
+		user_return_notifier_register(&smsr->urn);
+		smsr->registered = 1;
+	}
+#endif /*XXX*/
+}
+static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int i;
+
+	if (vmx->host_state.loaded)
+		return;
+
+	vmx->host_state.loaded = 1;
+	/*
+	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
+	 * allow segment selectors with cpl > 0 or ti == 1.
+	 */
+	vmx->host_state.ldt_sel = kvm_read_ldt();
+	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+	vmx->host_state.fs_sel = kvm_read_fs();
+	if (!(vmx->host_state.fs_sel & 7)) {
+		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
+		vmx->host_state.fs_reload_needed = 0;
+	} else {
+		vmcs_write16(HOST_FS_SELECTOR, 0);
+		vmx->host_state.fs_reload_needed = 1;
+	}
+	vmx->host_state.gs_sel = kvm_read_gs();
+	if (!(vmx->host_state.gs_sel & 7))
+		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
+	else {
+		vmcs_write16(HOST_GS_SELECTOR, 0);
+		vmx->host_state.gs_ldt_reload_needed = 1;
+	}
+
+#ifdef CONFIG_X86_64
+	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#else
+	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
+#endif
+
+#ifdef CONFIG_X86_64
+	if (is_long_mode(&vmx->vcpu)) {
+		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+	}
+#endif
+	for (i = 0; i < vmx->save_nmsrs; ++i)
+		kvm_set_shared_msr(vmx->guest_msrs[i].index,
+				   vmx->guest_msrs[i].data,
+				   vmx->guest_msrs[i].mask);
+}
+
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+	/* already handled by vcpu_run */
+	return 1;
+}
+
+
+static inline int is_page_fault(uint32_t intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+			     INTR_INFO_VALID_MASK)) ==
+		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+				      struct kvm_vcpu *vcpu, uint32_t access,
+				      uint32_t *error)
+{
+	void *data = val;
+	int r = /*X86EMUL_CONTINUE*/ 0;
+
+	while (bytes) {
+		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+		unsigned offset = addr & (PAGESIZE-1);
+		unsigned toread = min(bytes, (unsigned)PAGESIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA) {
+			r = /*X86EMUL_PROPAGATE_FAULT*/1;
+			goto out;
+		}
+		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+		if (ret < 0) {
+			r = /*X86EMUL_UNHANDLEABLE*/ 1;
+			goto out;
+		}
+
+		bytes -= toread;
+		data += toread;
+		addr += toread;
+	}
+out:
+	return r;
+}
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+			   uint32_t error_code)
+{
+#ifdef XXX
+	++vcpu->stat.pf_guest;
+#endif /*XXX*/
+	vcpu->arch.cr2 = addr;
+	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+			       struct kvm_vcpu *vcpu, uint32_t *error)
+{
+	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
+
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+	if (!is_protmode(vcpu))
+		return 0;
+
+	if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+		return 3;
+
+	return vmcs_read16(GUEST_CS_SELECTOR) & 3;
+}
+
+
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+				struct kvm_vcpu *vcpu, uint32_t *error)
+{
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+					  access | PFERR_FETCH_MASK, error);
+}
+
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+			   const void *v)
+{
+#ifdef XXX
+	if (vcpu->arch.apic &&
+	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+		return 0;
+
+	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+#else
+	return 0;
+#endif /*XXX*/
+}
+
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+#ifdef XXX
+	if (vcpu->arch.apic &&
+	    !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+		return 0;
+
+	return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+#else
+	return 0;
+#endif /*XXX*/
+}
+
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
+{
+#ifdef XXX
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+#else
+	return UNMAPPED_GVA;
+#endif
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			       struct kvm_vcpu *vcpu, uint32_t *error)
+{
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+					  error);
+}
+
+static int emulator_read_emulated(unsigned long addr,
+				  void *val,
+				  unsigned int bytes,
+				  struct kvm_vcpu *vcpu)
+{
+	gpa_t                 gpa;
+	uint32_t error_code;
+
+	if (vcpu->mmio_read_completed) {
+		memcpy(val, vcpu->mmio_data, bytes);
+#ifdef XXX
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+			       vcpu->mmio_phys_addr, *(uint64_t *)val);
+#endif /*XXX*/
+		vcpu->mmio_read_completed = 0;
+		return X86EMUL_CONTINUE;
+	}
+
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+	if (gpa == UNMAPPED_GVA) {
+		kvm_inject_page_fault(vcpu, addr, error_code);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	/* For APIC access vmexit */
+	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+		goto mmio;
+
+	if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
+				== X86EMUL_CONTINUE)
+		return X86EMUL_CONTINUE;
+
+mmio:
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+#ifdef XXX
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(uint64_t *)val);
+#endif /*XXX*/
+		return X86EMUL_CONTINUE;
+	}
+
+#ifdef XXX
+	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+#endif /*XXX*/
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 0;
+
+	return X86EMUL_UNHANDLEABLE;
+}
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+			  const void *val, int bytes)
+{
+	int ret;
+
+	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+	if (ret < 0)
+		return 0;
+#ifdef XXX
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+#endif /*XXX*/
+	return 1;
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
+{
+#ifdef XXX
+	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	access |= PFERR_WRITE_MASK;
+	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+#else
+	return UNMAPPED_GVA;
+#endif
+}
+
+static int emulator_write_emulated_onepage(unsigned long addr,
+					   const void *val,
+					   unsigned int bytes,
+					   struct kvm_vcpu *vcpu)
+{
+	gpa_t                 gpa;
+	uint32_t error_code;
+
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+
+	if (gpa == UNMAPPED_GVA) {
+		kvm_inject_page_fault(vcpu, addr, error_code);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	/* For APIC access vmexit */
+	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+		goto mmio;
+
+	if (emulator_write_phys(vcpu, gpa, val, bytes))
+		return X86EMUL_CONTINUE;
+
+mmio:
+#ifdef XXX
+	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(uint64_t *)val);
+#endif /*XXX*/
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+		return X86EMUL_CONTINUE;
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 1;
+	memcpy(vcpu->mmio_data, val, bytes);
+
+	return X86EMUL_CONTINUE;
+}
+
+int emulator_write_emulated(unsigned long addr,
+				   const void *val,
+				   unsigned int bytes,
+				   struct kvm_vcpu *vcpu)
+{
+	/* Crossing a page boundary? */
+	if (((addr + bytes - 1) ^ addr) & PAGEMASK) {
+		int rc, now;
+
+		now = -addr & ~PAGEMASK;
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		addr += now;
+		val += now;
+		bytes -= now;
+	}
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+}
+
+static int emulator_cmpxchg_emulated(unsigned long addr,
+				     const void *old,
+				     const void *new,
+				     unsigned int bytes,
+				     struct kvm_vcpu *vcpu)
+{
+	cmn_err(CE_WARN, "kvm: emulating exchange as write\n");
+#ifndef CONFIG_X86_64
+	/* guests cmpxchg8b have to be emulated atomically */
+	if (bytes == 8) {
+		gpa_t gpa;
+		struct page *page;
+		char *kaddr;
+		uint64_t val;
+
+		gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+		if (gpa == UNMAPPED_GVA ||
+		   (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+			goto emul_write;
+
+		if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK))
+			goto emul_write;
+
+		val = *(uint64_t *)new;
+
+		page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT);
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val);
+		kunmap_atomic(kaddr, KM_USER0);
+		kvm_release_page_dirty(page);
+	}
+emul_write:
+#endif
+
+	return emulator_write_emulated(addr, new, bytes, vcpu);
+}
+
+static struct x86_emulate_ops emulate_ops = {
+	.read_std            = kvm_read_guest_virt_system,
+	.fetch               = kvm_fetch_guest_virt,
+	.read_emulated       = emulator_read_emulated,
+	.write_emulated      = emulator_write_emulated,
+	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+};
+
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+	kvm_register_read(vcpu, VCPU_REGS_RAX);
+	kvm_register_read(vcpu, VCPU_REGS_RSP);
+	kvm_register_read(vcpu, VCPU_REGS_RIP);
+	vcpu->arch.regs_dirty = ~0;
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa;
+	int r;
+#ifdef XXX
+	if (tdp_enabled)
+		return 0;
+
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+	mutex_enter(&vcpu->kvm->mmu_lock);
+	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGESHIFT);
+	mutex_exit(&vcpu->kvm->mmu_lock);
+	return r;
+#else
+	return 0;
+#endif /*XXX*/
+}
+
+int emulate_instruction(struct kvm_vcpu *vcpu,
+			unsigned long cr2,
+			uint16_t error_code,
+			int emulation_type)
+{
+	int r, shadow_mask;
+	struct decode_cache *c;
+	struct kvm_run *run = vcpu->run;
+
+#ifdef XXX
+	kvm_clear_exception_queue(vcpu);
+#endif /*XXX*/
+	vcpu->arch.mmio_fault_cr2 = cr2;
+	/*
+	 * TODO: fix emulate.c to use guest_read/write_register
+	 * instead of direct ->regs accesses, can save hundred cycles
+	 * on Intel for instructions that don't read/change RSP, for
+	 * for example.
+	 */
+	cache_all_regs(vcpu);
+
+	vcpu->mmio_is_write = 0;
+	vcpu->arch.pio.string = 0;
+
+	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+		int cs_db, cs_l;
+		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+		vcpu->arch.emulate_ctxt.vcpu = vcpu;
+		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.mode =
+			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+			? X86EMUL_MODE_VM86 : cs_l
+			? X86EMUL_MODE_PROT64 :	cs_db
+			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+		/* Only allow emulation of specific instructions on #UD
+		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
+		c = &vcpu->arch.emulate_ctxt.decode;
+		if (emulation_type & EMULTYPE_TRAP_UD) {
+			if (!c->twobyte)
+				return EMULATE_FAIL;
+			switch (c->b) {
+			case 0x01: /* VMMCALL */
+				if (c->modrm_mod != 3 || c->modrm_rm != 1)
+					return EMULATE_FAIL;
+				break;
+			case 0x34: /* sysenter */
+			case 0x35: /* sysexit */
+				if (c->modrm_mod != 0 || c->modrm_rm != 0)
+					return EMULATE_FAIL;
+				break;
+			case 0x05: /* syscall */
+				if (c->modrm_mod != 0 || c->modrm_rm != 0)
+					return EMULATE_FAIL;
+				break;
+			default:
+				return EMULATE_FAIL;
+			}
+
+			if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+				return EMULATE_FAIL;
+		}
+
+#ifdef XXX
+		++vcpu->stat.insn_emulation;
+#endif /*XXX*/
+		if (r)  {
+#ifdef XXX
+			++vcpu->stat.insn_emulation_fail;
+#endif /*XXX*/
+			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+				return EMULATE_DONE;
+			return EMULATE_FAIL;
+		}
+	}
+
+	if (emulation_type & EMULTYPE_SKIP) {
+		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+		return EMULATE_DONE;
+	}
+
+	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+	if (r == 0)
+		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+
+	if (vcpu->arch.pio.string)
+		return EMULATE_DO_MMIO;
+
+	if ((r || vcpu->mmio_is_write) && run) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		run->mmio.phys_addr = vcpu->mmio_phys_addr;
+		memcpy(run->mmio.data, vcpu->mmio_data, 8);
+		run->mmio.len = vcpu->mmio_size;
+		run->mmio.is_write = vcpu->mmio_is_write;
+	}
+
+	if (r) {
+		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			return EMULATE_DONE;
+		if (!vcpu->mmio_needed) {
+#ifdef XXX
+			kvm_report_emulation_failure(vcpu, "mmio");
+#endif /*XXX*/
+			return EMULATE_FAIL;
+		}
+		return EMULATE_DO_MMIO;
+	}
+
+	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+	if (vcpu->mmio_is_write) {
+		vcpu->mmio_needed = 0;
+		return EMULATE_DO_MMIO;
+	}
+
+	return EMULATE_DONE;
+}
+
+/*
+ * The guest has exited.  See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int handle_exception(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_run *kvm_run = vcpu->run;
+	uint32_t intr_info, ex_no, error_code;
+	unsigned long cr2, rip, dr6;
+	uint32_t vect_info;
+	enum emulation_result er;
+
+	vect_info = vmx->idt_vectoring_info;
+	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	if (is_machine_check(intr_info))
+		return handle_machine_check(vcpu);
+
+	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+	    !is_page_fault(intr_info)) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+		vcpu->run->internal.ndata = 2;
+		vcpu->run->internal.data[0] = vect_info;
+		vcpu->run->internal.data[1] = intr_info;
+		return 0;
+	}
+
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+		return 1;  /* already handled by vmx_vcpu_run() */
+
+#ifdef XXX
+	if (is_no_device(intr_info)) {
+		vmx_fpu_activate(vcpu);
+		return 1;
+	}
+
+	if (is_invalid_opcode(intr_info)) {
+		er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+		if (er != EMULATE_DONE)
+			kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+#endif /*XXX*/
+
+	error_code = 0;
+	rip = kvm_rip_read(vcpu);
+	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	if (is_page_fault(intr_info)) {
+		/* EPT won't cause page fault directly */
+		if (enable_ept)
+			cmn_err(CE_PANIC, "page fault with ept enabled\n");
+		cr2 = vmcs_readl(EXIT_QUALIFICATION);
+#ifdef XXX
+		trace_kvm_page_fault(cr2, error_code);
+
+		if (kvm_event_needs_reinjection(vcpu))
+			kvm_mmu_unprotect_page_virt(vcpu, cr2);
+		return kvm_mmu_page_fault(vcpu, cr2, error_code);
+#else
+		return -1;
+#endif /*XXX*/
+	}
+
+#ifdef XXX
+	if (vmx->rmode.vm86_active &&
+	    handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
+								error_code)) {
+		if (vcpu->arch.halt_request) {
+			vcpu->arch.halt_request = 0;
+			return kvm_emulate_halt(vcpu);
+		}
+		return 1;
+	}
+#endif /*XXX*/
+
+	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+	switch (ex_no) {
+	case DB_VECTOR:
+#ifdef XXX
+		dr6 = vmcs_readl(EXIT_QUALIFICATION);
+		if (!(vcpu->guest_debug &
+		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+			vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+		/* fall through */
+#endif /*XXX*/
+	case BP_VECTOR:
+#ifdef XXX
+		/*
+		 * Update instruction length as we may reinject #BP from
+		 * user space while in guest debugging mode. Reading it for
+		 * #DB as well causes no harm, it is not used in that case.
+		 */
+		vmx->vcpu.arch.event_exit_inst_len =
+			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		kvm_run->exit_reason = KVM_EXIT_DEBUG;
+		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+		kvm_run->debug.arch.exception = ex_no;
+#endif /*XXX*/
+		break;
+	default:
+		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+		kvm_run->ex.exception = ex_no;
+		kvm_run->ex.error_code = error_code;
+		break;
+	}
+	return 0;
+}
+
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	++vcpu->stat.irq_exits;
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
+{
+	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+	return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification;
+	int size, in, string;
+	unsigned port;
+
+#ifdef XXX
+	++vcpu->stat.io_exits;
+#endif /*XXX*/
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	string = (exit_qualification & 16) != 0;
+
+	if (string) {
+		if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
+			return 0;
+		return 1;
+	}
+
+	size = (exit_qualification & 7) + 1;
+	in = (exit_qualification & 8) != 0;
+	port = exit_qualification >> 16;
+#ifdef XXX
+	skip_emulated_instruction(vcpu);
+	return kvm_emulate_pio(vcpu, in, size, port);
+#endif /*XXX*/
+}
+
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
+{
+	uint32_t cpu_based_vm_exec_control;
+
+	/* clear pending NMI */
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+#ifdef XXX
+	++vcpu->stat.nmi_window_exits;
+#endif /*XXX*/
+
+	return 1;
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	enum emulation_result err = EMULATE_DONE;
+	int ret = 1;
+
+#ifdef XXX
+	while (!guest_state_valid(vcpu)) {
+		err = emulate_instruction(vcpu, 0, 0, 0);
+
+		if (err == EMULATE_DO_MMIO) {
+			ret = 0;
+			goto out;
+		}
+
+		if (err != EMULATE_DONE) {
+			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+			vcpu->run->internal.ndata = 0;
+			ret = 0;
+			goto out;
+		}
+		if (signal_pending(current))
+			goto out;
+		if (need_resched())
+			schedule();
+	}
+#endif /*XXX*/
+
+	vmx->emulation_required = 0;
+out:
+	return ret;
+}
+
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+	if (cr0 & 0xffffffff00000000UL) {
+#ifdef XXX
+		kvm_inject_gp(vcpu, 0);
+#endif
+		return;
+	}
+#endif
+
+	cr0 &= ~CR0_RESERVED_BITS;
+
+	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+#ifdef XXX
+		kvm_inject_gp(vcpu, 0);
+#endif
+		return;
+	}
+
+	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+#ifdef XXX
+		kvm_inject_gp(vcpu, 0);
+#endif
+		return;
+	}
+
+	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+#ifdef XXX
+		if ((vcpu->arch.efer & EFER_LME)) {
+			int cs_db, cs_l;
+
+			if (!is_pae(vcpu)) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+
+			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+			if (cs_l) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+
+			}
+		} else
+#endif /*XXX*/
+#endif
+#ifdef XXX
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+#endif /*XXX*/
+
+	}
+
+	kvm_x86_ops->set_cr0(vcpu, cr0);
+	vcpu->arch.cr0 = cr0;
+#ifdef XXX
+	kvm_mmu_reset_context(vcpu);
+#endif /*XXX*/
+	return;
+}
+
+static inline int constant_test_bit(int nr, const void *addr)
+{
+	const uint32_t *p = (const uint32_t *)addr;
+	return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
+}
+static inline int variable_test_bit(int nr, const void *addr)
+{
+	uint8_t v;
+	const uint32_t *p = (const uint32_t *)addr;
+
+	asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
+	return v;
+}
+
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+
+static int pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	int changed = 1;
+	int r;
+
+	if (is_long_mode(vcpu) || !is_pae(vcpu))
+		return 0;
+
+	if (!test_bit(VCPU_EXREG_PDPTR,
+		      (unsigned long *)&vcpu->arch.regs_avail))
+		return 1;
+
+	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+	if (r < 0)
+		goto out;
+	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+out:
+
+	return changed;
+}
+
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+#ifdef XXX
+		kvm_mmu_sync_roots(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
+#endif /*XXX*/
+		return;
+	}
+
+	if (is_long_mode(vcpu)) {
+		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+#ifdef XXX
+			kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+			return;
+		}
+	} else {
+#ifdef XXX
+		if (is_pae(vcpu)) {
+			if (cr3 & CR3_PAE_RESERVED_BITS) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+		}
+#endif /*XXX*/
+		/*
+		 * We don't check reserved bits in nonpae mode, because
+		 * this isn't enforced, and VMware depends on this.
+		 */
+	}
+
+	/*
+	 * Does the new cr3 value map to physical memory? (Note, we
+	 * catch an invalid cr3 even in real-mode, because it would
+	 * cause trouble later on when we turn on paging anyway.)
+	 *
+	 * A real CPU would silently accept an invalid cr3 and would
+	 * attempt to use it - with largely undefined (and often hard
+	 * to debug) behavior on the guest side.
+	 */
+#ifdef XXX
+	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT)))
+		kvm_inject_gp(vcpu, 0);
+	else {
+#endif /*XXX*/
+		vcpu->arch.cr3 = cr3;
+#ifdef XXX
+		vcpu->arch.mmu.new_cr3(vcpu);
+	}
+#endif /*XXX*/
+}
+
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	unsigned long old_cr4 = kvm_read_cr4(vcpu);
+	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+	if (cr4 & CR4_RESERVED_BITS) {
+#ifdef XXX
+		kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+		return;
+	}
+
+	if (is_long_mode(vcpu)) {
+		if (!(cr4 & X86_CR4_PAE)) {
+#ifdef XXX
+			kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+			return;
+		}
+#ifdef XXX
+	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+		   && ((cr4 ^ old_cr4) & pdptr_bits)
+		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+		kvm_inject_gp(vcpu, 0);
+		return;
+#endif /*XXX*/
+	}
+
+	if (cr4 & X86_CR4_VMXE) {
+#ifdef XXX
+		kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+		return;
+	}
+	kvm_x86_ops->set_cr4(vcpu, cr4);
+	vcpu->arch.cr4 = cr4;
+	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
+	kvm_mmu_reset_context(vcpu);
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification, val;
+	int cr;
+	int reg;
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	cr = exit_qualification & 15;
+	reg = (exit_qualification >> 8) & 15;
+	switch ((exit_qualification >> 4) & 3) {
+	case 0: /* mov to cr */
+		val = kvm_register_read(vcpu, reg);
+#ifdef XXX
+		trace_kvm_cr_write(cr, val);
+#endif /*XXX*/
+		switch (cr) {
+		case 0:
+			kvm_set_cr0(vcpu, val);
+#ifdef XXX
+			skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+			return 1;
+		case 3:
+			kvm_set_cr3(vcpu, val);
+#ifdef XXX			
+			skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+			return 1;
+		case 4:
+			kvm_set_cr4(vcpu, val);
+#ifdef XXX
+			skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+			return 1;
+		case 8: {
+				uint8_t cr8_prev = kvm_get_cr8(vcpu);
+				uint8_t cr8 = kvm_register_read(vcpu, reg);
+				kvm_set_cr8(vcpu, cr8);
+#ifdef XXX
+				skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+				if (irqchip_in_kernel(vcpu->kvm))
+					return 1;
+				if (cr8_prev <= cr8)
+					return 1;
+				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+				return 0;
+			}
+		};
+		break;
+	case 2: /* clts */
+		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+#ifdef XXX
+		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
+		skip_emulated_instruction(vcpu);
+		vmx_fpu_activate(vcpu);
+#endif /*XXX*/
+		return 1;
+	case 1: /*mov from cr*/
+		switch (cr) {
+		case 3:
+			kvm_register_write(vcpu, reg, vcpu->arch.cr3);
+#ifdef XXX
+			trace_kvm_cr_read(cr, vcpu->arch.cr3);
+			skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+			return 1;
+		case 8:
+			val = kvm_get_cr8(vcpu);
+			kvm_register_write(vcpu, reg, val);
+#ifdef XXX
+			trace_kvm_cr_read(cr, val);
+			skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+			return 1;
+		}
+		break;
+	case 3: /* lmsw */
+		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+#ifdef XXX
+		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+		kvm_lmsw(vcpu, val);
+
+		skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+		return 1;
+	default:
+		break;
+	}
+	vcpu->run->exit_reason = 0;
+	cmn_err(CE_WARN, "unhandled control register: op %d cr %d\n",
+	       (int)(exit_qualification >> 4) & 3, cr);
+	return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification;
+	unsigned long val;
+	int dr, reg;
+
+#ifdef XXX
+	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
+	if (!kvm_require_cpl(vcpu, 0))
+		return 1;
+	dr = vmcs_readl(GUEST_DR7);
+
+	if (dr & DR7_GD) {
+		/*
+		 * As the vm-exit takes precedence over the debug trap, we
+		 * need to emulate the latter, either for the host or the
+		 * guest debugging itself.
+		 */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+			vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+			vcpu->run->debug.arch.dr7 = dr;
+			vcpu->run->debug.arch.pc =
+				vmcs_readl(GUEST_CS_BASE) +
+				vmcs_readl(GUEST_RIP);
+			vcpu->run->debug.arch.exception = DB_VECTOR;
+			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+			return 0;
+		} else {
+			vcpu->arch.dr7 &= ~DR7_GD;
+			vcpu->arch.dr6 |= DR6_BD;
+			vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+	}
+#endif /*XXX*/
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+	if (exit_qualification & TYPE_MOV_FROM_DR) {
+		switch (dr) {
+		case 0 ... 3:
+			val = vcpu->arch.db[dr];
+			break;
+		case 4:
+#ifdef XXX
+			if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+				return 1;
+			/* fall through */
+		case 6:
+			val = vcpu->arch.dr6;
+			break;
+		case 5:
+#ifdef XXX
+			if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+				return 1;
+			/* fall through */
+		default: /* 7 */
+			val = vcpu->arch.dr7;
+			break;
+		}
+		kvm_register_write(vcpu, reg, val);
+	} else {
+		val = vcpu->arch.regs[reg];
+		switch (dr) {
+		case 0 ... 3:
+			vcpu->arch.db[dr] = val;
+#ifdef XXX
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+#endif
+				vcpu->arch.eff_db[dr] = val;
+			break;
+		case 4:
+#ifdef XXX
+			if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+				return 1;
+			/* fall through */
+		case 6:
+			if (val & 0xffffffff00000000ULL) {
+				kvm_inject_gp(vcpu, 0);
+				return 1;
+			}
+			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+			break;
+		case 5:
+#ifdef XXX
+			if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+				return 1;
+			/* fall through */
+		default: /* 7 */
+			if (val & 0xffffffff00000000ULL) {
+				kvm_inject_gp(vcpu, 0);
+				return 1;
+			}
+			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+#ifdef XXX
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+#endif /*XXX*/
+				vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+				vcpu->arch.switch_db_regs =
+					(val & DR7_BP_EN_MASK);
+#ifdef XXX
+			}
+#endif /*XXX*/
+			break;
+		}
+	}
+#ifdef XXX
+	skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_cpuid(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	kvm_emulate_cpuid(vcpu);
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu)
+{
+	uint32_t ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+	uint64_t data;
+
+	if (vmx_get_msr(vcpu, ecx, &data)) {
+#ifdef XXX
+		trace_kvm_msr_read_ex(ecx);
+#endif /*XXX*/
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+#ifdef XXX
+	trace_kvm_msr_read(ecx, data);
+#endif /*XXX*/
+
+	/* FIXME: handling of bits 32:63 of rax, rdx */
+	vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+	vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+#ifdef XXX
+	skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu)
+{
+	uint32_t ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+	uint64_t data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+		| ((uint64_t)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+	if (vmx_set_msr(vcpu, ecx, data) != 0) {
+#ifdef XXX
+		trace_kvm_msr_write_ex(ecx, data);
+#endif /*XXX*/	       
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+#ifdef XXX
+	trace_kvm_msr_write(ecx, data);
+	skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
+{
+	return 1;
+}
+
+static int kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+	uint64_t param, ingpa, outgpa, ret;
+	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+	int fast, longmode;
+	int cs_db, cs_l;
+
+	/*
+	 * hypercall generates UD from non zero cpl and real mode
+	 * per HYPER-V spec
+	 */
+	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+	longmode = is_long_mode(vcpu) && cs_l == 1;
+
+	if (!longmode) {
+		param = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+			(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+		ingpa = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+			(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+		outgpa = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+			(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+	}
+#ifdef CONFIG_X86_64
+	else {
+		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+	}
+#endif
+
+	code = param & 0xffff;
+	fast = (param >> 16) & 0x1;
+	rep_cnt = (param >> 32) & 0xfff;
+	rep_idx = (param >> 48) & 0xfff;
+
+#ifdef XXX
+	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+#endif /*XXX*/
+
+	switch (code) {
+	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+#ifdef XXX
+		kvm_vcpu_on_spin(vcpu);
+#endif /*XXX*/
+		break;
+	default:
+		res = HV_STATUS_INVALID_HYPERCALL_CODE;
+		break;
+	}
+
+	ret = res | (((uint64_t)rep_done & 0xfff) << 32);
+	if (longmode) {
+		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+	} else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+	}
+
+	return 1;
+}
+
+
+/* Return values for hypercalls */
+#define KVM_ENOSYS		1000
+#define KVM_EFAULT		EFAULT
+#define KVM_E2BIG		E2BIG
+#define KVM_EPERM		EPERM
+
+#define KVM_HC_VAPIC_POLL_IRQ		1
+#define KVM_HC_MMU_OP			2
+
+/*
+ * hypercalls use architecture specific
+ */
+
+#ifdef _KERNEL
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
+#endif
+
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+	if (kvm_arch_para_features() & (1UL << feature))
+		return 1;
+	return 0;
+}
+#endif /* _KERNEL */
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr, a0, a1, a2, a3, ret;
+	int r = 1;
+
+	if (kvm_hv_hypercall_enabled(vcpu->kvm))
+		return kvm_hv_hypercall(vcpu);
+
+	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+
+#ifdef XXX
+	trace_kvm_hypercall(nr, a0, a1, a2, a3);
+#endif /*XXX*/
+
+	if (!is_long_mode(vcpu)) {
+		nr &= 0xFFFFFFFF;
+		a0 &= 0xFFFFFFFF;
+		a1 &= 0xFFFFFFFF;
+		a2 &= 0xFFFFFFFF;
+		a3 &= 0xFFFFFFFF;
+	}
+
+	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	switch (nr) {
+	case KVM_HC_VAPIC_POLL_IRQ:
+		ret = 0;
+		break;
+	case KVM_HC_MMU_OP:
+#ifdef XXX
+		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+#endif /*XXX*/
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+out:
+	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+#ifdef XXX
+	++vcpu->stat.hypercalls;
+#endif /*XXX*/
+	return r;
+}
+
+static int handle_halt(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	skip_emulated_instruction(vcpu);
+	return kvm_emulate_halt(vcpu);
+#else
+	return 0;
+#endif /*XXX*/
+}
+
+static int handle_vmcall(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+	kvm_emulate_hypercall(vcpu);
+	return 1;
+}
+
+static int handle_vmx_insn(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	kvm_queue_exception(vcpu, UD_VECTOR);
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_invlpg(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+#ifdef XXX
+	kvm_mmu_invlpg(vcpu, exit_qualification);
+	skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+	/* TODO: Add support for VT-d/pass-through device */
+	return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification;
+	enum emulation_result er;
+	unsigned long offset;
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	offset = exit_qualification & 0xffful;
+
+	er = emulate_instruction(vcpu, 0, 0, 0);
+
+	if (er !=  EMULATE_DONE) {
+		cmn_err(CE_PANIC,
+		       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+		       offset);
+	}
+	return 1;
+}
+
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+	return (seg != VCPU_SREG_LDTR) &&
+		(seg != VCPU_SREG_TR) &&
+		(kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+	return desc->c.b.limit0 | (desc->c.b.limit << 16);
+}
+
+static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector,
+				   struct kvm_segment *kvm_desct)
+{
+	kvm_desct->base = get_desc_base(seg_desc);
+	kvm_desct->limit = get_desc_limit(seg_desc);
+	if (seg_desc->c.b.g) {
+		kvm_desct->limit <<= 12;
+		kvm_desct->limit |= 0xfff;
+	}
+	kvm_desct->selector = selector;
+	kvm_desct->type = seg_desc->c.b.type;
+	kvm_desct->present = seg_desc->c.b.p;
+	kvm_desct->dpl = seg_desc->c.b.dpl;
+	kvm_desct->db = seg_desc->c.b.d;
+	kvm_desct->s = seg_desc->c.b.s;
+	kvm_desct->l = seg_desc->c.b.l;
+	kvm_desct->g = seg_desc->c.b.g;
+	kvm_desct->avl = seg_desc->c.b.avl;
+	if (!selector)
+		kvm_desct->unusable = 1;
+	else
+		kvm_desct->unusable = 0;
+	kvm_desct->padding = 0;
+}
+
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
+{
+	struct kvm_segment segvar = {
+		.base = selector << 4,
+		.limit = 0xffff,
+		.selector = selector,
+		.type = 3,
+		.present = 1,
+		.dpl = 3,
+		.db = 0,
+		.s = 1,
+		.l = 0,
+		.g = 0,
+		.avl = 0,
+		.unusable = 0,
+	};
+	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+	return 0;
+}
+
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+					  uint16_t selector,
+					  struct descriptor_table *dtable)
+{
+	if (selector & 1 << 2) {
+		struct kvm_segment kvm_seg;
+
+		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+
+		if (kvm_seg.unusable)
+			dtable->limit = 0;
+		else
+			dtable->limit = kvm_seg.limit;
+		dtable->base = kvm_seg.base;
+	}
+	else
+		kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+
+/* allowed just for 8 bytes segments */
+static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
+					 struct desc_struct *seg_desc)
+{
+	struct descriptor_table dtable;
+	uint16_t index = selector >> 3;
+	int ret;
+	uint32_t err;
+	gva_t addr;
+
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+	if (dtable.limit < index * 8 + 7) {
+		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+		return 1;
+	}
+	addr = dtable.base + index * 8;
+	ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+					 vcpu,  &err);
+	if (ret == 1)
+		kvm_inject_page_fault(vcpu, addr, err);
+
+       return ret;
+}
+
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+				struct kvm_vcpu *vcpu, uint32_t *error)
+{
+	void *data = val;
+	int r = 0;
+
+#ifdef XXX
+	while (bytes) {
+		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+		unsigned offset = addr & (PAGESIZE-1);
+		unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA) {
+			r = X86EMUL_PROPAGATE_FAULT;
+			goto out;
+		}
+		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+		if (ret < 0) {
+			r = X86EMUL_UNHANDLEABLE;
+			goto out;
+		}
+
+		bytes -= towrite;
+		data += towrite;
+		addr += towrite;
+	}
+out:
+#endif /*XXX*/
+	return r;
+}
+
+/* allowed just for 8 bytes segments */
+static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
+					 struct desc_struct *seg_desc)
+{
+	struct descriptor_table dtable;
+	uint16_t index = selector >> 3;
+
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+	if (dtable.limit < index * 8 + 7)
+		return 1;
+	return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+}
+
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
+{
+	struct kvm_segment kvm_seg;
+	struct desc_struct seg_desc;
+	uint8_t dpl, rpl, cpl;
+	unsigned err_vec = GP_VECTOR;
+	uint32_t err_code = 0;
+	int null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	int ret;
+
+	if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
+		return kvm_load_realmode_segment(vcpu, selector, seg);
+
+	/* NULL selector is not valid for TR, CS and SS */
+	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+	    && null_selector)
+		goto exception;
+
+	/* TR should be in GDT only */
+	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+		goto exception;
+
+	ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
+	if (ret)
+		return ret;
+
+	seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+	if (null_selector) { /* for NULL selector skip all following checks */
+		kvm_seg.unusable = 1;
+		goto load;
+	}
+
+	err_code = selector & 0xfffc;
+	err_vec = GP_VECTOR;
+
+	/* can't load system descriptor into segment selecor */
+	if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+		goto exception;
+
+	if (!kvm_seg.present) {
+		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+		goto exception;
+	}
+
+	rpl = selector & 3;
+	dpl = kvm_seg.dpl;
+	cpl = kvm_x86_ops->get_cpl(vcpu);
+
+	switch (seg) {
+	case VCPU_SREG_SS:
+		/*
+		 * segment is not a writable data segment or segment
+		 * selector's RPL != CPL or segment selector's RPL != CPL
+		 */
+		if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+			goto exception;
+		break;
+	case VCPU_SREG_CS:
+		if (!(kvm_seg.type & 8))
+			goto exception;
+
+		if (kvm_seg.type & 4) {
+			/* conforming */
+			if (dpl > cpl)
+				goto exception;
+		} else {
+			/* nonconforming */
+			if (rpl > cpl || dpl != cpl)
+				goto exception;
+		}
+		/* CS(RPL) <- CPL */
+		selector = (selector & 0xfffc) | cpl;
+            break;
+	case VCPU_SREG_TR:
+		if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+			goto exception;
+		break;
+	case VCPU_SREG_LDTR:
+		if (kvm_seg.s || kvm_seg.type != 2)
+			goto exception;
+		break;
+	default: /*  DS, ES, FS, or GS */
+		/*
+		 * segment is not a data or readable code segment or
+		 * ((segment is a data or nonconforming code segment)
+		 * and (both RPL and CPL > DPL))
+		 */
+		if ((kvm_seg.type & 0xa) == 0x8 ||
+		    (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+			goto exception;
+		break;
+	}
+
+	if (!kvm_seg.unusable && kvm_seg.s) {
+		/* mark segment as accessed */
+		kvm_seg.type |= 1;
+		seg_desc.c.b.type |= 1;
+		save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+	}
+load:
+	kvm_set_segment(vcpu, &kvm_seg, seg);
+	return 0;
+exception:
+#ifdef XXX
+	kvm_queue_exception_e(vcpu, err_vec, err_code);
+#endif /*XXX*/
+	return 1;
+}
+
+static void save_state_to_tss32(struct kvm_vcpu *vcpu,
+				struct tss_segment_32 *tss)
+{
+	tss->cr3 = vcpu->arch.cr3;
+	tss->eip = kvm_rip_read(vcpu);
+	tss->eflags = kvm_get_rflags(vcpu);
+	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg)
+{
+	struct kvm_segment kvm_seg;
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	kvm_seg.selector = sel;
+	kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static int load_state_from_tss32(struct kvm_vcpu *vcpu,
+				  struct tss_segment_32 *tss)
+{
+	kvm_set_cr3(vcpu, tss->cr3);
+
+	kvm_rip_write(vcpu, tss->eip);
+	kvm_set_rflags(vcpu, tss->eflags | 2);
+
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+	kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+	kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
+		return 1;
+	return 0;
+}
+
+static void save_state_to_tss16(struct kvm_vcpu *vcpu,
+				struct tss_segment_16 *tss)
+{
+	tss->ip = kvm_rip_read(vcpu);
+	tss->flag = kvm_get_rflags(vcpu);
+	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
+
+	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct kvm_vcpu *vcpu,
+				 struct tss_segment_16 *tss)
+{
+	kvm_rip_write(vcpu, tss->ip);
+	kvm_set_rflags(vcpu, tss->flag | 2);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+		return 1;
+
+	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+		return 1;
+	return 0;
+}
+
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGESHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+
+static int kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector,
+			      uint16_t old_tss_sel, uint32_t old_tss_base,
+			      struct desc_struct *nseg_desc)
+{
+	struct tss_segment_16 tss_segment_16;
+	int ret = 0;
+
+	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+			   sizeof tss_segment_16))
+		goto out;
+
+	save_state_to_tss16(vcpu, &tss_segment_16);
+
+	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+			    sizeof tss_segment_16))
+		goto out;
+
+#ifdef XXX
+	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+			   &tss_segment_16, sizeof tss_segment_16))
+		goto out;
+#endif /*XXX*/
+
+	if (old_tss_sel != 0xffff) {
+		tss_segment_16.prev_task_link = old_tss_sel;
+#ifdef XXX
+		if (kvm_write_guest(vcpu->kvm,
+				    get_tss_base_addr_write(vcpu, nseg_desc),
+				    &tss_segment_16.prev_task_link,
+				    sizeof tss_segment_16.prev_task_link))
+			goto out;
+#endif /*XXX*/
+	}
+
+	if (load_state_from_tss16(vcpu, &tss_segment_16))
+		goto out;
+
+	ret = 1;
+out:
+	return ret;
+}
+
+static int kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector,
+		       uint16_t old_tss_sel, uint32_t old_tss_base,
+		       struct desc_struct *nseg_desc)
+{
+	struct tss_segment_32 tss_segment_32;
+	int ret = 0;
+
+	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+			   sizeof tss_segment_32))
+		goto out;
+
+	save_state_to_tss32(vcpu, &tss_segment_32);
+
+	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+			    sizeof tss_segment_32))
+		goto out;
+
+#ifdef XXX
+	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+			   &tss_segment_32, sizeof tss_segment_32))
+		goto out;
+#endif /*XXX*/
+
+	if (old_tss_sel != 0xffff) {
+		tss_segment_32.prev_task_link = old_tss_sel;
+
+#ifdef XXX
+		if (kvm_write_guest(vcpu->kvm,
+				    get_tss_base_addr_write(vcpu, nseg_desc),
+				    &tss_segment_32.prev_task_link,
+				    sizeof tss_segment_32.prev_task_link))
+			goto out;
+#endif /*XXX*/
+	}
+
+	if (load_state_from_tss32(vcpu, &tss_segment_32))
+		goto out;
+
+	ret = 1;
+out:
+	return ret;
+}
+
+static uint64_t vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+	return vmcs_readl(sf->base);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason)
+{
+	struct kvm_segment tr_seg;
+	struct desc_struct cseg_desc;
+	struct desc_struct nseg_desc;
+	int ret = 0;
+	uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
+	uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
+	uint32_t desc_limit;
+
+#ifdef XXX
+	old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
+#endif /*XXX*/
+
+	/* FIXME: Handle errors. Failure to read either TSS or their
+	 * descriptors should generate a pagefault.
+	 */
+	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+		goto out;
+
+	if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
+		goto out;
+
+	if (reason != TASK_SWITCH_IRET) {
+		int cpl;
+
+		cpl = kvm_x86_ops->get_cpl(vcpu);
+		if ((tss_selector & 3) > nseg_desc.c.b.dpl || cpl > nseg_desc.c.b.dpl) {
+#ifdef XXX
+			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+#endif /*XXX*/
+			return 1;
+		}
+	}
+
+	desc_limit = get_desc_limit(&nseg_desc);
+	if (!nseg_desc.c.b.p ||
+	    ((desc_limit < 0x67 && (nseg_desc.c.b.type & 8)) ||
+	     desc_limit < 0x2b)) {
+#ifdef XXX
+		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+#endif /*XXX*/
+		return 1;
+	}
+
+	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+		cseg_desc.c.b.type &= ~(1 << 1); //clear the B flag
+		save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
+	}
+
+	if (reason == TASK_SWITCH_IRET) {
+		uint32_t eflags = kvm_get_rflags(vcpu);
+		kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+	}
+
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
+
+	if (nseg_desc.c.b.type & 8)
+		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+					 old_tss_base, &nseg_desc);
+	else
+		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+					 old_tss_base, &nseg_desc);
+
+	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+		uint32_t eflags = kvm_get_rflags(vcpu);
+		kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+	}
+
+	if (reason != TASK_SWITCH_IRET) {
+		nseg_desc.c.b.type |= (1 << 1);
+		save_guest_segment_descriptor(vcpu, tss_selector,
+					      &nseg_desc);
+	}
+
+	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
+	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+	tr_seg.type = 11;
+	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+	return ret;
+}
+
+static int handle_task_switch(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long exit_qualification;
+	uint16_t tss_selector;
+	int reason, type, idt_v;
+
+	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	reason = (uint32_t)exit_qualification >> 30;
+	if (reason == TASK_SWITCH_GATE && idt_v) {
+		switch (type) {
+		case INTR_TYPE_NMI_INTR:
+			vcpu->arch.nmi_injected = 0;
+#ifdef XXX
+			if (cpu_has_virtual_nmis())
+				vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+					      GUEST_INTR_STATE_NMI);
+#endif
+			break;
+		case INTR_TYPE_EXT_INTR:
+		case INTR_TYPE_SOFT_INTR:
+#ifdef XXX
+			kvm_clear_interrupt_queue(vcpu);
+#endif /*XXX*/
+			break;
+		case INTR_TYPE_HARD_EXCEPTION:
+		case INTR_TYPE_SOFT_EXCEPTION:
+#ifdef XXX
+			kvm_clear_exception_queue(vcpu);
+#endif /*XXX*/
+			break;
+		default:
+			break;
+		}
+	}
+	tss_selector = exit_qualification;
+#ifdef XXX
+	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+		       type != INTR_TYPE_EXT_INTR &&
+		       type != INTR_TYPE_NMI_INTR))
+		skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+
+	if (!kvm_task_switch(vcpu, tss_selector, reason))
+		return 0;
+
+	/* clear all local breakpoint enable flags */
+	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+
+	/*
+	 * TODO: What about debug traps on tss switch?
+	 *       Are we supposed to inject them and update dr6?
+	 */
+
+	return 1;
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification;
+	gpa_t gpa;
+	int gla_validity;
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	if (exit_qualification & (1 << 6)) {
+		cmn_err(CE_PANIC, "EPT: GPA exceeds GAW!\n");
+	}
+
+	gla_validity = (exit_qualification >> 7) & 0x3;
+	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+		cmn_err(CE_WARN, "EPT: Handling EPT violation failed!\n");
+		cmn_err(CE_CONT, "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			vmcs_readl(GUEST_LINEAR_ADDRESS));
+		cmn_err(CE_PANIC, "EPT: Exit qualification is 0x%lx\n",
+			(long unsigned int)exit_qualification);
+		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+		return 0;
+	}
+
+	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+#ifdef XXX
+	trace_kvm_page_fault(gpa, exit_qualification);
+	return kvm_mmu_page_fault(vcpu, gpa & PAGEMASK, 0);
+#else
+	return 0;
+#endif
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
+{
+	uint64_t sptes[4];
+	int nr_sptes, i;
+	gpa_t gpa;
+
+	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+
+	cmn_err(CE_WARN, "EPT: Misconfiguration.\n");
+	cmn_err(CE_CONT, "EPT: GPA: 0x%llx\n", gpa);
+#ifdef XXX
+	nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+
+	for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
+		ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+#endif /*XXX*/
+
+	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+
+	return 0;
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	skip_emulated_instruction(vcpu);
+	kvm_vcpu_on_spin(vcpu);
+#endif /*XXX*/
+
+	return 1;
+}
+
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX	
+	kvm_queue_exception(vcpu, UD_VECTOR);
+#endif /*XXX*/
+	return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
+{
+	uint32_t cpu_based_vm_exec_control;
+
+	/* clear pending irq */
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+#ifdef XXX
+	++vcpu->stat.irq_window_exits;
+
+	/*
+	 * If the user space waits to inject interrupts, exit as soon as
+	 * possible
+	 */
+	if (!irqchip_in_kernel(vcpu->kvm) &&
+	    vcpu->run->request_interrupt_window &&
+	    !kvm_cpu_has_interrupt(vcpu)) {
+		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+		return 0;
+	}
+#endif /*XXX*/
+	return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
+	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
+	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
+	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
+	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
+	[EXIT_REASON_CR_ACCESS]               = handle_cr,
+	[EXIT_REASON_DR_ACCESS]               = handle_dr,
+	[EXIT_REASON_CPUID]                   = handle_cpuid,
+	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
+	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
+	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
+	[EXIT_REASON_HLT]                     = handle_halt,
+	[EXIT_REASON_INVLPG]		      = handle_invlpg,
+	[EXIT_REASON_VMCALL]                  = handle_vmcall,
+	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
+	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
+	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
+	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
+	[EXIT_REASON_VMON]                    = handle_vmx_insn,
+	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
+	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
+	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
+	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
+	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
+	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
+	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
+	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
+	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+	ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+/*
+ * The guest has exited.  See if we can fix it or if we need userspace
+ * assistance.
+ */
+
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	uint32_t exit_reason = vmx->exit_reason;
+	uint32_t vectoring_info = vmx->idt_vectoring_info;
+
+	/* If guest state is invalid, start emulating */
+	if (vmx->emulation_required && emulate_invalid_guest_state)
+		return handle_invalid_guest_state(vcpu);
+
+	/* Access CR3 don't cause VMExit in paging mode, so we need
+	 * to sync with guest real CR3. */
+	if (enable_ept && is_paging(vcpu))
+		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+
+	if (vmx->fail) {
+		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		vcpu->run->fail_entry.hardware_entry_failure_reason
+			= vmcs_read32(VM_INSTRUCTION_ERROR);
+		return 0;
+	}
+
+	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+			exit_reason != EXIT_REASON_EPT_VIOLATION &&
+			exit_reason != EXIT_REASON_TASK_SWITCH))
+		cmn_err(CE_WARN, "%s: unexpected, valid vectoring info "
+		       "(0x%x) and exit reason is 0x%x\n",
+		       __func__, vectoring_info, exit_reason);
+
+#ifdef XXX
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+		if (vmx_interrupt_allowed(vcpu)) {
+			vmx->soft_vnmi_blocked = 0;
+		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
+			   vcpu->arch.nmi_pending) {
+			/*
+			 * This CPU don't support us in finding the end of an
+			 * NMI-blocked window if the guest runs with IRQs
+			 * disabled. So we pull the trigger after 1 s of
+			 * futile waiting, but inform the user about this.
+			 */
+			cmn_err(CE_WARN, "%s: Breaking out of NMI-blocked "
+			       "state on VCPU %d after 1 s timeout\n",
+			       __func__, vcpu->vcpu_id);
+			vmx->soft_vnmi_blocked = 0;
+		}
+	}
+#endif /*XXX*/
+
+	if (exit_reason < kvm_vmx_max_exit_handlers
+	    && kvm_vmx_exit_handlers[exit_reason])
+		return kvm_vmx_exit_handlers[exit_reason](vcpu);
+	else {
+		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+		vcpu->run->hw.hardware_exit_reason = exit_reason;
+	}
+	return 0;
+}
+
+static inline void kvm_guest_exit(void)
+{
+#ifdef XXX
+	account_system_vtime(current);
+	current->flags &= ~PF_VCPU;
+#endif /*XXX*/
+}
+
+static inline void kvm_guest_enter(void)
+{
+#ifdef XXX
+	account_system_vtime(current);
+	current->flags |= PF_VCPU;
+#endif /*XXX*/
+}
+
+int mmu_topup_memory_caches(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		goto out;
+	mutex_enter(&vcpu->kvm->mmu_lock);
+	kvm_mmu_free_some_pages(vcpu);
+	r = mmu_alloc_roots(vcpu);
+	mmu_sync_roots(vcpu);
+	mutex_exit(&vcpu->kvm->mmu_lock);
+	if (r)
+		goto out;
+	/* set_cr3() should ensure TLB has been flushed */
+	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+out:
+	return r;
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mmu.root_hpa != INVALID_PAGE)
+		return 0;
+
+	return kvm_mmu_load(vcpu);
+}
+
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	int req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		vcpu->run->request_interrupt_window;
+
+	if (vcpu->requests)
+		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+			kvm_mmu_unload(vcpu);
+
+	r = kvm_mmu_reload(vcpu);
+	if (r)
+		goto out;
+	if (vcpu->requests) {
+		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+			kvm_write_guest_time(vcpu);
+		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+			kvm_mmu_sync_roots(vcpu);
+		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+			kvm_x86_ops->tlb_flush(vcpu);
+		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+				       &vcpu->requests)) {
+			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+			r = 0;
+			goto out;
+		}
+		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+			r = 0;
+			goto out;
+		}
+		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+			vcpu->fpu_active = 0;
+			kvm_x86_ops->fpu_deactivate(vcpu);
+		}
+	}
+
+	kpreempt_disable();
+
+	kvm_x86_ops->prepare_guest_switch(vcpu);
+#ifdef XXX
+	if (vcpu->fpu_active)
+		kvm_load_guest_fpu(vcpu);
+#endif /*XXX*/
+	kpreempt_disable();
+
+	BT_CLEAR(&vcpu->requests, KVM_REQ_KICK);
+#ifdef XXX
+	smp_mb__after_clear_bit();
+#endif /*XXX*/
+
+	if (vcpu->requests /*XXX || need_resched() || signal_pending(current)*/) {
+		BT_SET(&vcpu->requests, KVM_REQ_KICK);
+		kpreempt_enable();
+		r = 1;
+		goto out;
+	}
+#ifdef XXX
+	inject_pending_event(vcpu);
+
+	/* enable NMI/IRQ window open exits if needed */
+	if (vcpu->arch.nmi_pending)
+		kvm_x86_ops->enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		kvm_x86_ops->enable_irq_window(vcpu);
+
+	if (kvm_lapic_enabled(vcpu)) {
+		update_cr8_intercept(vcpu);
+#ifdef XXX
+		kvm_lapic_sync_to_vapic(vcpu);
+#endif /*XXX*/
+	}
+
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+	kvm_guest_enter();
+
+#ifdef XXX
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.eff_db[0], 0);
+		set_debugreg(vcpu->arch.eff_db[1], 1);
+		set_debugreg(vcpu->arch.eff_db[2], 2);
+		set_debugreg(vcpu->arch.eff_db[3], 3);
+	}
+
+	trace_kvm_entry(vcpu->vcpu_id);
+#endif /*XXX*/
+	kvm_x86_ops->run(vcpu);
+#ifdef XXX
+	/*
+	 * If the guest has used debug registers, at least dr7
+	 * will be disabled while returning to the host.
+	 * If we don't have active breakpoints in the host, we don't
+	 * care about the messed up debug address registers. But if
+	 * we have some of them active, restore the old state.
+	 */
+	if (hw_breakpoint_active())
+		hw_breakpoint_restore();
+#endif /*XXX*/
+	BT_SET(&vcpu->requests, KVM_REQ_KICK);
+
+#ifdef XXX
+	++vcpu->stat.exits;
+#endif /*XXX*/
+	kvm_guest_exit();
+
+	kpreempt_enable();
+#ifdef XXX
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	/*
+	 * Profile KVM exit RIPs:
+	 */
+	if (unlikely(prof_on == KVM_PROFILING)) {
+		unsigned long rip = kvm_rip_read(vcpu);
+		profile_hit(KVM_PROFILING, (void *)rip);
+	}
+
+	kvm_lapic_sync_from_vapic(vcpu);
+#endif /*XXX*/
+	r = kvm_x86_ops->handle_exit(vcpu);
+out:
+	return r;
+}
+
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+
+	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+	kvm_run->cr8 = kvm_get_cr8(vcpu);
+	kvm_run->apic_base = kvm_get_apic_base(vcpu);
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_run->ready_for_interrupt_injection = 1;
+#ifdef XXX
+	else
+		kvm_run->ready_for_interrupt_injection =
+			kvm_arch_interrupt_allowed(vcpu) &&
+			!kvm_cpu_has_interrupt(vcpu) &&
+			!kvm_event_needs_reinjection(vcpu);
+#endif /*XXX*/
+}
+
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+		if (kvm_arch_vcpu_runnable(vcpu)) {
+			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			break;
+		}
+		if (kvm_cpu_has_pending_timer(vcpu))
+			break;
+		if (signal_pending(current))
+			break;
+
+		schedule();
+	}
+
+	finish_wait(&vcpu->wq, &wait);
+#endif /*XXX*/
+}
+
+static void vapic_enter(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct page *page;
+
+	if (!apic || !apic->vapic_addr)
+		return;
+
+	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
+
+	vcpu->arch.apic->vapic_page = page;
+#endif /*XXX*/
+}
+
+extern int kvm_apic_id(struct kvm_lapic *apic);
+
+static void vapic_exit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int idx;
+#ifdef XXX
+	if (!apic || !apic->vapic_addr)
+#endif /*XXX*/
+		return;
+#ifdef XXX
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	kvm_release_page_dirty(apic->vapic_page);
+	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+#endif /*XXX*/
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic;
+	int i;
+
+	ASSERT(vcpu);
+	apic = vcpu->arch.apic;
+	ASSERT(apic != NULL);
+
+#ifdef XXX
+	/* Stop the timer in case it's a reset to an active apic */
+	hrtimer_cancel(&apic->lapic_timer.timer);
+#endif /*XXX*/
+
+	apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+	kvm_apic_set_version(apic->vcpu);
+
+	for (i = 0; i < APIC_LVT_NUM; i++)
+		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+	apic_set_reg(apic, APIC_LVT0,
+		     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+
+	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+	apic_set_reg(apic, APIC_SPIV, 0xff);
+	apic_set_reg(apic, APIC_TASKPRI, 0);
+	apic_set_reg(apic, APIC_LDR, 0);
+	apic_set_reg(apic, APIC_ESR, 0);
+	apic_set_reg(apic, APIC_ICR, 0);
+	apic_set_reg(apic, APIC_ICR2, 0);
+	apic_set_reg(apic, APIC_TDCR, 0);
+	apic_set_reg(apic, APIC_TMICT, 0);
+	for (i = 0; i < 8; i++) {
+		apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+	}
+	apic->irr_pending = 0;
+#ifdef XXX
+	update_divide_count(apic);
+	atomic_set(&apic->lapic_timer.pending, 0);
+	if (kvm_vcpu_is_bsp(vcpu))
+		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	apic_update_ppr(apic);
+#endif /*XXX*/
+
+	vcpu->arch.apic_arb_prio = 0;
+
+	cmn_err(CE_NOTE, "%s: vcpu=%p, id=%d, base_msr= 0x%016 PRIx64 base_address=0x%0lx.\n",
+		__func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address);
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu)
+{
+	int r;
+	struct kvm *kvm = vcpu->kvm;
+
+	if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+		cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n",
+			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
+		kvm_lapic_reset(vcpu);
+		r = kvm_arch_vcpu_reset(vcpu);
+		if (r)
+			return r;
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	}
+
+#ifdef XXX
+	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+#endif /*XXX*/
+	vapic_enter(vcpu);
+
+	r = 1;
+	while (r > 0) {
+		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+			r = vcpu_enter_guest(vcpu);
+		else {
+#ifdef XXX
+			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+			kvm_vcpu_block(vcpu);
+#ifdef XXX
+			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+#endif /*XXX*/
+			/*
+			 * XXX - the following should use a bitset_t
+			 * and do bitset_atomic_test_and_del().
+			 * but I am lazy, and will get to it later
+			 */
+			if (BT_TEST(&vcpu->requests, KVM_REQ_UNHALT))
+			{
+				BT_CLEAR(&vcpu->requests, KVM_REQ_UNHALT);
+				switch(vcpu->arch.mp_state) {
+				case KVM_MP_STATE_HALTED:
+					vcpu->arch.mp_state =
+						KVM_MP_STATE_RUNNABLE;
+				case KVM_MP_STATE_RUNNABLE:
+					break;
+				case KVM_MP_STATE_SIPI_RECEIVED:
+				default:
+					r = -EINTR;
+					break;
+				}
+			}
+		}
+
+		if (r <= 0)
+			break;
+
+#ifdef XXX
+		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		if (kvm_cpu_has_pending_timer(vcpu))
+			kvm_inject_pending_timer_irqs(vcpu);
+		if (dm_request_for_irq_injection(vcpu)) {
+			r = -EINTR;
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.request_irq_exits;
+		}
+
+		if (signal_pending(current)) {
+			r = -EINTR;
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.signal_exits;
+		}
+		if (need_resched()) {
+			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+			kvm_resched(vcpu);
+			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+		}
+#endif /*XXX*/
+	}
+#ifdef XXX
+	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+	post_kvm_run_save(vcpu);
+	vapic_exit(vcpu);
+	return r;
+}
+
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r;
+	sigset_t sigsaved;
+
+	vcpu_load(vcpu);
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) {
+		kvm_vcpu_block(vcpu);
+		BT_CLEAR(&vcpu->requests, KVM_REQ_UNHALT);
+		r = -EAGAIN;
+		goto out;
+	}
+
+	/* re-sync apic's tpr */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		kvm_set_cr8(vcpu, kvm_run->cr8);
+
+
+	if (vcpu->arch.pio.cur_count) {
+#ifdef XXX
+		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = complete_pio(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+		if (r)
+			goto out;
+	}
+	if (vcpu->mmio_needed) {
+		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+		vcpu->mmio_read_completed = 1;
+		vcpu->mmio_needed = 0;
+#ifdef XXX
+		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
+					EMULTYPE_NO_DECODE);
+		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+		if (r == EMULATE_DO_MMIO) {
+			/*
+			 * Read-modify-write.  Back to userspace.
+			 */
+			r = 0;
+			goto out;
+		}
+#endif /*XXX*/
+	}
+	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+		kvm_register_write(vcpu, VCPU_REGS_RAX,
+				     kvm_run->hypercall.ret);
+
+	r = __vcpu_run(vcpu);
+
+out:
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	vcpu_put(vcpu);
+	return r;
+}
+
 static int
 kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_p)
 {
-	int rval = EINVAL;
+	int rval = DDI_SUCCESS;
+	volatile int x;  /* XXX - dtrace was not getting fbt return probe */
 
 	switch(cmd) {
 	case KVM_GET_API_VERSION:
 		cmn_err(CE_NOTE, "kvm_ioctl: KVM_GET_API_VERSION");
-		if (arg != NULL)
-			return (rval);
+		if (arg != NULL) {
+			rval = EINVAL;
+			break;
+		}
 		*rval_p = KVM_API_VERSION;
-		cmn_err(CE_NOTE, "kvm_ioctl: set rval_p to %d\n", *rval_p);
-		rval = DDI_SUCCESS;
 		break;
 	case KVM_CREATE_VM:
-		if (arg == NULL)
-			return (rval);
+		if (arg == NULL) {
+			rval = EINVAL;
+			break;
+		}
 		rval = kvm_dev_ioctl_create_vm(arg, mode);
-		return (rval);
+		break;
+	case KVM_RUN: {
+		struct kvm_run_ioc kvm_run_ioc;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+
+		if (!arg) {
+			rval = EINVAL;
+			break;
+		}
+		
+		if (ddi_copyin((caddr_t)arg, &kvm_run_ioc, sizeof kvm_run_ioc, mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		kvmp = find_kvm_id(kvm_run_ioc.kvm_kvmid);
+		if (kvmp == NULL) {
+			rval = EINVAL;
+			break;
+		}
+		if (!kvmp || kvm_run_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+			rval = EINVAL;
+			break;
+		}
+		vcpu = kvmp->vcpus[kvm_run_ioc.kvm_cpu_index];
+		
+		rval = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		break;
+	}
 	case KVM_CHECK_EXTENSION:
 		rval = kvm_dev_ioctl_check_extension_generic(arg, rval_p);
-		if (rval != DDI_SUCCESS)
-			return (rval);
 		break;
+	case KVM_GET_MSRS: {
+		struct kvm_msrs_ioc kvm_msrs_ioc;
+		struct kvm_msrs kvm_msrs;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+		struct kvm_msr_entry *entries;
+		unsigned size;
+		int n;
+
+		if (ddi_copyin((const void *)arg, &kvm_msrs_ioc,
+			       sizeof(kvm_msrs_ioc), mode) != 0) {
+			rval = EFAULT;
+			break;
+		}
+		kvmp = find_kvm_id(kvm_msrs_ioc.kvm_kvmid);
+		rval = EINVAL;
+		if (kvmp == NULL)
+			break;
+		if (!kvmp || kvm_msrs_ioc.kvm_cpu_index >= kvmp->online_vcpus)
+			break;
+
+		vcpu = kvmp->vcpus[kvm_msrs_ioc.kvm_cpu_index];
+
+		if (ddi_copyin(kvm_msrs_ioc.kvm_msrs, &kvm_msrs, sizeof(kvm_msrs), mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		if (kvm_msrs.nmsrs >= MAX_IO_MSRS) {
+			rval = E2BIG;
+			break;
+		}
+		
+		size = sizeof(struct kvm_msr_entry) * kvm_msrs.nmsrs;
+		entries = (struct kvm_msr_entry *) kmem_alloc(size, KM_SLEEP);
+		if (!entries) {
+			rval = ENOMEM;
+			break;
+		}
+
+		if (ddi_copyin((caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), entries, size, mode)) {
+			kmem_free(entries, size);
+			rval = EFAULT;
+			break;
+		}
+
+		rval = n = __msr_io(vcpu, &kvm_msrs, entries, kvm_get_msr);
+
+		if (rval < 0) {
+			kmem_free(entries, size);
+			rval = EINVAL;
+			break;
+		}
+
+		rval = ddi_copyout(entries, (caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), size, mode);
+		kmem_free(entries, size);
+		
+		*rval_p = n;
+
+		break;
+	}
+
+	case KVM_SET_MSRS: {
+		struct kvm_msrs_ioc kvm_msrs_ioc;
+		struct kvm_msrs kvm_msrs;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+		struct kvm_msr_entry *entries;
+		unsigned size;
+		int n;
+
+		if (ddi_copyin((const void *)arg, &kvm_msrs_ioc,
+			       sizeof(kvm_msrs_ioc), mode) != 0) {
+			rval = EFAULT;
+			break;
+		}
+
+		rval = EINVAL;
+		kvmp = find_kvm_id(kvm_msrs_ioc.kvm_kvmid);
+		if (kvmp == NULL)
+			break;
+		if (!kvmp || kvm_msrs_ioc.kvm_cpu_index >= kvmp->online_vcpus)
+			break;
+
+		vcpu = kvmp->vcpus[kvm_msrs_ioc.kvm_cpu_index];
+
+		if (ddi_copyin(kvm_msrs_ioc.kvm_msrs, &kvm_msrs, sizeof(kvm_msrs), mode)) {
+			rval = EFAULT;
+			break;
+		}
+		
+		if (kvm_msrs.nmsrs >= MAX_IO_MSRS) {
+			rval = E2BIG;
+			break;
+		}
+
+		size = sizeof(struct kvm_msr_entry) * kvm_msrs.nmsrs;
+		entries = (struct kvm_msr_entry *)kmem_alloc(size, KM_SLEEP);
+		if (!entries) {
+			rval = ENOMEM;
+			break;
+		}
+
+		if (ddi_copyin((caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), entries, size, mode)) {
+			kmem_free(entries, size);
+			rval = EFAULT;
+			break;
+		}
+
+		rval = n = __msr_io(vcpu, &kvm_msrs, entries, do_set_msr);
+
+		if (rval < 0) {
+			kmem_free(entries, size);
+			rval = EINVAL;
+			break;
+		}
+		kmem_free(entries, size);
+		*rval_p = n;
+		break;
+	}
+
  	case KVM_CREATE_VCPU: {
 		struct kvm_vcpu_ioc kvm_vcpu;
 		struct kvm *kvmp;
 		
 		if (ddi_copyin((const void *)arg, &kvm_vcpu,
-			       sizeof(kvm_vcpu), mode) != 0)
-			return (EFAULT);
+			       sizeof(kvm_vcpu), mode) != 0) {
+			rval = EFAULT;
+			break;
+		}
 
+		rval = EINVAL;
 		kvmp = find_kvm_id(kvm_vcpu.kvmid);
 		if (kvmp == NULL)
-			return(EINVAL);
+			break;
 
  		rval = kvm_vm_ioctl_create_vcpu(kvmp, kvm_vcpu.id, &kvm_vcpu, rval_p); 
- 		if (rval != 0) 
-			return (rval); 
+ 		if (rval != 0) {
+			rval = EINVAL;
+			break;
+		}
+		
 		if (ddi_copyout(&kvm_vcpu, (void *)arg,
 				sizeof(kvm_vcpu), mode) != 0)
-			return EFAULT;
+			rval = EFAULT;
  		break; 
 	}
 
@@ -2602,54 +8781,261 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_
 		struct kvm *kvmp;
 		
 		if (ddi_copyin((const void *)arg, &kvmioc,
-			       sizeof(kvmioc), mode) != 0)
-			return (EFAULT);
+			       sizeof(kvmioc), mode) != 0) {
+			rval = EFAULT;
+			break;
+		}
 
 		kvmp = find_kvm_id(kvmioc.kvmid);
-		if (kvmp == NULL)
-			return(EINVAL);
+		if (kvmp == NULL) {
+			rval = EINVAL;
+			break;
+		}
 
  		rval = kvm_vm_ioctl_set_memory_region(kvmp, &kvmioc.kvm_userspace_map, 1); 
- 		if (rval != 0) 
-			return (rval); 
+ 		if (rval != 0) {
+			rval = EINVAL;
+			break;
+		}
  		break; 
 	}
 	case KVM_GET_SUPPORTED_CPUID: {
 		struct kvm_cpuid2 *cpuid_arg = (struct kvm_cpuid2 *)arg;
 		struct kvm_cpuid2 cpuid;
 
-		if (ddi_copyin(cpuid_arg, &cpuid, sizeof (cpuid), mode))
-			return (EFAULT);
+		if (ddi_copyin(cpuid_arg, &cpuid, sizeof (cpuid), mode)) {
+			rval = EFAULT;
+			break;
+		}
 		rval = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
 						      cpuid_arg->entries, mode);
 		if (rval)
-			return (rval);
+			break;
 
 		if (ddi_copyout(&cpuid, cpuid_arg, sizeof (cpuid), mode))
-			return (EFAULT);
+			rval = EFAULT;
+		break;
+	}
+
+	case KVM_GET_MSR_INDEX_LIST: {
+		struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg;
+		struct kvm_msr_list msr_list;
+		unsigned n;
+
+		if (ddi_copyin(user_msr_list, &msr_list, sizeof msr_list, mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		n = msr_list.nmsrs;
+		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+		if (ddi_copyout(&msr_list, user_msr_list, sizeof msr_list, mode)) {
+			rval = EFAULT;
+			break;
+		}
+		if (n < msr_list.nmsrs) {
+			rval = E2BIG;
+			break;
+		}
+		rval = EFAULT;
+		if (ddi_copyout(&msrs_to_save, user_msr_list->indices, 
+				num_msrs_to_save * sizeof(uint32_t), mode))
+			break;
+		if (ddi_copyout(&emulated_msrs,
+				user_msr_list->indices + num_msrs_to_save,
+				ARRAY_SIZE(emulated_msrs) * sizeof(uint32_t), mode))
+			break;
+		rval = 0;
+		*rval_p = 0;
+		break;
+	}
+	case KVM_GET_REGS: {
+		struct kvm_regs_ioc kvm_regs_ioc;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+
+		if (ddi_copyin((caddr_t)arg, &kvm_regs_ioc, sizeof (kvm_regs_ioc), mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		kvmp = find_kvm_id(kvm_regs_ioc.kvm_kvmid);
+
+		if (!kvmp || kvm_regs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+			rval = EINVAL;
+			break;
+		}
+
+		vcpu = kvmp->vcpus[kvm_regs_ioc.kvm_cpu_index];
+		
+		rval = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs_ioc.kvm_regs);
+		if (rval) {
+			rval = EINVAL;
+			break;
+		}
+		if (ddi_copyout(&kvm_regs_ioc, (caddr_t)arg, sizeof(kvm_regs_ioc), mode))
+			rval = EFAULT;
+		*rval_p = 0;
+		break;
+	}
+	case KVM_SET_REGS: {
+		struct kvm_regs_ioc kvm_regs_ioc;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+
+		if (ddi_copyin((caddr_t)arg, &kvm_regs_ioc, sizeof (kvm_regs_ioc), mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		kvmp = find_kvm_id(kvm_regs_ioc.kvm_kvmid);
+		if (!kvmp || kvm_regs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+			rval = EINVAL;
+			break;
+		}
+
+		vcpu = kvmp->vcpus[kvm_regs_ioc.kvm_cpu_index];
+
+		cmn_err(CE_NOTE, "KVM_SET_REGS: rax = %lx, rbx = %lx, rcx = %lx, rdx = %lx\n",
+			kvm_regs_ioc.kvm_regs.rax, kvm_regs_ioc.kvm_regs.rbx, kvm_regs_ioc.kvm_regs.rcx, kvm_regs_ioc.kvm_regs.rdx);
+
+		rval = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs_ioc.kvm_regs);
+		if (rval)
+			rval = EINVAL;
+		*rval_p = 0;
+		break;
+	}
+	case KVM_GET_FPU: {
+		struct kvm_fpu_ioc kvm_fpu_ioc;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+
+		if (ddi_copyin((caddr_t)arg, &kvm_fpu_ioc, sizeof(kvm_fpu_ioc), mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		kvmp = find_kvm_id(kvm_fpu_ioc.kvm_kvmid);
+		if (!kvmp || kvm_fpu_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+			rval = EINVAL;
+			break;
+		}
+
+		vcpu = kvmp->vcpus[kvm_fpu_ioc.kvm_cpu_index];
+
+		rval = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &kvm_fpu_ioc.fpu);
+		if (rval) {
+			rval = EINVAL;
+			break;
+		}
+
+		if (ddi_copyout(&kvm_fpu_ioc, (caddr_t)arg, sizeof(struct kvm_fpu), mode))
+			rval = EFAULT;
+
+		*rval_p = 0;
+		break;
+	}
+	case KVM_SET_FPU: {
+		struct kvm_fpu_ioc kvm_fpu_ioc;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+
+		if (ddi_copyin((caddr_t)arg, &kvm_fpu_ioc, sizeof(kvm_fpu_ioc), mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		kvmp = find_kvm_id(kvm_fpu_ioc.kvm_kvmid);
+		if (!kvmp || kvm_fpu_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+			rval = EINVAL;
+			break;
+		}
+
+		vcpu = kvmp->vcpus[kvm_fpu_ioc.kvm_cpu_index];
+
+		rval = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &kvm_fpu_ioc.fpu);
+		if (rval)
+			rval = EINVAL;
+		*rval_p = 0;
 		break;
 	}
+	case KVM_GET_SREGS: {
+		struct kvm_sregs_ioc kvm_sregs_ioc;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
 
+		if (ddi_copyin((caddr_t)arg, &kvm_sregs_ioc, sizeof (kvm_sregs_ioc), mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		kvmp = find_kvm_id(kvm_sregs_ioc.kvm_kvmid);
+		if (!kvmp || kvm_sregs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+			rval = EINVAL;
+			break;
+		}
+
+		vcpu = kvmp->vcpus[kvm_sregs_ioc.kvm_cpu_index];
+		
+		rval = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs_ioc.sregs);
+		if (rval) {
+			rval = EINVAL;
+			break;
+		}
+		if (ddi_copyout(&kvm_sregs_ioc, (caddr_t)arg, sizeof(kvm_sregs_ioc), mode))
+			rval = EFAULT;
+		*rval_p = 0;
+		break;
+	}
+	case KVM_SET_SREGS: {
+		struct kvm_sregs_ioc kvm_sregs_ioc;
+		struct kvm *kvmp;
+		struct kvm_vcpu *vcpu;
+
+		if (ddi_copyin((caddr_t)arg, &kvm_sregs_ioc, sizeof (kvm_sregs_ioc), mode)) {
+			rval = EFAULT;
+			break;
+		}
+
+		kvmp = find_kvm_id(kvm_sregs_ioc.kvm_kvmid);
+		if (!kvmp || kvm_sregs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+			rval = EINVAL;
+			break;
+		}
+
+		vcpu = kvmp->vcpus[kvm_sregs_ioc.kvm_cpu_index];
+		
+		rval = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs_ioc.sregs);
+		if (rval)
+			rval = EINVAL;
+		*rval_p = 0;
+		break;
+	}	
 	case KVM_SET_CPUID2: {
 		struct kvm_cpuid2_ioc cpuid_ioc;
 		struct kvm_cpuid2 cpuid_data;
 		struct kvm_vcpu *vcpu;
 
-		rval = EFAULT;
-		if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode))
-			return (EFAULT);
-		if (cpuid_ioc.kvm_vcpu_addr == NULL)
-			return (EINVAL);
+		if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) {
+			rval = EFAULT;
+			break;
+		}
+		if (cpuid_ioc.kvm_vcpu_addr == NULL) {
+			rval = EINVAL;
+			break;
+		}
 
 		vcpu = (struct kvm_vcpu *)(cpuid_ioc.kvm_vcpu_addr);
 
 		if (ddi_copyin((const char *)(cpuid_ioc.cpuid_data), (char *)&cpuid_data,
-			       sizeof(cpuid_data), mode))
-			return (EFAULT);
+			       sizeof(cpuid_data), mode)) {
+			rval = EFAULT;
+			break;
+		}
 		rval = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid_data,
 						 cpuid_data.entries, mode);
 		if (rval)
-			return (rval);
+			rval = EINVAL;
 		break;
 	}
 
@@ -2658,33 +9044,41 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_
 		struct kvm_cpuid2 cpuid_data;
 		struct kvm_vcpu *vcpu;
 
-		rval = EFAULT;
-		if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode))
-			return (EFAULT);
+		if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) {
+			rval = EFAULT;
+			break;
+		}
 
-		if (cpuid_ioc.kvm_vcpu_addr == NULL)
-			return (EINVAL);
+		if (cpuid_ioc.kvm_vcpu_addr == NULL) {
+			rval = EINVAL;
+			break;
+		}
 
 		vcpu = (struct kvm_vcpu *)cpuid_ioc.kvm_vcpu_addr;
 
 		if (ddi_copyin((const char *)(cpuid_ioc.cpuid_data), (char *)&cpuid_data,
-			       sizeof(cpuid_data), mode))
-			return (EFAULT);
+			       sizeof(cpuid_data), mode)) {
+			rval = EFAULT;
+			break;
+		}
 
 		rval = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid_data,
 						 cpuid_data.entries, mode);
-		if (rval)
-			return (rval);
-		rval = EFAULT;
+		if (rval) {
+			rval = EINVAL;
+			break;
+		}
+
 		if (ddi_copyout(&cpuid_ioc, (char *)arg, sizeof cpuid_ioc, mode))
-			return (EFAULT);
-		rval = 0;
+			rval = EFAULT;
 		break;
 	}
 
 	case KVM_GET_VCPU_MMAP_SIZE:
-		if (arg != NULL)
-			return (rval);
+		if (arg != NULL) {
+			rval = EINVAL;
+			break;
+		}
 		*rval_p = ptob(1);
 		break;
 	case KVM_SET_TSS_ADDR:
@@ -2692,22 +9086,27 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_
 		struct kvm_tss kvm_tss;
 		struct kvm *kvmp;
 		if (ddi_copyin((const void *)arg, &kvm_tss,
-			       sizeof(kvm_tss), mode) != 0)
-			return (EFAULT);
+			       sizeof(kvm_tss), mode) != 0) {
+			rval = EFAULT;
+			break;
+		}
 
 		kvmp = find_kvm_id(kvm_tss.kvmid);
-		if (kvmp == NULL)
-			return(EINVAL);
+		if (kvmp == NULL) {
+			rval = EINVAL;
+			break;
+		}
 		rval = kvm_vm_ioctl_set_tss_addr(kvmp, kvm_tss.addr);
-		if (rval != DDI_SUCCESS)
-			return (rval);
+		break;
 	}
 	default:
-		return (rval);  /* x64, others may do other things... */
+		rval = EINVAL;  /* x64, others may do other things... */
 	}
+
+	x = 10;  /*XXX do something...*/
 	if (*rval_p == -1)
 		return (EINVAL);
-	return (DDI_SUCCESS);
+	return (rval);
 }
 
 static int
diff --git a/kvm.h b/kvm.h
index c9f81f2..c18812e 100644
--- a/kvm.h
+++ b/kvm.h
@@ -5,6 +5,7 @@
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 
+#include "kvm_types.h"
 #include <sys/bitmap.h>
 
 #define KVM_API_VERSION 12   /* same as linux (for qemu compatability...) */
@@ -13,6 +14,8 @@
 #define offsetof(s, m) ((size_t)(&((s *)0)->m))
 #endif
 
+#define offset_in_page(p)	((unsigned long)(p) & ~PAGEMASK)
+
 /* borrowed liberally from linux... */
 
 #define MAX_IO_MSRS 256
@@ -30,6 +33,8 @@
 
 #define KVM_MAX_VCPUS 64
 
+#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
+
 #define KVM_MAX_MCE_BANKS 32
 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
@@ -154,12 +159,6 @@
 
 #define KVM_NR_PAGE_SIZES	3  /* XXX assumes x86 */
 
-enum kvm_bus {
-	KVM_MMIO_BUS,
-	KVM_PIO_BUS,
-	KVM_NR_BUSES
-};
-
 struct kvm_vcpu_data {
 	char vcpu_vhpt[VHPT_SIZE];
 	char vcpu_vtlb[VTLB_SIZE];
@@ -175,105 +174,32 @@ struct kvm_vm_data {
 };
 
 /*
- * Address types:
- *
- *  gva - guest virtual address
- *  gpa - guest physical address
- *  gfn - guest frame number
- *  hva - host virtual address
- *  hpa - host physical address
- *  hfn - host frame number
- */
-
-typedef unsigned long  gva_t;
-typedef uint64_t       gpa_t;
-typedef unsigned long  gfn_t;
-
-typedef unsigned long  hva_t;
-typedef uint64_t       hpa_t;
-typedef unsigned long  hfn_t;
-
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- *   bits 4:7 - page table level for this shadow (1-4)
- *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - direct mapping of virtual to physical mapping at gfn
- *              used for real mode and two-dimensional paging
- *   bits 17:19 - common access permissions for all ptes in this shadow page
- */
-union kvm_mmu_page_role {
-	unsigned word;
-	struct {
-		unsigned glevels:4;
-		unsigned level:4;
-		unsigned quadrant:2;
-		unsigned pad_for_nice_hex_output:6;
-		unsigned direct:1;
-		unsigned access:3;
-		unsigned invalid:1;
-		unsigned cr4_pge:1;
-		unsigned nxe:1;
-	}w;
-};
-
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-
-struct kvm_vcpu;
-struct kvm_mmu_page;
-
-struct kvm_mmu {
-	void (*new_cr3)(struct kvm_vcpu *vcpu);
-	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t err);
-	void (*free)(struct kvm_vcpu *vcpu);
-	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t access,
-			    uint32_t *error);
-	void (*prefetch_page)(struct kvm_vcpu *vcpu,
-			      struct kvm_mmu_page *page);
-	int (*sync_page)(struct kvm_vcpu *vcpu,
-			 struct kvm_mmu_page *sp);
-	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
-	hpa_t root_hpa;
-	int root_level;
-	int shadow_root_level;
-	union kvm_mmu_page_role base_role;
-
-	uint64_t *pae_root;
-	uint64_t rsvd_bits_mask[2][4];
-};
-
-struct kvm_pv_mmu_op_buffer {
-	void *ptr;
-	unsigned len;
-	unsigned processed;
-	char pad[2];
-	char buf[512];  /* XXX aligned */
-};
-
-/*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
  */
-#define KVM_NR_MEM_OBJS 40
 
-#define KVM_NR_DB_REGS	4
 
-struct kvm_mmu_memory_cache {
-	int nobjs;
-	void *objects[KVM_NR_MEM_OBJS];
-};
+#define KVM_NR_DB_REGS	4
 
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
-	enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
-	unsigned int bytes;
-	unsigned long val, orig_val, *ptr;
+/*
+ * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+	uint16_t	cwd;
+	uint16_t	swd;
+	uint16_t	twd;
+	uint16_t	fop;
+	uint64_t	rip;
+	uint64_t	rdp;
+	uint32_t	mxcsr;
+	uint32_t	mxcsr_mask;
+	uint32_t	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+	uint32_t	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+	uint32_t	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
 };
 
 struct i387_fxsave_struct {
@@ -311,109 +237,11 @@ struct i387_fxsave_struct {
 
 } __attribute__((aligned(16)));
 
-struct kvm_pio_request {
-	unsigned long count;
-	int cur_count;
-	gva_t guest_gva;
-	int in;
-	int port;
-	int size;
-	int string;
-	int down;
-	int rep;
-};
-
 #define KVM_MAX_CPUID_ENTRIES 40
-enum kvm_reg {
-	VCPU_REGS_RAX = 0,
-	VCPU_REGS_RCX = 1,
-	VCPU_REGS_RDX = 2,
-	VCPU_REGS_RBX = 3,
-	VCPU_REGS_RSP = 4,
-	VCPU_REGS_RBP = 5,
-	VCPU_REGS_RSI = 6,
-	VCPU_REGS_RDI = 7,
-#ifdef CONFIG_X86_64
-	VCPU_REGS_R8 = 8,
-	VCPU_REGS_R9 = 9,
-	VCPU_REGS_R10 = 10,
-	VCPU_REGS_R11 = 11,
-	VCPU_REGS_R12 = 12,
-	VCPU_REGS_R13 = 13,
-	VCPU_REGS_R14 = 14,
-	VCPU_REGS_R15 = 15,
-#endif
-	VCPU_REGS_RIP,
-	NR_VCPU_REGS
-};
 
-enum kvm_reg_ex {
-	VCPU_EXREG_PDPTR = NR_VCPU_REGS,
-};
-
-struct kvm_cpuid_entry2 {
-	uint32_t function;
-	uint32_t index;
-	uint32_t flags;
-	uint32_t eax;
-	uint32_t ebx;
-	uint32_t ecx;
-	uint32_t edx;
-	uint32_t padding[3];
-};
-
-struct fetch_cache {
-	unsigned char data[15];
-	unsigned long start;
-	unsigned long end;
-};
-
-struct decode_cache {
-	unsigned char twobyte;
-	unsigned char b;
-	unsigned char lock_prefix;
-	unsigned char rep_prefix;
-	unsigned char op_bytes;
-	unsigned char ad_bytes;
-	unsigned char rex_prefix;
-	struct operand src;
-	struct operand src2;
-	struct operand dst;
-	unsigned char has_seg_override;
-	unsigned char seg_override;
-	unsigned int d;
-	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip, eip_orig;
-	/* modrm */
-	unsigned char modrm;
-	unsigned char modrm_mod;
-	unsigned char modrm_reg;
-	unsigned char modrm_rm;
-	unsigned char use_modrm_ea;
-	unsigned char rip_relative;
-	unsigned long modrm_ea;
-	void *modrm_ptr;
-	unsigned long modrm_val;
-	struct fetch_cache fetch;
-};
+#include "kvm_emulate.h"
 
 
-struct x86_emulate_ctxt {
-	/* Register state before/after emulation. */
-	struct kvm_vcpu *vcpu;
-
-	unsigned long eflags;
-	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
-	int mode;
-	uint32_t cs_base;
-
-	/* interruptibility state, as a result of execution of STI or MOV SS */
-	int interruptibility;
-
-	/* decode cache */
-	struct decode_cache decode;
-};
-
 /*
  * These structs MUST NOT be changed.
  * They are the ABI between hypervisor and guest OS.
@@ -444,34 +272,12 @@ struct pvclock_vcpu_time_info {
 	unsigned char    pad[3];
 } __attribute__((__packed__)); /* 32 bytes */
 
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
-   an 8 bit field: */
-typedef unsigned char mtrr_type;
-
-#define MTRR_NUM_FIXED_RANGES 88
-#define MTRR_MAX_VAR_RANGES 256
-
-struct mtrr_var_range {
-	uint32_t base_lo;
-	uint32_t base_hi;
-	uint32_t mask_lo;
-	uint32_t mask_hi;
-};
-
-struct mtrr_state_type {
-	struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
-	mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
-	unsigned char enabled;
-	unsigned char have_fixed;
-	mtrr_type def_type;
-};
-
 #define	APIC_LDR	0xD0
-
+#ifdef _KERNEL
 struct kvm_lapic {
 	unsigned long base_address;
-#ifdef XXX
 	struct kvm_io_device dev;
+#ifdef XXX
 	struct kvm_timer lapic_timer;
 #endif /*XXX*/
 	uint32_t divide_count;
@@ -484,114 +290,7 @@ struct kvm_lapic {
 	struct page *vapic_page;
 };
 
-struct kvm_vcpu_arch {
-	uint64_t host_tsc;
-	/*
-	 * rip and regs accesses must go through
-	 * kvm_{register,rip}_{read,write} functions.
-	 */
-	unsigned long regs[NR_VCPU_REGS];
-	uint32_t regs_avail;
-	uint32_t regs_dirty;
-
-	unsigned long cr0;
-	unsigned long cr0_guest_owned_bits;
-	unsigned long cr2;
-	unsigned long cr3;
-	unsigned long cr4;
-	unsigned long cr4_guest_owned_bits;
-	unsigned long cr8;
-	uint32_t hflags;
-	uint64_t pdptrs[4]; /* pae */
-	uint64_t efer;
-	uint64_t apic_base;
-	struct kvm_lapic *apic;    /* kernel irqchip context */
-	int32_t apic_arb_prio;
-	int mp_state;
-	int sipi_vector;
-	uint64_t ia32_misc_enable_msr;
-	char tpr_access_reporting;
-
-	struct kvm_mmu mmu;
-	/* only needed in kvm_pv_mmu_op() path, but it's hot so
-	 * put it here to avoid allocation */
-	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
-
-	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
-	struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
-	struct kvm_mmu_memory_cache mmu_page_cache;
-	struct kvm_mmu_memory_cache mmu_page_header_cache;
-
-	gfn_t last_pt_write_gfn;
-	int   last_pt_write_count;
-	uint64_t  *last_pte_updated;
-	gfn_t last_pte_gfn;
-
-	struct {
-		gfn_t gfn;	/* presumed gfn during guest pte update */
-		pfn_t pfn;	/* pfn corresponding to that gfn */
-		unsigned long mmu_seq;
-	} update_pte;
-
-	struct i387_fxsave_struct host_fx_image;
-	struct i387_fxsave_struct guest_fx_image;
-
-	gva_t mmio_fault_cr2;
-	struct kvm_pio_request pio;
-	void *pio_data;
-
-	unsigned char event_exit_inst_len;
-
-	struct kvm_queued_exception {
-		char pending;
-		char has_error_code;
-		unsigned char nr;
-		uint32_t error_code;
-	} exception;
-
-	struct kvm_queued_interrupt {
-		char pending;
-		char soft;
-		unsigned char nr;
-	} interrupt;
-
-	int halt_request; /* real mode on Intel only */
-
-	int cpuid_nent;
-	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
-	/* emulate context */
-
-	struct x86_emulate_ctxt emulate_ctxt;
-
-	gpa_t time;
-	struct pvclock_vcpu_time_info hv_clock;
-	unsigned int hv_clock_tsc_khz;
-	unsigned int time_offset;
-	struct page *time_page;
-
-	char nmi_pending;
-	char nmi_injected;
-
-	struct mtrr_state_type mtrr_state;
-	uint32_t pat;
-
-	int switch_db_regs;
-	unsigned long db[KVM_NR_DB_REGS];
-	unsigned long dr6;
-	unsigned long dr7;
-	unsigned long eff_db[KVM_NR_DB_REGS];
-
-	uint64_t mcg_cap;
-	uint64_t mcg_status;
-	uint64_t mcg_ctl;
-	uint64_t *mce_banks;
-
-	/* used for guest single stepping over the given code position */
-	unsigned short singlestep_cs;
-	unsigned long singlestep_rip;
-	/* fields used by HYPER-V emulation */
-	uint64_t hv_vapic;
-};
+struct vcpu_vmx;
 
 struct kvm_vcpu {
 	struct kvm *kvm;
@@ -615,18 +314,37 @@ struct kvm_vcpu {
 	sigset_t sigset;
 	struct kstat stat;
 
-#ifdef CONFIG_HAS_IOMEM
+  /*#ifdef CONFIG_HAS_IOMEM*/
 	int mmio_needed;
 	int mmio_read_completed;
 	int mmio_is_write;
 	int mmio_size;
 	unsigned char mmio_data[8];
 	gpa_t mmio_phys_addr;
-#endif
+  /*#endif*/
 
 	struct kvm_vcpu_arch arch;
 };
 
+
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+	int nr;
+	uint32_t msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+#ifdef XXX
+	struct user_return_notifier urn;
+#endif /*XXX*/
+	int registered;
+	struct kvm_shared_msr_values {
+		uint64_t host;
+		uint64_t curr;
+	} values[KVM_NR_SHARED_MSRS];
+};
+
 struct kvm_memory_slot {
 	gfn_t base_gfn;
 	unsigned long npages;
@@ -641,12 +359,6 @@ struct kvm_memory_slot {
 	int user_alloc;
 };
 
-#define KVM_MEMORY_SLOTS 32  /* XXX assumes x86 */
-#define KVM_PRIVATE_MEM_SLOTS 4 /* XXX assumes x86 */
-#define TSS_PRIVATE_MEMSLOT			(KVM_MEMORY_SLOTS + 0)
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	(KVM_MEMORY_SLOTS + 1)
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	(KVM_MEMORY_SLOTS + 2)
-
 
 struct kvm_memslots {
 	int nmemslots;
@@ -654,6 +366,7 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#endif /*_KERNEL*/
 
 #ifdef x86
 
@@ -784,21 +497,18 @@ struct kvm_regs {
 	uint64_t rip, rflags;
 };
 
+struct kvm_regs_ioc {
+	struct kvm_regs kvm_regs;
+	int kvm_cpu_index;
+	int kvm_kvmid;
+};
+
 /* for KVM_GET_LAPIC and KVM_SET_LAPIC */
 #define KVM_APIC_REG_SIZE 0x400
 struct kvm_lapic_state {
 	char regs[KVM_APIC_REG_SIZE];
 };
 
-struct kvm_segment {
-	uint64_t base;
-	uint32_t limit;
-	unsigned short selector;
-	unsigned char  type;
-	unsigned char  present, dpl, db, s, l, g, avl;
-	unsigned char  unusable;
-	unsigned char  padding;
-};
 
 struct kvm_dtable {
 	uint64_t base;
@@ -822,6 +532,12 @@ struct kvm_sregs {
 	uint64_t interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
 };
 
+struct kvm_sregs_ioc {
+	struct kvm_sregs sregs;
+	int kvm_cpu_index;
+	int kvm_kvmid;
+};
+
 /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
 #define KVM_VCPUEVENT_VALID_NMI_PENDING	0x00000001
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
@@ -930,13 +646,6 @@ struct kvm_assigned_dev_kernel {
 	kmutex_t assigned_dev_lock;
 };
 
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
-	uint64_t *parent_ptes[NR_PTE_CHAIN_ENTRIES];
-	list_t link;
-};
-
 /**
  * container_of - cast a member of a structure out to the containing structure
  * @ptr:	the pointer to the member.
@@ -948,39 +657,6 @@ struct kvm_pte_chain {
 	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
 	(type *)( (char *)__mptr - offsetof(type,member) );})
 
-
-struct kvm_mmu_page {
-	struct list_node link;
-	struct list_node hash_link;
-
-	struct list_node oos_link;
-
-	/*
-	 * The following two entries are used to key the shadow page in the
-	 * hash table.
-	 */
-	gfn_t gfn;
-	union kvm_mmu_page_role role;
-
-	uint64_t *spt;
-	/* hold the gfn of each spte inside spt */
-	gfn_t *gfns;
-	/*
-	 * One bit set per slot which has memory
-	 * in this shadow page.
-	 */
-	unsigned long slot_bitmap[BT_BITOUL(KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)];
-	int multimapped;         /* More than one parent_pte? */
-	int root_count;          /* Currently serving as active root */
-	char unsync;
-	unsigned int unsync_children;
-	union {
-		uint64_t *parent_pte;               /* !multimapped */
-		list_t parent_ptes; /* hash list, multimapped, kvm_pte_chain */
-	}v;
-	unsigned long unsync_child_bitmap[BT_BITOUL(512)];
-};
-
 #define PT64_ROOT_LEVEL 4
 #define PT32_ROOT_LEVEL 2
 #define PT32E_ROOT_LEVEL 3
@@ -1069,6 +745,11 @@ struct kvm_fpu {
 	uint32_t pad2;
 };
 
+struct kvm_fpu_ioc {
+	struct kvm_fpu fpu;
+	int kvm_cpu_index;
+	int kvm_kvmid;
+};
 
 struct kvm_msr_entry {
 	uint32_t index;
@@ -1084,6 +765,12 @@ struct kvm_msrs {
 	struct kvm_msr_entry entries[1];
 };
 
+struct kvm_msrs_ioc {
+	struct kvm_msrs *kvm_msrs;
+	int kvm_cpu_index;
+	int kvm_kvmid;
+};
+	
 /* for KVM_GET_MSR_INDEX_LIST */
 struct kvm_msr_list {
 	uint32_t nmsrs; /* number of msrs in entries */
@@ -1122,73 +809,10 @@ struct pvclock_wall_clock {
 #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
 #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
 
-
-struct kvm_mem_alias {
-	gfn_t base_gfn;
-	unsigned long npages;
-	gfn_t target_gfn;
-#define KVM_ALIAS_INVALID     1UL
-	unsigned long flags;
-};
-
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
-	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-	int naliases;
-};
-
-struct kvm_xen_hvm_config {
-	uint32_t flags;
-	uint32_t msr;
-	uint64_t blob_addr_32;
-	uint64_t blob_addr_64;
-	unsigned char blob_size_32;
-	unsigned char blob_size_64;
-	unsigned char pad2[30];
-};
-
-struct kvm_arch {
-	struct kvm_mem_aliases *aliases;
-
-	unsigned int n_free_mmu_pages;
-	unsigned int n_requested_mmu_pages;
-	unsigned int n_alloc_mmu_pages;
-	list_t mmu_page_hash[KVM_NUM_MMU_PAGES];
-	/*
-	 * Hash table of struct kvm_mmu_page.
-	 */
-	list_t active_mmu_pages;
-	list_t assigned_dev_head;
-	struct iommu_domain *iommu_domain;
-	int iommu_flags;
-	struct kvm_pic *vpic;
-	struct kvm_ioapic *vioapic;
-	struct kvm_pit *vpit;
-	int vapics_in_nmi_mode;
-
-	unsigned int tss_addr;
-	struct page *apic_access_page;
-
-	gpa_t wall_clock;
-
-	struct page *ept_identity_pagetable;
-	char ept_identity_pagetable_done;
-	gpa_t ept_identity_map_addr;
-
-	unsigned long irq_sources_bitmap;
-	uint64_t vm_init_tsc;
-	int64_t kvmclock_offset;
-
-	struct kvm_xen_hvm_config xen_hvm_config;
-
-	/* fields used by HYPER-V emulation */
-	uint64_t hv_guest_os_id;
-	uint64_t hv_hypercall;
-};
-
 #endif /*x86*/
 
+#ifdef _KERNEL
+
 struct kvm {
 	kmutex_t mmu_lock;
 	kmutex_t requests_lock;
@@ -1217,10 +841,10 @@ struct kvm {
 	struct kstat kvm_kstat;
 	struct kvm_arch arch;
 	volatile int users_count;
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+  /*#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET*/
 	struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
-#endif
+  /*#endif*/
 
 	kmutex_t irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
@@ -1238,6 +862,7 @@ struct kvm {
 #endif /*XXX*/
 	int kvmid;  /* unique identifier for this kvm */
 };
+#endif /*_KERNEL*/
 
 #define KVM_EXIT_UNKNOWN          0
 #define KVM_EXIT_EXCEPTION        1
@@ -1486,19 +1111,32 @@ static inline void native_load_tr_desc(void)
 #define _IO(x, y)	((x<<8)|y)  /* original is in /usr/include/sys/ioccom.h */
 #define KVMIO 0xAE
 
+/* for KVM_SET_CPUID2/KVM_GET_CPUID2 */
+struct kvm_cpuid2_ioc {
+	struct cpuid_data *cpuid_data;
+	uint64_t kvm_vcpu_addr;
+	int kvm_cpu_index;
+};
+
+/* for KVM_RUN */
+struct kvm_run_ioc {
+	int kvm_kvmid;
+	int kvm_cpu_index;
+};
+
 /*
  * ioctls for vcpu fds
  */
 #define KVM_RUN                   _IO(KVMIO,   0x80)
-#define KVM_GET_REGS              _IOR(KVMIO,  0x81, struct kvm_regs)
-#define KVM_SET_REGS              _IOW(KVMIO,  0x82, struct kvm_regs)
-#define KVM_GET_SREGS             _IOR(KVMIO,  0x83, struct kvm_sregs)
-#define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs)
+#define KVM_GET_REGS              _IOR(KVMIO,  0x81, struct kvm_regs_ioc)
+#define KVM_SET_REGS              _IOW(KVMIO,  0x82, struct kvm_regs_ioc)
+#define KVM_GET_SREGS             _IOR(KVMIO,  0x83, struct kvm_sregs_ioc)
+#define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs_ioc)
 #define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
-#define KVM_GET_FPU               _IOR(KVMIO,  0x8c, struct kvm_fpu)
-#define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
-#define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
-#define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
+#define KVM_GET_FPU               _IOR(KVMIO,  0x8c, struct kvm_fpu_ioc)
+#define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu_ioc)
+#define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs_ioc)
+#define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs_ioc)
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 /* Available with KVM_CAP_VCPU_EVENTS */
@@ -1554,6 +1192,20 @@ struct vmcs_config {
 	uint32_t vmexit_ctrl;
 	uint32_t vmentry_ctrl;
 };
+
+#define RMAP_EXT 4
+
+struct kvm_rmap_desc {
+	uint64_t *sptes[RMAP_EXT];
+	struct kvm_rmap_desc *more;
+};
+
+
+static struct vmx_capability {
+	uint32_t ept;
+	uint32_t vpid;
+} vmx_capability;
+
 struct vmcs {
 	uint32_t revision_id;
 	uint32_t abort;
@@ -1576,13 +1228,6 @@ struct kvm_dirty_log {
 	}v;
 };
 
-/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
-
-struct kvm_coalesced_mmio_zone {
-	uint64_t addr;
-	uint32_t size;
-	uint32_t pad;
-};
 
 struct kvm_coalesced_mmio {
 	uint64_t phys_addr;
@@ -1613,8 +1258,8 @@ struct kvm_mp_state {
 	uint32_t mp_state;
 };
 
-#define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
-#define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+#define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2_ioc)
+#define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2_ioc)
 
 /* for kvm_memory_region::flags */
 #define KVM_MEM_LOG_DIRTY_PAGES  1UL
@@ -1668,11 +1313,6 @@ struct kvm_vcpu_ioc {
 };
 
 
-/* for KVM_SET_CPUID2/KVM_GET_CPUID2 */
-struct kvm_cpuid2_ioc {
-	struct cpuid_data *cpuid_data;
-	uint64_t kvm_vcpu_addr;
-};
 
 /* LDT or TSS descriptor in the GDT. 16 bytes. */
 struct ldttss_desc64 {
@@ -1684,6 +1324,13 @@ struct ldttss_desc64 {
 	uint32_t zero1;
 } __attribute__((packed));
 
+struct shared_msr_entry {
+	unsigned index;
+	uint64_t data;
+	uint64_t mask;
+};
+
+#ifdef _KERNEL
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	list_t      local_vcpus_link;
@@ -1732,8 +1379,6 @@ struct vcpu_vmx {
 	char rdtscp_enabled;
 };
 
-#ifdef _KERNEL
-
 /*
  * vcpu->requests bit members
  */
@@ -1826,87 +1471,5 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
 
 #define INVALID_PAGE (~(hpa_t)0)
 
-struct kvm_x86_ops {
-	int (*cpu_has_kvm_support)(void);          /* __init */
-	int (*disabled_by_bios)(void);             /* __init */
-	int (*hardware_enable)(void *dummy);
-	void (*hardware_disable)(void *dummy);
-	void (*check_processor_compatibility)(void *rtn);
-	int (*hardware_setup)(void);               /* __init */
-	void (*hardware_unsetup)(void);            /* __exit */
-	int (*cpu_has_accelerated_tpr)(void);
-	void (*cpuid_update)(struct kvm_vcpu *vcpu);
-
-	/* Create, but do not attach this VCPU */
-	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
-	void (*vcpu_free)(struct kvm_vcpu *vcpu);
-	int (*vcpu_reset)(struct kvm_vcpu *vcpu);
-
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
-	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
-	void (*vcpu_put)(struct kvm_vcpu *vcpu);
-	void (*set_guest_debug)(struct kvm_vcpu *vcpu,
-#ifdef XXX
-				struct kvm_guest_debug *dbg);
-#else
-	void *dbg);
-#endif
-
-	int (*get_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata);
-	int (*set_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data);
-	uint64_t (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
-	void (*get_segment)(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg);
-	int (*get_cpl)(struct kvm_vcpu *vcpu);
-	void (*set_segment)(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg);
-	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
-	void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
-	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
-	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
-	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
-	void (*set_efer)(struct kvm_vcpu *vcpu, uint64_t efer);
-	void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
-	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
-	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
-	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
-	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-	void (*fpu_activate)(struct kvm_vcpu *vcpu);
-	void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
-
-	void (*tlb_flush)(struct kvm_vcpu *vcpu);
-
-	void (*run)(struct kvm_vcpu *vcpu);
-	int (*handle_exit)(struct kvm_vcpu *vcpu);
-	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
-	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
-	uint32_t (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
-	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
-				unsigned char *hypercall_addr);
-	void (*set_irq)(struct kvm_vcpu *vcpu);
-	void (*set_nmi)(struct kvm_vcpu *vcpu);
-	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
-				int has_error_code, uint32_t error_code);
-	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
-	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
-	int (*get_nmi_mask)(struct kvm_vcpu *vcpu);
-	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, int masked);
-	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
-	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
-	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-	int (*get_tdp_level)(void);
-	uint64_t (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, int is_mmio);
-	int (*get_lpage_level)(void);
-	int (*rdtscp_supported)(void);
-
-	const struct trace_print_flags *exit_reasons_str;
-};
-
 #endif
 
diff --git a/kvm_host.h b/kvm_host.h
index 33b2f2f..0d8737a 100644
--- a/kvm_host.h
+++ b/kvm_host.h
@@ -1,849 +1,570 @@
+#ifndef __KVM_HOST_H
+#define __KVM_HOST_H
+
 /*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This header defines architecture specific interfaces, x86 version
- *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
- *
  */
 
-#ifndef _ASM_X86_KVM_HOST_H
-#define _ASM_X86_KVM_HOST_H
-
 #ifdef XXX
 #include <linux/types.h>
+#include <linux/hardirq.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
-#include <linux/mmu_notifier.h>
-#include <linux/tracepoint.h>
+#include <linux/preempt.h>
+#include <linux/msi.h>
+#include <asm/signal.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
-#include <linux/kvm_types.h>
-
-#include <asm/pvclock-abi.h>
-#include <asm/desc.h>
-#include <asm/mtrr.h>
-#include <asm/msr-index.h>
-
-#endif
-
-#define KVM_PIO_PAGE_OFFSET 1
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
-
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |	\
-				  0xFFFFFF0000000000ULL)
-
-#define INVALID_PAGE (~(hpa_t)0)
-#define UNMAPPED_GVA (~(gpa_t)0)
-
-/* KVM Hugepage definitions for x86 */
-#define KVM_NR_PAGE_SIZES	3
-#define KVM_HPAGE_SHIFT(x)	(PAGESHIFT + (((x) - 1) * 9))
-#define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
-#define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
-#define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGESIZE)
-
-#define DE_VECTOR 0
-#define DB_VECTOR 1
-#define BP_VECTOR 3
-#define OF_VECTOR 4
-#define BR_VECTOR 5
-#define UD_VECTOR 6
-#define NM_VECTOR 7
-#define DF_VECTOR 8
-#define TS_VECTOR 10
-#define NP_VECTOR 11
-#define SS_VECTOR 12
-#define GP_VECTOR 13
-#define PF_VECTOR 14
-#define MF_VECTOR 16
-#define MC_VECTOR 18
-
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
-#define KVM_ALIAS_SLOTS 4
-
-#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
-#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 8
-
-extern kmutex_t kvm_lock;
-extern list_t vm_list;
-
-struct kvm_vcpu;
-struct kvm;
 
-
-enum {
-	VCPU_SREG_ES,
-	VCPU_SREG_CS,
-	VCPU_SREG_SS,
-	VCPU_SREG_DS,
-	VCPU_SREG_FS,
-	VCPU_SREG_GS,
-	VCPU_SREG_TR,
-	VCPU_SREG_LDTR,
-};
-
-#ifdef XXX
-#include <asm/kvm_emulate.h>
 #endif /*XXX*/
 
-#define KVM_NR_MEM_OBJS 40
+#include "kvm_types.h"
 
-#define KVM_NR_DB_REGS	4
+#define KVM_MEMORY_SLOTS 32  /* XXX assumes x86 */
+#define KVM_PRIVATE_MEM_SLOTS 4 /* XXX assumes x86 */
+#define TSS_PRIVATE_MEMSLOT			(KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	(KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	(KVM_MEMORY_SLOTS + 2)
 
-#define DR6_BD		(1 << 13)
-#define DR6_BS		(1 << 14)
-#define DR6_FIXED_1	0xffff0ff0
-#define DR6_VOLATILE	0x0000e00f
+#include "kvm_x86host.h"
 
-#define DR7_BP_EN_MASK	0x000000ff
-#define DR7_GE		(1 << 9)
-#define DR7_GD		(1 << 13)
-#define DR7_FIXED_1	0x00000400
-#define DR7_VOLATILE	0xffff23ff
-
-#ifdef XXX
 /*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
+ * vcpu->requests bit members
  */
-struct kvm_mmu_memory_cache {
-	int nobjs;
-	void *objects[KVM_NR_MEM_OBJS];
-};
-
-#define NR_PTE_CHAIN_ENTRIES 5
+#define KVM_REQ_TLB_FLUSH          0
+#define KVM_REQ_MIGRATE_TIMER      1
+#define KVM_REQ_REPORT_TPR_ACCESS  2
+#define KVM_REQ_MMU_RELOAD         3
+#define KVM_REQ_TRIPLE_FAULT       4
+#define KVM_REQ_PENDING_TIMER      5
+#define KVM_REQ_UNHALT             6
+#define KVM_REQ_MMU_SYNC           7
+#define KVM_REQ_KVMCLOCK_UPDATE    8
+#define KVM_REQ_KICK               9
+#define KVM_REQ_DEACTIVATE_FPU    10
+
+#define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-struct kvm_pte_chain {
-	u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
-	struct hlist_node link;
-};
+struct kvm;
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
 
 /*
- * kvm_mmu_page_role, below, is defined as:
- *
- *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- *   bits 4:7 - page table level for this shadow (1-4)
- *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - direct mapping of virtual to physical mapping at gfn
- *              used for real mode and two-dimensional paging
- *   bits 17:19 - common access permissions for all ptes in this shadow page
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
  */
-union kvm_mmu_page_role {
-	unsigned word;
-	struct {
-		unsigned glevels:4;
-		unsigned level:4;
-		unsigned quadrant:2;
-		unsigned pad_for_nice_hex_output:6;
-		unsigned direct:1;
-		unsigned access:3;
-		unsigned invalid:1;
-		unsigned cr4_pge:1;
-		unsigned nxe:1;
-	};
-};
-
-struct kvm_mmu_page {
-	struct list_head link;
-	struct hlist_node hash_link;
-
-	struct list_head oos_link;
-
-	/*
-	 * The following two entries are used to key the shadow page in the
-	 * hash table.
-	 */
-	gfn_t gfn;
-	union kvm_mmu_page_role role;
-
-	u64 *spt;
-	/* hold the gfn of each spte inside spt */
-	gfn_t *gfns;
-	/*
-	 * One bit set per slot which has memory
-	 * in this shadow page.
-	 */
-	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	int multimapped;         /* More than one parent_pte? */
-	int root_count;          /* Currently serving as active root */
-	bool unsync;
-	unsigned int unsync_children;
-	union {
-		u64 *parent_pte;               /* !multimapped */
-		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
-	};
-	DECLARE_BITMAP(unsync_child_bitmap, 512);
+struct kvm_io_bus {
+	int                   dev_count;
+#define NR_IOBUS_DEVS 200
+	struct kvm_io_device *devs[NR_IOBUS_DEVS];
 };
 
-struct kvm_pv_mmu_op_buffer {
-	void *ptr;
-	unsigned len;
-	unsigned processed;
-	char buf[512] __aligned(sizeof(long));
+enum kvm_bus {
+	KVM_MMIO_BUS,
+	KVM_PIO_BUS,
+	KVM_NR_BUSES
 };
 
-struct kvm_pio_request {
-	unsigned long count;
-	int cur_count;
-	gva_t guest_gva;
-	int in;
-	int port;
-	int size;
-	int string;
-	int down;
-	int rep;
-};
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-struct kvm_mmu {
-	void (*new_cr3)(struct kvm_vcpu *vcpu);
-	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-	void (*free)(struct kvm_vcpu *vcpu);
-	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
-			    u32 *error);
-	void (*prefetch_page)(struct kvm_vcpu *vcpu,
-			      struct kvm_mmu_page *page);
-	int (*sync_page)(struct kvm_vcpu *vcpu,
-			 struct kvm_mmu_page *sp);
-	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
-	hpa_t root_hpa;
-	int root_level;
-	int shadow_root_level;
-	union kvm_mmu_page_role base_role;
-
-	u64 *pae_root;
-	u64 rsvd_bits_mask[2][4];
-};
-
-struct kvm_vcpu_arch {
-	u64 host_tsc;
-	/*
-	 * rip and regs accesses must go through
-	 * kvm_{register,rip}_{read,write} functions.
-	 */
-	unsigned long regs[NR_VCPU_REGS];
-	u32 regs_avail;
-	u32 regs_dirty;
-
-	unsigned long cr0;
-	unsigned long cr0_guest_owned_bits;
-	unsigned long cr2;
-	unsigned long cr3;
-	unsigned long cr4;
-	unsigned long cr4_guest_owned_bits;
-	unsigned long cr8;
-	u32 hflags;
-	u64 pdptrs[4]; /* pae */
-	u64 efer;
-	u64 apic_base;
-	struct kvm_lapic *apic;    /* kernel irqchip context */
-	int32_t apic_arb_prio;
-	int mp_state;
-	int sipi_vector;
-	u64 ia32_misc_enable_msr;
-	bool tpr_access_reporting;
-
-	struct kvm_mmu mmu;
-	/* only needed in kvm_pv_mmu_op() path, but it's hot so
-	 * put it here to avoid allocation */
-	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
-
-	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
-	struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
-	struct kvm_mmu_memory_cache mmu_page_cache;
-	struct kvm_mmu_memory_cache mmu_page_header_cache;
-
-	gfn_t last_pt_write_gfn;
-	int   last_pt_write_count;
-	u64  *last_pte_updated;
-	gfn_t last_pte_gfn;
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+		     int len, const void *val);
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
+		    void *val);
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			    struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			      struct kvm_io_device *dev);
+#ifdef XXX
+struct kvm_vcpu {
+	struct kvm *kvm;
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	struct preempt_notifier preempt_notifier;
+#endif
+	int vcpu_id;
+	struct mutex mutex;
+	int   cpu;
+	struct kvm_run *run;
+	unsigned long requests;
+	unsigned long guest_debug;
+	int srcu_idx;
+
+	int fpu_active;
+	int guest_fpu_loaded;
+	wait_queue_head_t wq;
+	int sigset_active;
+	sigset_t sigset;
+	struct kvm_vcpu_stat stat;
+
+#ifdef CONFIG_HAS_IOMEM
+	int mmio_needed;
+	int mmio_read_completed;
+	int mmio_is_write;
+	int mmio_size;
+	unsigned char mmio_data[8];
+	gpa_t mmio_phys_addr;
+#endif
 
-	struct {
-		gfn_t gfn;	/* presumed gfn during guest pte update */
-		pfn_t pfn;	/* pfn corresponding to that gfn */
-		unsigned long mmu_seq;
-	} update_pte;
-
-	struct i387_fxsave_struct host_fx_image;
-	struct i387_fxsave_struct guest_fx_image;
-
-	gva_t mmio_fault_cr2;
-	struct kvm_pio_request pio;
-	void *pio_data;
-
-	u8 event_exit_inst_len;
-
-	struct kvm_queued_exception {
-		bool pending;
-		bool has_error_code;
-		u8 nr;
-		u32 error_code;
-	} exception;
-
-	struct kvm_queued_interrupt {
-		bool pending;
-		bool soft;
-		u8 nr;
-	} interrupt;
-
-	int halt_request; /* real mode on Intel only */
-
-	int cpuid_nent;
-	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
-	/* emulate context */
-
-	struct x86_emulate_ctxt emulate_ctxt;
-
-	gpa_t time;
-	struct pvclock_vcpu_time_info hv_clock;
-	unsigned int hv_clock_tsc_khz;
-	unsigned int time_offset;
-	struct page *time_page;
-
-	bool nmi_pending;
-	bool nmi_injected;
-
-	struct mtrr_state_type mtrr_state;
-	u32 pat;
-
-	int switch_db_regs;
-	unsigned long db[KVM_NR_DB_REGS];
-	unsigned long dr6;
-	unsigned long dr7;
-	unsigned long eff_db[KVM_NR_DB_REGS];
-
-	u64 mcg_cap;
-	u64 mcg_status;
-	u64 mcg_ctl;
-	u64 *mce_banks;
-
-	/* used for guest single stepping over the given code position */
-	u16 singlestep_cs;
-	unsigned long singlestep_rip;
-	/* fields used by HYPER-V emulation */
-	u64 hv_vapic;
+	struct kvm_vcpu_arch arch;
 };
 
-struct kvm_mem_alias {
+struct kvm_memory_slot {
 	gfn_t base_gfn;
 	unsigned long npages;
-	gfn_t target_gfn;
-#define KVM_ALIAS_INVALID     1UL
 	unsigned long flags;
+	unsigned long *rmap;
+	unsigned long *dirty_bitmap;
+	struct {
+		unsigned long rmap_pde;
+		int write_count;
+	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
+	unsigned long userspace_addr;
+	int user_alloc;
 };
 
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
+{
+	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+}
 
-struct kvm_mem_aliases {
-	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-	int naliases;
+struct kvm_kernel_irq_routing_entry {
+	uint32_t gsi;
+	uint32_t type;
+	int (*set)(struct kvm_kernel_irq_routing_entry *e,
+		   struct kvm *kvm, int irq_source_id, int level);
+	union {
+		struct {
+			unsigned irqchip;
+			unsigned pin;
+		} irqchip;
+		struct msi_msg msi;
+	};
+	struct hlist_node link;
 };
 
-struct kvm_arch {
-	struct kvm_mem_aliases *aliases;
+#ifdef __KVM_HAVE_IOAPIC
 
-	unsigned int n_free_mmu_pages;
-	unsigned int n_requested_mmu_pages;
-	unsigned int n_alloc_mmu_pages;
-	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+struct kvm_irq_routing_table {
+	int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
+	struct kvm_kernel_irq_routing_entry *rt_entries;
+	uint32_t nr_rt_entries;
 	/*
-	 * Hash table of struct kvm_mmu_page.
+	 * Array indexed by gsi. Each entry contains list of irq chips
+	 * the gsi is connected to.
 	 */
-	struct list_head active_mmu_pages;
-	struct list_head assigned_dev_head;
-	struct iommu_domain *iommu_domain;
-	int iommu_flags;
-	struct kvm_pic *vpic;
-	struct kvm_ioapic *vioapic;
-	struct kvm_pit *vpit;
-	int vapics_in_nmi_mode;
-
-	unsigned int tss_addr;
-	struct page *apic_access_page;
-
-	gpa_t wall_clock;
+	struct hlist_head map[0];
+};
 
-	struct page *ept_identity_pagetable;
-	bool ept_identity_pagetable_done;
-	gpa_t ept_identity_map_addr;
+#else
 
-	unsigned long irq_sources_bitmap;
-	u64 vm_init_tsc;
-	s64 kvmclock_offset;
+struct kvm_irq_routing_table {};
 
-	struct kvm_xen_hvm_config xen_hvm_config;
+#endif
 
-	/* fields used by HYPER-V emulation */
-	u64 hv_guest_os_id;
-	u64 hv_hypercall;
-};
+static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
+{
+	smp_rmb();
+	return kvm->vcpus[i];
+}
 
-struct kvm_vm_stat {
-	u32 mmu_shadow_zapped;
-	u32 mmu_pte_write;
-	u32 mmu_pte_updated;
-	u32 mmu_pde_zapped;
-	u32 mmu_flooded;
-	u32 mmu_recycled;
-	u32 mmu_cache_miss;
-	u32 mmu_unsync;
-	u32 remote_tlb_flush;
-	u32 lpages;
+#define kvm_for_each_vcpu(idx, vcpup, kvm) \
+	for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \
+	     idx < atomic_read(&kvm->online_vcpus) && vcpup; \
+	     vcpup = kvm_get_vcpu(kvm, ++idx))
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_put(struct kvm_vcpu *vcpu);
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+		  struct module *module);
+void kvm_exit(void);
+
+void kvm_get_kvm(struct kvm *kvm);
+void kvm_put_kvm(struct kvm *kvm);
+
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
+
+extern struct page *bad_page;
+extern pfn_t bad_pfn;
+
+int is_error_page(struct page *page);
+int is_error_pfn(pfn_t pfn);
+int kvm_is_error_hva(unsigned long addr);
+int kvm_set_memory_region(struct kvm *kvm,
+			  struct kvm_userspace_memory_region *mem,
+			  int user_alloc);
+int __kvm_set_memory_region(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem,
+			    int user_alloc);
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+				struct kvm_memory_slot *memslot,
+				struct kvm_memory_slot old,
+				struct kvm_userspace_memory_region *mem,
+				int user_alloc);
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem,
+				struct kvm_memory_slot old,
+				int user_alloc);
+void kvm_disable_largepages(void);
+void kvm_arch_flush_shadow(struct kvm *kvm);
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page_clean(struct page *page);
+void kvm_release_page_dirty(struct page *page);
+void kvm_set_page_dirty(struct page *page);
+void kvm_set_page_accessed(struct page *page);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+			 struct kvm_memory_slot *slot, gfn_t gfn);
+int memslot_id(struct kvm *kvm, gfn_t gfn);
+void kvm_release_pfn_dirty(pfn_t);
+void kvm_release_pfn_clean(pfn_t pfn);
+void kvm_set_pfn_dirty(pfn_t pfn);
+void kvm_set_pfn_accessed(pfn_t pfn);
+void kvm_get_pfn(pfn_t pfn);
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+			int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+			  unsigned long len);
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+			 int offset, int len);
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+		    unsigned long len);
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_reload_remote_mmus(struct kvm *kvm);
+
+long kvm_arch_dev_ioctl(struct file *filp,
+			unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_ioctl(struct file *filp,
+			 unsigned int ioctl, unsigned long arg);
+
+int kvm_dev_ioctl_check_extension(long ext);
+
+int kvm_get_dirty_log(struct kvm *kvm,
+			struct kvm_dirty_log *log, int *is_dirty);
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+				struct kvm_dirty_log *log);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+				   struct
+				   kvm_userspace_memory_region *mem,
+				   int user_alloc);
+long kvm_arch_vm_ioctl(struct file *filp,
+		       unsigned int ioctl, unsigned long arg);
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+				    struct kvm_translation *tr);
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state);
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state);
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
+int kvm_arch_init(void *opaque);
+void kvm_arch_exit(void);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
+int kvm_arch_hardware_enable(void *garbage);
+void kvm_arch_hardware_disable(void *garbage);
+int kvm_arch_hardware_setup(void);
+void kvm_arch_hardware_unsetup(void);
+void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+
+void kvm_free_physmem(struct kvm *kvm);
+
+struct  kvm *kvm_arch_create_vm(void);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+void kvm_free_all_assigned_devices(struct kvm *kvm);
+void kvm_arch_sync_events(struct kvm *kvm);
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+int kvm_is_mmio_pfn(pfn_t pfn);
+
+struct kvm_irq_ack_notifier {
+	struct hlist_node link;
+	unsigned gsi;
+	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
-struct kvm_vcpu_stat {
-	u32 pf_fixed;
-	u32 pf_guest;
-	u32 tlb_flush;
-	u32 invlpg;
-
-	u32 exits;
-	u32 io_exits;
-	u32 mmio_exits;
-	u32 signal_exits;
-	u32 irq_window_exits;
-	u32 nmi_window_exits;
-	u32 halt_exits;
-	u32 halt_wakeup;
-	u32 request_irq_exits;
-	u32 irq_exits;
-	u32 host_state_reload;
-	u32 efer_reload;
-	u32 fpu_reload;
-	u32 insn_emulation;
-	u32 insn_emulation_fail;
-	u32 hypercalls;
-	u32 irq_injections;
-	u32 nmi_injections;
+#define KVM_ASSIGNED_MSIX_PENDING		0x1
+struct kvm_guest_msix_entry {
+	u32 vector;
+	u16 entry;
+	u16 flags;
 };
 
-
-struct kvm_x86_ops {
-	int (*cpu_has_kvm_support)(void);          /* __init */
-	int (*disabled_by_bios)(void);             /* __init */
-	int (*hardware_enable)(void *dummy);
-	void (*hardware_disable)(void *dummy);
-	void (*check_processor_compatibility)(void *rtn);
-	int (*hardware_setup)(void);               /* __init */
-	void (*hardware_unsetup)(void);            /* __exit */
-	bool (*cpu_has_accelerated_tpr)(void);
-	void (*cpuid_update)(struct kvm_vcpu *vcpu);
-
-	/* Create, but do not attach this VCPU */
-	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
-	void (*vcpu_free)(struct kvm_vcpu *vcpu);
-	int (*vcpu_reset)(struct kvm_vcpu *vcpu);
-
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
-	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
-	void (*vcpu_put)(struct kvm_vcpu *vcpu);
-
-	void (*set_guest_debug)(struct kvm_vcpu *vcpu,
-				struct kvm_guest_debug *dbg);
-	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
-	void (*get_segment)(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg);
-	int (*get_cpl)(struct kvm_vcpu *vcpu);
-	void (*set_segment)(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg);
-	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
-	void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
-	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
-	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
-	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
-	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
-	void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
-	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
-	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
-	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
-	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-	void (*fpu_activate)(struct kvm_vcpu *vcpu);
-	void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
-
-	void (*tlb_flush)(struct kvm_vcpu *vcpu);
-
-	void (*run)(struct kvm_vcpu *vcpu);
-	int (*handle_exit)(struct kvm_vcpu *vcpu);
-	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
-	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
-	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
-	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
-				unsigned char *hypercall_addr);
-	void (*set_irq)(struct kvm_vcpu *vcpu);
-	void (*set_nmi)(struct kvm_vcpu *vcpu);
-	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code);
-	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
-	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
-	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
-	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
-	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
-	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
-	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
-	int (*get_tdp_level)(void);
-	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
-	int (*get_lpage_level)(void);
-	bool (*rdtscp_supported)(void);
-
-	const struct trace_print_flags *exit_reasons_str;
+struct kvm_assigned_dev_kernel {
+	struct kvm_irq_ack_notifier ack_notifier;
+	struct work_struct interrupt_work;
+	struct list_head list;
+	int assigned_dev_id;
+	int host_segnr;
+	int host_busnr;
+	int host_devfn;
+	unsigned int entries_nr;
+	int host_irq;
+	bool host_irq_disabled;
+	struct msix_entry *host_msix_entries;
+	int guest_irq;
+	struct kvm_guest_msix_entry *guest_msix_entries;
+	unsigned long irq_requested_type;
+	int irq_source_id;
+	int flags;
+	struct pci_dev *dev;
+	struct kvm *kvm;
+	spinlock_t assigned_dev_lock;
 };
 
-extern struct kvm_x86_ops *kvm_x86_ops;
-
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask);
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_zap_all(struct kvm *kvm);
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
-
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-			  const void *val, int bytes);
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
-		  gpa_t addr, unsigned long *ret);
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
-
-extern bool tdp_enabled;
-
-enum emulation_result {
-	EMULATE_DONE,       /* no further processing */
-	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
-	EMULATE_FAIL,         /* can't emulate this instruction */
+struct kvm_irq_mask_notifier {
+	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+	int irq;
+	struct hlist_node link;
 };
 
-#define EMULTYPE_NO_DECODE	    (1 << 0)
-#define EMULTYPE_TRAP_UD	    (1 << 1)
-#define EMULTYPE_SKIP		    (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu,
-			unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags);
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
-		     unsigned long *rflags);
-void kvm_enable_efer_bits(u64);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-
-struct x86_emulate_ctxt;
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
-		     int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-			   int size, unsigned long count, int down,
-			    gva_t address, int rep, unsigned port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long value);
-
-void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
-
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
-unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-
-unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-
-#ifdef XXX
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-#endif /*XXX*/
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-			   u32 error_code);
-bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
-
-int kvm_pic_set_irq(void *opaque, int irq, int level);
-
-void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_write_emulated(unsigned long addr,
-			    const void *val,
-			    unsigned int bytes,
-			    struct kvm_vcpu *vcpu);
-
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes,
-		       bool guest_initiated);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-
-void kvm_enable_tdp(void);
-void kvm_disable_tdp(void);
-
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-int complete_pio(struct kvm_vcpu *vcpu);
-bool kvm_check_iopl(struct kvm_vcpu *vcpu);
-
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+				    struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+				      struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
-static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+#ifdef __KVM_HAVE_IOAPIC
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask);
+#endif
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian);
+int kvm_request_irq_source_id(struct kvm *kvm);
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+
+/* For vcpu->arch.iommu_flags */
+#define KVM_IOMMU_CACHE_COHERENCY	0x1
+
+#ifdef CONFIG_IOMMU_API
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
+int kvm_iommu_map_guest(struct kvm *kvm);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+int kvm_assign_device(struct kvm *kvm,
+		      struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_deassign_device(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev);
+#else /* CONFIG_IOMMU_API */
+static inline int kvm_iommu_map_pages(struct kvm *kvm,
+				      gfn_t base_gfn,
+				      unsigned long npages)
 {
-	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
-	return (struct kvm_mmu_page *)page_private(page);
+	return 0;
 }
 
-#endif /*XXX*/
-
-static inline unsigned short kvm_read_fs(void)
+static inline int kvm_iommu_map_guest(struct kvm *kvm)
 {
-	unsigned short seg;
-	asm("mov %%fs, %0" : "=g"(seg));
-	return seg;
+	return -ENODEV;
 }
 
-static inline unsigned short kvm_read_gs(void)
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
 {
-	unsigned short seg;
-	asm("mov %%gs, %0" : "=g"(seg));
-	return seg;
+	return 0;
 }
 
-static inline unsigned short kvm_read_ldt(void)
+static inline int kvm_assign_device(struct kvm *kvm,
+		struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	unsigned short ldt;
-	asm("sldt %0" : "=g"(ldt));
-	return ldt;
+	return 0;
 }
 
-static inline void kvm_load_fs(unsigned short sel)
+static inline int kvm_deassign_device(struct kvm *kvm,
+		struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	asm("mov %0, %%fs" : : "rm"(sel));
+	return 0;
 }
+#endif /* CONFIG_IOMMU_API */
 
-static inline void kvm_load_gs(unsigned short sel)
+static inline void kvm_guest_enter(void)
 {
-	asm("mov %0, %%gs" : : "rm"(sel));
+	account_system_vtime(current);
+	current->flags |= PF_VCPU;
 }
 
-static inline void kvm_load_ldt(unsigned short sel)
+static inline void kvm_guest_exit(void)
 {
-	asm("lldt %0" : : "rm"(sel));
+	account_system_vtime(current);
+	current->flags &= ~PF_VCPU;
 }
 
-struct descriptor_table {
-	unsigned short limit;
-	unsigned long base;
-} __attribute__((packed));
-
-static inline void kvm_get_idt(struct descriptor_table *table)
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
 {
-	asm("sidt %0" : "=m"(*table));
+	return (gpa_t)gfn << PAGE_SHIFT;
 }
 
-static inline void kvm_get_gdt(struct descriptor_table *table)
+static inline hpa_t pfn_to_hpa(pfn_t pfn)
 {
-	asm("sgdt %0" : "=m"(*table));
+	return (hpa_t)pfn << PAGE_SHIFT;
 }
 
-/*
- * FIXME: Accessing the desc_struct through its fields is more elegant,
- * and should be the one valid thing to do. However, a lot of open code
- * still touches the a and b accessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than an union,
- * so we can get rid of it transparently in the future -- glommer
- */
-/* 8 byte segment descriptor */
-struct desc_struct {
-	union {
-		struct {
-			unsigned int a;
-			unsigned int b;
-		}a;
-		struct {
-			unsigned short limit0;
-			unsigned short base0;
-			unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
-			unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
-		}b;
-	}c;
-} __attribute__((packed));
-
-static inline unsigned long get_desc_base(const struct desc_struct *desc)
+static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
-	return (unsigned)(desc->c.b.base0 | ((desc->c.b.base1) << 16) | ((desc->c.b.base2) << 24));
+	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
 }
 
-extern unsigned long segment_base(uint16_t selector);
+enum kvm_stat_kind {
+	KVM_STAT_VM,
+	KVM_STAT_VCPU,
+};
+
+struct kvm_stats_debugfs_item {
+	const char *name;
+	int offset;
+	enum kvm_stat_kind kind;
+	struct dentry *dentry;
+};
+extern struct kvm_stats_debugfs_item debugfs_entries[];
+extern struct dentry *kvm_debugfs_dir;
 
-static inline unsigned long kvm_read_tr_base(void)
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
 {
-	unsigned short tr;
-	asm("str %0" : "=g"(tr));
-	return segment_base(tr);
+	if (unlikely(vcpu->kvm->mmu_notifier_count))
+		return 1;
+	/*
+	 * Both reads happen under the mmu_lock and both values are
+	 * modified under mmu_lock, so there's no need of smb_rmb()
+	 * here in between, otherwise mmu_notifier_count should be
+	 * read before mmu_notifier_seq, see
+	 * mmu_notifier_invalidate_range_end write side.
+	 */
+	if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
+		return 1;
+	return 0;
 }
+#endif
 
-#ifdef CONFIG_X86_64
-static inline unsigned long read_msr(unsigned long msr)
-{
-	uint64_t value;
+#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+#define unalias_gfn_instantiation unalias_gfn
+#endif
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+
+#define KVM_MAX_IRQ_ROUTES 1024
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_set_irq_routing(struct kvm *kvm,
+			const struct kvm_irq_routing_entry *entries,
+			unsigned nr,
+			unsigned flags);
+void kvm_free_irq_routing(struct kvm *kvm);
+
+#else
+
+static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
-	rdmsrl(msr, value);
-	return value;
-}
 #endif
 
-#ifdef XXX
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
-	asm("fxsave (%0)":: "r" (image));
-}
+#undef CONFIG_HAVE_KVM_EVENTFD
 
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
-	asm("fxrstor (%0)":: "r" (image));
-}
+#ifdef CONFIG_HAVE_KVM_EVENTFD
+
+void kvm_eventfd_init(struct kvm *kvm);
+int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
+void kvm_irqfd_release(struct kvm *kvm);
+int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
-static inline void kvm_fx_finit(void)
+#else
+
+static inline void kvm_eventfd_init(struct kvm *kvm) {}
+static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
-	asm("finit");
+	return -EINVAL;
 }
-#endif /*XXX*/
-static inline uint32_t get_rdx_init_val(void)
+
+static inline void kvm_irqfd_release(struct kvm *kvm) {}
+static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
-	return 0x600; /* P6 family */
+	return -ENOSYS;
 }
 
-static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, uint32_t error_code)
+#endif /* CONFIG_HAVE_KVM_EVENTFD */
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 {
-#ifdef XXX
-	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-#endif /*XXX*/
+	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
+#endif
 
-#define TSS_IOPB_BASE_OFFSET 0x66
-#define TSS_BASE_SIZE 0x68
-#define TSS_IOPB_SIZE (65536 / 8)
-#define TSS_REDIRECTION_SIZE (256 / 8)
-#define RMODE_TSS_SIZE							\
-	(TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
-
-enum {
-	TASK_SWITCH_CALL = 0,
-	TASK_SWITCH_IRET = 1,
-	TASK_SWITCH_JMP = 2,
-	TASK_SWITCH_GATE = 3,
-};
-
-#define HF_GIF_MASK		(1 << 0)
-#define HF_HIF_MASK		(1 << 1)
-#define HF_VINTR_MASK		(1 << 2)
-#define HF_NMI_MASK		(1 << 3)
-#define HF_IRET_MASK		(1 << 4)
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 
-/*
- * Hardware virtualization extension instructions may fault if a
- * reboot turns off virtualization while processes are running.
- * Trap the fault and ignore the instruction if that happens.
- */
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+				  unsigned long arg);
 
-#ifdef XXX
-#include "linkage.h"
+#else
 
-asmlinkage void kvm_handle_fault_on_reboot(void);
+static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+						unsigned long arg)
+{
+	return -ENOTTY;
+}
 
 #endif
 
-
-#define __kvm_handle_fault_on_reboot(insn) \
-	"666: " insn "\n\t" \
-	".pushsection .fixup, \"ax\" \n" \
-	"667: \n\t" \
-	__ASM_SIZE(push) " $666b \n\t"	      \
-	".popsection \n\t" \
-	".pushsection __ex_table, \"a\" \n\t" \
-	_ASM_PTR " 666b, 667b \n\t" \
-	".popsection \n\t"
-
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
-#ifdef XXX
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
-int kvm_age_hva(struct kvm *kvm, unsigned long hva);
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-
-void kvm_define_shared_msr(unsigned index, uint32_t msr);
-void kvm_set_shared_msr(unsigned index, uint64_t val, uint64_t mask);
 #endif /*XXX*/
-#endif /* _ASM_X86_KVM_HOST_H */
+#endif
+
diff --git a/kvm_x86.c b/kvm_x86.c
index 0885d84..29b1d6b 100644
--- a/kvm_x86.c
+++ b/kvm_x86.c
@@ -18,6 +18,7 @@
 #include <sys/thread.h>
 #include <sys/cpuvar.h>
 #include <vm/hat_i86.h>
+#include <sys/segments.h>
 
 #include "msr-index.h"
 #include "msr.h"
@@ -25,16 +26,24 @@
 #include "processor-flags.h"
 #include "apicdef.h"
 #include "kvm_host.h"
+#include "kvm_x86host.h"
+#include "iodev.h"
 
 #define PER_CPU_ATTRIBUTES
 #define PER_CPU_DEF_ATTRIBUTES
 #define PER_CPU_BASE_SECTION ".data"
 #include "percpu-defs.h"
+#include "coalesced_mmio.h"
 #include "kvm.h"
+#include "irq.h"
 
 extern struct vmcs **vmxarea;
 
 static int vcpuid;
+extern uint64_t native_read_msr_safe(unsigned int msr,
+				     int *err);
+extern int native_write_msr_safe(unsigned int msr,
+				 unsigned low, unsigned high);
 
 unsigned long segment_base(uint16_t selector)
 {
@@ -124,7 +133,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 		gfn++;
 	}
 
-	iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages);
+	iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGESIZE * npages);
 }
 
 static int
@@ -219,6 +228,7 @@ vmx_hardware_enable(void *garbage)
 	uint64_t phys_addr = kvtop(per_cpu(vmxarea, cpu));
 #else
 	uint64_t phys_addr;
+	volatile int x;  /* XXX - dtrace return probe missing */
 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)vmxarea[cpu]);
 	phys_addr = ((uint64_t)pfn << PAGESHIFT)|((uint64_t)vmxarea[cpu] & PAGEOFFSET);
 #endif
@@ -249,9 +259,35 @@ vmx_hardware_enable(void *garbage)
 	ept_sync_global();
 #endif /*XXX*/
 
+	x = 10; /*XXX*/
 	return 0;
 }
 
+extern struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu);
+extern void vmcs_writel(unsigned long field, unsigned long value);
+extern unsigned long vmcs_readl(unsigned long field);
+
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags, save_rflags;
+
+	rflags = vmcs_readl(GUEST_RFLAGS);
+	if (to_vmx(vcpu)->rmode.vm86_active) {
+		rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+		save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+		rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+	}
+	return rflags;
+}
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	if (to_vmx(vcpu)->rmode.vm86_active) {
+		to_vmx(vcpu)->rmode.save_rflags = rflags;
+		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+	}
+	vmcs_writel(GUEST_RFLAGS, rflags);
+}
+
 int kvm_arch_hardware_enable(void *garbage)
 {
 #ifdef LATER
@@ -362,19 +398,14 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p)
 	return r;
 }
 
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
-	return kvm->arch.vpic;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
+int irqchip_in_kernel(struct kvm *kvm)
 {
 	int ret;
 
 	ret = (pic_irqchip(kvm) != NULL);
 #ifdef XXX
 	smp_rmb();
-#endif /*XXX*/
+#endif
 	return ret;
 }
 
@@ -390,12 +421,16 @@ kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 #ifdef XXX
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
-#endif
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-#ifdef XXX
+	else
+		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+#else
+	if (!irqchip_in_kernel(kvm) /* || kvm_vcpu_is_bsp(vcpu) */)
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
 #endif
+
 	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
 	if (!page) {
 		r = ENOMEM;
@@ -414,6 +449,7 @@ kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 			goto fail_mmu_destroy;
 	}
 #endif /*XXX*/
+
 	vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS * sizeof(uint64_t) * 4,
 				       KM_SLEEP);
 	if (!vcpu->arch.mce_banks) {
@@ -487,8 +523,7 @@ fail:
  * 32-bit hardware).
  */
 
-uint64_t
-kvm_va2pa(caddr_t va)
+uint64_t kvm_va2pa(caddr_t va)
 {
 	uint64_t pa;
 
@@ -496,12 +531,21 @@ kvm_va2pa(caddr_t va)
 	return (pa);
 }
 
+#ifdef XXX
 unsigned long *vmx_io_bitmap_a;
 unsigned long *vmx_io_bitmap_b;
 unsigned long *vmx_msr_bitmap_legacy;
 unsigned long *vmx_msr_bitmap_longmode;
+#else
+/* make these arrays to try to force into low 4GB memory...*/
+/* also need to be aligned... */
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_io_bitmap_a[PAGESIZE/sizeof(unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_io_bitmap_b[PAGESIZE/sizeof(unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_msr_bitmap_legacy[PAGESIZE/sizeof(unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_msr_bitmap_longmode[PAGESIZE/sizeof(unsigned long)];
+#endif /*XXX*/
+
 
-extern void vmcs_writel(unsigned long field, unsigned long value);
 static void vmcs_write16(unsigned long field, uint16_t value)
 {
 	vmcs_writel(field, value);
@@ -521,169 +565,64 @@ static void vmcs_write64(unsigned long field, uint64_t value)
 #endif
 }
 
-extern unsigned long vmcs_readl(unsigned long field);
-
-
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
-{
-	uint32_t host_sysenter_cs, msr_low, msr_high;
-	uint32_t junk;
-	uint64_t host_pat, tsc_this, tsc_base;
-	unsigned long a;
-	struct descriptor_table dt;
-	int i;
-	unsigned long kvm_vmx_return;
-	uint32_t exec_control;
-
-#ifdef XXX
-	/* I/O */
-	vmcs_write64(IO_BITMAP_A, kvm_va2pa((caddr_t)vmx_io_bitmap_a));
-	vmcs_write64(IO_BITMAP_B, kvm_va2pa((caddr_t)vmx_io_bitmap_b));
-
-	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, kvm_pa2va(vmx_msr_bitmap_legacy));
-
-	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
-	/* Control */
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-		vmcs_config.pin_based_exec_ctrl);
-
-	exec_control = vmcs_config.cpu_based_exec_ctrl;
-	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
-		exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
-		exec_control |= CPU_BASED_CR8_STORE_EXITING |
-				CPU_BASED_CR8_LOAD_EXITING;
-#endif
-	}
-	if (!enable_ept)
-		exec_control |= CPU_BASED_CR3_STORE_EXITING |
-				CPU_BASED_CR3_LOAD_EXITING  |
-				CPU_BASED_INVLPG_EXITING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
-	if (cpu_has_secondary_exec_ctrls()) {
-		exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-		if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-			exec_control &=
-				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-		if (vmx->vpid == 0)
-			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-		if (!enable_ept) {
-			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
-			enable_unrestricted_guest = 0;
-		}
-		if (!enable_unrestricted_guest)
-			exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
-		if (!ple_gap)
-			exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-	}
-
-	if (ple_gap) {
-		vmcs_write32(PLE_GAP, ple_gap);
-		vmcs_write32(PLE_WINDOW, ple_window);
-	}
-
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
-
-	vmcs_writel(HOST_CR0, getcr0());  /* 22.2.3 */
-	vmcs_writel(HOST_CR4, getcr4());  /* 22.2.3, 22.2.5 */
-	vmcs_writel(HOST_CR3, getcr3());  /* 22.2.3  FIXME: shadow tables */
-
-	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
-	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
-	vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
-	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-#ifdef CONFIG_X86_64
-	rdmsrl(MSR_FS_BASE, a);
-	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
-	rdmsrl(MSR_GS_BASE, a);
-	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
-	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
-	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
+extern int enable_ept;
+extern int enable_unrestricted_guest;
+extern int emulate_invalid_guest_state;
 
-	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
+static int bypass_guest_pf = 1;
 
-	kvm_get_idt(&dt);
-	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+extern void vmcs_clear(struct vmcs *vmcs);
+extern void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+extern void vmx_vcpu_put(struct kvm_vcpu *vcpu);
 
-	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
-	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+extern int vmx_vcpu_setup(struct vcpu_vmx *vmx);
+extern int enable_vpid;
 
-	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
-	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-	rdmsrl(MSR_IA32_SYSENTER_ESP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
-	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
+extern ulong_t *vmx_vpid_bitmap;
+extern kmutex_t vmx_vpid_lock;
 
-	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
-		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-		host_pat = msr_low | ((uint64_t) msr_high << 32);
-		vmcs_write64(HOST_IA32_PAT, host_pat);
-	}
-	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
-		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-		host_pat = msr_low | ((uint64_t) msr_high << 32);
-		/* Write the default value follow host pat */
-		vmcs_write64(GUEST_IA32_PAT, host_pat);
-		/* Keep arch.pat sync with GUEST_IA32_PAT */
-		vmx->vcpu.arch.pat = host_pat;
-	}
+static void allocate_vpid(struct vcpu_vmx *vmx)
+{
+	int vpid;
 
-	for (i = 0; i < NR_VMX_MSR; ++i) {
-		uint32_t index = vmx_msr_index[i];
-		uint32_t data_low, data_high;
-		int j = vmx->nmsrs;
-
-		if (rdmsr_safe(index, &data_low, &data_high) < 0)
-			continue;
-		if (wrmsr_safe(index, data_low, data_high) < 0)
-			continue;
-		vmx->guest_msrs[j].index = i;
-		vmx->guest_msrs[j].data = 0;
-		vmx->guest_msrs[j].mask = -1ull;
-		++vmx->nmsrs;
+	vmx->vpid = 0;
+	if (!enable_vpid)
+		return;
+	mutex_enter(&vmx_vpid_lock);
+	vpid = bt_availbit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+	if (vpid < VMX_NR_VPIDS) {
+		vmx->vpid = vpid;
+		BT_SET(vmx_vpid_bitmap, vpid);
 	}
+	mutex_exit(&vmx_vpid_lock);
+}
 
-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
-	/* 22.2.1, 20.8.1 */
-	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
-	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
-	if (enable_ept)
-		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
-	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+#ifdef XXX
+static int alloc_identity_pagetable(struct kvm *kvm)
+{
+	struct kvm_userspace_memory_region kvm_userspace_mem;
+	int r = 0;
 
-	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
-	rdtscll(tsc_this);
-	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
-		tsc_base = tsc_this;
+	mutex_enter(&kvm->slots_lock);
+	if (kvm->arch.ept_identity_pagetable)
+		goto out;
+	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+	kvm_userspace_mem.flags = 0;
+	kvm_userspace_mem.guest_phys_addr =
+		kvm->arch.ept_identity_map_addr;
+	kvm_userspace_mem.memory_size = PAGESIZE;
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+	if (r)
+		goto out;
 
-	guest_write_tsc(0, tsc_base);
-#endif /*XXX*/
-	return 0;
+	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+			kvm->arch.ept_identity_map_addr >> PAGESHIFT);
+out:
+	mutex_exit(&kvm->slots_lock);
+	return r;
 }
 
-extern void vmcs_clear(struct vmcs *vmcs);
-extern void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-extern void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+#endif /*XXX*/
 
 struct kvm_vcpu *
 vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
@@ -694,9 +633,8 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
 
 	if (!vmx)
 		return NULL;
-#ifdef NOTNOW
+
 	allocate_vpid(vmx);
-#endif /*NOTNOW*/
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, arg, id);
 	if (err) {
 #ifdef NOTNOW
@@ -713,8 +651,11 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
 	}
 
 	vmx->vmcs = kmem_zalloc(PAGESIZE, KM_SLEEP);
-	if (!vmx->vmcs)
+	if (!vmx->vmcs) {
+		kmem_free(vmx, sizeof(struct vcpu_vmx));
+		vmx = NULL;
 		return NULL;
+	}
 
 
 	kpreempt_disable();
@@ -734,11 +675,16 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
 	kpreempt_enable();
 	if (err)
 		vmx->vmcs = NULL;
-#ifdef NOTNOW
 	if (vm_need_virtualize_apic_accesses(kvm))
+#ifdef XXX
 		if (alloc_apic_access_page(kvm) != 0)
+#endif /*XXX*/
 			goto free_vmcs;
 
+#ifdef XXX
+	/*
+	 * XXX For right now, we don't implement ept
+	 */
 	if (enable_ept) {
 		if (!kvm->arch.ept_identity_map_addr)
 			kvm->arch.ept_identity_map_addr =
@@ -746,13 +692,14 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
 		if (alloc_identity_pagetable(kvm) != 0)
 			goto free_vmcs;
 	}
+#endif /*XXX*/
 
-#endif /*NOTNOW*/
 	return &vmx->vcpu;
 
-#ifdef XXX
 free_vmcs:
-	free_vmcs(vmx->vmcs);
+	kmem_free(vmx->vmcs, PAGESIZE);
+	vmx->vmcs = 0;
+#ifdef XXX
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu:
@@ -772,10 +719,8 @@ kvm_arch_vcpu_create(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
 	return vmx_create_vcpu(kvm, arg, id);
 }
 
-extern struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu);
-
+extern int enable_ept;
 
-static int enable_ept = 1;
 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 	uint32_t eb;
@@ -788,7 +733,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 		eb |= 1u << BP_VECTOR;
 #endif /*XXX*/
-	if (to_vmx(vcpu)->rmode.vm86_active)
+	if (((struct vcpu_vmx *)vcpu)->rmode.vm86_active)
 		eb = ~0;
 	if (enable_ept)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -801,12 +746,12 @@ static inline uint32_t apic_get_reg(struct kvm_lapic *apic, int reg_off)
 {
 	return *((uint32_t *) (apic->regs + reg_off));
 }
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val)
+void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val)
 {
 	*((uint32_t *) (apic->regs + reg_off)) = val;
 }
 
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+int kvm_apic_id(struct kvm_lapic *apic)
 {
 	return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
@@ -874,7 +819,7 @@ void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 		vcpu->arch.cr8 = cr8;
 }
 
-static int is_paging(struct kvm_vcpu *vcpu)
+int is_paging(struct kvm_vcpu *vcpu)
 {
 #ifdef XXX	
 	return kvm_getcr0_bits(vcpu, X86_CR0_PG);
@@ -885,8 +830,8 @@ static int is_paging(struct kvm_vcpu *vcpu)
 
 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+	unsigned long hw_cr4 = cr4 | (((struct vcpu_vmx *)vcpu)->rmode.vm86_active ?
+				      KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
 	vcpu->arch.cr4 = cr4;
 	if (enable_ept) {
@@ -904,7 +849,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
 	unsigned long hw_cr0;
 #ifdef XXX
 	if (enable_unrestricted_guest)
@@ -913,7 +858,6 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	else
 #endif /*XXX*/
 		hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
-
 #ifdef XXX
 	if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
 		enter_pmode(vcpu);
@@ -933,10 +877,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 #endif
 
-#ifdef XXX
 	if (enable_ept)
 		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
-#endif /*XXX*/
 
 	if (!vcpu->fpu_active)
 		hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
@@ -978,33 +920,145 @@ static void seg_setup(int seg)
 	vmcs_write16(sf->selector, 0);
 	vmcs_writel(sf->base, 0);
 	vmcs_write32(sf->limit, 0xffff);
-#ifdef XXX
+
 	if (enable_unrestricted_guest) {
 		ar = 0x93;
 		if (seg == VCPU_SREG_CS)
 			ar |= 0x08; /* code segment */
 	} else
-#endif /*XXX*/
 		ar = 0xf3;
 
 	vmcs_write32(sf->ar_bytes, ar);
 }
 
+static gva_t rmode_tss_base(struct kvm *kvm)
+{
+	if (!kvm->arch.tss_addr) {
+		struct kvm_memslots *slots;
+		gfn_t base_gfn;
+
+#ifdef XXX
+		slots = rcu_dereference(kvm->memslots);
+#else
+		slots = kvm->memslots;
+#endif /*XXX*/
+		base_gfn = kvm->memslots->memslots[0].base_gfn +
+				 kvm->memslots->memslots[0].npages - 3;
+		return base_gfn << PAGESHIFT;
+	}
+	return kvm->arch.tss_addr;
+}
+
+extern int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+			 int offset, int len);
+
+unsigned long empty_zero_page[PAGESIZE / sizeof(unsigned long)];
+
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+{
+	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
+}
+
+static int init_rmode_tss(struct kvm *kvm)
+{
+	gfn_t fn = rmode_tss_base(kvm) >> PAGESHIFT;
+	uint16_t data = 0;
+	int ret = 0;
+	int r;
+
+	r = kvm_clear_guest_page(kvm, fn, 0, PAGESIZE);
+	if (r < 0)
+		goto out;
+	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+	r = kvm_write_guest_page(kvm, fn++, &data,
+			TSS_IOPB_BASE_OFFSET, sizeof(uint16_t));
+	if (r < 0)
+		goto out;
+	r = kvm_clear_guest_page(kvm, fn++, 0, PAGESIZE);
+	if (r < 0)
+		goto out;
+	r = kvm_clear_guest_page(kvm, fn, 0, PAGESIZE);
+	if (r < 0)
+		goto out;
+	data = ~0;
+	r = kvm_write_guest_page(kvm, fn, &data,
+				 RMODE_TSS_SIZE - 2 * PAGESIZE - 1,
+				 sizeof(uint8_t));
+	if (r < 0)
+		goto out;
+
+	ret = 1;
+out:
+	return ret;
+}
+
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+	int i, r, ret;
+	pfn_t identity_map_pfn;
+	uint32_t tmp;
+
+	if (!enable_ept)
+		return 1;
+	if ((!kvm->arch.ept_identity_pagetable)) {
+		cmn_err(CE_WARN, "EPT: identity-mapping pagetable haven't been allocated!\n");
+		return 0;
+	}
+	if ((kvm->arch.ept_identity_pagetable_done))
+		return 1;
+	ret = 0;
+	identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGESHIFT;
+	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGESIZE);
+	if (r < 0)
+		goto out;
+#ifdef XXX
+	/* Set up identity-mapping pagetable for EPT in real mode */
+	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+		r = kvm_write_guest_page(kvm, identity_map_pfn,
+				&tmp, i * sizeof(tmp), sizeof(tmp));
+		if (r < 0)
+			goto out;
+	}
+#endif /*XXX*/
+	kvm->arch.ept_identity_pagetable_done = 1;
+	ret = 1;
+out:
+	return ret;
+}
+
+static int init_rmode(struct kvm *kvm)
+{
+	if (!init_rmode_tss(kvm))
+		return 0;
+	if (!init_rmode_identity_map(kvm))
+		return 0;
+	return 1;
+}
+
+extern void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer);
+extern void kvm_register_write(struct kvm_vcpu *vcpu,
+			       enum kvm_reg reg,
+			       unsigned long val);
+extern ulong kvm_read_cr0(struct kvm_vcpu *vcpu);
+extern void setup_msrs(struct vcpu_vmx *vmx);
+
 int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = (struct vcpu_vmx *)to_vmx(vcpu);
+	struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
 	uint64_t msr;
 	int ret, idx;
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
 #ifdef XXX
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
+#endif /*XXX*/
 
 	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
 	}
-#endif
 	vmx->rmode.vm86_active = 0;
 
 	vmx->soft_vnmi_blocked = 0;
@@ -1027,12 +1081,12 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
 	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
 	 */
-#ifdef XXX
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
 		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
 		vmcs_writel(GUEST_CS_BASE, 0x000f0000);
 	} else {
-#endif /*XXX*/
+#endif /*CONFIG_KVM_APIC_ARCHITECTURE*/
 		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
 		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
 #ifdef XXX
@@ -1064,9 +1118,9 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		kvm_rip_write(vcpu, 0xfff0);
 	else
 		kvm_rip_write(vcpu, 0);
-
-	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 #endif /*XXX*/
+	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
+
 	vmcs_writel(GUEST_DR7, 0x400);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -1082,9 +1136,7 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	/* Special registers */
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
-#ifdef XXX
 	setup_msrs(vmx);
-#endif /*XXX*/
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 
@@ -1101,20 +1153,18 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write64(APIC_ACCESS_ADDR,
 			     page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
 #endif /*XXX*/
+
 	if (vmx->vpid != 0)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
 	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-#ifdef XXX
 	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-#endif /*XXX*/
 	vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef XXX
 	vmx_set_efer(&vmx->vcpu, 0);
+#ifdef XXX
 	vmx_fpu_activate(&vmx->vcpu);
 #endif /*XXX*/
 	update_exception_bitmap(&vmx->vcpu);
-
 #ifdef XXX
 	vpid_sync_vcpu_all(vmx);
 #endif /*XXX*/
@@ -1148,18 +1198,424 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 extern void vcpu_load(struct kvm_vcpu *vcpu);
 
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+				  struct kmem_cache *base_cache, int min)
+{
+	caddr_t obj;
+
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+		obj = kmem_cache_alloc(base_cache, KM_SLEEP);
+		if (!obj)
+			return -ENOMEM;
+		cache->objects[cache->nobjs++] = obj;
+	}
+	return 0;
+}
+
+extern struct kmem_cache *pte_chain_cache;
+extern struct kmem_cache *rmap_desc_cache;
+extern struct kmem_cache *mmu_page_header_cache;
+
+/*XXX the following is called for tdp (two dimensional hardware paging */
+/* we dont support this right now */
+int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+	int r = 0;
+
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+				   pte_chain_cache, 4);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+				   rmap_desc_cache, 4);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+				   mmu_page_header_cache, 4);
+out:
+	return r;
+}
+
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+	struct kvm_memory_slot *slot;
+	int host_level, level, max_level;
+#ifdef XXX
+	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+	if (slot && slot->dirty_bitmap)
+		return PT_PAGE_TABLE_LEVEL;
+
+	host_level = host_mapping_level(vcpu->kvm, large_gfn);
+
+	if (host_level == PT_PAGE_TABLE_LEVEL)
+		return host_level;
+
+	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
+		kvm_x86_ops->get_lpage_level() : host_level;
+
+	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
+		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+			break;
+
+	return level - 1;
+#else
+	return 0;
+#endif /*XXX*/
+}
+
+extern struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	gfn = unalias_gfn_instantiation(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return bad_hva();
+	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE);
+}
+
+int kvm_is_error_hva(unsigned long addr)
+{
+	return addr == bad_hva();
+}
+
+extern caddr_t bad_page;
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr)) {
+		get_page(bad_page);
+		return page_to_pfn(bad_page);
+	}
+
+	return hva_to_pfn(kvm, addr);
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+				uint32_t error_code)
+{
+#ifdef XXX
+	pfn_t pfn;
+	int r;
+	int level;
+	gfn_t gfn = gpa >> PAGESHIFT;
+	unsigned long mmu_seq;
+
+	ASSERT(vcpu);
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	level = mapping_level(vcpu, gfn);
+
+	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
+	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
+		return 1;
+	}
+	mutex_enter(&vcpu->kvm->mmu_lock);
+	if (mmu_notifier_retry(vcpu, mmu_seq))
+		goto out_unlock;
+	kvm_mmu_free_some_pages(vcpu);
+	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+			 level, gfn, pfn);
+	mutex_exit(&vcpu->kvm->mmu_lock);
+
+	return r;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+#endif /*XXX*/
+	return 0;
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	mutex_enter(&vcpu->kvm->mmu_lock);
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+
+		sp = page_header(root);
+		--sp->root_count;
+		if (!sp->root_count && sp->role.invalid)
+			kvm_mmu_zap_page(vcpu->kvm, sp);
+		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			--sp->root_count;
+			if (!sp->root_count && sp->role.invalid)
+				kvm_mmu_zap_page(vcpu->kvm, sp);
+		}
+		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+	}
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+	mmu_free_roots(vcpu);
+}
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+		sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+			       struct kvm_mmu_page *sp)
+{
+	return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+int get_ept_level(void)
+{
+	return VMX_EPT_DEFAULT_GAW + 1;
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+				  u32 access, u32 *error)
+{
+	if (error)
+		*error = 0;
+	return vaddr;
+}
+
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	u64 exb_bit_rsvd = 0;
+
+	if (!is_nx(vcpu))
+		exb_bit_rsvd = rsvd_bits(63, 63);
+	switch (level) {
+	case PT32_ROOT_LEVEL:
+		/* no rsvd bits for 2 level 4K page table entries */
+		context->rsvd_bits_mask[0][1] = 0;
+		context->rsvd_bits_mask[0][0] = 0;
+		if (is_cpuid_PSE36())
+			/* 36bits PSE 4MB page */
+			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+		else
+			/* 32 bits PSE 4MB page */
+			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		break;
+	case PT32E_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][2] =
+			rsvd_bits(maxphyaddr, 63) |
+			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
+		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62);	/* PDE */
+		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62); 	/* PTE */
+		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62) |
+			rsvd_bits(13, 20);		/* large page */
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		break;
+	case PT64_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51);
+		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51);
+		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) |
+			rsvd_bits(13, 29);
+		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) |
+			rsvd_bits(13, 20);		/* large page */
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		break;
+	}
+}
+
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	context->new_cr3 = nonpaging_new_cr3;
+	context->page_fault = tdp_page_fault;
+	context->free = nonpaging_free;
+	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
+	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+	context->root_hpa = INVALID_PAGE;
+
+	if (!is_paging(vcpu)) {
+		context->gva_to_gpa = nonpaging_gva_to_gpa;
+		context->root_level = 0;
+	} else if (is_long_mode(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+		context->gva_to_gpa = paging64_gva_to_gpa;
+		context->root_level = PT64_ROOT_LEVEL;
+	} else if (is_pae(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+		context->gva_to_gpa = paging64_gva_to_gpa;
+		context->root_level = PT32E_ROOT_LEVEL;
+	} else {
+		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+		context->gva_to_gpa = paging32_gva_to_gpa;
+		context->root_level = PT32_ROOT_LEVEL;
+	}
+
+	return 0;
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	context->new_cr3 = nonpaging_new_cr3;
+	context->page_fault = nonpaging_page_fault;
+	context->gva_to_gpa = nonpaging_gva_to_gpa;
+	context->free = nonpaging_free;
+	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
+	context->root_level = 0;
+	context->shadow_root_level = PT32E_ROOT_LEVEL;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	ASSERT(is_pae(vcpu));
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = paging64_page_fault;
+	context->gva_to_gpa = paging64_gva_to_gpa;
+	context->prefetch_page = paging64_prefetch_page;
+	context->sync_page = paging64_sync_page;
+	context->invlpg = paging64_invlpg;
+	context->free = paging_free;
+	context->root_level = level;
+	context->shadow_root_level = level;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = paging32_page_fault;
+	context->gva_to_gpa = paging32_gva_to_gpa;
+	context->free = paging_free;
+	context->prefetch_page = paging32_prefetch_page;
+	context->sync_page = paging32_sync_page;
+	context->invlpg = paging32_invlpg;
+	context->root_level = PT32_ROOT_LEVEL;
+	context->shadow_root_level = PT32E_ROOT_LEVEL;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	if (!is_paging(vcpu))
+		r = nonpaging_init_context(vcpu);
+	else if (is_long_mode(vcpu))
+		r = paging64_init_context(vcpu);
+	else if (is_pae(vcpu))
+		r = paging32E_init_context(vcpu);
+	else
+		r = paging32_init_context(vcpu);
+
+	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+
+	return r;
+}
+
+int init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.update_pte.pfn = -1; /* bad_pfn */
 
 #ifdef XXX
+	/*
+	 * XXX currently, we won't support 2 dimensional paging.
+	 * So the hardware will not do guest-virtual to guest-physical
+	 * and guest-physical to host physical.  So we'll need to
+	 * implement "shadow" paging...
+	 */
+  
 	if (tdp_enabled)
 		return init_kvm_tdp_mmu(vcpu);
 	else
+#endif
 		return init_kvm_softmmu(vcpu);
-#else
 	return 0;
-#endif /*XXX*/
 }
 
 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
@@ -1196,6 +1652,11 @@ free_vcpu:
 	return r;
 }
 
+void kvm_get_kvm(struct kvm *kvm)
+{
+	atomic_inc_32(&kvm->users_count);
+}
+
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
@@ -1217,9 +1678,8 @@ kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int32_t id, struct kvm_vcpu_ioc *arg,
 	if (r)
 		return r;
 
-#ifdef NOTNOW
-
-	mutex_lock(&kvm->lock);
+	mutex_enter(&kvm->lock);
+#ifdef XXX
 	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
 		r = -EINVAL;
 		goto vcpu_destroy;
@@ -1233,30 +1693,33 @@ kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int32_t id, struct kvm_vcpu_ioc *arg,
 
 	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
 
+#endif /*XXX*/
+
 	/* Now it's all set up, let userspace reach it */
 	kvm_get_kvm(kvm);
-#endif /*NOTNOW*/
-	*rval_p = vcpuid++;  /* guarantee unique id */
+
+	*rval_p = kvm->online_vcpus;  /* guarantee unique id */
+	vcpu->vcpu_id = *rval_p;
 
 	/* XXX need to protect online_vcpus */
-	kvm->vcpus[kvm->online_vcpus] = vcpu;
+	kvm->vcpus[kvm->online_vcpus++] = vcpu;
 
-#ifdef NOTNOW
+#ifdef XXX
 	smp_wmb();
-	atomic_inc(&kvm->online_vcpus);
+#endif /*XXX*/
+	atomic_inc_32(&kvm->online_vcpus);
 
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	if (kvm->bsp_vcpu_id == id)
 		kvm->bsp_vcpu = vcpu;
 #endif
 
-	mutex_unlock(&kvm->lock);
-#endif /*NOTNOW*/
+	mutex_exit(&kvm->lock);
 	return r;
 
 vcpu_destroy:
 #ifdef NOTNOW
-	mutex_unlock(&kvm->lock);
+	mutex_exit(&kvm->lock);
 	kvm_arch_vcpu_destroy(vcpu);
 #endif /*NOTNOW*/
 	return r;
@@ -1281,7 +1744,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			unsigned long userspace_addr;
 			down_write(&current->mm->mmap_sem);
 			userspace_addr = do_mmap(NULL, 0,
-						 npages * PAGE_SIZE,
+						 npages * PAGESIZE,
 						 PROT_READ | PROT_WRITE,
 						 MAP_PRIVATE | MAP_ANONYMOUS,
 						 0);
@@ -1330,6 +1793,240 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 	return kvm_set_memory_region(kvm, mem, user_alloc);
 }
 
+static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
+{
+	return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
+}
+
+static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
+				   gpa_t addr, int len)
+{
+	struct kvm_coalesced_mmio_zone *zone;
+	struct kvm_coalesced_mmio_ring *ring;
+	unsigned avail;
+	int i;
+
+	/* Are we able to batch it ? */
+
+	/* last is the first free entry
+	 * check if we don't meet the first used entry
+	 * there is always one unused entry in the buffer
+	 */
+	ring = dev->kvm->coalesced_mmio_ring;
+	avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
+	if (avail < KVM_MAX_VCPUS) {
+		/* full */
+		return 0;
+	}
+
+	/* is it in a batchable area ? */
+
+	for (i = 0; i < dev->nb_zones; i++) {
+		zone = &dev->zone[i];
+
+		/* (addr,len) is fully included in
+		 * (zone->addr, zone->size)
+		 */
+
+		if (zone->addr <= addr &&
+		    addr + len <= zone->addr + zone->size)
+			return 1;
+	}
+	return 0;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			    struct kvm_io_device *dev)
+{
+	struct kvm_io_bus *new_bus, *bus;
+
+	bus = kvm->buses[bus_idx];
+	if (bus->dev_count > NR_IOBUS_DEVS-1)
+		return -ENOSPC;
+
+	new_bus = kmem_zalloc(sizeof(struct kvm_io_bus), KM_SLEEP);
+	if (!new_bus)
+		return -ENOMEM;
+	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+	new_bus->devs[new_bus->dev_count++] = dev;
+#ifdef XXX
+	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+	synchronize_srcu_expedited(&kvm->srcu);
+#endif /*XXX*/
+	kmem_free(bus, sizeof(struct kvm_io_bus));
+
+	return 0;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			      struct kvm_io_device *dev)
+{
+	int i, r;
+	struct kvm_io_bus *new_bus, *bus;
+
+	new_bus = kmem_zalloc(sizeof(struct kvm_io_bus), KM_SLEEP);
+	if (!new_bus)
+		return -ENOMEM;
+
+	bus = kvm->buses[bus_idx];
+	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+
+	r = -ENOENT;
+	for (i = 0; i < new_bus->dev_count; i++)
+		if (new_bus->devs[i] == dev) {
+			r = 0;
+			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+			break;
+		}
+
+	if (r) {
+		kmem_free(new_bus, sizeof(struct kvm_io_bus));
+		return r;
+	}
+
+#ifdef XXX
+	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+	synchronize_srcu_expedited(&kvm->srcu);
+#endif
+	kmem_free(bus, sizeof(struct kvm_io_bus));
+	return r;
+}
+
+static int coalesced_mmio_write(struct kvm_io_device *this,
+				gpa_t addr, int len, const void *val)
+{
+	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+	if (!coalesced_mmio_in_range(dev, addr, len))
+		return -EOPNOTSUPP;
+
+	mutex_enter(&dev->lock);
+
+	/* copy data in first free entry of the ring */
+
+	ring->coalesced_mmio[ring->last].phys_addr = addr;
+	ring->coalesced_mmio[ring->last].len = len;
+	memcpy(ring->coalesced_mmio[ring->last].data, val, len);
+#ifdef XXX
+	smp_wmb();
+#endif /*XXX*/
+	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
+	mutex_exit(&dev->lock);
+	return 0;
+}
+
+static void coalesced_mmio_destructor(struct kvm_io_device *this)
+{
+	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+
+	kmem_free(dev, sizeof(struct kvm_coalesced_mmio_dev));
+}
+
+static const struct kvm_io_device_ops coalesced_mmio_ops = {
+	.write      = coalesced_mmio_write,
+	.destructor = coalesced_mmio_destructor,
+};
+
+
+int kvm_coalesced_mmio_init(struct kvm *kvm)
+{
+	struct kvm_coalesced_mmio_dev *dev;
+	caddr_t *page;
+	int ret;
+
+	ret = -ENOMEM;
+	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
+	if (!page)
+		goto out_err;
+	kvm->coalesced_mmio_ring = (struct kvm_coalesced_mmio_ring *)page;
+
+	ret = -ENOMEM;
+	dev = kmem_alloc(sizeof(struct kvm_coalesced_mmio_dev), KM_SLEEP);
+	if (!dev)
+		goto out_free_page;
+	mutex_init(&dev->lock, NULL, MUTEX_DRIVER, 0);
+	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+	dev->kvm = kvm;
+	kvm->coalesced_mmio_dev = dev;
+
+	mutex_enter(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+	mutex_exit(&kvm->slots_lock);
+	if (ret < 0)
+		goto out_free_dev;
+
+	return ret;
+
+out_free_dev:
+	kmem_free(dev, sizeof(struct kvm_coalesced_mmio_dev));
+out_free_page:
+	kmem_free(page, PAGESIZE);
+out_err:
+	return ret;
+}
+
+void kvm_coalesced_mmio_free(struct kvm *kvm)
+{
+	if (kvm->coalesced_mmio_ring)
+		kmem_free(kvm->coalesced_mmio_ring, PAGESIZE);
+}
+
+int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
+					 struct kvm_coalesced_mmio_zone *zone)
+{
+	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	mutex_enter(&kvm->slots_lock);
+	if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
+		mutex_exit(&kvm->slots_lock);
+		return -ENOBUFS;
+	}
+
+	dev->zone[dev->nb_zones] = *zone;
+	dev->nb_zones++;
+
+	mutex_exit(&kvm->slots_lock);
+	return 0;
+}
+
+int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
+					   struct kvm_coalesced_mmio_zone *zone)
+{
+	int i;
+	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+	struct kvm_coalesced_mmio_zone *z;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	mutex_enter(&kvm->slots_lock);
+
+	i = dev->nb_zones;
+	while (i) {
+		z = &dev->zone[i - 1];
+
+		/* unregister all zones
+		 * included in (zone->addr, zone->size)
+		 */
+
+		if (zone->addr <= z->addr &&
+		    z->addr + z->size <= zone->addr + zone->size) {
+			dev->nb_zones--;
+			*z = dev->zone[dev->nb_zones];
+		}
+		i--;
+	}
+
+	mutex_exit(&kvm->slots_lock);
+
+	return 0;
+}
+
 long
 kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
 {
@@ -1358,6 +2055,7 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
 			goto out;
 		break;
 	}
+#endif /*NOTNOW*/
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	case KVM_REGISTER_COALESCED_MMIO: {
@@ -1385,13 +2083,12 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
 		break;
 	}
 #endif
-
+#ifdef XXX
 	case KVM_IRQFD: {
 		struct kvm_irqfd data;
 
-		r = -EFAULT;
-		if (copy_from_user(&data, argp, sizeof data))
-			goto out;
+		if (ddi_copyin(argp, &data, sizeof data, mode))
+			return (EFAULT);
 		r = kvm_irqfd(kvmp, data.fd, data.gsi, data.flags);
 		break;
 	}
@@ -1404,24 +2101,21 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
 		r = kvm_ioeventfd(kvmp, &data);
 		break;
 	}
-#endif /*NOTNOW*/
+
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	case KVM_SET_BOOT_CPU_ID:
 		r = 0;
-		mutex_lock(&kvmp->lock);
+		mutex_enter(&kvmp->lock);
 		if (atomic_read(&kvmp->online_vcpus) != 0)
 			r = -EBUSY;
 		else
 			kvmp->bsp_vcpu_id = arg;
-		mutex_unlock(&kvmp->lock);
+		mutex_exit(&kvmp->lock);
 		break;
 #endif
-#ifdef NOTNOW
+#endif /*XXX*/
 	default:
-		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
-		if (r == -ENOTTY)
-			r = kvm_vm_ioctl_assigned_device(kvmp, ioctl, arg);
-#endif /*NOTNOW*/
+		return EINVAL;
 	}
 
 out:
diff --git a/msr.h b/msr.h
index df3f675..22c0b07 100644
--- a/msr.h
+++ b/msr.h
@@ -12,7 +12,7 @@
 #ifdef _KERNEL
 
 #include "asm.h"
-
+#include <sys/ontrap.h>
 #include <sys/errno.h>
 
 #ifdef XXX
@@ -76,21 +76,11 @@ static inline unsigned long long native_read_msr(unsigned int msr)
 }
 
 
-static inline unsigned long long native_read_msr_safe(unsigned int msr,
-						      int *err)
-{
-	DECLARE_ARGS(val, low, high);
+extern uint64_t native_read_msr_safe(unsigned int msr,
+				     int *err);
+extern int native_write_msr_safe(unsigned int msr,
+				 unsigned low, unsigned high);
 
-	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
-		     ".previous\n\t"
-		     _ASM_EXTABLE(2b, 3b)
-		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
-		     : "c" (msr), [fault] "i" (-EIO));
-	return EAX_EDX_VAL(val, low, high);
-}
 
 static inline void native_write_msr(unsigned int msr,
 				    unsigned low, unsigned high)
@@ -98,23 +88,6 @@ static inline void native_write_msr(unsigned int msr,
 	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
 }
 
-/* Can be uninlined because referenced by paravirt */
-static inline int native_write_msr_safe(unsigned int msr,
-					unsigned low, unsigned high)
-{
-	int err;
-	asm volatile("2: wrmsr ; xor %[err],%[err]\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
-		     ".previous\n\t"
-		     _ASM_EXTABLE(2b, 3b)
-		     : [err] "=a" (err)
-		     : "c" (msr), "0" (low), "d" (high),
-		       [fault] "i" (-EIO)
-		     : "memory");
-	return err;
-}
 
 extern unsigned long long native_read_tsc(void);
author	max <max@maxpad.(none)>	2010-12-03 19:19:31 +0100
committer	max <max@maxpad.(none)>	2010-12-03 19:19:31 +0100
commit	aaf4078a2967dbd67bf0efad9c3f4b81ab35e665 (patch)
tree	5bfa0a8d72f2fa2b5c8f3b38880e0d3eb5ce01d4
parent	00233f503e3241dd6361421e306acbba7454c99f (diff)
download	illumos-kvm-aaf4078a2967dbd67bf0efad9c3f4b81ab35e665.tar.gz