summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile13
-rw-r--r--kvm.c6799
-rw-r--r--kvm.h683
-rw-r--r--kvm_host.h1201
-rw-r--r--kvm_x86.c1144
-rw-r--r--msr.h37
6 files changed, 8115 insertions, 1762 deletions
diff --git a/Makefile b/Makefile
index 70cb946..b74e15d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,16 +9,19 @@ LD=/usr/bin/ld
CTFCONVERT=$(KERNEL_SOURCE)/usr/src/tools/proto/opt/onbld/bin/i386/ctfconvert
CTFMERGE=$(KERNEL_SOURCE)/usr/src/tools/proto/opt/onbld/bin/i386/ctfmerge
-CFLAGS += -D_KERNEL -D_MACHDEP -Dx86 -DCONFIG_X86_64 -DDEBUG -c -O -g
-INCLUDEDIR= -I $(KERNEL_SOURCE)/usr/src/uts/intel -I $(KERNEL_SOURCE)/usr/src/uts/i86pc
+CFLAGS += -D_KERNEL -D_MACHDEP -Dx86 -DCONFIG_X86_64 -DDEBUG -c -g -DCONFIG_SOLARIS -DCONFIG_KVM_MMIO
-kvm: kvm.c kvm_x86.c kvm.h
+INCLUDEDIR= -I $(KERNEL_SOURCE)/usr/src/uts/intel -I $(KERNEL_SOURCE)/usr/src/uts/i86pc -I $(KERNEL_SOURCE)/usr/src/uts/common
+
+kvm: kvm.c kvm_x86.c emulate.c kvm.h kvm_x86host.h
$(CC) $(CFLAGS) $(INCLUDEDIR) kvm.c
$(CC) $(CFLAGS) $(INCLUDEDIR) kvm_x86.c
+ $(CC) $(CFLAGS) $(INCLUDEDIR) emulate.c
$(CTFCONVERT) -i -L VERSION kvm.o
$(CTFCONVERT) -i -L VERSION kvm_x86.o
- $(LD) -r -o kvm kvm.o kvm_x86.o
- $(CTFMERGE) -L VERSION -o kvm kvm.o kvm_x86.o
+ $(CTFCONVERT) -i -L VERSION emulate.o
+ $(LD) -r -o kvm kvm.o kvm_x86.o emulate.o
+ $(CTFMERGE) -L VERSION -o kvm kvm.o kvm_x86.o emulate.o
install: kvm
@echo "==> Installing kvm module"
diff --git a/kvm.c b/kvm.c
index 30508bc..e8a9d27 100644
--- a/kvm.c
+++ b/kvm.c
@@ -24,7 +24,15 @@
#include "msr.h"
#include "irqflags.h"
#include "kvm_host.h"
+#include "kvm_x86host.h"
+#include "processor-flags.h"
+#include "hyperv.h"
+#include "apicdef.h"
+#include "segment.h"
+#include "iodev.h"
#include "kvm.h"
+#include "irq.h"
+#include "tss.h"
int kvmid; /* monotonically increasing, unique per vm */
int largepages_enabled = 1;
@@ -126,6 +134,69 @@ extern void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static int vmx_set_tss_addr(struct kvm *kvmp, uintptr_t addr);
static int vmx_hardware_setup(void);
extern int vmx_hardware_enable(void *garbage);
+extern unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmcs_writel(unsigned long field, unsigned long value);
+unsigned long vmcs_readl(unsigned long field);
+extern void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+static int vmx_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata);
+static int vmx_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data);
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu);
+static void vmx_save_host_state(struct kvm_vcpu *vcpu);
+
+struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static int vmx_handle_exit(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu);
+static int vmx_get_lpage_level(void);
+static int vmx_rdtscp_supported(void);
+void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer);
+static uint64_t vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+static int vmx_get_cpl(struct kvm_vcpu *vcpu);
+int get_ept_level(void);
+
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ vpid_sync_vcpu_all(to_vmx(vcpu));
+ if (enable_ept)
+ ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+#endif
+}
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ unsigned long guest_cr3;
+ uint64_t eptp;
+
+ guest_cr3 = cr3;
+#ifdef XXX
+ if (enable_ept) {
+ /*
+ * ept not implemented right now...
+ */
+ eptp = construct_eptp(cr3);
+ vmcs_write64(EPT_POINTER, eptp);
+ guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+ vcpu->kvm->arch.ept_identity_map_addr;
+ ept_load_pdptrs(vcpu);
+ }
+#endif /*XXX*/
+
+ vmx_flush_tlb(vcpu);
+ vmcs_writel(GUEST_CR3, guest_cr3);
+}
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = nulldev/*cpu_has_kvm_support*/,
@@ -141,38 +212,38 @@ static struct kvm_x86_ops vmx_x86_ops = {
.vcpu_free = nulldev /*vmx_free_vcpu*/,
.vcpu_reset = vmx_vcpu_reset,
- .prepare_guest_switch = nulldev /*vmx_save_host_state*/,
+ .prepare_guest_switch = vmx_save_host_state /*vmx_save_host_state*/,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
.set_guest_debug = nulldev /*set_guest_debug*/,
- .get_msr = nulldev /*vmx_get_msr*/,
- .set_msr = nulldev /*vmx_set_msr*/,
- .get_segment_base = nulldev /*vmx_get_segment_base*/,
- .get_segment = nulldev /*vmx_get_segment*/,
- .set_segment = nulldev /*vmx_set_segment*/,
- .get_cpl = nulldev /*vmx_get_cpl*/,
+ .get_msr = vmx_get_msr /*vmx_get_msr*/,
+ .set_msr = vmx_set_msr /*vmx_set_msr*/,
+ .get_segment_base = vmx_get_segment_base /*vmx_get_segment_base*/,
+ .get_segment = vmx_get_segment /*vmx_get_segment*/,
+ .set_segment = vmx_set_segment /*vmx_set_segment*/,
+ .get_cpl = vmx_get_cpl /*vmx_get_cpl*/,
.get_cs_db_l_bits = nulldev /*vmx_get_cs_db_l_bits*/,
.decache_cr0_guest_bits = nulldev /*vmx_decache_cr0_guest_bits*/,
.decache_cr4_guest_bits = nulldev /*vmx_decache_cr4_guest_bits*/,
.set_cr0 = vmx_set_cr0,
- .set_cr3 = nulldev /*vmx_set_cr3*/,
+ .set_cr3 = vmx_set_cr3 /*vmx_set_cr3*/,
.set_cr4 = vmx_set_cr4,
- .set_efer = nulldev /*vmx_set_efer*/,
- .get_idt = nulldev /*vmx_get_idt*/,
- .set_idt = nulldev /*vmx_set_idt*/,
- .get_gdt = nulldev /*vmx_get_gdt*/,
- .set_gdt = nulldev /*vmx_set_gdt*/,
+ .set_efer = vmx_set_efer /*vmx_set_efer*/,
+ .get_idt = vmx_get_idt /*vmx_get_idt*/,
+ .set_idt = vmx_set_idt /*vmx_set_idt*/,
+ .get_gdt = vmx_get_gdt /*vmx_get_gdt*/,
+ .set_gdt = vmx_set_gdt /*vmx_set_gdt*/,
.cache_reg = nulldev /*vmx_cache_reg*/,
- .get_rflags = nulldev /*vmx_get_rflags*/,
- .set_rflags = nulldev /*vmx_set_rflags*/,
+ .get_rflags = vmx_get_rflags /*vmx_get_rflags*/,
+ .set_rflags = vmx_set_rflags /*vmx_set_rflags*/,
.fpu_activate = nulldev /*vmx_fpu_activate*/,
.fpu_deactivate = nulldev /*vmx_fpu_deactivate*/,
.tlb_flush = nulldev /*vmx_flush_tlb*/,
- .run = nulldev /*vmx_vcpu_run*/,
- .handle_exit = nulldev /*vmx_handle_exit*/,
+ .run = vmx_vcpu_run /*vmx_vcpu_run*/,
+ .handle_exit = vmx_handle_exit /*vmx_handle_exit*/,
.skip_emulated_instruction = nulldev /*skip_emulated_instruction*/,
.set_interrupt_shadow = nulldev /*vmx_set_interrupt_shadow*/,
.get_interrupt_shadow = nulldev /*vmx_get_interrupt_shadow*/,
@@ -180,28 +251,62 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_irq = nulldev /*vmx_inject_irq*/,
.set_nmi = nulldev /*vmx_inject_nmi*/,
.queue_exception = nulldev /*vmx_queue_exception*/,
- .interrupt_allowed = nulldev /*vmx_interrupt_allowed*/,
+ .interrupt_allowed = vmx_interrupt_allowed /*vmx_interrupt_allowed*/,
.nmi_allowed = nulldev /*vmx_nmi_allowed*/,
.get_nmi_mask = nulldev /*vmx_get_nmi_mask*/,
.set_nmi_mask = nulldev /*vmx_set_nmi_mask*/,
.enable_nmi_window = nulldev /*enable_nmi_window*/,
.enable_irq_window = nulldev /*enable_irq_window*/,
- .update_cr8_intercept = nulldev /*update_cr8_intercept*/,
+ .update_cr8_intercept = vmx_update_cr8_intercept /*update_cr8_intercept*/,
.set_tss_addr = vmx_set_tss_addr,
- .get_tdp_level = nulldev /*get_ept_level*/,
+ .get_tdp_level = get_ept_level /*get_ept_level*/,
.get_mt_mask = nulldev /*vmx_get_mt_mask*/,
.exit_reasons_str = nulldev /*vmx_exit_reasons_str*/,
- .get_lpage_level = nulldev /*vmx_get_lpage_level*/,
+ .get_lpage_level = vmx_get_lpage_level /*vmx_get_lpage_level*/,
.cpuid_update = nulldev /*vmx_cpuid_update*/,
- .rdtscp_supported = nulldev /*vmx_rdtscp_supported*/,
+ .rdtscp_supported = vmx_rdtscp_supported /*vmx_rdtscp_supported*/,
};
struct kvm_x86_ops *kvm_x86_ops;
+uint32_t vmcs_read32(unsigned long field)
+{
+ return vmcs_readl(field);
+}
+
+void vmcs_write32(unsigned long field, uint32_t value)
+{
+ vmcs_writel(field, value);
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->base = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
+ vmcs_writel(GUEST_IDTR_BASE, dt->base);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->base = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
+ vmcs_writel(GUEST_GDTR_BASE, dt->base);
+}
+
/*
* In linux, there is a separate vmx kernel module from the kvm driver.
* That may be a good idea, but we're going to do everything in
@@ -212,8 +317,8 @@ struct kvm_x86_ops *kvm_x86_ops;
struct vmcs **vmxarea; /* 1 per cpu */
-static int alloc_kvm_area(void)
-{
+static int alloc_kvm_area(void){
+
int i, j;
/*
@@ -244,6 +349,39 @@ static int alloc_kvm_area(void)
extern struct vmcs_config vmcs_config;
+static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt,
+ uint32_t msr, uint32_t *result)
+{
+ uint32_t vmx_msr_low, vmx_msr_high;
+ uint32_t ctl = ctl_min | ctl_opt;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
+
+ /* Ensure minimum (required) set of control bits are supported. */
+ if (ctl_min & ~ctl)
+ return EIO;
+
+ *result = ctl;
+ return DDI_SUCCESS;
+}
+
+/* Pure 2^n version of get_order */
+static inline int get_order(unsigned long size)
+{
+ int order;
+
+ size = (size - 1) >> (PAGESHIFT - 1);
+ order = -1;
+ do {
+ size >>= 1;
+ order++;
+ } while (size);
+ return order;
+}
+
static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
{
uint32_t vmx_msr_low, vmx_msr_high;
@@ -254,12 +392,11 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
uint32_t _vmexit_control = 0;
uint32_t _vmentry_control = 0;
-#ifdef XXX
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
opt = PIN_BASED_VIRTUAL_NMIS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
- &_pin_based_exec_control) < 0)
- return -EIO;
+ &_pin_based_exec_control) != DDI_SUCCESS)
+ return EIO;
min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
@@ -278,8 +415,8 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
- return -EIO;
+ &_cpu_based_exec_control) != DDI_SUCCESS)
+ return EIO;
#ifdef CONFIG_X86_64
if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
@@ -296,8 +433,8 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_RDTSCP;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
- &_cpu_based_2nd_exec_control) < 0)
- return -EIO;
+ &_cpu_based_2nd_exec_control) != DDI_SUCCESS)
+ return EIO;
}
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
@@ -320,15 +457,14 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
#endif
opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
- &_vmexit_control) < 0)
- return -EIO;
+ &_vmexit_control) != DDI_SUCCESS)
+ return EIO;
min = 0;
opt = VM_ENTRY_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
- &_vmentry_control) < 0)
- return -EIO;
-#endif /*XXX*/
+ &_vmentry_control) != DDI_SUCCESS)
+ return EIO;
rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
@@ -347,29 +483,100 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf)
return EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
-#ifdef XXX
vmcs_conf->order = get_order(vmcs_config.size);
-#endif
vmcs_conf->revision_id = vmx_msr_low;
-#ifdef XXX
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
-#endif
+
return 0;
}
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
+
+static int bypass_guest_pf = 1;
+int enable_vpid = 1;
+static int flexpriority_enabled = 1;
+int enable_ept = 0;
+int enable_unrestricted_guest = 1;
+int emulate_invalid_guest_state = 0;
+
+void kvm_enable_efer_bits(uint64_t mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+
+static inline int cpu_has_vmx_vpid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int cpu_has_vmx_ept(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
+}
+static inline int cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
+static inline int cpu_has_vmx_tpr_shadow(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
+}
+
+static inline int cpu_has_vmx_virtualize_apic_accesses(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline int cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
+}
+
+static inline int cpu_has_vmx_ept_2m_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+}
+
+void kvm_disable_largepages(void)
+{
+ largepages_enabled = 0;
+}
+
+static inline int cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
static int vmx_hardware_setup(void)
{
- if (setup_vmcs_config(&vmcs_config) < 0)
+ if (setup_vmcs_config(&vmcs_config) != DDI_SUCCESS)
return EIO;
#ifdef XXX
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
+#endif /*XXX*/
+
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
@@ -390,10 +597,11 @@ static int vmx_hardware_setup(void)
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
-
+#ifdef XXX
if (!cpu_has_vmx_ple())
ple_gap = 0;
-#endif /*XXX*/
+#endif
+
return alloc_kvm_area();
}
@@ -402,42 +610,520 @@ int kvm_arch_hardware_setup(void)
return kvm_x86_ops->hardware_setup();
}
-int kvm_mmu_module_init(void)
+struct kmem_cache *pte_chain_cache;
+struct kmem_cache *rmap_desc_cache;
+struct kmem_cache *mmu_page_header_cache;
+
+int tdp_enabled = 0;
+
+#define PT_WRITABLE_SHIFT 1
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(uint64_t)(PAGESIZE-1))
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+ size_t size)
{
+ void *p;
+
+ p = mc->objects[--mc->nobjs];
+ return p;
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+ uint64_t *parent_pte)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE);
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE);
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ list_insert_head(&vcpu->kvm->arch.active_mmu_pages, sp);
#ifdef XXX
+ /* XXX don't see this used anywhere */
+ INIT_LIST_HEAD(&sp->oos_link);
+#endif /*XXX*/
+ bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+ sp->multimapped = 0;
+ sp->parent_pte = parent_pte;
+ --vcpu->kvm->arch.n_free_mmu_pages;
+ return sp;
+}
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
+struct kvm_mmu_page *
+shadow_hpa_to_kvmpage(hpa_t shadow_page)
+{
+ /*
+ * XXX - We'll probably need a faster way to do this...
+ * For right now, search all kvm_mmu_page for matching hpa
+ */
+
+}
+
+struct kvm_mmu_page *
+page_header(hpa_t shadow_page)
+{
+ return (struct kvm_mmu_page *)shadow_hpa_to_kvmpage(shadow_page);
+}
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ mmu_parent_walk_fn fn)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ struct kvm_mmu_page *parent_sp;
+ int i;
+
+ if (!sp->multimapped && sp->parent_pte) {
+ parent_sp = page_header(__pa(sp->parent_pte));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ return;
+ }
+ for(pte_chain = list_head(sp->parent_ptes); pte_chain;
+ pte_chain = list_next(sp->parent_ptes, pte_chain)) {
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ }
+ }
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ kvm_mmu_update_parents_unsync(sp);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ gva_t gaddr,
+ unsigned level,
+ int direct,
+ unsigned access,
+ uint64_t *parent_pte)
+{
+ union kvm_mmu_page_role role;
+ unsigned index;
+ unsigned quadrant;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *sp;
+ struct hlist_node *node, *tmp;
+
+ role = vcpu->arch.mmu.base_role;
+ role.level = level;
+ role.direct = direct;
+ role.access = access;
+ if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+ quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+ role.quadrant = quadrant;
+ }
+ index = kvm_page_table_hashfn(gfn);
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ for (sp = list_head(&vcpu->kvm->arch.mmu_page_hash[index]); sp;
+ sp = list_next(&vcpu->kvm->arch.mmu_page_hash[index], sp)) {
+ if (sp->gfn == gfn) {
+ if (sp->unsync)
+ if (kvm_sync_page(vcpu, sp))
+ continue;
+
+ if (sp->role.word != role.word)
+ continue;
+
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ BT_SET(&vcpu->requests, KVM_REQ_MMU_SYNC);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ }
+ return sp;
+ }
+ }
+#ifdef XXX
+ ++vcpu->kvm->stat.mmu_cache_miss;
+#endif
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+ if (!sp)
+ return sp;
+ sp->gfn = gfn;
+ sp->role = role;
+ list_insert_head(bucket, &sp);
+ if (!direct) {
+ if (rmap_write_protect(vcpu->kvm, gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+#ifdef XXX
+ account_shadowed(vcpu->kvm, gfn);
+#endif /*XXX*/
+ }
+ if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
+ vcpu->arch.mmu.prefetch_page(vcpu, sp);
+ else
+ nonpaging_prefetch_page(vcpu, sp);
+#ifdef XXX
+ trace_kvm_mmu_get_page(sp, true);
+#endif /*XXX*/
+ return sp;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ gfn_t root_gfn;
+ struct kvm_mmu_page *sp;
+ int direct = 0;
+ uint64_t pdptr;
+
+ root_gfn = vcpu->arch.cr3 >> PAGESHIFT;
+
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ ASSERT(!VALID_PAGE(root));
+ if (tdp_enabled)
+ direct = 1;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ PT64_ROOT_LEVEL, direct,
+ ACC_ALL, NULL);
+ root = kvm_va2pa(sp->spt);
+ ++sp->root_count;
+ vcpu->arch.mmu.root_hpa = root;
+ return 0;
+ }
+ direct = !is_paging(vcpu);
+ if (tdp_enabled)
+ direct = 1;
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ ASSERT(!VALID_PAGE(root));
+ if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+ pdptr = kvm_pdptr_read(vcpu, i);
+ if (!is_present_gpte(pdptr)) {
+ vcpu->arch.mmu.pae_root[i] = 0;
+ continue;
+ }
+ root_gfn = pdptr >> PAGESHIFT;
+ } else if (vcpu->arch.mmu.root_level == 0)
+ root_gfn = 0;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+ PT32_ROOT_LEVEL, direct,
+ ACC_ALL, NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ return 0;
+}
+
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ }
+ }
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
+static void mmu_destroy_caches(void)
+{
+ if (pte_chain_cache)
+ kmem_cache_destroy(pte_chain_cache);
+ if (rmap_desc_cache)
+ kmem_cache_destroy(rmap_desc_cache);
+ if (mmu_page_header_cache)
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int
+zero_constructor(void *buf, void *arg, int tags)
+{
+ bzero(buf, (size_t)arg);
+}
+
+int kvm_mmu_module_init(void)
+{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
- sizeof(struct kvm_pte_chain),
- 0, 0, NULL);
+ sizeof(struct kvm_pte_chain), 0,
+ zero_constructor, NULL, NULL,
+ sizeof(struct kvm_pte_chain), NULL, 0);
if (!pte_chain_cache)
goto nomem;
rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
- sizeof(struct kvm_rmap_desc),
- 0, 0, NULL);
+ sizeof(struct kvm_rmap_desc), 0,
+ zero_constructor, NULL, NULL,
+ sizeof(struct kvm_rmap_desc), NULL, 0);
if (!rmap_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
- sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
+ sizeof(struct kvm_mmu_page), 0,
+ zero_constructor, NULL, NULL,
+ sizeof(struct kvm_mmu_page), NULL, 0);
if (!mmu_page_header_cache)
goto nomem;
+#ifdef XXX
+ /* this looks like a garbage collector/reaper. Implement later if needed */
register_shrinker(&mmu_shrinker);
+#endif /*XXX*/
return 0;
nomem:
mmu_destroy_caches();
- return -ENOMEM;
+ return ENOMEM;
+}
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in the beginning of the list.
+ */
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_SAVE_MSRS_BEGIN 5
+static uint32_t msrs_to_save[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_APIC_ASSIST_PAGE,
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+};
+
+static unsigned num_msrs_to_save;
+
+static uint32_t emulated_msrs[] = {
+ MSR_IA32_MISC_ENABLE,
+};
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+uint64_t native_read_msr_safe(unsigned int msr,
+ int *err)
+{
+ DECLARE_ARGS(val, low, high);
+
+#ifdef CONFIG_SOLARIS
+ {
+ on_trap_data_t otd;
+
+ if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
+ native_read_msr(msr);
+ } else {
+ *err = EINVAL; /* XXX probably not right... */
+ }
+ no_trap();
+ }
#else
- return DDI_SUCCESS;
-#endif /*XXX*/
+ asm volatile("2: rdmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err] ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
+ : "c" (msr), [fault] "i" (-EIO));
+#endif /*CONFIG_SOLARIS*/
+ return EAX_EDX_VAL(val, low, high);
+}
+
+/* Can be uninlined because referenced by paravirt */
+int native_write_msr_safe(unsigned int msr,
+ unsigned low, unsigned high)
+{
+ int err;
+#ifdef CONFIG_SOLARIS
+ {
+ on_trap_data_t otd;
+
+ if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
+ native_write_msr(msr, low, high);
+ } else {
+ err = EINVAL; /* XXX probably not right... */
+ }
+ no_trap();
+ }
+#else
+ asm volatile("2: wrmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err] ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=a" (err)
+ : "c" (msr), "0" (low), "d" (high),
+ [fault] "i" (-EIO)
+ : "memory");
+#endif /*CONFIG_SOLARIS*/
+ return err;
+}
+
+static void kvm_init_msr_list(void)
+{
+ uint32_t dummy[2];
+ unsigned i, j;
+
+ /* skip the first msrs in the list. KVM-specific */
+ for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+ if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ continue;
+ if (j < i)
+ msrs_to_save[j] = msrs_to_save[i];
+ j++;
+ }
+ num_msrs_to_save = j;
+}
+
+static uint64_t shadow_trap_nonpresent_pte;
+static uint64_t shadow_notrap_nonpresent_pte;
+static uint64_t shadow_base_present_pte;
+static uint64_t shadow_nx_mask;
+static uint64_t shadow_x_mask; /* mutual exclusive with nx_mask */
+static uint64_t shadow_user_mask;
+static uint64_t shadow_accessed_mask;
+static uint64_t shadow_dirty_mask;
+
+void kvm_mmu_set_nonpresent_ptes(uint64_t trap_pte, uint64_t notrap_pte)
+{
+ shadow_trap_nonpresent_pte = trap_pte;
+ shadow_notrap_nonpresent_pte = notrap_pte;
+}
+
+void kvm_mmu_set_base_ptes(uint64_t base_pte)
+{
+ shadow_base_present_pte = base_pte;
+}
+
+void kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask,
+ uint64_t dirty_mask, uint64_t nx_mask, uint64_t x_mask)
+{
+ shadow_user_mask = user_mask;
+ shadow_accessed_mask = accessed_mask;
+ shadow_dirty_mask = dirty_mask;
+ shadow_nx_mask = nx_mask;
+ shadow_x_mask = x_mask;
+}
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_SHIFT 5
+#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
+#define PFERR_FETCH_MASK (1U << 4)
+
+static void kvm_timer_init(void)
+{
+ int cpu;
+
+ /*
+ * XXX We assume that any machine running solaris kvm
+ * has constant time stamp counter increment rate.
+ * This will be true for all but older machines.
+ */
+#ifndef CONFIG_SOLARIS
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+#else
+ /* assume pi_clock in mhz */
+ /* cpu_tsc_khz = (CPU)->cpu_type_info.pi_clock * 1000;*/
+#endif /*CONFIG_SOLARIS*/
}
int kvm_arch_init(void *opaque)
{
int r;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+ volatile int x; /* XXX - dtrace return probe missing */
if (ops->cpu_has_kvm_support()) {
cmn_err(CE_WARN, "kvm: no hardware support\n");
@@ -454,7 +1140,6 @@ int kvm_arch_init(void *opaque)
if (r)
goto out;
-#ifdef XXX
kvm_init_msr_list();
kvm_x86_ops = ops;
@@ -463,16 +1148,20 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
kvm_timer_init();
-#endif
+ x = 10; /*XXX*/
return 0;
out:
+ x = 20; /*XXX*/
return r;
}
+caddr_t bad_page; /* XXX page_t on linux... */
+pfn_t bad_pfn;
+kmem_cache_t *kvm_vcpu_cache;
+
int kvm_init(void *opaque, unsigned int vcpu_size)
{
int r;
@@ -482,33 +1171,28 @@ int kvm_init(void *opaque, unsigned int vcpu_size)
if (r != DDI_SUCCESS)
return (r);
-#ifdef XXX
- if (r)
- goto out_fail;
- bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ bad_page = kmem_zalloc(PAGESIZE, KM_SLEEP);
if (bad_page == NULL) {
- r = -ENOMEM;
+ r = ENOMEM;
goto out;
}
- bad_pfn = page_to_pfn(bad_page);
+ bad_pfn = hat_getpfnum(kas.a_hat, bad_page);
+#ifdef XXX
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
}
-
#endif /*XXX*/
-
r = kvm_arch_hardware_setup();
- return (r);
-#ifdef XXX
- if (r < 0)
+ if (r != DDI_SUCCESS)
goto out_free_0a;
+#ifdef XXX
for_each_online_cpu(cpu) {
smp_call_function_single(cpu,
kvm_arch_check_processor_compat,
@@ -516,7 +1200,10 @@ int kvm_init(void *opaque, unsigned int vcpu_size)
if (r < 0)
goto out_free_1;
}
+#endif /*XXX*/
+
+#ifdef XXX
r = register_cpu_notifier(&kvm_cpu_notifier);
if (r)
goto out_free_2;
@@ -529,64 +1216,84 @@ int kvm_init(void *opaque, unsigned int vcpu_size)
r = sysdev_register(&kvm_sysdev);
if (r)
goto out_free_4;
-
+#endif /*XXX*/
/* A kmem cache lets us meet the alignment requirements of fx_save. */
kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
__alignof__(struct kvm_vcpu),
- 0, NULL);
+ NULL, NULL, NULL, NULL, NULL, 0);
if (!kvm_vcpu_cache) {
- r = -ENOMEM;
+ r = ENOMEM;
goto out_free_5;
}
+#ifdef XXX
kvm_chardev_ops.owner = module;
kvm_vm_fops.owner = module;
kvm_vcpu_fops.owner = module;
r = misc_register(&kvm_dev);
if (r) {
- printk(KERN_ERR "kvm: misc device register failed\n");
+ cmn_err(CE_WARN, "kvm: misc device register failed\n");
goto out_free;
}
+ /*
+ * XXX - if kernel preemption occurs, we probably need
+ * to implement these, and add hooks to the preemption code.
+ * For right now, we'll make the totally unreasonable
+ * assumption that we won't be preempted while in the
+ * kernel, i.e., no realtime threads are running
+ */
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
kvm_init_debug();
+#endif /*XXX*/
return 0;
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
out_free_5:
+#ifdef XXX
sysdev_unregister(&kvm_sysdev);
out_free_4:
sysdev_class_unregister(&kvm_sysdev_class);
out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
+#endif /*XXX*/
out_free_2:
out_free_1:
+#ifdef XXX
kvm_arch_hardware_unsetup();
+#endif /*XXX*/
out_free_0a:
+#ifdef XXX
free_cpumask_var(cpus_hardware_enabled);
+#endif /*XXX*/
out_free_0:
- __free_page(bad_page);
+ kmem_free(bad_page, PAGESIZE);
out:
+#ifdef XXX
kvm_arch_exit();
+#endif
out_fail:
return r;
-#endif /*XXX*/
}
-extern unsigned long *vmx_io_bitmap_a;
-extern unsigned long *vmx_io_bitmap_b;
-extern unsigned long *vmx_msr_bitmap_legacy;
-extern unsigned long *vmx_msr_bitmap_longmode;
+extern unsigned long vmx_io_bitmap_a[];
+extern unsigned long vmx_io_bitmap_b[];
+extern unsigned long vmx_msr_bitmap_legacy[];
+extern unsigned long vmx_msr_bitmap_longmode[];
+
+static inline int cpu_has_vmx_msr_bitmap(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
+}
static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr)
{
-#ifdef XXX
int f = sizeof(unsigned long);
if (!cpu_has_vmx_msr_bitmap())
@@ -598,14 +1305,13 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t
* We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
*/
if (msr <= 0x1fff) {
- __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+ BT_CLEAR(msr_bitmap + 0x000 / f, msr); /* read-low */
+ BT_CLEAR(msr_bitmap + 0x800 / f, msr); /* write-low */
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
- __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+ BT_CLEAR(msr_bitmap + 0x400 / f, msr); /* read-high */
+ BT_CLEAR(msr_bitmap + 0xc00 / f, msr); /* write-high */
}
-#endif /*XXX*/
}
static void vmx_disable_intercept_for_msr(uint32_t msr, int longmode_only)
@@ -615,16 +1321,52 @@ static void vmx_disable_intercept_for_msr(uint32_t msr, int longmode_only)
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
}
+static struct kvm_shared_msrs_global shared_msrs_global;
+
+void kvm_define_shared_msr(unsigned slot, uint32_t msr)
+{
+ if (slot >= shared_msrs_global.nr)
+ shared_msrs_global.nr = slot + 1;
+ shared_msrs_global.msrs[slot] = msr;
+#ifdef XXX
+ /* we need ensured the shared_msr_global have been updated */
+ smp_wmb();
+#endif /*XXX*/
+}
+
+static uint64_t host_efer;
+
+/*
+ * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * away by decrementing the array size.
+ */
+static const uint32_t vmx_msr_index[] = {
+#ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+#endif
+ MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
+};
+#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
+#define VMX_NR_VPIDS (1 << 16)
+ulong_t *vmx_vpid_bitmap;
+size_t vpid_bitmap_words;
+kmutex_t vmx_vpid_lock;
+
+void kvm_disable_tdp(void)
+{
+ tdp_enabled = 0;
+}
+
static int vmx_init(void)
{
int r, i;
-#ifdef XXX
+
rdmsrl_safe(MSR_EFER, &host_efer);
for (i = 0; i < NR_VMX_MSR; ++i)
kvm_define_shared_msr(i, vmx_msr_index[i]);
-#endif /*XXX*/
+#ifdef XXX
vmx_io_bitmap_a = (unsigned long *)kmem_zalloc(PAGESIZE, KM_SLEEP);
if (!vmx_io_bitmap_a)
return ENOMEM;
@@ -643,10 +1385,9 @@ static int vmx_init(void)
vmx_msr_bitmap_longmode = (unsigned long *)kmem_zalloc(PAGESIZE, KM_SLEEP);
if (!vmx_msr_bitmap_longmode) {
- r = ENOMEM;
- goto out2;
+ r = ENOMEM; goto out2;
}
-
+#endif
/*
* Allow direct access to the PC debug port (it is often used for I/O
* delays, but the vmexits simply slow things down).
@@ -659,9 +1400,7 @@ static int vmx_init(void)
memset(vmx_msr_bitmap_legacy, 0xff, PAGESIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGESIZE);
-#ifdef XXX
- set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-#endif /*XXX*/
+ BT_SET(vmx_vpid_bitmap, 0); /* 0 is reserved for host */
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx));
@@ -675,7 +1414,6 @@ static int vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, 0);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, 0);
-#ifdef XXX
if (enable_ept) {
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -686,10 +1424,10 @@ static int vmx_init(void)
} else
kvm_disable_tdp();
+#ifdef XXX
if (bypass_guest_pf)
kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
#endif /*XXX*/
-
return 0;
out3:
@@ -718,10 +1456,20 @@ _init(void)
ddi_soft_state_fini(&kvm_state);
}
+ if (enable_vpid) {
+ vpid_bitmap_words = howmany(VMX_NR_VPIDS, BT_NBIPUL);
+ vmx_vpid_bitmap = kmem_zalloc(sizeof(ulong_t)*vpid_bitmap_words, KM_SLEEP);
+ mutex_init(&vmx_vpid_lock, NULL, MUTEX_DRIVER, NULL);
+ }
+
mutex_init(&kvm_lock, NULL, MUTEX_DRIVER, 0); /* XXX */
kvm_x86_ops = &vmx_x86_ops;
if ((r = vmx_init()) != DDI_SUCCESS) {
mutex_destroy(&kvm_lock);
+ if (vmx_vpid_bitmap) {
+ kmem_free(vmx_vpid_bitmap, sizeof(ulong_t)*vpid_bitmap_words);
+ mutex_destroy(&vmx_vpid_lock);
+ }
mod_remove(&modlinkage);
ddi_soft_state_fini(&kvm_state);
return (r);
@@ -953,7 +1701,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
{
return container_of(mn, struct kvm, mmu_notifier);
}
-#endif
static void
kvm_mmu_pages_init(struct kvm_mmu_page *parent,
@@ -981,21 +1728,8 @@ mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
return (pvec->nr == KVM_PAGE_ARRAY_NR);
}
-static uint64_t shadow_trap_nonpresent_pte;
-static uint64_t shadow_notrap_nonpresent_pte;
-
extern pfn_t hat_getpfnum(struct hat *hat, caddr_t);
-#ifdef XXX
-
-static inline struct kvm_mmu_page *
-page_header(hpa_t shadow_page)
-{
- struct page *page = pfn_to_page(shadow_page >> PAGESHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
-}
-
static int
is_large_pte(uint64_t pte)
{
@@ -1009,6 +1743,7 @@ is_shadow_present_pte(uint64_t pte)
&& pte != shadow_notrap_nonpresent_pte;
}
+
static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
@@ -1042,7 +1777,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
}
}
- if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+ if (bt_getlowbit(sp->unsync_child_bitmap, 0, 512) == 512)
sp->unsync_children = 0;
return nr_unsync_leaf;
@@ -1228,7 +1963,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
kvm->mmu_notifier_count--;
spin_unlock(&kvm->mmu_lock);
- BUG_ON(kvm->mmu_notifier_count < 0);
+ assert(kvm->mmu_notifier_count >= 0);
}
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1257,18 +1992,14 @@ kvm_arch_flush_shadow(struct kvm *kvm)
kvm_reload_remote_mmus(kvm);
}
-#ENDIF /*XXX*/
-
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int idx;
-#ifdef XXX
idx = srcu_read_lock(&kvm->srcu);
kvm_arch_flush_shadow(kvm);
srcu_read_unlock(&kvm->srcu, idx);
-#endif /*XXX*/
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
@@ -1285,7 +2016,6 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}
-
#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
static int kvm_init_mmu_notifier(struct kvm *kvm)
@@ -1345,7 +2075,8 @@ kvm_create_vm(void)
(void *)ipltospl(DISP_LEVEL));
#ifdef XXX
kvm_eventfd_init(kvmp);
-#endif
+#endif /*XXX*/
+
mutex_init(&kvmp->lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&kvmp->irq_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&kvmp->slots_lock, NULL, MUTEX_DRIVER, NULL);
@@ -1436,10 +2167,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
used_pages = max(0, used_pages);
-#ifdef XXX
/* for the time being, assume that address space will only grow */
/* larger. The following code will be added later. */
-
+#ifdef XXX
/*
* If we set the number of mmu pages to be smaller be than the
* number of actived pages , we must to free some mmu pages before we
@@ -1448,7 +2178,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
if (used_pages > kvm_nr_mmu_pages) {
while (used_pages > kvm_nr_mmu_pages &&
- !list_empty(&kvm->arch.active_mmu_pages)) {
+ !list_is_empty(&kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
@@ -1483,7 +2213,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
old.npages * PAGESIZE);
up_write(&current->mm->mmap_sem);
if (ret < 0)
- printk(KERN_WARNING
+ cmn_err(CE_WARN,
"kvm_vm_ioctl_set_memory_region: "
"failed to munmap memory\n");
}
@@ -2004,6 +2734,31 @@ static inline uint32_t bit(int bitno)
return 1 << (bitno & 31);
}
+static inline int cpu_has_vmx_ept_1g_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+}
+
+static int vmx_get_lpage_level(void)
+{
+ if (enable_ept && !cpu_has_vmx_ept_1g_page())
+ return PT_DIRECTORY_LEVEL;
+ else
+ /* For shadow and EPT supported 1GB page */
+ return PT_PDPE_LEVEL;
+}
+
+static inline int cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDTSCP;
+}
+
+static int vmx_rdtscp_supported(void)
+{
+ return cpu_has_vmx_rdtscp();
+}
+
#define F(x) bit(X86_FEATURE_##x)
@@ -2058,10 +2813,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function,
F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
0 /* SKINIT */ | 0 /* WDT */;
+ volatile int x; /* XXX - dtrace return probe missing */
+
/* all calls to cpuid_count() should be made on the same cpu */
/* XXX - right now, system panics at ddi_exit_critical() */
/* XXX - to run everything on same cpu, bind qemu at startup */
- /*ddic = ddi_enter_critical(); */
+ kpreempt_disable();
do_cpuid_1_ent(entry, function, index);
++*nent;
@@ -2135,6 +2892,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function,
}
/*XXX - see comment above for ddi_enter_critical() */
/*ddi_exit_critical(ddic);*/
+ kpreempt_enable();
+ x = 10; /*XXX*/
}
#undef F
@@ -2146,13 +2905,15 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 *cpuid_entries;
int limit, nent = 0, r = E2BIG;
uint32_t func;
+ int allocsize = 0;
if (cpuid->nent < 1)
goto out;
if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
cpuid->nent = KVM_MAX_CPUID_ENTRIES;
r = ENOMEM;
- cpuid_entries = kmem_alloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent, KM_SLEEP);
+ allocsize = sizeof(struct kvm_cpuid_entry2)*cpuid->nent;
+ cpuid_entries = kmem_alloc(allocsize, KM_SLEEP);
if (!cpuid_entries)
goto out;
@@ -2182,16 +2943,11 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
r = 0;
out_free:
- kmem_free(cpuid_entries, sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+ kmem_free(cpuid_entries, allocsize);
out:
return r;
}
-struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_vmx, vcpu);
-}
-
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -2199,6 +2955,7 @@ void vmcs_clear(struct vmcs *vmcs)
{
unsigned char error;
uint64_t phys_addr = (hat_getpfnum(kas.a_hat, (char *)vmcs)<<PAGESHIFT)|((uint64_t)vmcs&PAGEOFFSET);
+ volatile int x; /*XXX - dtrace return probe missing */
asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "\n\tsetna %0\n"
: "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
@@ -2206,6 +2963,7 @@ void vmcs_clear(struct vmcs *vmcs)
if (error)
cmn_err(CE_PANIC, "kvm: vmclear fail: %p/%llx\n",
vmcs, phys_addr);
+ x = 10; /*XXX*/
}
static void __vcpu_clear(void *arg)
@@ -2236,16 +2994,21 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
}
+
+static void vmwrite_error(unsigned long field, unsigned long value)
+{
+ cmn_err(CE_WARN, "vmwrite error: reg %lx value %lx (err %d)\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
void vmcs_writel(unsigned long field, unsigned long value)
{
unsigned char error;
asm volatile (ASM_VMX_VMWRITE_RAX_RDX "\n\tsetna %0"
: "=q"(error) : "a"(value), "d"(field) : "cc");
-#ifdef XXX
- if (unlikely(error))
+ if ((error))
vmwrite_error(field, value);
-#endif
}
unsigned long vmcs_readl(unsigned long field)
@@ -2257,7 +3020,6 @@ unsigned long vmcs_readl(unsigned long field)
return value;
}
-
uint64_t vmcs_read64(unsigned long field)
{
#ifdef CONFIG_X86_64
@@ -2267,6 +3029,11 @@ uint64_t vmcs_read64(unsigned long field)
#endif
}
+uint16_t vmcs_read16(unsigned long field)
+{
+ return vmcs_readl(field);
+}
+
void vmcs_write64(unsigned long field, uint64_t value)
{
vmcs_writel(field, value);
@@ -2276,15 +3043,208 @@ void vmcs_write64(unsigned long field, uint64_t value)
#endif
}
+
+void vmcs_write16(unsigned long field, uint16_t value)
+{
+ vmcs_writel(field, value);
+}
+
+/*
+ * writes 'guest_tsc' into guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ */
+static void guest_write_tsc(uint64_t guest_tsc, uint64_t host_tsc)
+{
+ vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+}
+
+static inline int cpu_has_secondary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+ return flexpriority_enabled && irqchip_in_kernel(kvm);
+}
+
+extern uint64_t kvm_va2pa(caddr_t va);
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+ uint32_t host_sysenter_cs, msr_low, msr_high;
+ uint32_t junk;
+ uint64_t host_pat, tsc_this, tsc_base;
+ unsigned long a;
+ struct descriptor_table dt;
+ int i;
+ unsigned long kvm_vmx_return;
+ uint32_t exec_control;
+
+ /* I/O */
+ vmcs_write64(IO_BITMAP_A, kvm_va2pa((caddr_t)vmx_io_bitmap_a));
+ vmcs_write64(IO_BITMAP_B, kvm_va2pa((caddr_t)vmx_io_bitmap_b));
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, kvm_va2pa((caddr_t)vmx_msr_bitmap_legacy));
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+ /* Control */
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+ vmcs_config.pin_based_exec_ctrl);
+
+ exec_control = vmcs_config.cpu_based_exec_ctrl;
+#ifdef XXX
+ if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ }
+#endif /*XXX*/
+
+ if (!enable_ept)
+ exec_control |= CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_INVLPG_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+ if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ exec_control &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+ if (!enable_ept) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ enable_unrestricted_guest = 0;
+ }
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+#ifdef XXX
+ if (!ple_gap)
+#endif /*XXX*/
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+#ifdef XXX
+ if (ple_gap) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmcs_write32(PLE_WINDOW, ple_window);
+ }
+#endif /*XXX*/
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+
+ vmcs_writel(HOST_CR0, getcr0()); /* 22.2.3 */
+ vmcs_writel(HOST_CR4, getcr4()); /* 22.2.3, 22.2.5 */
+ vmcs_writel(HOST_CR3, getcr3()); /* 22.2.3 FIXME: shadow tables */
+
+ vmcs_write16(HOST_CS_SELECTOR, GDT_KCODE); /* 22.2.4 */
+ vmcs_write16(HOST_DS_SELECTOR, GDT_KDATA); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, GDT_KDATA); /* 22.2.4 */
+ vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
+ vmcs_write16(HOST_SS_SELECTOR, GDT_KDATA); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_FS_BASE, a);
+ vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
+ rdmsrl(MSR_GS_BASE, a);
+ vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
+#else
+ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ kvm_get_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
+
+ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+ vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+ rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
+ rdmsrl(MSR_IA32_SYSENTER_ESP, a);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
+ rdmsrl(MSR_IA32_SYSENTER_EIP, a);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((uint64_t) msr_high << 32);
+ vmcs_write64(HOST_IA32_PAT, host_pat);
+ }
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((uint64_t) msr_high << 32);
+ /* Write the default value follow host pat */
+ vmcs_write64(GUEST_IA32_PAT, host_pat);
+ /* Keep arch.pat sync with GUEST_IA32_PAT */
+ vmx->vcpu.arch.pat = host_pat;
+ }
+
+ for (i = 0; i < NR_VMX_MSR; ++i) {
+ uint32_t index = vmx_msr_index[i];
+ uint32_t data_low, data_high;
+ int j = vmx->nmsrs;
+
+ if (rdmsr_safe(index, &data_low, &data_high) < 0)
+ continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
+ vmx->guest_msrs[j].mask = -1ull;
+ ++vmx->nmsrs;
+ }
+
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+ /* 22.2.1, 20.8.1 */
+ vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+
+ tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
+ rdtscll(tsc_this);
+ if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
+ tsc_base = tsc_this;
+
+ guest_write_tsc(0, tsc_base);
+ return 0;
+}
+
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /* XXX - the following assignment assumes vmx contains vcpu */
+ /* at the beginning of the structure */
+
+ struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
uint64_t phys_addr = (hat_getpfnum(kas.a_hat, (char *)vmx->vmcs)<<PAGESHIFT)|((uint64_t)(vmx->vmcs)&0xfff);
uint64_t tsc_this, delta, new_offset;
+ volatile int x; /* XXX - dtrace return probe missing */
if (vcpu->cpu != cpu) {
vcpu_clear(vmx);
@@ -2293,10 +3253,10 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
#endif /*XXX*/
BT_SET(&vcpu->requests, KVM_REQ_TLB_FLUSH);
#ifdef XXX
- local_irq_disable();
+ kpreempt_disable();
list_add(&vmx->local_vcpus_link,
&per_cpu(vcpus_on_cpu, cpu));
- local_irq_enable();
+ kpreempt_enable();
#endif /*XXX*/
}
@@ -2351,6 +3311,8 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmcs_write64(TSC_OFFSET, new_offset);
}
}
+ x = 10;
+ return;
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2382,6 +3344,15 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
#endif /*XXX*/
}
+/* straight from xen code... */
+void
+ldt_load(void)
+{
+ *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
+ wr_ldtr(ULDT_SEL);
+}
+
+
static void reload_tss(void)
{
/*
@@ -2396,7 +3367,7 @@ static void reload_tss(void)
load_TR_desc();
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
+int is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
return vcpu->arch.efer & EFER_LMA;
@@ -2405,6 +3376,27 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
#endif
}
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+
+ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ uint64_t tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+#ifdef XXX
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+#endif /*XXX*/
+ return vcpu->arch.cr4 & mask;
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+
static void
__vmx_load_host_state(struct vcpu_vmx *vmx)
{
@@ -2425,18 +3417,15 @@ __vmx_load_host_state(struct vcpu_vmx *vmx)
* If we have to reload gs, we must take care to
* preserve our gs base.
*/
-#ifdef XXX
- local_irq_save(flags);
-#endif /*XXX*/
+ kpreempt_disable();
kvm_load_gs(vmx->host_state.gs_sel);
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
#endif
-#ifdef XXX
- local_irq_restore(flags);
-#endif /*XXX*/
+ kpreempt_enable();
}
reload_tss();
+
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -2445,9 +3434,16 @@ __vmx_load_host_state(struct vcpu_vmx *vmx)
#endif
}
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+ kpreempt_disable();
+ __vmx_load_host_state(vmx);
+ kpreempt_enable();
+}
+
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
- __vmx_load_host_state(to_vmx(vcpu));
+ __vmx_load_host_state((struct vcpu_vmx *)vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2488,9 +3484,75 @@ void vcpu_put(struct kvm_vcpu *vcpu)
mutex_exit(&vcpu->mutex);
}
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+ uint32_t function, uint32_t index)
+{
+ if (e->function != function)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ return 0;
+ return 1;
+}
+
+struct kvm_pic *pic_irqchip(struct kvm *kvm);
+extern int irqchip_in_kernel(struct kvm *kvm);
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+ struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+ int j, nent = vcpu->arch.cpuid_nent;
+
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+ /* when no next entry is found, the current entry[i] is reselected */
+ for (j = i + 1; ; j = (j + 1) % nent) {
+ struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+ if (ej->function == e->function) {
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ return j;
+ }
+ }
+ return 0; /* silence gcc, even though control never reaches here */
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ uint32_t function, uint32_t index)
+{
+ int i;
+ struct kvm_cpuid_entry2 *best = NULL;
+
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ struct kvm_cpuid_entry2 *e;
+
+ e = &vcpu->arch.cpuid_entries[i];
+ if (is_matching_cpuid_entry(e, function, index)) {
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+ move_to_next_stateful_cpuid_entry(vcpu, i);
+ best = e;
+ break;
+ }
+ /*
+ * Both basic or both extended?
+ */
+ if (((e->function ^ function) & 0x80000000) == 0)
+ if (!best || e->function > best->function)
+ best = e;
+ }
+ return best;
+}
+
+#define APIC_LVT_NUM 6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
+
+extern void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val);
+
void kvm_apic_set_version(struct kvm_vcpu *vcpu)
{
-#ifdef XXX
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *feat;
uint32_t v = APIC_VERSION;
@@ -2502,7 +3564,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
v |= APIC_LVR_DIRECTED_EOI;
apic_set_reg(apic, APIC_LVR, v);
-#endif /*XXX*/
}
@@ -2552,48 +3613,5166 @@ out:
return r;
}
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+#ifdef XXX
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, reg);
+#endif /*XXX*/
+
+ return vcpu->arch.regs[reg];
+}
+
+void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+#ifdef XXX
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+#endif
+}
+
+unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+
+ rflags = kvm_x86_ops->get_rflags(vcpu);
+#ifdef XXX
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+#endif /*XXX*/
+ return rflags;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+
+ regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+#ifdef CONFIG_X86_64
+ regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+ regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+ regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+ regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+ regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+ regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+ regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+ regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+#endif
+
+ regs->rip = kvm_rip_read(vcpu);
+ regs->rflags = kvm_get_rflags(vcpu);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+#define VMX_SEGMENT_FIELD(seg) \
+ [VCPU_SREG_##seg] = { \
+ .selector = GUEST_##seg##_SELECTOR, \
+ .base = GUEST_##seg##_BASE, \
+ .limit = GUEST_##seg##_LIMIT, \
+ .ar_bytes = GUEST_##seg##_AR_BYTES, \
+ }
+
+static struct kvm_vmx_segment_field {
+ unsigned selector;
+ unsigned base;
+ unsigned limit;
+ unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+ VMX_SEGMENT_FIELD(CS),
+ VMX_SEGMENT_FIELD(DS),
+ VMX_SEGMENT_FIELD(ES),
+ VMX_SEGMENT_FIELD(FS),
+ VMX_SEGMENT_FIELD(GS),
+ VMX_SEGMENT_FIELD(SS),
+ VMX_SEGMENT_FIELD(TR),
+ VMX_SEGMENT_FIELD(LDTR),
+};
+
+void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ uint32_t ar;
+
+ var->base = vmcs_readl(sf->base);
+ var->limit = vmcs_read32(sf->limit);
+ var->selector = vmcs_read16(sf->selector);
+ ar = vmcs_read32(sf->ar_bytes);
+#ifdef XXX
+ if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
+ ar = 0;
+#endif /*XXX*/
+ var->type = ar & 15;
+ var->s = (ar >> 4) & 1;
+ var->dpl = (ar >> 5) & 3;
+ var->present = (ar >> 7) & 1;
+ var->avl = (ar >> 12) & 1;
+ var->l = (ar >> 13) & 1;
+ var->db = (ar >> 14) & 1;
+ var->g = (ar >> 15) & 1;
+ var->unusable = (ar >> 16) & 1;
+}
+
+static uint32_t vmx_segment_access_rights(struct kvm_segment *var)
+{
+ uint32_t ar;
+
+ if (var->unusable)
+ ar = 1 << 16;
+ else {
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ }
+ if (ar == 0) /* a 0 value means unusable */
+ ar = AR_UNUSABLE_MASK;
+
+ return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ uint32_t ar;
+
+ if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
+ vmx->rmode.tr.selector = var->selector;
+ vmx->rmode.tr.base = var->base;
+ vmx->rmode.tr.limit = var->limit;
+ vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+ return;
+ }
+ vmcs_writel(sf->base, var->base);
+ vmcs_write32(sf->limit, var->limit);
+ vmcs_write16(sf->selector, var->selector);
+ if (vmx->rmode.vm86_active && var->s) {
+ /*
+ * Hack real-mode segments into vm86 compatibility.
+ */
+ if (var->base == 0xffff0000 && var->selector == 0xf000)
+ vmcs_writel(sf->base, 0xf0000);
+ ar = 0xf3;
+ } else
+ ar = vmx_segment_access_rights(var);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the usedland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+#ifdef XXX
+ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ ar |= 0x1; /* Accessed */
+#endif /*XXX*/
+
+ vmcs_write32(sf->ar_bytes, ar);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+static uint16_t get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ return kvm_seg.selector;
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+#ifdef XXX
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ vcpu->arch.singlestep_cs ==
+ get_segment_selector(vcpu, VCPU_SREG_CS) &&
+ vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+ rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+#endif /*XXX*/
+ kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+
+ kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+#ifdef CONFIG_X86_64
+ kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+ kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+ kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+ kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+ kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+ kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+ kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+ kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#endif
+
+ kvm_rip_write(vcpu, regs->rip);
+ kvm_set_rflags(vcpu, regs->rflags);
+
+ vcpu->arch.exception.pending = 0;
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+#ifdef XXX
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+#endif /*XXX*/
+
+ vcpu_load(vcpu);
+#ifdef XXX
+ memcpy(fpu->fpr, fxsave->st_space, 128);
+ fpu->fcw = fxsave->cwd;
+ fpu->fsw = fxsave->swd;
+ fpu->ftwx = fxsave->twd;
+ fpu->last_opcode = fxsave->fop;
+ fpu->last_ip = fxsave->rip;
+ fpu->last_dp = fxsave->rdp;
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+#endif /*XXX*/
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+#ifdef XXX
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+#endif
+
+ vcpu_load(vcpu);
+#ifdef XXX
+ memcpy(fxsave->st_space, fpu->fpr, 128);
+ fxsave->cwd = fpu->fcw;
+ fxsave->swd = fpu->fsw;
+ fxsave->twd = fpu->ftwx;
+ fxsave->fop = fpu->last_opcode;
+ fxsave->rip = fpu->last_ip;
+ fxsave->rdp = fpu->last_dp;
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+#endif /*XXX*/
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+
+ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+#ifdef XXX
+ if (tmask & vcpu->arch.cr0_guest_owned_bits)
+ kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+#endif /*XXX*/
+ return vcpu->arch.cr0 & mask;
+}
+
+
+ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ if (irqchip_in_kernel(vcpu->kvm))
+ return kvm_lapic_get_cr8(vcpu);
+ else
+#endif /*XXX*/
+ return vcpu->arch.cr8;
+}
+
+extern uint64_t kvm_get_apic_base(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ struct descriptor_table dt;
+
+ vcpu_load(vcpu);
+
+ kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ sregs->idt.limit = dt.limit;
+ sregs->idt.base = dt.base;
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ sregs->gdt.limit = dt.limit;
+ sregs->gdt.base = dt.base;
+
+ sregs->cr0 = kvm_read_cr0(vcpu);
+ sregs->cr2 = vcpu->arch.cr2;
+ sregs->cr3 = vcpu->arch.cr3;
+ sregs->cr4 = kvm_read_cr4(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
+ sregs->efer = vcpu->arch.efer;
+ sregs->apic_base = kvm_get_apic_base(vcpu);
+
+ memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+ if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ BT_SET((unsigned long *)sregs->interrupt_bitmap,
+ vcpu->arch.interrupt.nr);
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ ASSERT(vcpu);
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ vcpu->arch.mmu.free(vcpu);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ }
+}
+
+extern int init_kvm_mmu(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ destroy_kvm_mmu(vcpu);
+ return init_kvm_mmu(vcpu);
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, uint8_t vector,
+ int soft)
+{
+ vcpu->arch.interrupt.pending = 1;
+ vcpu->arch.interrupt.soft = soft;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+
+static inline int is_present_gpte(unsigned long pte)
+{
+ return pte & PT_PRESENT_MASK;
+}
+
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+#ifdef XXX
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (alias->flags & KVM_ALIAS_INVALID)
+ continue;
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+#endif /*XXX*/
+ return gfn;
+}
+
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+#ifdef XXX
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+#else
+ struct kvm_memslots *slots = kvm->memslots;
+#endif /*XXX*/
+
+ for (i = 0; i < slots->nmemslots; ++i) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ return memslot;
+ }
+ return NULL;
+}
+
+static inline unsigned long bad_hva(void)
+{
+ return PAGEOFFSET;
+}
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ gfn = unalias_gfn_instantiation(kvm, gfn);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return bad_hva();
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE);
+}
+
+
+int kvm_is_error_hva(unsigned long addr)
+{
+ return addr == bad_hva();
+}
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return EFAULT;
+ r = copyin((caddr_t)(addr + offset), data, len);
+ if (r)
+ return EFAULT;
+ return 0;
+}
+
+
+/*
+ * Load the pae pdptrs. Return true is they are all valid.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ gfn_t pdpt_gfn = cr3 >> PAGESHIFT;
+ unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2;
+ int i;
+ int ret;
+ uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+ ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+ offset * sizeof(uint64_t), sizeof(pdpte));
+ if (ret < 0) {
+ ret = 0;
+ goto out;
+ }
+ for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+ if (is_present_gpte(pdpte[i]) &&
+ (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = 1;
+
+ memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+ BT_SET((unsigned long *)&vcpu->arch.regs_avail,
+ VCPU_EXREG_PDPTR);
+ BT_SET((unsigned long *)&vcpu->arch.regs_dirty,
+ VCPU_EXREG_PDPTR);
+out:
+
+ return ret;
+}
+
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ if (irr == -1 || tpr < irr) {
+ vmcs_write32(TPR_THRESHOLD, 0);
+ return;
+ }
+
+ vmcs_write32(TPR_THRESHOLD, irr);
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops->update_cr8_intercept)
+ return;
+
+ if (!vcpu->arch.apic)
+ return;
+#ifdef XXX
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+#endif /*XXX*/
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+#ifdef XXX
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+#endif /*XXX*/
+}
+
+static int __find_msr_index(struct vcpu_vmx *vmx, uint32_t msr)
+{
+ int i;
+
+ for (i = 0; i < vmx->nmsrs; ++i)
+ if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+ return i;
+ return -1;
+}
+
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, uint32_t msr)
+{
+ int i;
+
+ i = __find_msr_index(vmx, msr);
+ if (i >= 0)
+ return &vmx->guest_msrs[i];
+ return NULL;
+}
+
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+{
+ struct shared_msr_entry tmp;
+
+ tmp = vmx->guest_msrs[to];
+ vmx->guest_msrs[to] = vmx->guest_msrs[from];
+ vmx->guest_msrs[from] = tmp;
+}
+
+static int update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+{
+ uint64_t guest_efer;
+ uint64_t ignore_bits;
+
+ guest_efer = vmx->vcpu.arch.efer;
+
+ /*
+ * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * outside long mode
+ */
+ ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+ ignore_bits |= EFER_LMA | EFER_LME;
+ /* SCE is meaningful only in long mode on Intel */
+ if (guest_efer & EFER_LMA)
+ ignore_bits &= ~(uint64_t)EFER_SCE;
+#endif
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+ return 1;
+}
+
+/*
+ * Set up the vmcs to automatically save and restore system
+ * msrs. Don't touch the 64-bit msrs if the guest is in legacy
+ * mode, as fiddling with msrs is very expensive.
+ */
+void setup_msrs(struct vcpu_vmx *vmx)
+{
+ int save_nmsrs, index;
+ unsigned long *msr_bitmap;
+
+ vmx_load_host_state(vmx);
+ save_nmsrs = 0;
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_LSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_CSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_TSC_AUX);
+ if (index >= 0 && vmx->rdtscp_enabled)
+ move_msr_up(vmx, index, save_nmsrs++);
+ /*
+ * MSR_K6_STAR is only needed on long mode guests, and only
+ * if efer.sce is enabled.
+ */
+ index = __find_msr_index(vmx, MSR_K6_STAR);
+ if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
+ move_msr_up(vmx, index, save_nmsrs++);
+ }
+#endif
+ index = __find_msr_index(vmx, MSR_EFER);
+ if (index >= 0 && update_transition_efer(vmx, index))
+ move_msr_up(vmx, index, save_nmsrs++);
+
+ vmx->save_nmsrs = save_nmsrs;
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ if (is_long_mode(&vmx->vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy;
+
+ vmcs_write64(MSR_BITMAP, kvm_va2pa((caddr_t)msr_bitmap));
+ }
+}
+
+void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+ if (!msr)
+ return;
+
+ /*
+ * Force kernel_gs_base reloading before EFER changes, as control
+ * of this msr depends on is_long_mode().
+ */
+ vmx_load_host_state(to_vmx(vcpu));
+ vcpu->arch.efer = efer;
+ if (efer & EFER_LMA) {
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ vmcs_read32(VM_ENTRY_CONTROLS) |
+ VM_ENTRY_IA32E_MODE);
+ msr->data = efer;
+ } else {
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ vmcs_read32(VM_ENTRY_CONTROLS) &
+ ~VM_ENTRY_IA32E_MODE);
+
+ msr->data = efer & ~EFER_LME;
+ }
+ setup_msrs(vmx);
+}
+
+static inline int is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+int kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
+}
+#endif
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+ struct kvm_pic *s = pic_irqchip(kvm);
+
+ mutex_enter(&s->lock);
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
+ mutex_exit(&s->lock);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int mmu_reset_needed = 0;
+ int pending_vec, max_bits;
+ struct descriptor_table dt;
+
+ vcpu_load(vcpu);
+
+ dt.limit = sregs->idt.limit;
+ dt.base = sregs->idt.base;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+ dt.limit = sregs->gdt.limit;
+ dt.base = sregs->gdt.base;
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+
+ kvm_set_cr8(vcpu, sregs->cr8);
+
+ mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ kvm_x86_ops->set_efer(vcpu, sregs->efer);
+ kvm_set_apic_base(vcpu, sregs->apic_base);
+
+ mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+ vcpu->arch.cr0 = sregs->cr0;
+
+ mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+ load_pdptrs(vcpu, vcpu->arch.cr3);
+ mmu_reset_needed = 1;
+ }
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
+ max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ pending_vec = bt_getlowbit(
+ (const unsigned long *)sregs->interrupt_bitmap, 0, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, 0);
+ cmn_err(CE_NOTE, "Set back pending irq %d\n", pending_vec);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_pic_clear_isr_ack(vcpu->kvm);
+ }
+
+ kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ update_cr8_intercept(vcpu);
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !is_protmode(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+#endif /*CONFIG_KVM_APIC_ARCHITECTURE*/
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+ static int version;
+ struct pvclock_wall_clock wc;
+ struct timespec boot;
+
+#ifdef XXX
+ if (!wall_clock)
+ return;
+
+ version++;
+
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+ /*
+ * The guest calculates current wall clock time by adding
+ * system time (updated by kvm_write_guest_time below) to the
+ * wall clock specified here. guest system time equals host
+ * system time for us, thus we must fill in host boot time here.
+ */
+ getboottime(&boot);
+
+ wc.sec = boot.tv_sec;
+ wc.nsec = boot.tv_nsec;
+ wc.version = version;
+
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+#endif /*XXX*/
+}
+
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGESIZE - offset)
+ return PAGESIZE - offset;
+ else
+ return len;
+}
+
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *memslot;
+
+#ifdef XXX
+ gfn = unalias_gfn(kvm, gfn);
+ memslot = gfn_to_memslot_unaliased(kvm, gfn);
+ if (memslot && memslot->dirty_bitmap) {
+ unsigned long rel_gfn = gfn - memslot->base_gfn;
+ unsigned long *p = memslot->dirty_bitmap +
+ rel_gfn / BT_NBIPUL;
+ int offset = rel_gfn % BT_NBIPUL;
+
+ /* avoid RMW */
+ if (!generic_test_le_bit(offset, p))
+ generic___set_le_bit(offset, p);
+ }
+#endif /*XXX*/
+}
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+ int offset, int len)
+{
+ int r;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ r = copyout(data, (caddr_t)((uint64_t)addr + offset), len);
+ if (r)
+ return -EFAULT;
+ mark_page_dirty(kvm, gfn);
+ return 0;
+}
+
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+ unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGESHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+
+static int xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int lm = is_long_mode(vcpu);
+ uint8_t *blob_addr = lm ? (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+ : (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+ uint8_t blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ uint32_t page_num = data & ~PAGEMASK;
+ uint64_t page_addr = data & PAGEMASK;
+ uint8_t *page;
+ int r;
+
+ r = E2BIG;
+ if (page_num >= blob_size)
+ goto out;
+ r = ENOMEM;
+ page = kmem_alloc(PAGESIZE, KM_SLEEP);
+ if (!page)
+ goto out;
+ r = EFAULT;
+ if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE))
+ goto out_free;
+ if (kvm_write_guest(kvm, page_addr, page, PAGESIZE))
+ goto out_free;
+ r = 0;
+out_free:
+ kmem_free(page, PAGESIZE);
+out:
+ return r;
+}
+
+int ignore_msrs = 0;
+extern int is_paging(struct kvm_vcpu *vcpu);
+
+static void set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
+{
+ if (efer & efer_reserved_bits) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if (is_paging(vcpu)
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ if (efer & EFER_FFXSR) {
+ struct kvm_cpuid_entry2 *feat;
+
+ feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ }
+
+ if (efer & EFER_SVME) {
+ struct kvm_cpuid_entry2 *feat;
+
+ feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ }
+
+ kvm_x86_ops->set_efer(vcpu, efer);
+
+ efer &= ~EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
+
+ vcpu->arch.efer = efer;
+
+ vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+ kvm_mmu_reset_context(vcpu);
+}
+
+static int msr_mtrr_valid(unsigned msr)
+{
+ switch (msr) {
+ case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+ case MSR_MTRRfix64K_00000:
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_CR_PAT:
+ return 1;
+ case 0x2f8:
+ return 1;
+ }
+ return 0;
+}
+
+
+static int valid_pat_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static int valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static int mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+ int i;
+
+ if (!msr_mtrr_valid(msr))
+ return 0;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ for (i = 0; i < 8; i++)
+ if (!valid_pat_type((data >> (i * 8)) & 0xff))
+ return 0;
+ return 1;
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return 0;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return 0;
+ return 1;
+ }
+
+ /* variable MTRRs */
+ return valid_mtrr_type(data & 0xff);
+}
+
+
+static int set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+ uint64_t *p = (uint64_t *)&vcpu->arch.mtrr_state.fixed_ranges;
+
+ if (!mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ if (msr == MSR_MTRRdefType) {
+ vcpu->arch.mtrr_state.def_type = data;
+ vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+ } else if (msr == MSR_MTRRfix64K_00000)
+ p[0] = data;
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ p[1 + msr - MSR_MTRRfix16K_80000] = data;
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ uint64_t *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pt = data;
+ }
+
+#ifdef XXX
+ kvm_mmu_reset_context(vcpu);
+#endif /*XXX*/
+ return 0;
+}
+
+static int set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+ switch (msr) {
+#ifdef XXX
+ case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ addr = gfn_to_hva(vcpu->kvm, data >>
+ HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (clear_user((void __user *)addr, PAGESIZE))
+ return 1;
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+#endif /*XXX*/
+ default:
+ cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ kvm->arch.hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!kvm->arch.hv_guest_os_id)
+ kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ uint64_t gfn;
+ unsigned long addr;
+ uint8_t instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!kvm->arch.hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (copyout(instructions, (caddr_t)addr, 4))
+ return 1;
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ default:
+ cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+ uint64_t mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ if (data != 0 && data != ~(uint64_t)0)
+ return -1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ uint32_t offset = msr - MSR_IA32_MC0_CTL;
+ /* only 0 or all 1s can be written to IA32_MCi_CTL
+ * some Linux kernels though clear bit 10 in bank 4 to
+ * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+ * this to avoid an uncatched #GP in the guest
+ */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && (data | (1 << 10)) != ~(uint64_t)0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int kvm_hv_msr_partition_wide(uint32_t msr)
+{
+ int r = 0;
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ r = 1;
+ break;
+ }
+
+ return r;
+}
+
+
+static inline void get_page(caddr_t page)
+{
+}
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+ pfn_t pfn;
+
+ pfn = gfn_to_pfn(kvm, gfn);
+#ifdef XXX
+ if (!kvm_is_mmio_pfn(pfn))
+ return pfn_to_page(pfn);
+#endif /*XXX*/
+
+ get_page(bad_page);
+ return (struct page *)bad_page;
+}
+
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
+{
+ volatile int x;
+
+ switch (msr) {
+ case MSR_EFER:
+ set_efer(vcpu, data);
+ break;
+ case MSR_K7_HWCR:
+ data &= ~(uint64_t)0x40; /* ignore flush filter disable */
+ if (data != 0) {
+ cmn_err(CE_NOTE, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
+ return 1;
+ }
+ break;
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: "
+ "0x%llx\n", data);
+ return 1;
+ }
+ break;
+ case MSR_AMD64_NB_CFG:
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!data) {
+ /* We support the non-activated case already */
+ break;
+ } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+ /* Values other than LBR and BTF are vendor-specific,
+ thus reserved and should throw a #GP */
+ return 1;
+ }
+ cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ break;
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ break;
+ case 0x200 ... 0x2ff:
+ return set_msr_mtrr(vcpu, msr, data);
+ case MSR_IA32_APICBASE:
+ kvm_set_apic_base(vcpu, data);
+ break;
+#ifdef XXX
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
+#endif /*XXX*/
+ case MSR_IA32_MISC_ENABLE:
+ vcpu->arch.ia32_misc_enable_msr = data;
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
+ case MSR_KVM_SYSTEM_TIME: {
+#ifdef XXX
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+#endif /*XXX*/
+
+ vcpu->arch.time = data;
+
+ /* we verify if the enable bit is set... */
+ if (!(data & 1))
+ break;
+
+ /* ...but clean it before doing the actual write */
+ vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1);
+#ifdef XXX
+ vcpu->arch.time_page =
+ gfn_to_page(vcpu->kvm, data >> PAGESHIFT);
+
+ if (is_error_page(vcpu->arch.time_page)) {
+ kvm_release_page_clean(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
+ kvm_request_guest_time_update(vcpu);
+#endif /*XXX*/
+ break;
+ }
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return set_msr_mce(vcpu, msr, data);
+
+ /* Performance counters are not protected by a CPUID bit,
+ * so we should check all of them in the generic path for the sake of
+ * cross vendor migration.
+ * Writing a zero into the event select MSRs disables them,
+ * which we perfectly emulate ;-). Any other value should be at least
+ * reported, some guests depend on them.
+ */
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ if (data != 0)
+ cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ /* at least RHEL 4 unconditionally writes to the perfctr registers,
+ * so we ignore writes to make it happy.
+ */
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_enter(&vcpu->kvm->lock);
+ r = set_msr_hyperv_pw(vcpu, msr, data);
+ mutex_exit(&vcpu->kvm->lock);
+ return r;
+ } else
+ return set_msr_hyperv(vcpu, msr, data);
+ break;
+ default:
+ if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+ return xen_hvm_config(vcpu, data);
+ if (!ignore_msrs) {
+ cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %llx\n",
+ msr, data);
+ return 1;
+ } else {
+ cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %llx\n",
+ msr, data);
+ break;
+ }
+ }
+ x = 10; /*XXX*/
+ return 0;
+}
+
+
+
+static int get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+ uint64_t *p = (uint64_t *)&vcpu->arch.mtrr_state.fixed_ranges;
+
+ if (!msr_mtrr_valid(msr))
+ return 1;
+
+ if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.def_type +
+ (vcpu->arch.mtrr_state.enabled << 10);
+ else if (msr == MSR_MTRRfix64K_00000)
+ *pdata = p[0];
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ uint64_t *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pdata = *pt;
+ }
+
+ return 0;
+}
+
+
+
+static int get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+ uint64_t data = 0;
+
+ switch (msr) {
+#ifdef XXX
+ case HV_X64_MSR_VP_INDEX: {
+ int r;
+ struct kvm_vcpu *v;
+ kvm_for_each_vcpu(r, v, vcpu->kvm)
+ if (v == vcpu)
+ data = r;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+#endif /*XXX*/
+ default:
+ cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+ uint64_t data = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = kvm->arch.hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = kvm->arch.hv_hypercall;
+ break;
+ default:
+ cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+ uint64_t data;
+ uint64_t mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ data = 0;
+ break;
+ case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P))
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ uint32_t offset = msr - MSR_IA32_MC0_CTL;
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
+{
+ uint64_t data;
+ volatile int x; /*XXX - dtrace return probe is not there... */
+
+ switch (msr) {
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K7_HWCR:
+ case MSR_VM_HSAVE_PA:
+ case MSR_P6_PERFCTR0:
+ case MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ data = 0;
+ break;
+ case MSR_MTRRcap:
+ data = 0x500 | KVM_NR_VAR_MTRR;
+ break;
+ case 0x200 ... 0x2ff:
+ return get_msr_mtrr(vcpu, msr, pdata);
+ case 0xcd: /* fsb frequency */
+ data = 3;
+ break;
+ case MSR_IA32_APICBASE:
+ data = kvm_get_apic_base(vcpu);
+ break;
+#ifdef XXX
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ return kvm_x2apic_msr_read(vcpu, msr, pdata);
+ break;
+#endif /*XXX*/
+ case MSR_IA32_MISC_ENABLE:
+ data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ data = 1000ULL;
+ /* CPU multiplier */
+ data |= (((uint64_t)4ULL) << 40);
+ break;
+ case MSR_EFER:
+ data = vcpu->arch.efer;
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ data = vcpu->arch.time;
+ break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ return get_msr_mce(vcpu, msr, pdata);
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_enter(&vcpu->kvm->lock);
+ r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ mutex_exit(&vcpu->kvm->lock);
+ return r;
+ } else
+ return get_msr_hyperv(vcpu, msr, pdata);
+ break;
+ default:
+ if (!ignore_msrs) {
+ cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ } else {
+ cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr);
+ data = 0;
+ }
+ break;
+ }
+ *pdata = data;
+ x = 10; /*XXX*/
+ return 0;
+}
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+ struct kvm_msr_entry *entries,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, uint64_t *data))
+{
+ int i, idx;
+
+ vcpu_load(vcpu);
+
+#ifdef XXX
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+#endif
+ for (i = 0; i < msrs->nmsrs; ++i)
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
+ break;
+#ifdef XXX
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+#endif
+ vcpu_put(vcpu);
+
+ return i;
+}
+
+/*
+ * reads and returns guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset -- 21.3
+ */
+static uint64_t guest_read_tsc(void)
+{
+ uint64_t host_tsc, tsc_offset;
+
+ rdtscll(host_tsc);
+ tsc_offset = vmcs_read64(TSC_OFFSET);
+ return host_tsc + tsc_offset;
+}
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata)
+{
+ uint64_t data;
+ struct shared_msr_entry *msr;
+
+ if (!pdata) {
+ cmn_err(CE_WARN, "BUG: get_msr called with NULL pdata\n");
+ return EINVAL;
+ }
+
+ switch (msr_index) {
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ data = vmcs_readl(GUEST_FS_BASE);
+ break;
+ case MSR_GS_BASE:
+ data = vmcs_readl(GUEST_GS_BASE);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state((struct vcpu_vmx *)vcpu);
+ data = ((struct vcpu_vmx *)(vcpu))->msr_guest_kernel_gs_base;
+ break;
+#endif
+ case MSR_EFER:
+ return kvm_get_msr_common(vcpu, msr_index, pdata);
+ case MSR_IA32_TSC:
+ data = guest_read_tsc();
+ break;
+ case MSR_IA32_SYSENTER_CS:
+ data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ data = vmcs_readl(GUEST_SYSENTER_EIP);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ data = vmcs_readl(GUEST_SYSENTER_ESP);
+ break;
+ case MSR_TSC_AUX:
+ if (!((struct vcpu_vmx *)(vcpu))->rdtscp_enabled)
+ return 1;
+ /* Otherwise falls through */
+ default:
+ vmx_load_host_state((struct vcpu_vmx *)vcpu);
+ msr = find_msr_entry((struct vcpu_vmx *)vcpu, msr_index);
+ if (msr) {
+ vmx_load_host_state((struct vcpu_vmx *)vcpu);
+ data = msr->data;
+ break;
+ }
+ return kvm_get_msr_common(vcpu, msr_index, pdata);
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata)
+{
+ return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
+
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
+{
+ struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
+ struct shared_msr_entry *msr;
+ uint64_t host_tsc;
+ int ret = 0;
+
+ switch (msr_index) {
+ case MSR_EFER:
+ vmx_load_host_state(vmx);
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ vmcs_writel(GUEST_FS_BASE, data);
+ break;
+ case MSR_GS_BASE:
+ vmcs_writel(GUEST_GS_BASE, data);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(vmx);
+ vmx->msr_guest_kernel_gs_base = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ vmcs_write32(GUEST_SYSENTER_CS, data);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ vmcs_writel(GUEST_SYSENTER_EIP, data);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+ case MSR_IA32_TSC:
+ rdtscll(host_tsc);
+ guest_write_tsc(data, host_tsc);
+ break;
+ case MSR_IA32_CR_PAT:
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, data);
+ vcpu->arch.pat = data;
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ break;
+ case MSR_TSC_AUX:
+ if (!vmx->rdtscp_enabled)
+ return 1;
+ /* Check reserved bit, higher 32 bits should be zero */
+ if ((data >> 32) != 0)
+ return 1;
+ /* Otherwise falls through */
+ default:
+ msr = find_msr_entry(vmx, msr_index);
+ if (msr) {
+ vmx_load_host_state(vmx);
+ msr->data = data;
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ }
+
+ return ret;
+}
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
+{
+ return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data)
+{
+ return kvm_set_msr(vcpu, index, *data);
+}
+
+static inline int is_machine_check(uint32_t intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#ifdef XXX
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs, 0);
+#endif
+#endif /*XXX*/
+}
+
+static void vmcs_clear_bits(unsigned long field, uint32_t mask)
+{
+ vmcs_writel(field, vmcs_readl(field) & ~mask);
+}
+
+static void vmcs_set_bits(unsigned long field, uint32_t mask)
+{
+ vmcs_writel(field, vmcs_readl(field) | mask);
+}
+
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, int has_error, uint32_t error_code)
+{
+ uint32_t prev_nr;
+ int class1, class2;
+
+ if (!vcpu->arch.exception.pending) {
+ queue:
+ vcpu->arch.exception.pending = 1;
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.nr;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ BT_SET(&vcpu->requests, KVM_REQ_TRIPLE_FAULT);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+ || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /* generate double fault per SDM Table 5-5 */
+ vcpu->arch.exception.pending = 1;
+ vcpu->arch.exception.has_error_code = 1;
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+}
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, 0, 0);
+}
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code)
+{
+ kvm_multiple_exception(vcpu, nr, 1, error_code);
+}
+
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+ uint32_t exit_intr_info;
+ uint32_t idt_vectoring_info = vmx->idt_vectoring_info;
+ int unblock_nmi;
+ uint8_t vector;
+ int type;
+ int idtv_info_valid;
+
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+ /* Handle machine checks before interrupts are enabled */
+ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
+ && is_machine_check(exit_intr_info)))
+ kvm_machine_check();
+
+ /* We need to handle NMIs before interrupts are enabled */
+ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+ (exit_intr_info & INTR_INFO_VALID_MASK))
+ asm("int $2");
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+#ifdef XXX
+ if (cpu_has_virtual_nmis()) {
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->soft_vnmi_blocked))
+ vmx->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+#endif /*XXX*/
+ vmx->vcpu.arch.nmi_injected = 0;
+#ifdef XXX
+ kvm_clear_exception_queue(&vmx->vcpu);
+ kvm_clear_interrupt_queue(&vmx->vcpu);
+
+ if (!idtv_info_valid)
+ return;
+#endif /*XXX*/
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vmx->vcpu.arch.nmi_injected = 1;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
+ */
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ /* fall through */
+ case INTR_TYPE_HARD_EXCEPTION:
+#ifdef XXX
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+ uint32_t err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ kvm_queue_exception_e(&vmx->vcpu, vector, err);
+ } else
+ kvm_queue_exception(&vmx->vcpu, vector);
+#endif /*XXX*/
+ break;
+ case INTR_TYPE_SOFT_INTR:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ /* fall through */
+ case INTR_TYPE_EXT_INTR:
+#ifdef XXX
+ kvm_queue_interrupt(&vmx->vcpu, vector,
+ type == INTR_TYPE_SOFT_INTR);
+#endif /*XXX*/
+ break;
+ default:
+ break;
+ }
+}
+
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+#define read_cr0() (native_read_cr0())
+
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
+
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+#ifdef XXX
+ if (!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)
+ vmx->entry_time = ktime_get();
+
+ /* Don't enter VMX if guest state is invalid, let the exit handler
+ start emulation until we arrive back to a valid state */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
+ return;
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+#endif /*XXX*/
+
+ /*
+ * Loading guest fpu may have cleared host cr0.ts
+ */
+ vmcs_writel(HOST_CR0, read_cr0());
+
+ asm(
+ /* Store host registers */
+ "push %%"R"dx; push %%"R"bp;"
+ "push %%"R"cx \n\t"
+ "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+ "je 1f \n\t"
+ "mov %%"R"sp, %c[host_rsp](%0) \n\t"
+ __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+ "1: \n\t"
+ /* Reload cr2 if changed */
+ "mov %c[cr2](%0), %%"R"ax \n\t"
+ "mov %%cr2, %%"R"dx \n\t"
+ "cmp %%"R"ax, %%"R"dx \n\t"
+ "je 2f \n\t"
+ "mov %%"R"ax, %%cr2 \n\t"
+ "2: \n\t"
+ /* Check if vmlaunch of vmresume is needed */
+ "cmpl $0, %c[launched](%0) \n\t"
+ /* Load guest registers. Don't clobber flags. */
+ "mov %c[rax](%0), %%"R"ax \n\t"
+ "mov %c[rbx](%0), %%"R"bx \n\t"
+ "mov %c[rdx](%0), %%"R"dx \n\t"
+ "mov %c[rsi](%0), %%"R"si \n\t"
+ "mov %c[rdi](%0), %%"R"di \n\t"
+ "mov %c[rbp](%0), %%"R"bp \n\t"
+#ifdef CONFIG_X86_64
+ "mov %c[r8](%0), %%r8 \n\t"
+ "mov %c[r9](%0), %%r9 \n\t"
+ "mov %c[r10](%0), %%r10 \n\t"
+ "mov %c[r11](%0), %%r11 \n\t"
+ "mov %c[r12](%0), %%r12 \n\t"
+ "mov %c[r13](%0), %%r13 \n\t"
+ "mov %c[r14](%0), %%r14 \n\t"
+ "mov %c[r15](%0), %%r15 \n\t"
+#endif
+ "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
+ /* Enter guest mode */
+ "jne .Llaunched \n\t"
+ __ex(ASM_VMX_VMLAUNCH) "\n\t"
+ "jmp .Lkvm_vmx_return \n\t"
+ ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
+ ".Lkvm_vmx_return: "
+ /* Save guest registers, load host registers, keep flags */
+ "xchg %0, (%%"R"sp) \n\t"
+ "mov %%"R"ax, %c[rax](%0) \n\t"
+ "mov %%"R"bx, %c[rbx](%0) \n\t"
+ "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+ "mov %%"R"dx, %c[rdx](%0) \n\t"
+ "mov %%"R"si, %c[rsi](%0) \n\t"
+ "mov %%"R"di, %c[rdi](%0) \n\t"
+ "mov %%"R"bp, %c[rbp](%0) \n\t"
+#ifdef CONFIG_X86_64
+ "mov %%r8, %c[r8](%0) \n\t"
+ "mov %%r9, %c[r9](%0) \n\t"
+ "mov %%r10, %c[r10](%0) \n\t"
+ "mov %%r11, %c[r11](%0) \n\t"
+ "mov %%r12, %c[r12](%0) \n\t"
+ "mov %%r13, %c[r13](%0) \n\t"
+ "mov %%r14, %c[r14](%0) \n\t"
+ "mov %%r15, %c[r15](%0) \n\t"
+#endif
+ "mov %%cr2, %%"R"ax \n\t"
+ "mov %%"R"ax, %c[cr2](%0) \n\t"
+
+ "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
+ "setbe %c[fail](%0) \n\t"
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+ [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+ [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+ [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+ [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+ [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+#endif
+ [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+ : "cc", "memory"
+ , R"bx", R"di", R"si"
+#ifdef CONFIG_X86_64
+ , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#endif
+ );
+
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_PDPTR));
+ vcpu->arch.regs_dirty = 0;
+
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+#ifdef XXX
+ if (vmx->rmode.irq.pending)
+ fixup_rmode_irq(vmx);
+#endif /*XXX*/
+
+ asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+ vmx->launched = 1;
+
+ vmx_complete_interrupts(vmx);
+}
+
+#undef R
+#undef Q
+
+void kvm_set_shared_msr(unsigned slot, uint64_t value, uint64_t mask)
+{
+#ifdef XXX
+ struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+ if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ return;
+ smsr->values[slot].curr = value;
+ wrmsrl(shared_msrs_global.msrs[slot], value);
+ if (!smsr->registered) {
+ smsr->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&smsr->urn);
+ smsr->registered = 1;
+ }
+#endif /*XXX*/
+}
+static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int i;
+
+ if (vmx->host_state.loaded)
+ return;
+
+ vmx->host_state.loaded = 1;
+ /*
+ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
+ * allow segment selectors with cpl > 0 or ti == 1.
+ */
+ vmx->host_state.ldt_sel = kvm_read_ldt();
+ vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+ vmx->host_state.fs_sel = kvm_read_fs();
+ if (!(vmx->host_state.fs_sel & 7)) {
+ vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
+ vmx->host_state.fs_reload_needed = 0;
+ } else {
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ vmx->host_state.fs_reload_needed = 1;
+ }
+ vmx->host_state.gs_sel = kvm_read_gs();
+ if (!(vmx->host_state.gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
+ else {
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ vmx->host_state.gs_ldt_reload_needed = 1;
+ }
+
+#ifdef CONFIG_X86_64
+ vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+ vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#else
+ vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+ vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
+#endif
+
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ }
+#endif
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
+}
+
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+ /* already handled by vcpu_run */
+ return 1;
+}
+
+
+static inline int is_page_fault(uint32_t intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, uint32_t access,
+ uint32_t *error)
+{
+ void *data = val;
+ int r = /*X86EMUL_CONTINUE*/ 0;
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+ unsigned offset = addr & (PAGESIZE-1);
+ unsigned toread = min(bytes, (unsigned)PAGESIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA) {
+ r = /*X86EMUL_PROPAGATE_FAULT*/1;
+ goto out;
+ }
+ ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+ if (ret < 0) {
+ r = /*X86EMUL_UNHANDLEABLE*/ 1;
+ goto out;
+ }
+
+ bytes -= toread;
+ data += toread;
+ addr += toread;
+ }
+out:
+ return r;
+}
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+ uint32_t error_code)
+{
+#ifdef XXX
+ ++vcpu->stat.pf_guest;
+#endif /*XXX*/
+ vcpu->arch.cr2 = addr;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, uint32_t *error)
+{
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
+
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (!is_protmode(vcpu))
+ return 0;
+
+ if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+ return 3;
+
+ return vmcs_read16(GUEST_CS_SELECTOR) & 3;
+}
+
+
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, uint32_t *error)
+{
+ uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+ access | PFERR_FETCH_MASK, error);
+}
+
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
+{
+#ifdef XXX
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
+
+ return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+#else
+ return 0;
+#endif /*XXX*/
+}
+
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+#ifdef XXX
+ if (vcpu->arch.apic &&
+ !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+ return 0;
+
+ return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+#else
+ return 0;
+#endif /*XXX*/
+}
+
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
+{
+#ifdef XXX
+ uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+#else
+ return UNMAPPED_GVA;
+#endif
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, uint32_t *error)
+{
+ uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ error);
+}
+
+static int emulator_read_emulated(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+ uint32_t error_code;
+
+ if (vcpu->mmio_read_completed) {
+ memcpy(val, vcpu->mmio_data, bytes);
+#ifdef XXX
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_phys_addr, *(uint64_t *)val);
+#endif /*XXX*/
+ vcpu->mmio_read_completed = 0;
+ return X86EMUL_CONTINUE;
+ }
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ /* For APIC access vmexit */
+ if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+ goto mmio;
+
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
+ == X86EMUL_CONTINUE)
+ return X86EMUL_CONTINUE;
+
+mmio:
+ /*
+ * Is this MMIO handled locally?
+ */
+ if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+#ifdef XXX
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(uint64_t *)val);
+#endif /*XXX*/
+ return X86EMUL_CONTINUE;
+ }
+
+#ifdef XXX
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+#endif /*XXX*/
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 0;
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ int ret;
+
+ ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ if (ret < 0)
+ return 0;
+#ifdef XXX
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+#endif /*XXX*/
+ return 1;
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
+{
+#ifdef XXX
+ uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+#else
+ return UNMAPPED_GVA;
+#endif
+}
+
+static int emulator_write_emulated_onepage(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+ uint32_t error_code;
+
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ /* For APIC access vmexit */
+ if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+ goto mmio;
+
+ if (emulator_write_phys(vcpu, gpa, val, bytes))
+ return X86EMUL_CONTINUE;
+
+mmio:
+#ifdef XXX
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(uint64_t *)val);
+#endif /*XXX*/
+ /*
+ * Is this MMIO handled locally?
+ */
+ if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+ return X86EMUL_CONTINUE;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
+
+ return X86EMUL_CONTINUE;
+}
+
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ /* Crossing a page boundary? */
+ if (((addr + bytes - 1) ^ addr) & PAGEMASK) {
+ int rc, now;
+
+ now = -addr & ~PAGEMASK;
+ rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr += now;
+ val += now;
+ bytes -= now;
+ }
+ return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+}
+
+static int emulator_cmpxchg_emulated(unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
+{
+ cmn_err(CE_WARN, "kvm: emulating exchange as write\n");
+#ifndef CONFIG_X86_64
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes == 8) {
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ uint64_t val;
+
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
+
+ if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK))
+ goto emul_write;
+
+ val = *(uint64_t *)new;
+
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val);
+ kunmap_atomic(kaddr, KM_USER0);
+ kvm_release_page_dirty(page);
+ }
+emul_write:
+#endif
+
+ return emulator_write_emulated(addr, new, bytes, vcpu);
+}
+
+static struct x86_emulate_ops emulate_ops = {
+ .read_std = kvm_read_guest_virt_system,
+ .fetch = kvm_fetch_guest_virt,
+ .read_emulated = emulator_read_emulated,
+ .write_emulated = emulator_write_emulated,
+ .cmpxchg_emulated = emulator_cmpxchg_emulated,
+};
+
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+ kvm_register_read(vcpu, VCPU_REGS_RAX);
+ kvm_register_read(vcpu, VCPU_REGS_RSP);
+ kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vcpu->arch.regs_dirty = ~0;
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+ int r;
+#ifdef XXX
+ if (tdp_enabled)
+ return 0;
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+ mutex_enter(&vcpu->kvm->mmu_lock);
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGESHIFT);
+ mutex_exit(&vcpu->kvm->mmu_lock);
+ return r;
+#else
+ return 0;
+#endif /*XXX*/
+}
+
+int emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ uint16_t error_code,
+ int emulation_type)
+{
+ int r, shadow_mask;
+ struct decode_cache *c;
+ struct kvm_run *run = vcpu->run;
+
+#ifdef XXX
+ kvm_clear_exception_queue(vcpu);
+#endif /*XXX*/
+ vcpu->arch.mmio_fault_cr2 = cr2;
+ /*
+ * TODO: fix emulate.c to use guest_read/write_register
+ * instead of direct ->regs accesses, can save hundred cycles
+ * on Intel for instructions that don't read/change RSP, for
+ * for example.
+ */
+ cache_all_regs(vcpu);
+
+ vcpu->mmio_is_write = 0;
+ vcpu->arch.pio.string = 0;
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ int cs_db, cs_l;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_VM86 : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+ /* Only allow emulation of specific instructions on #UD
+ * (namely VMMCALL, sysenter, sysexit, syscall)*/
+ c = &vcpu->arch.emulate_ctxt.decode;
+ if (emulation_type & EMULTYPE_TRAP_UD) {
+ if (!c->twobyte)
+ return EMULATE_FAIL;
+ switch (c->b) {
+ case 0x01: /* VMMCALL */
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ return EMULATE_FAIL;
+ break;
+ case 0x34: /* sysenter */
+ case 0x35: /* sysexit */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ case 0x05: /* syscall */
+ if (c->modrm_mod != 0 || c->modrm_rm != 0)
+ return EMULATE_FAIL;
+ break;
+ default:
+ return EMULATE_FAIL;
+ }
+
+ if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+ return EMULATE_FAIL;
+ }
+
+#ifdef XXX
+ ++vcpu->stat.insn_emulation;
+#endif /*XXX*/
+ if (r) {
+#ifdef XXX
+ ++vcpu->stat.insn_emulation_fail;
+#endif /*XXX*/
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ return EMULATE_DONE;
+ return EMULATE_FAIL;
+ }
+ }
+
+ if (emulation_type & EMULTYPE_SKIP) {
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+ return EMULATE_DONE;
+ }
+
+ r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+ if (r == 0)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+
+ if (vcpu->arch.pio.string)
+ return EMULATE_DO_MMIO;
+
+ if ((r || vcpu->mmio_is_write) && run) {
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = vcpu->mmio_phys_addr;
+ memcpy(run->mmio.data, vcpu->mmio_data, 8);
+ run->mmio.len = vcpu->mmio_size;
+ run->mmio.is_write = vcpu->mmio_is_write;
+ }
+
+ if (r) {
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ return EMULATE_DONE;
+ if (!vcpu->mmio_needed) {
+#ifdef XXX
+ kvm_report_emulation_failure(vcpu, "mmio");
+#endif /*XXX*/
+ return EMULATE_FAIL;
+ }
+ return EMULATE_DO_MMIO;
+ }
+
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+ if (vcpu->mmio_is_write) {
+ vcpu->mmio_needed = 0;
+ return EMULATE_DO_MMIO;
+ }
+
+ return EMULATE_DONE;
+}
+
+/*
+ * The guest has exited. See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int handle_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ uint32_t intr_info, ex_no, error_code;
+ unsigned long cr2, rip, dr6;
+ uint32_t vect_info;
+ enum emulation_result er;
+
+ vect_info = vmx->idt_vectoring_info;
+ intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ if (is_machine_check(intr_info))
+ return handle_machine_check(vcpu);
+
+ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+ !is_page_fault(intr_info)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ return 0;
+ }
+
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+ return 1; /* already handled by vmx_vcpu_run() */
+
+#ifdef XXX
+ if (is_no_device(intr_info)) {
+ vmx_fpu_activate(vcpu);
+ return 1;
+ }
+
+ if (is_invalid_opcode(intr_info)) {
+ er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+#endif /*XXX*/
+
+ error_code = 0;
+ rip = kvm_rip_read(vcpu);
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ if (is_page_fault(intr_info)) {
+ /* EPT won't cause page fault directly */
+ if (enable_ept)
+ cmn_err(CE_PANIC, "page fault with ept enabled\n");
+ cr2 = vmcs_readl(EXIT_QUALIFICATION);
+#ifdef XXX
+ trace_kvm_page_fault(cr2, error_code);
+
+ if (kvm_event_needs_reinjection(vcpu))
+ kvm_mmu_unprotect_page_virt(vcpu, cr2);
+ return kvm_mmu_page_fault(vcpu, cr2, error_code);
+#else
+ return -1;
+#endif /*XXX*/
+ }
+
+#ifdef XXX
+ if (vmx->rmode.vm86_active &&
+ handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
+ error_code)) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt(vcpu);
+ }
+ return 1;
+ }
+#endif /*XXX*/
+
+ ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+ switch (ex_no) {
+ case DB_VECTOR:
+#ifdef XXX
+ dr6 = vmcs_readl(EXIT_QUALIFICATION);
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+ /* fall through */
+#endif /*XXX*/
+ case BP_VECTOR:
+#ifdef XXX
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.exception = ex_no;
+#endif /*XXX*/
+ break;
+ default:
+ kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+ kvm_run->ex.exception = ex_no;
+ kvm_run->ex.error_code = error_code;
+ break;
+ }
+ return 0;
+}
+
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ ++vcpu->stat.irq_exits;
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int size, in, string;
+ unsigned port;
+
+#ifdef XXX
+ ++vcpu->stat.io_exits;
+#endif /*XXX*/
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ string = (exit_qualification & 16) != 0;
+
+ if (string) {
+ if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
+ return 0;
+ return 1;
+ }
+
+ size = (exit_qualification & 7) + 1;
+ in = (exit_qualification & 8) != 0;
+ port = exit_qualification >> 16;
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+ return kvm_emulate_pio(vcpu, in, size, port);
+#endif /*XXX*/
+}
+
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
+{
+ uint32_t cpu_based_vm_exec_control;
+
+ /* clear pending NMI */
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+#ifdef XXX
+ ++vcpu->stat.nmi_window_exits;
+#endif /*XXX*/
+
+ return 1;
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ enum emulation_result err = EMULATE_DONE;
+ int ret = 1;
+
+#ifdef XXX
+ while (!guest_state_valid(vcpu)) {
+ err = emulate_instruction(vcpu, 0, 0, 0);
+
+ if (err == EMULATE_DO_MMIO) {
+ ret = 0;
+ goto out;
+ }
+
+ if (err != EMULATE_DONE) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ ret = 0;
+ goto out;
+ }
+ if (signal_pending(current))
+ goto out;
+ if (need_resched())
+ schedule();
+ }
+#endif /*XXX*/
+
+ vmx->emulation_required = 0;
+out:
+ return ret;
+}
+
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL) {
+#ifdef XXX
+ kvm_inject_gp(vcpu, 0);
+#endif
+ return;
+ }
+#endif
+
+ cr0 &= ~CR0_RESERVED_BITS;
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+#ifdef XXX
+ kvm_inject_gp(vcpu, 0);
+#endif
+ return;
+ }
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+#ifdef XXX
+ kvm_inject_gp(vcpu, 0);
+#endif
+ return;
+ }
+
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+#ifdef XXX
+ if ((vcpu->arch.efer & EFER_LME)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+
+ }
+ } else
+#endif /*XXX*/
+#endif
+#ifdef XXX
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+#endif /*XXX*/
+
+ }
+
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+#ifdef XXX
+ kvm_mmu_reset_context(vcpu);
+#endif /*XXX*/
+ return;
+}
+
+static inline int constant_test_bit(int nr, const void *addr)
+{
+ const uint32_t *p = (const uint32_t *)addr;
+ return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
+}
+static inline int variable_test_bit(int nr, const void *addr)
+{
+ uint8_t v;
+ const uint32_t *p = (const uint32_t *)addr;
+
+ asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
+ return v;
+}
+
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+
+static int pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+ uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+ int changed = 1;
+ int r;
+
+ if (is_long_mode(vcpu) || !is_pae(vcpu))
+ return 0;
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return 1;
+
+ r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+ if (r < 0)
+ goto out;
+ changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+out:
+
+ return changed;
+}
+
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+#ifdef XXX
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
+#endif /*XXX*/
+ return;
+ }
+
+ if (is_long_mode(vcpu)) {
+ if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+#ifdef XXX
+ kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+ return;
+ }
+ } else {
+#ifdef XXX
+ if (is_pae(vcpu)) {
+ if (cr3 & CR3_PAE_RESERVED_BITS) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ }
+#endif /*XXX*/
+ /*
+ * We don't check reserved bits in nonpae mode, because
+ * this isn't enforced, and VMware depends on this.
+ */
+ }
+
+ /*
+ * Does the new cr3 value map to physical memory? (Note, we
+ * catch an invalid cr3 even in real-mode, because it would
+ * cause trouble later on when we turn on paging anyway.)
+ *
+ * A real CPU would silently accept an invalid cr3 and would
+ * attempt to use it - with largely undefined (and often hard
+ * to debug) behavior on the guest side.
+ */
+#ifdef XXX
+ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT)))
+ kvm_inject_gp(vcpu, 0);
+ else {
+#endif /*XXX*/
+ vcpu->arch.cr3 = cr3;
+#ifdef XXX
+ vcpu->arch.mmu.new_cr3(vcpu);
+ }
+#endif /*XXX*/
+}
+
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+ if (cr4 & CR4_RESERVED_BITS) {
+#ifdef XXX
+ kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+ return;
+ }
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE)) {
+#ifdef XXX
+ kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+ return;
+ }
+#ifdef XXX
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & pdptr_bits)
+ && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+#endif /*XXX*/
+ }
+
+ if (cr4 & X86_CR4_VMXE) {
+#ifdef XXX
+ kvm_inject_gp(vcpu, 0);
+#endif /*XXX*/
+ return;
+ }
+ kvm_x86_ops->set_cr4(vcpu, cr4);
+ vcpu->arch.cr4 = cr4;
+ vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
+ kvm_mmu_reset_context(vcpu);
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification, val;
+ int cr;
+ int reg;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ cr = exit_qualification & 15;
+ reg = (exit_qualification >> 8) & 15;
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ val = kvm_register_read(vcpu, reg);
+#ifdef XXX
+ trace_kvm_cr_write(cr, val);
+#endif /*XXX*/
+ switch (cr) {
+ case 0:
+ kvm_set_cr0(vcpu, val);
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+ case 3:
+ kvm_set_cr3(vcpu, val);
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+ case 4:
+ kvm_set_cr4(vcpu, val);
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+ case 8: {
+ uint8_t cr8_prev = kvm_get_cr8(vcpu);
+ uint8_t cr8 = kvm_register_read(vcpu, reg);
+ kvm_set_cr8(vcpu, cr8);
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ if (irqchip_in_kernel(vcpu->kvm))
+ return 1;
+ if (cr8_prev <= cr8)
+ return 1;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
+ };
+ break;
+ case 2: /* clts */
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+#ifdef XXX
+ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
+ skip_emulated_instruction(vcpu);
+ vmx_fpu_activate(vcpu);
+#endif /*XXX*/
+ return 1;
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 3:
+ kvm_register_write(vcpu, reg, vcpu->arch.cr3);
+#ifdef XXX
+ trace_kvm_cr_read(cr, vcpu->arch.cr3);
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+#ifdef XXX
+ trace_kvm_cr_read(cr, val);
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+ }
+ break;
+ case 3: /* lmsw */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+#ifdef XXX
+ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ kvm_lmsw(vcpu, val);
+
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+ default:
+ break;
+ }
+ vcpu->run->exit_reason = 0;
+ cmn_err(CE_WARN, "unhandled control register: op %d cr %d\n",
+ (int)(exit_qualification >> 4) & 3, cr);
+ return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ unsigned long val;
+ int dr, reg;
+
+#ifdef XXX
+ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
+ if (!kvm_require_cpl(vcpu, 0))
+ return 1;
+ dr = vmcs_readl(GUEST_DR7);
+
+ if (dr & DR7_GD) {
+ /*
+ * As the vm-exit takes precedence over the debug trap, we
+ * need to emulate the latter, either for the host or the
+ * guest debugging itself.
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+ vcpu->run->debug.arch.dr7 = dr;
+ vcpu->run->debug.arch.pc =
+ vmcs_readl(GUEST_CS_BASE) +
+ vmcs_readl(GUEST_RIP);
+ vcpu->run->debug.arch.exception = DB_VECTOR;
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ } else {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ vcpu->arch.dr6 |= DR6_BD;
+ vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ }
+#endif /*XXX*/
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+ reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+ if (exit_qualification & TYPE_MOV_FROM_DR) {
+ switch (dr) {
+ case 0 ... 3:
+ val = vcpu->arch.db[dr];
+ break;
+ case 4:
+#ifdef XXX
+ if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+ return 1;
+ /* fall through */
+ case 6:
+ val = vcpu->arch.dr6;
+ break;
+ case 5:
+#ifdef XXX
+ if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+ return 1;
+ /* fall through */
+ default: /* 7 */
+ val = vcpu->arch.dr7;
+ break;
+ }
+ kvm_register_write(vcpu, reg, val);
+ } else {
+ val = vcpu->arch.regs[reg];
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[dr] = val;
+#ifdef XXX
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+#endif
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+#ifdef XXX
+ if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+ return 1;
+ /* fall through */
+ case 6:
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ break;
+ case 5:
+#ifdef XXX
+ if (check_dr_alias(vcpu) < 0)
+#endif /*XXX*/
+ return 1;
+ /* fall through */
+ default: /* 7 */
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+#ifdef XXX
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+#endif /*XXX*/
+ vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+ vcpu->arch.switch_db_regs =
+ (val & DR7_BP_EN_MASK);
+#ifdef XXX
+ }
+#endif /*XXX*/
+ break;
+ }
+ }
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_cpuid(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ kvm_emulate_cpuid(vcpu);
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu)
+{
+ uint32_t ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ uint64_t data;
+
+ if (vmx_get_msr(vcpu, ecx, &data)) {
+#ifdef XXX
+ trace_kvm_msr_read_ex(ecx);
+#endif /*XXX*/
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+#ifdef XXX
+ trace_kvm_msr_read(ecx, data);
+#endif /*XXX*/
+
+ /* FIXME: handling of bits 32:63 of rax, rdx */
+ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu)
+{
+ uint32_t ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ uint64_t data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+ | ((uint64_t)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+ if (vmx_set_msr(vcpu, ecx, data) != 0) {
+#ifdef XXX
+ trace_kvm_msr_write_ex(ecx, data);
+#endif /*XXX*/
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+#ifdef XXX
+ trace_kvm_msr_write(ecx, data);
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ uint64_t param, ingpa, outgpa, ret;
+ uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ int fast, longmode;
+ int cs_db, cs_l;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ longmode = is_long_mode(vcpu) && cs_l == 1;
+
+ if (!longmode) {
+ param = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = (param >> 16) & 0x1;
+ rep_cnt = (param >> 32) & 0xfff;
+ rep_idx = (param >> 48) & 0xfff;
+
+#ifdef XXX
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+#endif /*XXX*/
+
+ switch (code) {
+ case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+#ifdef XXX
+ kvm_vcpu_on_spin(vcpu);
+#endif /*XXX*/
+ break;
+ default:
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ ret = res | (((uint64_t)rep_done & 0xfff) << 32);
+ if (longmode) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ } else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+ }
+
+ return 1;
+}
+
+
+/* Return values for hypercalls */
+#define KVM_ENOSYS 1000
+#define KVM_EFAULT EFAULT
+#define KVM_E2BIG E2BIG
+#define KVM_EPERM EPERM
+
+#define KVM_HC_VAPIC_POLL_IRQ 1
+#define KVM_HC_MMU_OP 2
+
+/*
+ * hypercalls use architecture specific
+ */
+
+#ifdef _KERNEL
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
+#endif
+
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+ if (kvm_arch_para_features() & (1UL << feature))
+ return 1;
+ return 0;
+}
+#endif /* _KERNEL */
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int r = 1;
+
+ if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+
+#ifdef XXX
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
+#endif /*XXX*/
+
+ if (!is_long_mode(vcpu)) {
+ nr &= 0xFFFFFFFF;
+ a0 &= 0xFFFFFFFF;
+ a1 &= 0xFFFFFFFF;
+ a2 &= 0xFFFFFFFF;
+ a3 &= 0xFFFFFFFF;
+ }
+
+ if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ switch (nr) {
+ case KVM_HC_VAPIC_POLL_IRQ:
+ ret = 0;
+ break;
+ case KVM_HC_MMU_OP:
+#ifdef XXX
+ r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+#endif /*XXX*/
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+out:
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+#ifdef XXX
+ ++vcpu->stat.hypercalls;
+#endif /*XXX*/
+ return r;
+}
+
+static int handle_halt(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+ return kvm_emulate_halt(vcpu);
+#else
+ return 0;
+#endif /*XXX*/
+}
+
+static int handle_vmcall(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ kvm_emulate_hypercall(vcpu);
+ return 1;
+}
+
+static int handle_vmx_insn(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ kvm_queue_exception(vcpu, UD_VECTOR);
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_invlpg(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+#ifdef XXX
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+ /* TODO: Add support for VT-d/pass-through device */
+ return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ enum emulation_result er;
+ unsigned long offset;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ offset = exit_qualification & 0xffful;
+
+ er = emulate_instruction(vcpu, 0, 0, 0);
+
+ if (er != EMULATE_DONE) {
+ cmn_err(CE_PANIC,
+ "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+ offset);
+ }
+ return 1;
+}
+
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+ return (seg != VCPU_SREG_LDTR) &&
+ (seg != VCPU_SREG_TR) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+ return desc->c.b.limit0 | (desc->c.b.limit << 16);
+}
+
+static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector,
+ struct kvm_segment *kvm_desct)
+{
+ kvm_desct->base = get_desc_base(seg_desc);
+ kvm_desct->limit = get_desc_limit(seg_desc);
+ if (seg_desc->c.b.g) {
+ kvm_desct->limit <<= 12;
+ kvm_desct->limit |= 0xfff;
+ }
+ kvm_desct->selector = selector;
+ kvm_desct->type = seg_desc->c.b.type;
+ kvm_desct->present = seg_desc->c.b.p;
+ kvm_desct->dpl = seg_desc->c.b.dpl;
+ kvm_desct->db = seg_desc->c.b.d;
+ kvm_desct->s = seg_desc->c.b.s;
+ kvm_desct->l = seg_desc->c.b.l;
+ kvm_desct->g = seg_desc->c.b.g;
+ kvm_desct->avl = seg_desc->c.b.avl;
+ if (!selector)
+ kvm_desct->unusable = 1;
+ else
+ kvm_desct->unusable = 0;
+ kvm_desct->padding = 0;
+}
+
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
+{
+ struct kvm_segment segvar = {
+ .base = selector << 4,
+ .limit = 0xffff,
+ .selector = selector,
+ .type = 3,
+ .present = 1,
+ .dpl = 3,
+ .db = 0,
+ .s = 1,
+ .l = 0,
+ .g = 0,
+ .avl = 0,
+ .unusable = 0,
+ };
+ kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+ return 0;
+}
+
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+ uint16_t selector,
+ struct descriptor_table *dtable)
+{
+ if (selector & 1 << 2) {
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+
+ if (kvm_seg.unusable)
+ dtable->limit = 0;
+ else
+ dtable->limit = kvm_seg.limit;
+ dtable->base = kvm_seg.base;
+ }
+ else
+ kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+
+/* allowed just for 8 bytes segments */
+static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
+ struct desc_struct *seg_desc)
+{
+ struct descriptor_table dtable;
+ uint16_t index = selector >> 3;
+ int ret;
+ uint32_t err;
+ gva_t addr;
+
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+ return 1;
+ }
+ addr = dtable.base + index * 8;
+ ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+ vcpu, &err);
+ if (ret == 1)
+ kvm_inject_page_fault(vcpu, addr, err);
+
+ return ret;
+}
+
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, uint32_t *error)
+{
+ void *data = val;
+ int r = 0;
+
+#ifdef XXX
+ while (bytes) {
+ gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+ unsigned offset = addr & (PAGESIZE-1);
+ unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
+ ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+ if (ret < 0) {
+ r = X86EMUL_UNHANDLEABLE;
+ goto out;
+ }
+
+ bytes -= towrite;
+ data += towrite;
+ addr += towrite;
+ }
+out:
+#endif /*XXX*/
+ return r;
+}
+
+/* allowed just for 8 bytes segments */
+static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector,
+ struct desc_struct *seg_desc)
+{
+ struct descriptor_table dtable;
+ uint16_t index = selector >> 3;
+
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7)
+ return 1;
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+}
+
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg)
+{
+ struct kvm_segment kvm_seg;
+ struct desc_struct seg_desc;
+ uint8_t dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ uint32_t err_code = 0;
+ int null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
+
+ if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
+ return kvm_load_realmode_segment(vcpu, selector, seg);
+
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ if (ret)
+ return ret;
+
+ seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+ if (null_selector) { /* for NULL selector skip all following checks */
+ kvm_seg.unusable = 1;
+ goto load;
+ }
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+ goto exception;
+
+ if (!kvm_seg.present) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = kvm_seg.dpl;
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(kvm_seg.type & 8))
+ goto exception;
+
+ if (kvm_seg.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (kvm_seg.s || kvm_seg.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((kvm_seg.type & 0xa) == 0x8 ||
+ (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!kvm_seg.unusable && kvm_seg.s) {
+ /* mark segment as accessed */
+ kvm_seg.type |= 1;
+ seg_desc.c.b.type |= 1;
+ save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ }
+load:
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+ return 0;
+exception:
+#ifdef XXX
+ kvm_queue_exception_e(vcpu, err_vec, err_code);
+#endif /*XXX*/
+ return 1;
+}
+
+static void save_state_to_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ tss->cr3 = vcpu->arch.cr3;
+ tss->eip = kvm_rip_read(vcpu);
+ tss->eflags = kvm_get_rflags(vcpu);
+ tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+ tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg)
+{
+ struct kvm_segment kvm_seg;
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static int load_state_from_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ kvm_set_cr3(vcpu, tss->cr3);
+
+ kvm_rip_write(vcpu, tss->eip);
+ kvm_set_rflags(vcpu, tss->eflags | 2);
+
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+ kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+ kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
+ return 1;
+ return 0;
+}
+
+static void save_state_to_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = kvm_rip_read(vcpu);
+ tss->flag = kvm_get_rflags(vcpu);
+ tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ kvm_rip_write(vcpu, tss->ip);
+ kvm_set_rflags(vcpu, tss->flag | 2);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+ return 1;
+ return 0;
+}
+
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+{
+ gfn_t gfn = gpa >> PAGESHIFT;
+ int seg;
+ int offset = offset_in_page(gpa);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ ++gfn;
+ }
+ return 0;
+}
+
+static int kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector,
+ uint16_t old_tss_sel, uint32_t old_tss_base,
+ struct desc_struct *nseg_desc)
+{
+ struct tss_segment_16 tss_segment_16;
+ int ret = 0;
+
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
+ goto out;
+
+ save_state_to_tss16(vcpu, &tss_segment_16);
+
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
+ goto out;
+
+#ifdef XXX
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+ &tss_segment_16, sizeof tss_segment_16))
+ goto out;
+#endif /*XXX*/
+
+ if (old_tss_sel != 0xffff) {
+ tss_segment_16.prev_task_link = old_tss_sel;
+#ifdef XXX
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr_write(vcpu, nseg_desc),
+ &tss_segment_16.prev_task_link,
+ sizeof tss_segment_16.prev_task_link))
+ goto out;
+#endif /*XXX*/
+ }
+
+ if (load_state_from_tss16(vcpu, &tss_segment_16))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+static int kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector,
+ uint16_t old_tss_sel, uint32_t old_tss_base,
+ struct desc_struct *nseg_desc)
+{
+ struct tss_segment_32 tss_segment_32;
+ int ret = 0;
+
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
+ goto out;
+
+ save_state_to_tss32(vcpu, &tss_segment_32);
+
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
+ goto out;
+
+#ifdef XXX
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+ &tss_segment_32, sizeof tss_segment_32))
+ goto out;
+#endif /*XXX*/
+
+ if (old_tss_sel != 0xffff) {
+ tss_segment_32.prev_task_link = old_tss_sel;
+
+#ifdef XXX
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr_write(vcpu, nseg_desc),
+ &tss_segment_32.prev_task_link,
+ sizeof tss_segment_32.prev_task_link))
+ goto out;
+#endif /*XXX*/
+ }
+
+ if (load_state_from_tss32(vcpu, &tss_segment_32))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+static uint64_t vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ return vmcs_readl(sf->base);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason)
+{
+ struct kvm_segment tr_seg;
+ struct desc_struct cseg_desc;
+ struct desc_struct nseg_desc;
+ int ret = 0;
+ uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
+ uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
+ uint32_t desc_limit;
+
+#ifdef XXX
+ old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
+#endif /*XXX*/
+
+ /* FIXME: Handle errors. Failure to read either TSS or their
+ * descriptors should generate a pagefault.
+ */
+ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+ goto out;
+
+ if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
+ goto out;
+
+ if (reason != TASK_SWITCH_IRET) {
+ int cpl;
+
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+ if ((tss_selector & 3) > nseg_desc.c.b.dpl || cpl > nseg_desc.c.b.dpl) {
+#ifdef XXX
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+#endif /*XXX*/
+ return 1;
+ }
+ }
+
+ desc_limit = get_desc_limit(&nseg_desc);
+ if (!nseg_desc.c.b.p ||
+ ((desc_limit < 0x67 && (nseg_desc.c.b.type & 8)) ||
+ desc_limit < 0x2b)) {
+#ifdef XXX
+ kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+#endif /*XXX*/
+ return 1;
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ cseg_desc.c.b.type &= ~(1 << 1); //clear the B flag
+ save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET) {
+ uint32_t eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+ }
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (nseg_desc.c.b.type & 8)
+ ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
+ else
+ ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+ uint32_t eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+ }
+
+ if (reason != TASK_SWITCH_IRET) {
+ nseg_desc.c.b.type |= (1 << 1);
+ save_guest_segment_descriptor(vcpu, tss_selector,
+ &nseg_desc);
+ }
+
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
+ seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+ tr_seg.type = 11;
+ kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+ return ret;
+}
+
+static int handle_task_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
+ uint16_t tss_selector;
+ int reason, type, idt_v;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ reason = (uint32_t)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = 0;
+#ifdef XXX
+ if (cpu_has_virtual_nmis())
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+#endif
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+#ifdef XXX
+ kvm_clear_interrupt_queue(vcpu);
+#endif /*XXX*/
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ case INTR_TYPE_SOFT_EXCEPTION:
+#ifdef XXX
+ kvm_clear_exception_queue(vcpu);
+#endif /*XXX*/
+ break;
+ default:
+ break;
+ }
+ }
+ tss_selector = exit_qualification;
+#ifdef XXX
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ skip_emulated_instruction(vcpu);
+#endif /*XXX*/
+
+ if (!kvm_task_switch(vcpu, tss_selector, reason))
+ return 0;
+
+ /* clear all local breakpoint enable flags */
+ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+
+ /*
+ * TODO: What about debug traps on tss switch?
+ * Are we supposed to inject them and update dr6?
+ */
+
+ return 1;
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ gpa_t gpa;
+ int gla_validity;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ if (exit_qualification & (1 << 6)) {
+ cmn_err(CE_PANIC, "EPT: GPA exceeds GAW!\n");
+ }
+
+ gla_validity = (exit_qualification >> 7) & 0x3;
+ if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+ cmn_err(CE_WARN, "EPT: Handling EPT violation failed!\n");
+ cmn_err(CE_CONT, "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+ (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+ vmcs_readl(GUEST_LINEAR_ADDRESS));
+ cmn_err(CE_PANIC, "EPT: Exit qualification is 0x%lx\n",
+ (long unsigned int)exit_qualification);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ return 0;
+ }
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+#ifdef XXX
+ trace_kvm_page_fault(gpa, exit_qualification);
+ return kvm_mmu_page_fault(vcpu, gpa & PAGEMASK, 0);
+#else
+ return 0;
+#endif
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
+{
+ uint64_t sptes[4];
+ int nr_sptes, i;
+ gpa_t gpa;
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+
+ cmn_err(CE_WARN, "EPT: Misconfiguration.\n");
+ cmn_err(CE_CONT, "EPT: GPA: 0x%llx\n", gpa);
+#ifdef XXX
+ nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+
+ for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
+ ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+#endif /*XXX*/
+
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+
+ return 0;
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ skip_emulated_instruction(vcpu);
+ kvm_vcpu_on_spin(vcpu);
+#endif /*XXX*/
+
+ return 1;
+}
+
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ kvm_queue_exception(vcpu, UD_VECTOR);
+#endif /*XXX*/
+ return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
+{
+ uint32_t cpu_based_vm_exec_control;
+
+ /* clear pending irq */
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+#ifdef XXX
+ ++vcpu->stat.irq_window_exits;
+
+ /*
+ * If the user space waits to inject interrupts, exit as soon as
+ * possible
+ */
+ if (!irqchip_in_kernel(vcpu->kvm) &&
+ vcpu->run->request_interrupt_window &&
+ !kvm_cpu_has_interrupt(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ return 0;
+ }
+#endif /*XXX*/
+ return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
+ [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
+ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
+ [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
+ [EXIT_REASON_IO_INSTRUCTION] = handle_io,
+ [EXIT_REASON_CR_ACCESS] = handle_cr,
+ [EXIT_REASON_DR_ACCESS] = handle_dr,
+ [EXIT_REASON_CPUID] = handle_cpuid,
+ [EXIT_REASON_MSR_READ] = handle_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
+ [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
+ [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRST] = handle_vmx_insn,
+ [EXIT_REASON_VMREAD] = handle_vmx_insn,
+ [EXIT_REASON_VMRESUME] = handle_vmx_insn,
+ [EXIT_REASON_VMWRITE] = handle_vmx_insn,
+ [EXIT_REASON_VMOFF] = handle_vmx_insn,
+ [EXIT_REASON_VMON] = handle_vmx_insn,
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+ ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+/*
+ * The guest has exited. See if we can fix it or if we need userspace
+ * assistance.
+ */
+
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t exit_reason = vmx->exit_reason;
+ uint32_t vectoring_info = vmx->idt_vectoring_info;
+
+ /* If guest state is invalid, start emulating */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
+ return handle_invalid_guest_state(vcpu);
+
+ /* Access CR3 don't cause VMExit in paging mode, so we need
+ * to sync with guest real CR3. */
+ if (enable_ept && is_paging(vcpu))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+
+ if (vmx->fail) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = vmcs_read32(VM_INSTRUCTION_ERROR);
+ return 0;
+ }
+
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason != EXIT_REASON_TASK_SWITCH))
+ cmn_err(CE_WARN, "%s: unexpected, valid vectoring info "
+ "(0x%x) and exit reason is 0x%x\n",
+ __func__, vectoring_info, exit_reason);
+
+#ifdef XXX
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+ if (vmx_interrupt_allowed(vcpu)) {
+ vmx->soft_vnmi_blocked = 0;
+ } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ cmn_err(CE_WARN, "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->soft_vnmi_blocked = 0;
+ }
+ }
+#endif /*XXX*/
+
+ if (exit_reason < kvm_vmx_max_exit_handlers
+ && kvm_vmx_exit_handlers[exit_reason])
+ return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ else {
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = exit_reason;
+ }
+ return 0;
+}
+
+static inline void kvm_guest_exit(void)
+{
+#ifdef XXX
+ account_system_vtime(current);
+ current->flags &= ~PF_VCPU;
+#endif /*XXX*/
+}
+
+static inline void kvm_guest_enter(void)
+{
+#ifdef XXX
+ account_system_vtime(current);
+ current->flags |= PF_VCPU;
+#endif /*XXX*/
+}
+
+int mmu_topup_memory_caches(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+ mutex_enter(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ r = mmu_alloc_roots(vcpu);
+ mmu_sync_roots(vcpu);
+ mutex_exit(&vcpu->kvm->mmu_lock);
+ if (r)
+ goto out;
+ /* set_cr3() should ensure TLB has been flushed */
+ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+out:
+ return r;
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mmu.root_hpa != INVALID_PAGE)
+ return 0;
+
+ return kvm_mmu_load(vcpu);
+}
+
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ int req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+ vcpu->run->request_interrupt_window;
+
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+ kvm_mmu_unload(vcpu);
+
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->requests) {
+ if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+ __kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+ kvm_write_guest_time(vcpu);
+ if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ kvm_mmu_sync_roots(vcpu);
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ kvm_x86_ops->tlb_flush(vcpu);
+ if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+ &vcpu->requests)) {
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ r = 0;
+ goto out;
+ }
+ if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ r = 0;
+ goto out;
+ }
+ if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+ vcpu->fpu_active = 0;
+ kvm_x86_ops->fpu_deactivate(vcpu);
+ }
+ }
+
+ kpreempt_disable();
+
+ kvm_x86_ops->prepare_guest_switch(vcpu);
+#ifdef XXX
+ if (vcpu->fpu_active)
+ kvm_load_guest_fpu(vcpu);
+#endif /*XXX*/
+ kpreempt_disable();
+
+ BT_CLEAR(&vcpu->requests, KVM_REQ_KICK);
+#ifdef XXX
+ smp_mb__after_clear_bit();
+#endif /*XXX*/
+
+ if (vcpu->requests /*XXX || need_resched() || signal_pending(current)*/) {
+ BT_SET(&vcpu->requests, KVM_REQ_KICK);
+ kpreempt_enable();
+ r = 1;
+ goto out;
+ }
+#ifdef XXX
+ inject_pending_event(vcpu);
+
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+#ifdef XXX
+ kvm_lapic_sync_to_vapic(vcpu);
+#endif /*XXX*/
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+ kvm_guest_enter();
+
+#ifdef XXX
+ if (unlikely(vcpu->arch.switch_db_regs)) {
+ set_debugreg(0, 7);
+ set_debugreg(vcpu->arch.eff_db[0], 0);
+ set_debugreg(vcpu->arch.eff_db[1], 1);
+ set_debugreg(vcpu->arch.eff_db[2], 2);
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ }
+
+ trace_kvm_entry(vcpu->vcpu_id);
+#endif /*XXX*/
+ kvm_x86_ops->run(vcpu);
+#ifdef XXX
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
+#endif /*XXX*/
+ BT_SET(&vcpu->requests, KVM_REQ_KICK);
+
+#ifdef XXX
+ ++vcpu->stat.exits;
+#endif /*XXX*/
+ kvm_guest_exit();
+
+ kpreempt_enable();
+#ifdef XXX
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING)) {
+ unsigned long rip = kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
+ }
+
+ kvm_lapic_sync_from_vapic(vcpu);
+#endif /*XXX*/
+ r = kvm_x86_ops->handle_exit(vcpu);
+out:
+ return r;
+}
+
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
+ kvm_run->apic_base = kvm_get_apic_base(vcpu);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_run->ready_for_interrupt_injection = 1;
+#ifdef XXX
+ else
+ kvm_run->ready_for_interrupt_injection =
+ kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu);
+#endif /*XXX*/
+}
+
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+ if (kvm_arch_vcpu_runnable(vcpu)) {
+ set_bit(KVM_REQ_UNHALT, &vcpu->requests);
+ break;
+ }
+ if (kvm_cpu_has_pending_timer(vcpu))
+ break;
+ if (signal_pending(current))
+ break;
+
+ schedule();
+ }
+
+ finish_wait(&vcpu->wq, &wait);
+#endif /*XXX*/
+}
+
+static void vapic_enter(struct kvm_vcpu *vcpu)
+{
+#ifdef XXX
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct page *page;
+
+ if (!apic || !apic->vapic_addr)
+ return;
+
+ page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
+
+ vcpu->arch.apic->vapic_page = page;
+#endif /*XXX*/
+}
+
+extern int kvm_apic_id(struct kvm_lapic *apic);
+
+static void vapic_exit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int idx;
+#ifdef XXX
+ if (!apic || !apic->vapic_addr)
+#endif /*XXX*/
+ return;
+#ifdef XXX
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_release_page_dirty(apic->vapic_page);
+ mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+#endif /*XXX*/
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic;
+ int i;
+
+ ASSERT(vcpu);
+ apic = vcpu->arch.apic;
+ ASSERT(apic != NULL);
+
+#ifdef XXX
+ /* Stop the timer in case it's a reset to an active apic */
+ hrtimer_cancel(&apic->lapic_timer.timer);
+#endif /*XXX*/
+
+ apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+ kvm_apic_set_version(apic->vcpu);
+
+ for (i = 0; i < APIC_LVT_NUM; i++)
+ apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+ apic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+
+ apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+ apic_set_reg(apic, APIC_SPIV, 0xff);
+ apic_set_reg(apic, APIC_TASKPRI, 0);
+ apic_set_reg(apic, APIC_LDR, 0);
+ apic_set_reg(apic, APIC_ESR, 0);
+ apic_set_reg(apic, APIC_ICR, 0);
+ apic_set_reg(apic, APIC_ICR2, 0);
+ apic_set_reg(apic, APIC_TDCR, 0);
+ apic_set_reg(apic, APIC_TMICT, 0);
+ for (i = 0; i < 8; i++) {
+ apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+ apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+ apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+ }
+ apic->irr_pending = 0;
+#ifdef XXX
+ update_divide_count(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+ if (kvm_vcpu_is_bsp(vcpu))
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ apic_update_ppr(apic);
+#endif /*XXX*/
+
+ vcpu->arch.apic_arb_prio = 0;
+
+ cmn_err(CE_NOTE, "%s: vcpu=%p, id=%d, base_msr= 0x%016 PRIx64 base_address=0x%0lx.\n",
+ __func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address);
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+ cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, vcpu->arch.sipi_vector);
+ kvm_lapic_reset(vcpu);
+ r = kvm_arch_vcpu_reset(vcpu);
+ if (r)
+ return r;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
+
+#ifdef XXX
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+#endif /*XXX*/
+ vapic_enter(vcpu);
+
+ r = 1;
+ while (r > 0) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ r = vcpu_enter_guest(vcpu);
+ else {
+#ifdef XXX
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+ kvm_vcpu_block(vcpu);
+#ifdef XXX
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+#endif /*XXX*/
+ /*
+ * XXX - the following should use a bitset_t
+ * and do bitset_atomic_test_and_del().
+ * but I am lazy, and will get to it later
+ */
+ if (BT_TEST(&vcpu->requests, KVM_REQ_UNHALT))
+ {
+ BT_CLEAR(&vcpu->requests, KVM_REQ_UNHALT);
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ break;
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ default:
+ r = -EINTR;
+ break;
+ }
+ }
+ }
+
+ if (r <= 0)
+ break;
+
+#ifdef XXX
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+ if (dm_request_for_irq_injection(vcpu)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_resched(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ }
+#endif /*XXX*/
+ }
+#ifdef XXX
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+ post_kvm_run_save(vcpu);
+ vapic_exit(vcpu);
+ return r;
+}
+
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+ sigset_t sigsaved;
+
+ vcpu_load(vcpu);
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) {
+ kvm_vcpu_block(vcpu);
+ BT_CLEAR(&vcpu->requests, KVM_REQ_UNHALT);
+ r = -EAGAIN;
+ goto out;
+ }
+
+ /* re-sync apic's tpr */
+ if (!irqchip_in_kernel(vcpu->kvm))
+ kvm_set_cr8(vcpu, kvm_run->cr8);
+
+
+ if (vcpu->arch.pio.cur_count) {
+#ifdef XXX
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = complete_pio(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+#endif /*XXX*/
+ if (r)
+ goto out;
+ }
+ if (vcpu->mmio_needed) {
+ memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+ vcpu->mmio_read_completed = 1;
+ vcpu->mmio_needed = 0;
+#ifdef XXX
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
+ EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r == EMULATE_DO_MMIO) {
+ /*
+ * Read-modify-write. Back to userspace.
+ */
+ r = 0;
+ goto out;
+ }
+#endif /*XXX*/
+ }
+ if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+ kvm_register_write(vcpu, VCPU_REGS_RAX,
+ kvm_run->hypercall.ret);
+
+ r = __vcpu_run(vcpu);
+
+out:
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ vcpu_put(vcpu);
+ return r;
+}
+
static int
kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_p)
{
- int rval = EINVAL;
+ int rval = DDI_SUCCESS;
+ volatile int x; /* XXX - dtrace was not getting fbt return probe */
switch(cmd) {
case KVM_GET_API_VERSION:
cmn_err(CE_NOTE, "kvm_ioctl: KVM_GET_API_VERSION");
- if (arg != NULL)
- return (rval);
+ if (arg != NULL) {
+ rval = EINVAL;
+ break;
+ }
*rval_p = KVM_API_VERSION;
- cmn_err(CE_NOTE, "kvm_ioctl: set rval_p to %d\n", *rval_p);
- rval = DDI_SUCCESS;
break;
case KVM_CREATE_VM:
- if (arg == NULL)
- return (rval);
+ if (arg == NULL) {
+ rval = EINVAL;
+ break;
+ }
rval = kvm_dev_ioctl_create_vm(arg, mode);
- return (rval);
+ break;
+ case KVM_RUN: {
+ struct kvm_run_ioc kvm_run_ioc;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+
+ if (!arg) {
+ rval = EINVAL;
+ break;
+ }
+
+ if (ddi_copyin((caddr_t)arg, &kvm_run_ioc, sizeof kvm_run_ioc, mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ kvmp = find_kvm_id(kvm_run_ioc.kvm_kvmid);
+ if (kvmp == NULL) {
+ rval = EINVAL;
+ break;
+ }
+ if (!kvmp || kvm_run_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+ rval = EINVAL;
+ break;
+ }
+ vcpu = kvmp->vcpus[kvm_run_ioc.kvm_cpu_index];
+
+ rval = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+ break;
+ }
case KVM_CHECK_EXTENSION:
rval = kvm_dev_ioctl_check_extension_generic(arg, rval_p);
- if (rval != DDI_SUCCESS)
- return (rval);
break;
+ case KVM_GET_MSRS: {
+ struct kvm_msrs_ioc kvm_msrs_ioc;
+ struct kvm_msrs kvm_msrs;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+ struct kvm_msr_entry *entries;
+ unsigned size;
+ int n;
+
+ if (ddi_copyin((const void *)arg, &kvm_msrs_ioc,
+ sizeof(kvm_msrs_ioc), mode) != 0) {
+ rval = EFAULT;
+ break;
+ }
+ kvmp = find_kvm_id(kvm_msrs_ioc.kvm_kvmid);
+ rval = EINVAL;
+ if (kvmp == NULL)
+ break;
+ if (!kvmp || kvm_msrs_ioc.kvm_cpu_index >= kvmp->online_vcpus)
+ break;
+
+ vcpu = kvmp->vcpus[kvm_msrs_ioc.kvm_cpu_index];
+
+ if (ddi_copyin(kvm_msrs_ioc.kvm_msrs, &kvm_msrs, sizeof(kvm_msrs), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ if (kvm_msrs.nmsrs >= MAX_IO_MSRS) {
+ rval = E2BIG;
+ break;
+ }
+
+ size = sizeof(struct kvm_msr_entry) * kvm_msrs.nmsrs;
+ entries = (struct kvm_msr_entry *) kmem_alloc(size, KM_SLEEP);
+ if (!entries) {
+ rval = ENOMEM;
+ break;
+ }
+
+ if (ddi_copyin((caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), entries, size, mode)) {
+ kmem_free(entries, size);
+ rval = EFAULT;
+ break;
+ }
+
+ rval = n = __msr_io(vcpu, &kvm_msrs, entries, kvm_get_msr);
+
+ if (rval < 0) {
+ kmem_free(entries, size);
+ rval = EINVAL;
+ break;
+ }
+
+ rval = ddi_copyout(entries, (caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), size, mode);
+ kmem_free(entries, size);
+
+ *rval_p = n;
+
+ break;
+ }
+
+ case KVM_SET_MSRS: {
+ struct kvm_msrs_ioc kvm_msrs_ioc;
+ struct kvm_msrs kvm_msrs;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+ struct kvm_msr_entry *entries;
+ unsigned size;
+ int n;
+
+ if (ddi_copyin((const void *)arg, &kvm_msrs_ioc,
+ sizeof(kvm_msrs_ioc), mode) != 0) {
+ rval = EFAULT;
+ break;
+ }
+
+ rval = EINVAL;
+ kvmp = find_kvm_id(kvm_msrs_ioc.kvm_kvmid);
+ if (kvmp == NULL)
+ break;
+ if (!kvmp || kvm_msrs_ioc.kvm_cpu_index >= kvmp->online_vcpus)
+ break;
+
+ vcpu = kvmp->vcpus[kvm_msrs_ioc.kvm_cpu_index];
+
+ if (ddi_copyin(kvm_msrs_ioc.kvm_msrs, &kvm_msrs, sizeof(kvm_msrs), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ if (kvm_msrs.nmsrs >= MAX_IO_MSRS) {
+ rval = E2BIG;
+ break;
+ }
+
+ size = sizeof(struct kvm_msr_entry) * kvm_msrs.nmsrs;
+ entries = (struct kvm_msr_entry *)kmem_alloc(size, KM_SLEEP);
+ if (!entries) {
+ rval = ENOMEM;
+ break;
+ }
+
+ if (ddi_copyin((caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), entries, size, mode)) {
+ kmem_free(entries, size);
+ rval = EFAULT;
+ break;
+ }
+
+ rval = n = __msr_io(vcpu, &kvm_msrs, entries, do_set_msr);
+
+ if (rval < 0) {
+ kmem_free(entries, size);
+ rval = EINVAL;
+ break;
+ }
+ kmem_free(entries, size);
+ *rval_p = n;
+ break;
+ }
+
case KVM_CREATE_VCPU: {
struct kvm_vcpu_ioc kvm_vcpu;
struct kvm *kvmp;
if (ddi_copyin((const void *)arg, &kvm_vcpu,
- sizeof(kvm_vcpu), mode) != 0)
- return (EFAULT);
+ sizeof(kvm_vcpu), mode) != 0) {
+ rval = EFAULT;
+ break;
+ }
+ rval = EINVAL;
kvmp = find_kvm_id(kvm_vcpu.kvmid);
if (kvmp == NULL)
- return(EINVAL);
+ break;
rval = kvm_vm_ioctl_create_vcpu(kvmp, kvm_vcpu.id, &kvm_vcpu, rval_p);
- if (rval != 0)
- return (rval);
+ if (rval != 0) {
+ rval = EINVAL;
+ break;
+ }
+
if (ddi_copyout(&kvm_vcpu, (void *)arg,
sizeof(kvm_vcpu), mode) != 0)
- return EFAULT;
+ rval = EFAULT;
break;
}
@@ -2602,54 +8781,261 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_
struct kvm *kvmp;
if (ddi_copyin((const void *)arg, &kvmioc,
- sizeof(kvmioc), mode) != 0)
- return (EFAULT);
+ sizeof(kvmioc), mode) != 0) {
+ rval = EFAULT;
+ break;
+ }
kvmp = find_kvm_id(kvmioc.kvmid);
- if (kvmp == NULL)
- return(EINVAL);
+ if (kvmp == NULL) {
+ rval = EINVAL;
+ break;
+ }
rval = kvm_vm_ioctl_set_memory_region(kvmp, &kvmioc.kvm_userspace_map, 1);
- if (rval != 0)
- return (rval);
+ if (rval != 0) {
+ rval = EINVAL;
+ break;
+ }
break;
}
case KVM_GET_SUPPORTED_CPUID: {
struct kvm_cpuid2 *cpuid_arg = (struct kvm_cpuid2 *)arg;
struct kvm_cpuid2 cpuid;
- if (ddi_copyin(cpuid_arg, &cpuid, sizeof (cpuid), mode))
- return (EFAULT);
+ if (ddi_copyin(cpuid_arg, &cpuid, sizeof (cpuid), mode)) {
+ rval = EFAULT;
+ break;
+ }
rval = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
cpuid_arg->entries, mode);
if (rval)
- return (rval);
+ break;
if (ddi_copyout(&cpuid, cpuid_arg, sizeof (cpuid), mode))
- return (EFAULT);
+ rval = EFAULT;
+ break;
+ }
+
+ case KVM_GET_MSR_INDEX_LIST: {
+ struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg;
+ struct kvm_msr_list msr_list;
+ unsigned n;
+
+ if (ddi_copyin(user_msr_list, &msr_list, sizeof msr_list, mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+ if (ddi_copyout(&msr_list, user_msr_list, sizeof msr_list, mode)) {
+ rval = EFAULT;
+ break;
+ }
+ if (n < msr_list.nmsrs) {
+ rval = E2BIG;
+ break;
+ }
+ rval = EFAULT;
+ if (ddi_copyout(&msrs_to_save, user_msr_list->indices,
+ num_msrs_to_save * sizeof(uint32_t), mode))
+ break;
+ if (ddi_copyout(&emulated_msrs,
+ user_msr_list->indices + num_msrs_to_save,
+ ARRAY_SIZE(emulated_msrs) * sizeof(uint32_t), mode))
+ break;
+ rval = 0;
+ *rval_p = 0;
+ break;
+ }
+ case KVM_GET_REGS: {
+ struct kvm_regs_ioc kvm_regs_ioc;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+
+ if (ddi_copyin((caddr_t)arg, &kvm_regs_ioc, sizeof (kvm_regs_ioc), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ kvmp = find_kvm_id(kvm_regs_ioc.kvm_kvmid);
+
+ if (!kvmp || kvm_regs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+ rval = EINVAL;
+ break;
+ }
+
+ vcpu = kvmp->vcpus[kvm_regs_ioc.kvm_cpu_index];
+
+ rval = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs_ioc.kvm_regs);
+ if (rval) {
+ rval = EINVAL;
+ break;
+ }
+ if (ddi_copyout(&kvm_regs_ioc, (caddr_t)arg, sizeof(kvm_regs_ioc), mode))
+ rval = EFAULT;
+ *rval_p = 0;
+ break;
+ }
+ case KVM_SET_REGS: {
+ struct kvm_regs_ioc kvm_regs_ioc;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+
+ if (ddi_copyin((caddr_t)arg, &kvm_regs_ioc, sizeof (kvm_regs_ioc), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ kvmp = find_kvm_id(kvm_regs_ioc.kvm_kvmid);
+ if (!kvmp || kvm_regs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+ rval = EINVAL;
+ break;
+ }
+
+ vcpu = kvmp->vcpus[kvm_regs_ioc.kvm_cpu_index];
+
+ cmn_err(CE_NOTE, "KVM_SET_REGS: rax = %lx, rbx = %lx, rcx = %lx, rdx = %lx\n",
+ kvm_regs_ioc.kvm_regs.rax, kvm_regs_ioc.kvm_regs.rbx, kvm_regs_ioc.kvm_regs.rcx, kvm_regs_ioc.kvm_regs.rdx);
+
+ rval = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs_ioc.kvm_regs);
+ if (rval)
+ rval = EINVAL;
+ *rval_p = 0;
+ break;
+ }
+ case KVM_GET_FPU: {
+ struct kvm_fpu_ioc kvm_fpu_ioc;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+
+ if (ddi_copyin((caddr_t)arg, &kvm_fpu_ioc, sizeof(kvm_fpu_ioc), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ kvmp = find_kvm_id(kvm_fpu_ioc.kvm_kvmid);
+ if (!kvmp || kvm_fpu_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+ rval = EINVAL;
+ break;
+ }
+
+ vcpu = kvmp->vcpus[kvm_fpu_ioc.kvm_cpu_index];
+
+ rval = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &kvm_fpu_ioc.fpu);
+ if (rval) {
+ rval = EINVAL;
+ break;
+ }
+
+ if (ddi_copyout(&kvm_fpu_ioc, (caddr_t)arg, sizeof(struct kvm_fpu), mode))
+ rval = EFAULT;
+
+ *rval_p = 0;
+ break;
+ }
+ case KVM_SET_FPU: {
+ struct kvm_fpu_ioc kvm_fpu_ioc;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+
+ if (ddi_copyin((caddr_t)arg, &kvm_fpu_ioc, sizeof(kvm_fpu_ioc), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ kvmp = find_kvm_id(kvm_fpu_ioc.kvm_kvmid);
+ if (!kvmp || kvm_fpu_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+ rval = EINVAL;
+ break;
+ }
+
+ vcpu = kvmp->vcpus[kvm_fpu_ioc.kvm_cpu_index];
+
+ rval = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &kvm_fpu_ioc.fpu);
+ if (rval)
+ rval = EINVAL;
+ *rval_p = 0;
break;
}
+ case KVM_GET_SREGS: {
+ struct kvm_sregs_ioc kvm_sregs_ioc;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+ if (ddi_copyin((caddr_t)arg, &kvm_sregs_ioc, sizeof (kvm_sregs_ioc), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ kvmp = find_kvm_id(kvm_sregs_ioc.kvm_kvmid);
+ if (!kvmp || kvm_sregs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+ rval = EINVAL;
+ break;
+ }
+
+ vcpu = kvmp->vcpus[kvm_sregs_ioc.kvm_cpu_index];
+
+ rval = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs_ioc.sregs);
+ if (rval) {
+ rval = EINVAL;
+ break;
+ }
+ if (ddi_copyout(&kvm_sregs_ioc, (caddr_t)arg, sizeof(kvm_sregs_ioc), mode))
+ rval = EFAULT;
+ *rval_p = 0;
+ break;
+ }
+ case KVM_SET_SREGS: {
+ struct kvm_sregs_ioc kvm_sregs_ioc;
+ struct kvm *kvmp;
+ struct kvm_vcpu *vcpu;
+
+ if (ddi_copyin((caddr_t)arg, &kvm_sregs_ioc, sizeof (kvm_sregs_ioc), mode)) {
+ rval = EFAULT;
+ break;
+ }
+
+ kvmp = find_kvm_id(kvm_sregs_ioc.kvm_kvmid);
+ if (!kvmp || kvm_sregs_ioc.kvm_cpu_index >= kvmp->online_vcpus) {
+ rval = EINVAL;
+ break;
+ }
+
+ vcpu = kvmp->vcpus[kvm_sregs_ioc.kvm_cpu_index];
+
+ rval = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs_ioc.sregs);
+ if (rval)
+ rval = EINVAL;
+ *rval_p = 0;
+ break;
+ }
case KVM_SET_CPUID2: {
struct kvm_cpuid2_ioc cpuid_ioc;
struct kvm_cpuid2 cpuid_data;
struct kvm_vcpu *vcpu;
- rval = EFAULT;
- if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode))
- return (EFAULT);
- if (cpuid_ioc.kvm_vcpu_addr == NULL)
- return (EINVAL);
+ if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) {
+ rval = EFAULT;
+ break;
+ }
+ if (cpuid_ioc.kvm_vcpu_addr == NULL) {
+ rval = EINVAL;
+ break;
+ }
vcpu = (struct kvm_vcpu *)(cpuid_ioc.kvm_vcpu_addr);
if (ddi_copyin((const char *)(cpuid_ioc.cpuid_data), (char *)&cpuid_data,
- sizeof(cpuid_data), mode))
- return (EFAULT);
+ sizeof(cpuid_data), mode)) {
+ rval = EFAULT;
+ break;
+ }
rval = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid_data,
cpuid_data.entries, mode);
if (rval)
- return (rval);
+ rval = EINVAL;
break;
}
@@ -2658,33 +9044,41 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_
struct kvm_cpuid2 cpuid_data;
struct kvm_vcpu *vcpu;
- rval = EFAULT;
- if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode))
- return (EFAULT);
+ if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) {
+ rval = EFAULT;
+ break;
+ }
- if (cpuid_ioc.kvm_vcpu_addr == NULL)
- return (EINVAL);
+ if (cpuid_ioc.kvm_vcpu_addr == NULL) {
+ rval = EINVAL;
+ break;
+ }
vcpu = (struct kvm_vcpu *)cpuid_ioc.kvm_vcpu_addr;
if (ddi_copyin((const char *)(cpuid_ioc.cpuid_data), (char *)&cpuid_data,
- sizeof(cpuid_data), mode))
- return (EFAULT);
+ sizeof(cpuid_data), mode)) {
+ rval = EFAULT;
+ break;
+ }
rval = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid_data,
cpuid_data.entries, mode);
- if (rval)
- return (rval);
- rval = EFAULT;
+ if (rval) {
+ rval = EINVAL;
+ break;
+ }
+
if (ddi_copyout(&cpuid_ioc, (char *)arg, sizeof cpuid_ioc, mode))
- return (EFAULT);
- rval = 0;
+ rval = EFAULT;
break;
}
case KVM_GET_VCPU_MMAP_SIZE:
- if (arg != NULL)
- return (rval);
+ if (arg != NULL) {
+ rval = EINVAL;
+ break;
+ }
*rval_p = ptob(1);
break;
case KVM_SET_TSS_ADDR:
@@ -2692,22 +9086,27 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_
struct kvm_tss kvm_tss;
struct kvm *kvmp;
if (ddi_copyin((const void *)arg, &kvm_tss,
- sizeof(kvm_tss), mode) != 0)
- return (EFAULT);
+ sizeof(kvm_tss), mode) != 0) {
+ rval = EFAULT;
+ break;
+ }
kvmp = find_kvm_id(kvm_tss.kvmid);
- if (kvmp == NULL)
- return(EINVAL);
+ if (kvmp == NULL) {
+ rval = EINVAL;
+ break;
+ }
rval = kvm_vm_ioctl_set_tss_addr(kvmp, kvm_tss.addr);
- if (rval != DDI_SUCCESS)
- return (rval);
+ break;
}
default:
- return (rval); /* x64, others may do other things... */
+ rval = EINVAL; /* x64, others may do other things... */
}
+
+ x = 10; /*XXX do something...*/
if (*rval_p == -1)
return (EINVAL);
- return (DDI_SUCCESS);
+ return (rval);
}
static int
diff --git a/kvm.h b/kvm.h
index c9f81f2..c18812e 100644
--- a/kvm.h
+++ b/kvm.h
@@ -5,6 +5,7 @@
#include <sys/ddi.h>
#include <sys/sunddi.h>
+#include "kvm_types.h"
#include <sys/bitmap.h>
#define KVM_API_VERSION 12 /* same as linux (for qemu compatability...) */
@@ -13,6 +14,8 @@
#define offsetof(s, m) ((size_t)(&((s *)0)->m))
#endif
+#define offset_in_page(p) ((unsigned long)(p) & ~PAGEMASK)
+
/* borrowed liberally from linux... */
#define MAX_IO_MSRS 256
@@ -30,6 +33,8 @@
#define KVM_MAX_VCPUS 64
+#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
+
#define KVM_MAX_MCE_BANKS 32
#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
@@ -154,12 +159,6 @@
#define KVM_NR_PAGE_SIZES 3 /* XXX assumes x86 */
-enum kvm_bus {
- KVM_MMIO_BUS,
- KVM_PIO_BUS,
- KVM_NR_BUSES
-};
-
struct kvm_vcpu_data {
char vcpu_vhpt[VHPT_SIZE];
char vcpu_vtlb[VTLB_SIZE];
@@ -175,105 +174,32 @@ struct kvm_vm_data {
};
/*
- * Address types:
- *
- * gva - guest virtual address
- * gpa - guest physical address
- * gfn - guest frame number
- * hva - host virtual address
- * hpa - host physical address
- * hfn - host frame number
- */
-
-typedef unsigned long gva_t;
-typedef uint64_t gpa_t;
-typedef unsigned long gfn_t;
-
-typedef unsigned long hva_t;
-typedef uint64_t hpa_t;
-typedef unsigned long hfn_t;
-
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- * bits 4:7 - page table level for this shadow (1-4)
- * bits 8:9 - page table quadrant for 2-level guests
- * bit 16 - direct mapping of virtual to physical mapping at gfn
- * used for real mode and two-dimensional paging
- * bits 17:19 - common access permissions for all ptes in this shadow page
- */
-union kvm_mmu_page_role {
- unsigned word;
- struct {
- unsigned glevels:4;
- unsigned level:4;
- unsigned quadrant:2;
- unsigned pad_for_nice_hex_output:6;
- unsigned direct:1;
- unsigned access:3;
- unsigned invalid:1;
- unsigned cr4_pge:1;
- unsigned nxe:1;
- }w;
-};
-
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-
-struct kvm_vcpu;
-struct kvm_mmu_page;
-
-struct kvm_mmu {
- void (*new_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t err);
- void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t access,
- uint32_t *error);
- void (*prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page);
- int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
- void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
- hpa_t root_hpa;
- int root_level;
- int shadow_root_level;
- union kvm_mmu_page_role base_role;
-
- uint64_t *pae_root;
- uint64_t rsvd_bits_mask[2][4];
-};
-
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char pad[2];
- char buf[512]; /* XXX aligned */
-};
-
-/*
* We don't want allocation failures within the mmu code, so we preallocate
* enough memory for a single page fault in a cache.
*/
-#define KVM_NR_MEM_OBJS 40
-#define KVM_NR_DB_REGS 4
-struct kvm_mmu_memory_cache {
- int nobjs;
- void *objects[KVM_NR_MEM_OBJS];
-};
+#define KVM_NR_DB_REGS 4
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
- enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
- unsigned int bytes;
- unsigned long val, orig_val, *ptr;
+/*
+ * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+ uint16_t cwd;
+ uint16_t swd;
+ uint16_t twd;
+ uint16_t fop;
+ uint64_t rip;
+ uint64_t rdp;
+ uint32_t mxcsr;
+ uint32_t mxcsr_mask;
+ uint32_t st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+ uint32_t xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+ uint32_t xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
};
struct i387_fxsave_struct {
@@ -311,109 +237,11 @@ struct i387_fxsave_struct {
} __attribute__((aligned(16)));
-struct kvm_pio_request {
- unsigned long count;
- int cur_count;
- gva_t guest_gva;
- int in;
- int port;
- int size;
- int string;
- int down;
- int rep;
-};
-
#define KVM_MAX_CPUID_ENTRIES 40
-enum kvm_reg {
- VCPU_REGS_RAX = 0,
- VCPU_REGS_RCX = 1,
- VCPU_REGS_RDX = 2,
- VCPU_REGS_RBX = 3,
- VCPU_REGS_RSP = 4,
- VCPU_REGS_RBP = 5,
- VCPU_REGS_RSI = 6,
- VCPU_REGS_RDI = 7,
-#ifdef CONFIG_X86_64
- VCPU_REGS_R8 = 8,
- VCPU_REGS_R9 = 9,
- VCPU_REGS_R10 = 10,
- VCPU_REGS_R11 = 11,
- VCPU_REGS_R12 = 12,
- VCPU_REGS_R13 = 13,
- VCPU_REGS_R14 = 14,
- VCPU_REGS_R15 = 15,
-#endif
- VCPU_REGS_RIP,
- NR_VCPU_REGS
-};
-enum kvm_reg_ex {
- VCPU_EXREG_PDPTR = NR_VCPU_REGS,
-};
-
-struct kvm_cpuid_entry2 {
- uint32_t function;
- uint32_t index;
- uint32_t flags;
- uint32_t eax;
- uint32_t ebx;
- uint32_t ecx;
- uint32_t edx;
- uint32_t padding[3];
-};
-
-struct fetch_cache {
- unsigned char data[15];
- unsigned long start;
- unsigned long end;
-};
-
-struct decode_cache {
- unsigned char twobyte;
- unsigned char b;
- unsigned char lock_prefix;
- unsigned char rep_prefix;
- unsigned char op_bytes;
- unsigned char ad_bytes;
- unsigned char rex_prefix;
- struct operand src;
- struct operand src2;
- struct operand dst;
- unsigned char has_seg_override;
- unsigned char seg_override;
- unsigned int d;
- unsigned long regs[NR_VCPU_REGS];
- unsigned long eip, eip_orig;
- /* modrm */
- unsigned char modrm;
- unsigned char modrm_mod;
- unsigned char modrm_reg;
- unsigned char modrm_rm;
- unsigned char use_modrm_ea;
- unsigned char rip_relative;
- unsigned long modrm_ea;
- void *modrm_ptr;
- unsigned long modrm_val;
- struct fetch_cache fetch;
-};
+#include "kvm_emulate.h"
-struct x86_emulate_ctxt {
- /* Register state before/after emulation. */
- struct kvm_vcpu *vcpu;
-
- unsigned long eflags;
- /* Emulated execution mode, represented by an X86EMUL_MODE value. */
- int mode;
- uint32_t cs_base;
-
- /* interruptibility state, as a result of execution of STI or MOV SS */
- int interruptibility;
-
- /* decode cache */
- struct decode_cache decode;
-};
-
/*
* These structs MUST NOT be changed.
* They are the ABI between hypervisor and guest OS.
@@ -444,34 +272,12 @@ struct pvclock_vcpu_time_info {
unsigned char pad[3];
} __attribute__((__packed__)); /* 32 bytes */
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
- an 8 bit field: */
-typedef unsigned char mtrr_type;
-
-#define MTRR_NUM_FIXED_RANGES 88
-#define MTRR_MAX_VAR_RANGES 256
-
-struct mtrr_var_range {
- uint32_t base_lo;
- uint32_t base_hi;
- uint32_t mask_lo;
- uint32_t mask_hi;
-};
-
-struct mtrr_state_type {
- struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
- mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
- unsigned char enabled;
- unsigned char have_fixed;
- mtrr_type def_type;
-};
-
#define APIC_LDR 0xD0
-
+#ifdef _KERNEL
struct kvm_lapic {
unsigned long base_address;
-#ifdef XXX
struct kvm_io_device dev;
+#ifdef XXX
struct kvm_timer lapic_timer;
#endif /*XXX*/
uint32_t divide_count;
@@ -484,114 +290,7 @@ struct kvm_lapic {
struct page *vapic_page;
};
-struct kvm_vcpu_arch {
- uint64_t host_tsc;
- /*
- * rip and regs accesses must go through
- * kvm_{register,rip}_{read,write} functions.
- */
- unsigned long regs[NR_VCPU_REGS];
- uint32_t regs_avail;
- uint32_t regs_dirty;
-
- unsigned long cr0;
- unsigned long cr0_guest_owned_bits;
- unsigned long cr2;
- unsigned long cr3;
- unsigned long cr4;
- unsigned long cr4_guest_owned_bits;
- unsigned long cr8;
- uint32_t hflags;
- uint64_t pdptrs[4]; /* pae */
- uint64_t efer;
- uint64_t apic_base;
- struct kvm_lapic *apic; /* kernel irqchip context */
- int32_t apic_arb_prio;
- int mp_state;
- int sipi_vector;
- uint64_t ia32_misc_enable_msr;
- char tpr_access_reporting;
-
- struct kvm_mmu mmu;
- /* only needed in kvm_pv_mmu_op() path, but it's hot so
- * put it here to avoid allocation */
- struct kvm_pv_mmu_op_buffer mmu_op_buffer;
-
- struct kvm_mmu_memory_cache mmu_pte_chain_cache;
- struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
- struct kvm_mmu_memory_cache mmu_page_cache;
- struct kvm_mmu_memory_cache mmu_page_header_cache;
-
- gfn_t last_pt_write_gfn;
- int last_pt_write_count;
- uint64_t *last_pte_updated;
- gfn_t last_pte_gfn;
-
- struct {
- gfn_t gfn; /* presumed gfn during guest pte update */
- pfn_t pfn; /* pfn corresponding to that gfn */
- unsigned long mmu_seq;
- } update_pte;
-
- struct i387_fxsave_struct host_fx_image;
- struct i387_fxsave_struct guest_fx_image;
-
- gva_t mmio_fault_cr2;
- struct kvm_pio_request pio;
- void *pio_data;
-
- unsigned char event_exit_inst_len;
-
- struct kvm_queued_exception {
- char pending;
- char has_error_code;
- unsigned char nr;
- uint32_t error_code;
- } exception;
-
- struct kvm_queued_interrupt {
- char pending;
- char soft;
- unsigned char nr;
- } interrupt;
-
- int halt_request; /* real mode on Intel only */
-
- int cpuid_nent;
- struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
- /* emulate context */
-
- struct x86_emulate_ctxt emulate_ctxt;
-
- gpa_t time;
- struct pvclock_vcpu_time_info hv_clock;
- unsigned int hv_clock_tsc_khz;
- unsigned int time_offset;
- struct page *time_page;
-
- char nmi_pending;
- char nmi_injected;
-
- struct mtrr_state_type mtrr_state;
- uint32_t pat;
-
- int switch_db_regs;
- unsigned long db[KVM_NR_DB_REGS];
- unsigned long dr6;
- unsigned long dr7;
- unsigned long eff_db[KVM_NR_DB_REGS];
-
- uint64_t mcg_cap;
- uint64_t mcg_status;
- uint64_t mcg_ctl;
- uint64_t *mce_banks;
-
- /* used for guest single stepping over the given code position */
- unsigned short singlestep_cs;
- unsigned long singlestep_rip;
- /* fields used by HYPER-V emulation */
- uint64_t hv_vapic;
-};
+struct vcpu_vmx;
struct kvm_vcpu {
struct kvm *kvm;
@@ -615,18 +314,37 @@ struct kvm_vcpu {
sigset_t sigset;
struct kstat stat;
-#ifdef CONFIG_HAS_IOMEM
+ /*#ifdef CONFIG_HAS_IOMEM*/
int mmio_needed;
int mmio_read_completed;
int mmio_is_write;
int mmio_size;
unsigned char mmio_data[8];
gpa_t mmio_phys_addr;
-#endif
+ /*#endif*/
struct kvm_vcpu_arch arch;
};
+
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+ int nr;
+ uint32_t msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+#ifdef XXX
+ struct user_return_notifier urn;
+#endif /*XXX*/
+ int registered;
+ struct kvm_shared_msr_values {
+ uint64_t host;
+ uint64_t curr;
+ } values[KVM_NR_SHARED_MSRS];
+};
+
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
@@ -641,12 +359,6 @@ struct kvm_memory_slot {
int user_alloc;
};
-#define KVM_MEMORY_SLOTS 32 /* XXX assumes x86 */
-#define KVM_PRIVATE_MEM_SLOTS 4 /* XXX assumes x86 */
-#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
-
struct kvm_memslots {
int nmemslots;
@@ -654,6 +366,7 @@ struct kvm_memslots {
KVM_PRIVATE_MEM_SLOTS];
};
+#endif /*_KERNEL*/
#ifdef x86
@@ -784,21 +497,18 @@ struct kvm_regs {
uint64_t rip, rflags;
};
+struct kvm_regs_ioc {
+ struct kvm_regs kvm_regs;
+ int kvm_cpu_index;
+ int kvm_kvmid;
+};
+
/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
#define KVM_APIC_REG_SIZE 0x400
struct kvm_lapic_state {
char regs[KVM_APIC_REG_SIZE];
};
-struct kvm_segment {
- uint64_t base;
- uint32_t limit;
- unsigned short selector;
- unsigned char type;
- unsigned char present, dpl, db, s, l, g, avl;
- unsigned char unusable;
- unsigned char padding;
-};
struct kvm_dtable {
uint64_t base;
@@ -822,6 +532,12 @@ struct kvm_sregs {
uint64_t interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
};
+struct kvm_sregs_ioc {
+ struct kvm_sregs sregs;
+ int kvm_cpu_index;
+ int kvm_kvmid;
+};
+
/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
@@ -930,13 +646,6 @@ struct kvm_assigned_dev_kernel {
kmutex_t assigned_dev_lock;
};
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
- uint64_t *parent_ptes[NR_PTE_CHAIN_ENTRIES];
- list_t link;
-};
-
/**
* container_of - cast a member of a structure out to the containing structure
* @ptr: the pointer to the member.
@@ -948,39 +657,6 @@ struct kvm_pte_chain {
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
-
-struct kvm_mmu_page {
- struct list_node link;
- struct list_node hash_link;
-
- struct list_node oos_link;
-
- /*
- * The following two entries are used to key the shadow page in the
- * hash table.
- */
- gfn_t gfn;
- union kvm_mmu_page_role role;
-
- uint64_t *spt;
- /* hold the gfn of each spte inside spt */
- gfn_t *gfns;
- /*
- * One bit set per slot which has memory
- * in this shadow page.
- */
- unsigned long slot_bitmap[BT_BITOUL(KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)];
- int multimapped; /* More than one parent_pte? */
- int root_count; /* Currently serving as active root */
- char unsync;
- unsigned int unsync_children;
- union {
- uint64_t *parent_pte; /* !multimapped */
- list_t parent_ptes; /* hash list, multimapped, kvm_pte_chain */
- }v;
- unsigned long unsync_child_bitmap[BT_BITOUL(512)];
-};
-
#define PT64_ROOT_LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
@@ -1069,6 +745,11 @@ struct kvm_fpu {
uint32_t pad2;
};
+struct kvm_fpu_ioc {
+ struct kvm_fpu fpu;
+ int kvm_cpu_index;
+ int kvm_kvmid;
+};
struct kvm_msr_entry {
uint32_t index;
@@ -1084,6 +765,12 @@ struct kvm_msrs {
struct kvm_msr_entry entries[1];
};
+struct kvm_msrs_ioc {
+ struct kvm_msrs *kvm_msrs;
+ int kvm_cpu_index;
+ int kvm_kvmid;
+};
+
/* for KVM_GET_MSR_INDEX_LIST */
struct kvm_msr_list {
uint32_t nmsrs; /* number of msrs in entries */
@@ -1122,73 +809,10 @@ struct pvclock_wall_clock {
#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-
-struct kvm_mem_alias {
- gfn_t base_gfn;
- unsigned long npages;
- gfn_t target_gfn;
-#define KVM_ALIAS_INVALID 1UL
- unsigned long flags;
-};
-
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
- struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
- int naliases;
-};
-
-struct kvm_xen_hvm_config {
- uint32_t flags;
- uint32_t msr;
- uint64_t blob_addr_32;
- uint64_t blob_addr_64;
- unsigned char blob_size_32;
- unsigned char blob_size_64;
- unsigned char pad2[30];
-};
-
-struct kvm_arch {
- struct kvm_mem_aliases *aliases;
-
- unsigned int n_free_mmu_pages;
- unsigned int n_requested_mmu_pages;
- unsigned int n_alloc_mmu_pages;
- list_t mmu_page_hash[KVM_NUM_MMU_PAGES];
- /*
- * Hash table of struct kvm_mmu_page.
- */
- list_t active_mmu_pages;
- list_t assigned_dev_head;
- struct iommu_domain *iommu_domain;
- int iommu_flags;
- struct kvm_pic *vpic;
- struct kvm_ioapic *vioapic;
- struct kvm_pit *vpit;
- int vapics_in_nmi_mode;
-
- unsigned int tss_addr;
- struct page *apic_access_page;
-
- gpa_t wall_clock;
-
- struct page *ept_identity_pagetable;
- char ept_identity_pagetable_done;
- gpa_t ept_identity_map_addr;
-
- unsigned long irq_sources_bitmap;
- uint64_t vm_init_tsc;
- int64_t kvmclock_offset;
-
- struct kvm_xen_hvm_config xen_hvm_config;
-
- /* fields used by HYPER-V emulation */
- uint64_t hv_guest_os_id;
- uint64_t hv_hypercall;
-};
-
#endif /*x86*/
+#ifdef _KERNEL
+
struct kvm {
kmutex_t mmu_lock;
kmutex_t requests_lock;
@@ -1217,10 +841,10 @@ struct kvm {
struct kstat kvm_kstat;
struct kvm_arch arch;
volatile int users_count;
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+ /*#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET*/
struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
-#endif
+ /*#endif*/
kmutex_t irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
@@ -1238,6 +862,7 @@ struct kvm {
#endif /*XXX*/
int kvmid; /* unique identifier for this kvm */
};
+#endif /*_KERNEL*/
#define KVM_EXIT_UNKNOWN 0
#define KVM_EXIT_EXCEPTION 1
@@ -1486,19 +1111,32 @@ static inline void native_load_tr_desc(void)
#define _IO(x, y) ((x<<8)|y) /* original is in /usr/include/sys/ioccom.h */
#define KVMIO 0xAE
+/* for KVM_SET_CPUID2/KVM_GET_CPUID2 */
+struct kvm_cpuid2_ioc {
+ struct cpuid_data *cpuid_data;
+ uint64_t kvm_vcpu_addr;
+ int kvm_cpu_index;
+};
+
+/* for KVM_RUN */
+struct kvm_run_ioc {
+ int kvm_kvmid;
+ int kvm_cpu_index;
+};
+
/*
* ioctls for vcpu fds
*/
#define KVM_RUN _IO(KVMIO, 0x80)
-#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs)
-#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs)
-#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs)
-#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs)
+#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs_ioc)
+#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs_ioc)
+#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs_ioc)
+#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs_ioc)
#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
-#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu)
-#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
-#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs)
-#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs)
+#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu_ioc)
+#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu_ioc)
+#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs_ioc)
+#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs_ioc)
#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state)
#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state)
/* Available with KVM_CAP_VCPU_EVENTS */
@@ -1554,6 +1192,20 @@ struct vmcs_config {
uint32_t vmexit_ctrl;
uint32_t vmentry_ctrl;
};
+
+#define RMAP_EXT 4
+
+struct kvm_rmap_desc {
+ uint64_t *sptes[RMAP_EXT];
+ struct kvm_rmap_desc *more;
+};
+
+
+static struct vmx_capability {
+ uint32_t ept;
+ uint32_t vpid;
+} vmx_capability;
+
struct vmcs {
uint32_t revision_id;
uint32_t abort;
@@ -1576,13 +1228,6 @@ struct kvm_dirty_log {
}v;
};
-/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
-
-struct kvm_coalesced_mmio_zone {
- uint64_t addr;
- uint32_t size;
- uint32_t pad;
-};
struct kvm_coalesced_mmio {
uint64_t phys_addr;
@@ -1613,8 +1258,8 @@ struct kvm_mp_state {
uint32_t mp_state;
};
-#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
-#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2_ioc)
+#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2_ioc)
/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES 1UL
@@ -1668,11 +1313,6 @@ struct kvm_vcpu_ioc {
};
-/* for KVM_SET_CPUID2/KVM_GET_CPUID2 */
-struct kvm_cpuid2_ioc {
- struct cpuid_data *cpuid_data;
- uint64_t kvm_vcpu_addr;
-};
/* LDT or TSS descriptor in the GDT. 16 bytes. */
struct ldttss_desc64 {
@@ -1684,6 +1324,13 @@ struct ldttss_desc64 {
uint32_t zero1;
} __attribute__((packed));
+struct shared_msr_entry {
+ unsigned index;
+ uint64_t data;
+ uint64_t mask;
+};
+
+#ifdef _KERNEL
struct vcpu_vmx {
struct kvm_vcpu vcpu;
list_t local_vcpus_link;
@@ -1732,8 +1379,6 @@ struct vcpu_vmx {
char rdtscp_enabled;
};
-#ifdef _KERNEL
-
/*
* vcpu->requests bit members
*/
@@ -1826,87 +1471,5 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
#define INVALID_PAGE (~(hpa_t)0)
-struct kvm_x86_ops {
- int (*cpu_has_kvm_support)(void); /* __init */
- int (*disabled_by_bios)(void); /* __init */
- int (*hardware_enable)(void *dummy);
- void (*hardware_disable)(void *dummy);
- void (*check_processor_compatibility)(void *rtn);
- int (*hardware_setup)(void); /* __init */
- void (*hardware_unsetup)(void); /* __exit */
- int (*cpu_has_accelerated_tpr)(void);
- void (*cpuid_update)(struct kvm_vcpu *vcpu);
-
- /* Create, but do not attach this VCPU */
- struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
- void (*vcpu_free)(struct kvm_vcpu *vcpu);
- int (*vcpu_reset)(struct kvm_vcpu *vcpu);
-
- void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
- void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
- void (*vcpu_put)(struct kvm_vcpu *vcpu);
- void (*set_guest_debug)(struct kvm_vcpu *vcpu,
-#ifdef XXX
- struct kvm_guest_debug *dbg);
-#else
- void *dbg);
-#endif
-
- int (*get_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata);
- int (*set_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data);
- uint64_t (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
- void (*get_segment)(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg);
- int (*get_cpl)(struct kvm_vcpu *vcpu);
- void (*set_segment)(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg);
- void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
- void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
- void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
- void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
- void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
- void (*set_efer)(struct kvm_vcpu *vcpu, uint64_t efer);
- void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
- int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
- void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
- unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
- void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
- void (*fpu_activate)(struct kvm_vcpu *vcpu);
- void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
-
- void (*tlb_flush)(struct kvm_vcpu *vcpu);
-
- void (*run)(struct kvm_vcpu *vcpu);
- int (*handle_exit)(struct kvm_vcpu *vcpu);
- void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
- void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- uint32_t (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- void (*patch_hypercall)(struct kvm_vcpu *vcpu,
- unsigned char *hypercall_addr);
- void (*set_irq)(struct kvm_vcpu *vcpu);
- void (*set_nmi)(struct kvm_vcpu *vcpu);
- void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
- int has_error_code, uint32_t error_code);
- int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
- int (*nmi_allowed)(struct kvm_vcpu *vcpu);
- int (*get_nmi_mask)(struct kvm_vcpu *vcpu);
- void (*set_nmi_mask)(struct kvm_vcpu *vcpu, int masked);
- void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
- void (*enable_irq_window)(struct kvm_vcpu *vcpu);
- void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
- int (*get_tdp_level)(void);
- uint64_t (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, int is_mmio);
- int (*get_lpage_level)(void);
- int (*rdtscp_supported)(void);
-
- const struct trace_print_flags *exit_reasons_str;
-};
-
#endif
diff --git a/kvm_host.h b/kvm_host.h
index 33b2f2f..0d8737a 100644
--- a/kvm_host.h
+++ b/kvm_host.h
@@ -1,849 +1,570 @@
+#ifndef __KVM_HOST_H
+#define __KVM_HOST_H
+
/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This header defines architecture specific interfaces, x86 version
- *
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
- *
*/
-#ifndef _ASM_X86_KVM_HOST_H
-#define _ASM_X86_KVM_HOST_H
-
#ifdef XXX
#include <linux/types.h>
+#include <linux/hardirq.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
#include <linux/mm.h>
-#include <linux/mmu_notifier.h>
-#include <linux/tracepoint.h>
+#include <linux/preempt.h>
+#include <linux/msi.h>
+#include <asm/signal.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
-#include <linux/kvm_types.h>
-
-#include <asm/pvclock-abi.h>
-#include <asm/desc.h>
-#include <asm/mtrr.h>
-#include <asm/msr-index.h>
-
-#endif
-
-#define KVM_PIO_PAGE_OFFSET 1
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
-
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
- 0xFFFFFF0000000000ULL)
-
-#define INVALID_PAGE (~(hpa_t)0)
-#define UNMAPPED_GVA (~(gpa_t)0)
-
-/* KVM Hugepage definitions for x86 */
-#define KVM_NR_PAGE_SIZES 3
-#define KVM_HPAGE_SHIFT(x) (PAGESHIFT + (((x) - 1) * 9))
-#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
-#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
-#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGESIZE)
-
-#define DE_VECTOR 0
-#define DB_VECTOR 1
-#define BP_VECTOR 3
-#define OF_VECTOR 4
-#define BR_VECTOR 5
-#define UD_VECTOR 6
-#define NM_VECTOR 7
-#define DF_VECTOR 8
-#define TS_VECTOR 10
-#define NP_VECTOR 11
-#define SS_VECTOR 12
-#define GP_VECTOR 13
-#define PF_VECTOR 14
-#define MF_VECTOR 16
-#define MC_VECTOR 18
-
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
-#define KVM_ALIAS_SLOTS 4
-
-#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
-#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 8
-
-extern kmutex_t kvm_lock;
-extern list_t vm_list;
-
-struct kvm_vcpu;
-struct kvm;
-
-enum {
- VCPU_SREG_ES,
- VCPU_SREG_CS,
- VCPU_SREG_SS,
- VCPU_SREG_DS,
- VCPU_SREG_FS,
- VCPU_SREG_GS,
- VCPU_SREG_TR,
- VCPU_SREG_LDTR,
-};
-
-#ifdef XXX
-#include <asm/kvm_emulate.h>
#endif /*XXX*/
-#define KVM_NR_MEM_OBJS 40
+#include "kvm_types.h"
-#define KVM_NR_DB_REGS 4
+#define KVM_MEMORY_SLOTS 32 /* XXX assumes x86 */
+#define KVM_PRIVATE_MEM_SLOTS 4 /* XXX assumes x86 */
+#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
-#define DR6_BD (1 << 13)
-#define DR6_BS (1 << 14)
-#define DR6_FIXED_1 0xffff0ff0
-#define DR6_VOLATILE 0x0000e00f
+#include "kvm_x86host.h"
-#define DR7_BP_EN_MASK 0x000000ff
-#define DR7_GE (1 << 9)
-#define DR7_GD (1 << 13)
-#define DR7_FIXED_1 0x00000400
-#define DR7_VOLATILE 0xffff23ff
-
-#ifdef XXX
/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
+ * vcpu->requests bit members
*/
-struct kvm_mmu_memory_cache {
- int nobjs;
- void *objects[KVM_NR_MEM_OBJS];
-};
-
-#define NR_PTE_CHAIN_ENTRIES 5
+#define KVM_REQ_TLB_FLUSH 0
+#define KVM_REQ_MIGRATE_TIMER 1
+#define KVM_REQ_REPORT_TPR_ACCESS 2
+#define KVM_REQ_MMU_RELOAD 3
+#define KVM_REQ_TRIPLE_FAULT 4
+#define KVM_REQ_PENDING_TIMER 5
+#define KVM_REQ_UNHALT 6
+#define KVM_REQ_MMU_SYNC 7
+#define KVM_REQ_KVMCLOCK_UPDATE 8
+#define KVM_REQ_KICK 9
+#define KVM_REQ_DEACTIVATE_FPU 10
+
+#define KVM_USERSPACE_IRQ_SOURCE_ID 0
-struct kvm_pte_chain {
- u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
- struct hlist_node link;
-};
+struct kvm;
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
/*
- * kvm_mmu_page_role, below, is defined as:
- *
- * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- * bits 4:7 - page table level for this shadow (1-4)
- * bits 8:9 - page table quadrant for 2-level guests
- * bit 16 - direct mapping of virtual to physical mapping at gfn
- * used for real mode and two-dimensional paging
- * bits 17:19 - common access permissions for all ptes in this shadow page
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice. At least its abstracted so we can change
+ * in one place.
*/
-union kvm_mmu_page_role {
- unsigned word;
- struct {
- unsigned glevels:4;
- unsigned level:4;
- unsigned quadrant:2;
- unsigned pad_for_nice_hex_output:6;
- unsigned direct:1;
- unsigned access:3;
- unsigned invalid:1;
- unsigned cr4_pge:1;
- unsigned nxe:1;
- };
-};
-
-struct kvm_mmu_page {
- struct list_head link;
- struct hlist_node hash_link;
-
- struct list_head oos_link;
-
- /*
- * The following two entries are used to key the shadow page in the
- * hash table.
- */
- gfn_t gfn;
- union kvm_mmu_page_role role;
-
- u64 *spt;
- /* hold the gfn of each spte inside spt */
- gfn_t *gfns;
- /*
- * One bit set per slot which has memory
- * in this shadow page.
- */
- DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- int multimapped; /* More than one parent_pte? */
- int root_count; /* Currently serving as active root */
- bool unsync;
- unsigned int unsync_children;
- union {
- u64 *parent_pte; /* !multimapped */
- struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
- };
- DECLARE_BITMAP(unsync_child_bitmap, 512);
+struct kvm_io_bus {
+ int dev_count;
+#define NR_IOBUS_DEVS 200
+ struct kvm_io_device *devs[NR_IOBUS_DEVS];
};
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char buf[512] __aligned(sizeof(long));
+enum kvm_bus {
+ KVM_MMIO_BUS,
+ KVM_PIO_BUS,
+ KVM_NR_BUSES
};
-struct kvm_pio_request {
- unsigned long count;
- int cur_count;
- gva_t guest_gva;
- int in;
- int port;
- int size;
- int string;
- int down;
- int rep;
-};
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-struct kvm_mmu {
- void (*new_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
- u32 *error);
- void (*prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page);
- int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
- void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
- hpa_t root_hpa;
- int root_level;
- int shadow_root_level;
- union kvm_mmu_page_role base_role;
-
- u64 *pae_root;
- u64 rsvd_bits_mask[2][4];
-};
-
-struct kvm_vcpu_arch {
- u64 host_tsc;
- /*
- * rip and regs accesses must go through
- * kvm_{register,rip}_{read,write} functions.
- */
- unsigned long regs[NR_VCPU_REGS];
- u32 regs_avail;
- u32 regs_dirty;
-
- unsigned long cr0;
- unsigned long cr0_guest_owned_bits;
- unsigned long cr2;
- unsigned long cr3;
- unsigned long cr4;
- unsigned long cr4_guest_owned_bits;
- unsigned long cr8;
- u32 hflags;
- u64 pdptrs[4]; /* pae */
- u64 efer;
- u64 apic_base;
- struct kvm_lapic *apic; /* kernel irqchip context */
- int32_t apic_arb_prio;
- int mp_state;
- int sipi_vector;
- u64 ia32_misc_enable_msr;
- bool tpr_access_reporting;
-
- struct kvm_mmu mmu;
- /* only needed in kvm_pv_mmu_op() path, but it's hot so
- * put it here to avoid allocation */
- struct kvm_pv_mmu_op_buffer mmu_op_buffer;
-
- struct kvm_mmu_memory_cache mmu_pte_chain_cache;
- struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
- struct kvm_mmu_memory_cache mmu_page_cache;
- struct kvm_mmu_memory_cache mmu_page_header_cache;
-
- gfn_t last_pt_write_gfn;
- int last_pt_write_count;
- u64 *last_pte_updated;
- gfn_t last_pte_gfn;
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, const void *val);
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
+ void *val);
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev);
+#ifdef XXX
+struct kvm_vcpu {
+ struct kvm *kvm;
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ struct preempt_notifier preempt_notifier;
+#endif
+ int vcpu_id;
+ struct mutex mutex;
+ int cpu;
+ struct kvm_run *run;
+ unsigned long requests;
+ unsigned long guest_debug;
+ int srcu_idx;
+
+ int fpu_active;
+ int guest_fpu_loaded;
+ wait_queue_head_t wq;
+ int sigset_active;
+ sigset_t sigset;
+ struct kvm_vcpu_stat stat;
+
+#ifdef CONFIG_HAS_IOMEM
+ int mmio_needed;
+ int mmio_read_completed;
+ int mmio_is_write;
+ int mmio_size;
+ unsigned char mmio_data[8];
+ gpa_t mmio_phys_addr;
+#endif
- struct {
- gfn_t gfn; /* presumed gfn during guest pte update */
- pfn_t pfn; /* pfn corresponding to that gfn */
- unsigned long mmu_seq;
- } update_pte;
-
- struct i387_fxsave_struct host_fx_image;
- struct i387_fxsave_struct guest_fx_image;
-
- gva_t mmio_fault_cr2;
- struct kvm_pio_request pio;
- void *pio_data;
-
- u8 event_exit_inst_len;
-
- struct kvm_queued_exception {
- bool pending;
- bool has_error_code;
- u8 nr;
- u32 error_code;
- } exception;
-
- struct kvm_queued_interrupt {
- bool pending;
- bool soft;
- u8 nr;
- } interrupt;
-
- int halt_request; /* real mode on Intel only */
-
- int cpuid_nent;
- struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
- /* emulate context */
-
- struct x86_emulate_ctxt emulate_ctxt;
-
- gpa_t time;
- struct pvclock_vcpu_time_info hv_clock;
- unsigned int hv_clock_tsc_khz;
- unsigned int time_offset;
- struct page *time_page;
-
- bool nmi_pending;
- bool nmi_injected;
-
- struct mtrr_state_type mtrr_state;
- u32 pat;
-
- int switch_db_regs;
- unsigned long db[KVM_NR_DB_REGS];
- unsigned long dr6;
- unsigned long dr7;
- unsigned long eff_db[KVM_NR_DB_REGS];
-
- u64 mcg_cap;
- u64 mcg_status;
- u64 mcg_ctl;
- u64 *mce_banks;
-
- /* used for guest single stepping over the given code position */
- u16 singlestep_cs;
- unsigned long singlestep_rip;
- /* fields used by HYPER-V emulation */
- u64 hv_vapic;
+ struct kvm_vcpu_arch arch;
};
-struct kvm_mem_alias {
+struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
- gfn_t target_gfn;
-#define KVM_ALIAS_INVALID 1UL
unsigned long flags;
+ unsigned long *rmap;
+ unsigned long *dirty_bitmap;
+ struct {
+ unsigned long rmap_pde;
+ int write_count;
+ } *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ unsigned long userspace_addr;
+ int user_alloc;
};
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
+{
+ return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+}
-struct kvm_mem_aliases {
- struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
- int naliases;
+struct kvm_kernel_irq_routing_entry {
+ uint32_t gsi;
+ uint32_t type;
+ int (*set)(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level);
+ union {
+ struct {
+ unsigned irqchip;
+ unsigned pin;
+ } irqchip;
+ struct msi_msg msi;
+ };
+ struct hlist_node link;
};
-struct kvm_arch {
- struct kvm_mem_aliases *aliases;
+#ifdef __KVM_HAVE_IOAPIC
- unsigned int n_free_mmu_pages;
- unsigned int n_requested_mmu_pages;
- unsigned int n_alloc_mmu_pages;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+struct kvm_irq_routing_table {
+ int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
+ struct kvm_kernel_irq_routing_entry *rt_entries;
+ uint32_t nr_rt_entries;
/*
- * Hash table of struct kvm_mmu_page.
+ * Array indexed by gsi. Each entry contains list of irq chips
+ * the gsi is connected to.
*/
- struct list_head active_mmu_pages;
- struct list_head assigned_dev_head;
- struct iommu_domain *iommu_domain;
- int iommu_flags;
- struct kvm_pic *vpic;
- struct kvm_ioapic *vioapic;
- struct kvm_pit *vpit;
- int vapics_in_nmi_mode;
-
- unsigned int tss_addr;
- struct page *apic_access_page;
-
- gpa_t wall_clock;
+ struct hlist_head map[0];
+};
- struct page *ept_identity_pagetable;
- bool ept_identity_pagetable_done;
- gpa_t ept_identity_map_addr;
+#else
- unsigned long irq_sources_bitmap;
- u64 vm_init_tsc;
- s64 kvmclock_offset;
+struct kvm_irq_routing_table {};
- struct kvm_xen_hvm_config xen_hvm_config;
+#endif
- /* fields used by HYPER-V emulation */
- u64 hv_guest_os_id;
- u64 hv_hypercall;
-};
+static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
+{
+ smp_rmb();
+ return kvm->vcpus[i];
+}
-struct kvm_vm_stat {
- u32 mmu_shadow_zapped;
- u32 mmu_pte_write;
- u32 mmu_pte_updated;
- u32 mmu_pde_zapped;
- u32 mmu_flooded;
- u32 mmu_recycled;
- u32 mmu_cache_miss;
- u32 mmu_unsync;
- u32 remote_tlb_flush;
- u32 lpages;
+#define kvm_for_each_vcpu(idx, vcpup, kvm) \
+ for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \
+ idx < atomic_read(&kvm->online_vcpus) && vcpup; \
+ vcpup = kvm_get_vcpu(kvm, ++idx))
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_put(struct kvm_vcpu *vcpu);
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+ struct module *module);
+void kvm_exit(void);
+
+void kvm_get_kvm(struct kvm *kvm);
+void kvm_put_kvm(struct kvm *kvm);
+
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
+
+extern struct page *bad_page;
+extern pfn_t bad_pfn;
+
+int is_error_page(struct page *page);
+int is_error_pfn(pfn_t pfn);
+int kvm_is_error_hva(unsigned long addr);
+int kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+int __kvm_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc);
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc);
+void kvm_disable_largepages(void);
+void kvm_arch_flush_shadow(struct kvm *kvm);
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page_clean(struct page *page);
+void kvm_release_page_dirty(struct page *page);
+void kvm_set_page_dirty(struct page *page);
+void kvm_set_page_accessed(struct page *page);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+int memslot_id(struct kvm *kvm, gfn_t gfn);
+void kvm_release_pfn_dirty(pfn_t);
+void kvm_release_pfn_clean(pfn_t pfn);
+void kvm_set_pfn_dirty(pfn_t pfn);
+void kvm_set_pfn_accessed(pfn_t pfn);
+void kvm_get_pfn(pfn_t pfn);
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len);
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+ int offset, int len);
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+ unsigned long len);
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_reload_remote_mmus(struct kvm *kvm);
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+
+int kvm_dev_ioctl_check_extension(long ext);
+
+int kvm_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log, int *is_dirty);
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ struct
+ kvm_userspace_memory_region *mem,
+ int user_alloc);
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg);
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr);
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state);
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state);
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
+int kvm_arch_init(void *opaque);
+void kvm_arch_exit(void);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
+int kvm_arch_hardware_enable(void *garbage);
+void kvm_arch_hardware_disable(void *garbage);
+int kvm_arch_hardware_setup(void);
+void kvm_arch_hardware_unsetup(void);
+void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+
+void kvm_free_physmem(struct kvm *kvm);
+
+struct kvm *kvm_arch_create_vm(void);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+void kvm_free_all_assigned_devices(struct kvm *kvm);
+void kvm_arch_sync_events(struct kvm *kvm);
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+int kvm_is_mmio_pfn(pfn_t pfn);
+
+struct kvm_irq_ack_notifier {
+ struct hlist_node link;
+ unsigned gsi;
+ void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
};
-struct kvm_vcpu_stat {
- u32 pf_fixed;
- u32 pf_guest;
- u32 tlb_flush;
- u32 invlpg;
-
- u32 exits;
- u32 io_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 irq_window_exits;
- u32 nmi_window_exits;
- u32 halt_exits;
- u32 halt_wakeup;
- u32 request_irq_exits;
- u32 irq_exits;
- u32 host_state_reload;
- u32 efer_reload;
- u32 fpu_reload;
- u32 insn_emulation;
- u32 insn_emulation_fail;
- u32 hypercalls;
- u32 irq_injections;
- u32 nmi_injections;
+#define KVM_ASSIGNED_MSIX_PENDING 0x1
+struct kvm_guest_msix_entry {
+ u32 vector;
+ u16 entry;
+ u16 flags;
};
-
-struct kvm_x86_ops {
- int (*cpu_has_kvm_support)(void); /* __init */
- int (*disabled_by_bios)(void); /* __init */
- int (*hardware_enable)(void *dummy);
- void (*hardware_disable)(void *dummy);
- void (*check_processor_compatibility)(void *rtn);
- int (*hardware_setup)(void); /* __init */
- void (*hardware_unsetup)(void); /* __exit */
- bool (*cpu_has_accelerated_tpr)(void);
- void (*cpuid_update)(struct kvm_vcpu *vcpu);
-
- /* Create, but do not attach this VCPU */
- struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
- void (*vcpu_free)(struct kvm_vcpu *vcpu);
- int (*vcpu_reset)(struct kvm_vcpu *vcpu);
-
- void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
- void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
- void (*vcpu_put)(struct kvm_vcpu *vcpu);
-
- void (*set_guest_debug)(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
- int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
- u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
- void (*get_segment)(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg);
- int (*get_cpl)(struct kvm_vcpu *vcpu);
- void (*set_segment)(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg);
- void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
- void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
- void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
- void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
- void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
- void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
- void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
- int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
- void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
- unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
- void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
- void (*fpu_activate)(struct kvm_vcpu *vcpu);
- void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
-
- void (*tlb_flush)(struct kvm_vcpu *vcpu);
-
- void (*run)(struct kvm_vcpu *vcpu);
- int (*handle_exit)(struct kvm_vcpu *vcpu);
- void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
- void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- void (*patch_hypercall)(struct kvm_vcpu *vcpu,
- unsigned char *hypercall_addr);
- void (*set_irq)(struct kvm_vcpu *vcpu);
- void (*set_nmi)(struct kvm_vcpu *vcpu);
- void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code);
- int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
- int (*nmi_allowed)(struct kvm_vcpu *vcpu);
- bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
- void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
- void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
- void (*enable_irq_window)(struct kvm_vcpu *vcpu);
- void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
- int (*get_tdp_level)(void);
- u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- int (*get_lpage_level)(void);
- bool (*rdtscp_supported)(void);
-
- const struct trace_print_flags *exit_reasons_str;
+struct kvm_assigned_dev_kernel {
+ struct kvm_irq_ack_notifier ack_notifier;
+ struct work_struct interrupt_work;
+ struct list_head list;
+ int assigned_dev_id;
+ int host_segnr;
+ int host_busnr;
+ int host_devfn;
+ unsigned int entries_nr;
+ int host_irq;
+ bool host_irq_disabled;
+ struct msix_entry *host_msix_entries;
+ int guest_irq;
+ struct kvm_guest_msix_entry *guest_msix_entries;
+ unsigned long irq_requested_type;
+ int irq_source_id;
+ int flags;
+ struct pci_dev *dev;
+ struct kvm *kvm;
+ spinlock_t assigned_dev_lock;
};
-extern struct kvm_x86_ops *kvm_x86_ops;
-
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask);
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_zap_all(struct kvm *kvm);
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
-
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes);
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
- gpa_t addr, unsigned long *ret);
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
-
-extern bool tdp_enabled;
-
-enum emulation_result {
- EMULATE_DONE, /* no further processing */
- EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
- EMULATE_FAIL, /* can't emulate this instruction */
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
};
-#define EMULTYPE_NO_DECODE (1 << 0)
-#define EMULTYPE_TRAP_UD (1 << 1)
-#define EMULTYPE_SKIP (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags);
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
- unsigned long *rflags);
-void kvm_enable_efer_bits(u64);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-
-struct x86_emulate_ctxt;
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
- int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long value);
-
-void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
-
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
-unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-
-unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-
-#ifdef XXX
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-#endif /*XXX*/
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
- u32 error_code);
-bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
-
-int kvm_pic_set_irq(void *opaque, int irq, int level);
-
-void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- bool guest_initiated);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-
-void kvm_enable_tdp(void);
-void kvm_disable_tdp(void);
-
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-int complete_pio(struct kvm_vcpu *vcpu);
-bool kvm_check_iopl(struct kvm_vcpu *vcpu);
-
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
-static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+#ifdef __KVM_HAVE_IOAPIC
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+ union kvm_ioapic_redirect_entry *entry,
+ unsigned long *deliver_bitmask);
+#endif
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian);
+int kvm_request_irq_source_id(struct kvm *kvm);
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+
+/* For vcpu->arch.iommu_flags */
+#define KVM_IOMMU_CACHE_COHERENCY 0x1
+
+#ifdef CONFIG_IOMMU_API
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
+int kvm_iommu_map_guest(struct kvm *kvm);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+int kvm_assign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev);
+#else /* CONFIG_IOMMU_API */
+static inline int kvm_iommu_map_pages(struct kvm *kvm,
+ gfn_t base_gfn,
+ unsigned long npages)
{
- struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
+ return 0;
}
-#endif /*XXX*/
-
-static inline unsigned short kvm_read_fs(void)
+static inline int kvm_iommu_map_guest(struct kvm *kvm)
{
- unsigned short seg;
- asm("mov %%fs, %0" : "=g"(seg));
- return seg;
+ return -ENODEV;
}
-static inline unsigned short kvm_read_gs(void)
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
{
- unsigned short seg;
- asm("mov %%gs, %0" : "=g"(seg));
- return seg;
+ return 0;
}
-static inline unsigned short kvm_read_ldt(void)
+static inline int kvm_assign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
{
- unsigned short ldt;
- asm("sldt %0" : "=g"(ldt));
- return ldt;
+ return 0;
}
-static inline void kvm_load_fs(unsigned short sel)
+static inline int kvm_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
{
- asm("mov %0, %%fs" : : "rm"(sel));
+ return 0;
}
+#endif /* CONFIG_IOMMU_API */
-static inline void kvm_load_gs(unsigned short sel)
+static inline void kvm_guest_enter(void)
{
- asm("mov %0, %%gs" : : "rm"(sel));
+ account_system_vtime(current);
+ current->flags |= PF_VCPU;
}
-static inline void kvm_load_ldt(unsigned short sel)
+static inline void kvm_guest_exit(void)
{
- asm("lldt %0" : : "rm"(sel));
+ account_system_vtime(current);
+ current->flags &= ~PF_VCPU;
}
-struct descriptor_table {
- unsigned short limit;
- unsigned long base;
-} __attribute__((packed));
-
-static inline void kvm_get_idt(struct descriptor_table *table)
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
{
- asm("sidt %0" : "=m"(*table));
+ return (gpa_t)gfn << PAGE_SHIFT;
}
-static inline void kvm_get_gdt(struct descriptor_table *table)
+static inline hpa_t pfn_to_hpa(pfn_t pfn)
{
- asm("sgdt %0" : "=m"(*table));
+ return (hpa_t)pfn << PAGE_SHIFT;
}
-/*
- * FIXME: Accessing the desc_struct through its fields is more elegant,
- * and should be the one valid thing to do. However, a lot of open code
- * still touches the a and b accessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than an union,
- * so we can get rid of it transparently in the future -- glommer
- */
-/* 8 byte segment descriptor */
-struct desc_struct {
- union {
- struct {
- unsigned int a;
- unsigned int b;
- }a;
- struct {
- unsigned short limit0;
- unsigned short base0;
- unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
- unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
- }b;
- }c;
-} __attribute__((packed));
-
-static inline unsigned long get_desc_base(const struct desc_struct *desc)
+static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
- return (unsigned)(desc->c.b.base0 | ((desc->c.b.base1) << 16) | ((desc->c.b.base2) << 24));
+ set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
}
-extern unsigned long segment_base(uint16_t selector);
+enum kvm_stat_kind {
+ KVM_STAT_VM,
+ KVM_STAT_VCPU,
+};
+
+struct kvm_stats_debugfs_item {
+ const char *name;
+ int offset;
+ enum kvm_stat_kind kind;
+ struct dentry *dentry;
+};
+extern struct kvm_stats_debugfs_item debugfs_entries[];
+extern struct dentry *kvm_debugfs_dir;
-static inline unsigned long kvm_read_tr_base(void)
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
{
- unsigned short tr;
- asm("str %0" : "=g"(tr));
- return segment_base(tr);
+ if (unlikely(vcpu->kvm->mmu_notifier_count))
+ return 1;
+ /*
+ * Both reads happen under the mmu_lock and both values are
+ * modified under mmu_lock, so there's no need of smb_rmb()
+ * here in between, otherwise mmu_notifier_count should be
+ * read before mmu_notifier_seq, see
+ * mmu_notifier_invalidate_range_end write side.
+ */
+ if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
+ return 1;
+ return 0;
}
+#endif
-#ifdef CONFIG_X86_64
-static inline unsigned long read_msr(unsigned long msr)
-{
- uint64_t value;
+#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+#define unalias_gfn_instantiation unalias_gfn
+#endif
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+
+#define KVM_MAX_IRQ_ROUTES 1024
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_set_irq_routing(struct kvm *kvm,
+ const struct kvm_irq_routing_entry *entries,
+ unsigned nr,
+ unsigned flags);
+void kvm_free_irq_routing(struct kvm *kvm);
+
+#else
+
+static inline void kvm_free_irq_routing(struct kvm *kvm) {}
- rdmsrl(msr, value);
- return value;
-}
#endif
-#ifdef XXX
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
- asm("fxsave (%0)":: "r" (image));
-}
+#undef CONFIG_HAVE_KVM_EVENTFD
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
- asm("fxrstor (%0)":: "r" (image));
-}
+#ifdef CONFIG_HAVE_KVM_EVENTFD
+
+void kvm_eventfd_init(struct kvm *kvm);
+int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
+void kvm_irqfd_release(struct kvm *kvm);
+int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
-static inline void kvm_fx_finit(void)
+#else
+
+static inline void kvm_eventfd_init(struct kvm *kvm) {}
+static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
{
- asm("finit");
+ return -EINVAL;
}
-#endif /*XXX*/
-static inline uint32_t get_rdx_init_val(void)
+
+static inline void kvm_irqfd_release(struct kvm *kvm) {}
+static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
- return 0x600; /* P6 family */
+ return -ENOSYS;
}
-static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, uint32_t error_code)
+#endif /* CONFIG_HAVE_KVM_EVENTFD */
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
-#ifdef XXX
- kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-#endif /*XXX*/
+ return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
}
+#endif
-#define TSS_IOPB_BASE_OFFSET 0x66
-#define TSS_BASE_SIZE 0x68
-#define TSS_IOPB_SIZE (65536 / 8)
-#define TSS_REDIRECTION_SIZE (256 / 8)
-#define RMODE_TSS_SIZE \
- (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
-
-enum {
- TASK_SWITCH_CALL = 0,
- TASK_SWITCH_IRET = 1,
- TASK_SWITCH_JMP = 2,
- TASK_SWITCH_GATE = 3,
-};
-
-#define HF_GIF_MASK (1 << 0)
-#define HF_HIF_MASK (1 << 1)
-#define HF_VINTR_MASK (1 << 2)
-#define HF_NMI_MASK (1 << 3)
-#define HF_IRET_MASK (1 << 4)
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
-/*
- * Hardware virtualization extension instructions may fault if a
- * reboot turns off virtualization while processes are running.
- * Trap the fault and ignore the instruction if that happens.
- */
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg);
-#ifdef XXX
-#include "linkage.h"
+#else
-asmlinkage void kvm_handle_fault_on_reboot(void);
+static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ return -ENOTTY;
+}
#endif
-
-#define __kvm_handle_fault_on_reboot(insn) \
- "666: " insn "\n\t" \
- ".pushsection .fixup, \"ax\" \n" \
- "667: \n\t" \
- __ASM_SIZE(push) " $666b \n\t" \
- ".popsection \n\t" \
- ".pushsection __ex_table, \"a\" \n\t" \
- _ASM_PTR " 666b, 667b \n\t" \
- ".popsection \n\t"
-
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
-#ifdef XXX
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
-int kvm_age_hva(struct kvm *kvm, unsigned long hva);
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-
-void kvm_define_shared_msr(unsigned index, uint32_t msr);
-void kvm_set_shared_msr(unsigned index, uint64_t val, uint64_t mask);
#endif /*XXX*/
-#endif /* _ASM_X86_KVM_HOST_H */
+#endif
+
diff --git a/kvm_x86.c b/kvm_x86.c
index 0885d84..29b1d6b 100644
--- a/kvm_x86.c
+++ b/kvm_x86.c
@@ -18,6 +18,7 @@
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <vm/hat_i86.h>
+#include <sys/segments.h>
#include "msr-index.h"
#include "msr.h"
@@ -25,16 +26,24 @@
#include "processor-flags.h"
#include "apicdef.h"
#include "kvm_host.h"
+#include "kvm_x86host.h"
+#include "iodev.h"
#define PER_CPU_ATTRIBUTES
#define PER_CPU_DEF_ATTRIBUTES
#define PER_CPU_BASE_SECTION ".data"
#include "percpu-defs.h"
+#include "coalesced_mmio.h"
#include "kvm.h"
+#include "irq.h"
extern struct vmcs **vmxarea;
static int vcpuid;
+extern uint64_t native_read_msr_safe(unsigned int msr,
+ int *err);
+extern int native_write_msr_safe(unsigned int msr,
+ unsigned low, unsigned high);
unsigned long segment_base(uint16_t selector)
{
@@ -124,7 +133,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
gfn++;
}
- iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages);
+ iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGESIZE * npages);
}
static int
@@ -219,6 +228,7 @@ vmx_hardware_enable(void *garbage)
uint64_t phys_addr = kvtop(per_cpu(vmxarea, cpu));
#else
uint64_t phys_addr;
+ volatile int x; /* XXX - dtrace return probe missing */
pfn = hat_getpfnum(kas.a_hat, (caddr_t)vmxarea[cpu]);
phys_addr = ((uint64_t)pfn << PAGESHIFT)|((uint64_t)vmxarea[cpu] & PAGEOFFSET);
#endif
@@ -249,9 +259,35 @@ vmx_hardware_enable(void *garbage)
ept_sync_global();
#endif /*XXX*/
+ x = 10; /*XXX*/
return 0;
}
+extern struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu);
+extern void vmcs_writel(unsigned long field, unsigned long value);
+extern unsigned long vmcs_readl(unsigned long field);
+
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags, save_rflags;
+
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ return rflags;
+}
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ to_vmx(vcpu)->rmode.save_rflags = rflags;
+ rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+ }
+ vmcs_writel(GUEST_RFLAGS, rflags);
+}
+
int kvm_arch_hardware_enable(void *garbage)
{
#ifdef LATER
@@ -362,19 +398,14 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p)
return r;
}
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
- return kvm->arch.vpic;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
+int irqchip_in_kernel(struct kvm *kvm)
{
int ret;
ret = (pic_irqchip(kvm) != NULL);
#ifdef XXX
smp_rmb();
-#endif /*XXX*/
+#endif
return ret;
}
@@ -390,12 +421,16 @@ kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
#ifdef XXX
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
-#endif
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-#ifdef XXX
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+#else
+ if (!irqchip_in_kernel(kvm) /* || kvm_vcpu_is_bsp(vcpu) */)
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
#endif
+
page = kmem_zalloc(PAGESIZE, KM_SLEEP);
if (!page) {
r = ENOMEM;
@@ -414,6 +449,7 @@ kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_mmu_destroy;
}
#endif /*XXX*/
+
vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS * sizeof(uint64_t) * 4,
KM_SLEEP);
if (!vcpu->arch.mce_banks) {
@@ -487,8 +523,7 @@ fail:
* 32-bit hardware).
*/
-uint64_t
-kvm_va2pa(caddr_t va)
+uint64_t kvm_va2pa(caddr_t va)
{
uint64_t pa;
@@ -496,12 +531,21 @@ kvm_va2pa(caddr_t va)
return (pa);
}
+#ifdef XXX
unsigned long *vmx_io_bitmap_a;
unsigned long *vmx_io_bitmap_b;
unsigned long *vmx_msr_bitmap_legacy;
unsigned long *vmx_msr_bitmap_longmode;
+#else
+/* make these arrays to try to force into low 4GB memory...*/
+/* also need to be aligned... */
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_io_bitmap_a[PAGESIZE/sizeof(unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_io_bitmap_b[PAGESIZE/sizeof(unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_msr_bitmap_legacy[PAGESIZE/sizeof(unsigned long)];
+__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_msr_bitmap_longmode[PAGESIZE/sizeof(unsigned long)];
+#endif /*XXX*/
+
-extern void vmcs_writel(unsigned long field, unsigned long value);
static void vmcs_write16(unsigned long field, uint16_t value)
{
vmcs_writel(field, value);
@@ -521,169 +565,64 @@ static void vmcs_write64(unsigned long field, uint64_t value)
#endif
}
-extern unsigned long vmcs_readl(unsigned long field);
-
-
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
-{
- uint32_t host_sysenter_cs, msr_low, msr_high;
- uint32_t junk;
- uint64_t host_pat, tsc_this, tsc_base;
- unsigned long a;
- struct descriptor_table dt;
- int i;
- unsigned long kvm_vmx_return;
- uint32_t exec_control;
-
-#ifdef XXX
- /* I/O */
- vmcs_write64(IO_BITMAP_A, kvm_va2pa((caddr_t)vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, kvm_va2pa((caddr_t)vmx_io_bitmap_b));
-
- if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, kvm_pa2va(vmx_msr_bitmap_legacy));
-
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
- /* Control */
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
- vmcs_config.pin_based_exec_ctrl);
-
- exec_control = vmcs_config.cpu_based_exec_ctrl;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
- exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
- exec_control |= CPU_BASED_CR8_STORE_EXITING |
- CPU_BASED_CR8_LOAD_EXITING;
-#endif
- }
- if (!enable_ept)
- exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_INVLPG_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
- if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- exec_control &=
- ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- if (vmx->vpid == 0)
- exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
- if (!enable_ept) {
- exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
- enable_unrestricted_guest = 0;
- }
- if (!enable_unrestricted_guest)
- exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
- if (!ple_gap)
- exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
- }
-
- if (ple_gap) {
- vmcs_write32(PLE_GAP, ple_gap);
- vmcs_write32(PLE_WINDOW, ple_window);
- }
-
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
-
- vmcs_writel(HOST_CR0, getcr0()); /* 22.2.3 */
- vmcs_writel(HOST_CR4, getcr4()); /* 22.2.3, 22.2.5 */
- vmcs_writel(HOST_CR3, getcr3()); /* 22.2.3 FIXME: shadow tables */
-
- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
- vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
- vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_FS_BASE, a);
- vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
- rdmsrl(MSR_GS_BASE, a);
- vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
- vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
- vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
+extern int enable_ept;
+extern int enable_unrestricted_guest;
+extern int emulate_invalid_guest_state;
- vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+static int bypass_guest_pf = 1;
- kvm_get_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
+extern void vmcs_clear(struct vmcs *vmcs);
+extern void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+extern void vmx_vcpu_put(struct kvm_vcpu *vcpu);
- asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
- vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+extern int vmx_vcpu_setup(struct vcpu_vmx *vmx);
+extern int enable_vpid;
- rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
- vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
- rdmsrl(MSR_IA32_SYSENTER_ESP, a);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
- rdmsrl(MSR_IA32_SYSENTER_EIP, a);
- vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
+extern ulong_t *vmx_vpid_bitmap;
+extern kmutex_t vmx_vpid_lock;
- if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((uint64_t) msr_high << 32);
- vmcs_write64(HOST_IA32_PAT, host_pat);
- }
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((uint64_t) msr_high << 32);
- /* Write the default value follow host pat */
- vmcs_write64(GUEST_IA32_PAT, host_pat);
- /* Keep arch.pat sync with GUEST_IA32_PAT */
- vmx->vcpu.arch.pat = host_pat;
- }
+static void allocate_vpid(struct vcpu_vmx *vmx)
+{
+ int vpid;
- for (i = 0; i < NR_VMX_MSR; ++i) {
- uint32_t index = vmx_msr_index[i];
- uint32_t data_low, data_high;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
- vmx->guest_msrs[j].mask = -1ull;
- ++vmx->nmsrs;
+ vmx->vpid = 0;
+ if (!enable_vpid)
+ return;
+ mutex_enter(&vmx_vpid_lock);
+ vpid = bt_availbit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS) {
+ vmx->vpid = vpid;
+ BT_SET(vmx_vpid_bitmap, vpid);
}
+ mutex_exit(&vmx_vpid_lock);
+}
- vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
- /* 22.2.1, 20.8.1 */
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
- vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
- if (enable_ept)
- vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
- vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+#ifdef XXX
+static int alloc_identity_pagetable(struct kvm *kvm)
+{
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+ int r = 0;
- tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
- rdtscll(tsc_this);
- if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
- tsc_base = tsc_this;
+ mutex_enter(&kvm->slots_lock);
+ if (kvm->arch.ept_identity_pagetable)
+ goto out;
+ kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+ kvm_userspace_mem.flags = 0;
+ kvm_userspace_mem.guest_phys_addr =
+ kvm->arch.ept_identity_map_addr;
+ kvm_userspace_mem.memory_size = PAGESIZE;
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ if (r)
+ goto out;
- guest_write_tsc(0, tsc_base);
-#endif /*XXX*/
- return 0;
+ kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+ kvm->arch.ept_identity_map_addr >> PAGESHIFT);
+out:
+ mutex_exit(&kvm->slots_lock);
+ return r;
}
-extern void vmcs_clear(struct vmcs *vmcs);
-extern void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-extern void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+#endif /*XXX*/
struct kvm_vcpu *
vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
@@ -694,9 +633,8 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
if (!vmx)
return NULL;
-#ifdef NOTNOW
+
allocate_vpid(vmx);
-#endif /*NOTNOW*/
err = kvm_vcpu_init(&vmx->vcpu, kvm, arg, id);
if (err) {
#ifdef NOTNOW
@@ -713,8 +651,11 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
}
vmx->vmcs = kmem_zalloc(PAGESIZE, KM_SLEEP);
- if (!vmx->vmcs)
+ if (!vmx->vmcs) {
+ kmem_free(vmx, sizeof(struct vcpu_vmx));
+ vmx = NULL;
return NULL;
+ }
kpreempt_disable();
@@ -734,11 +675,16 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
kpreempt_enable();
if (err)
vmx->vmcs = NULL;
-#ifdef NOTNOW
if (vm_need_virtualize_apic_accesses(kvm))
+#ifdef XXX
if (alloc_apic_access_page(kvm) != 0)
+#endif /*XXX*/
goto free_vmcs;
+#ifdef XXX
+ /*
+ * XXX For right now, we don't implement ept
+ */
if (enable_ept) {
if (!kvm->arch.ept_identity_map_addr)
kvm->arch.ept_identity_map_addr =
@@ -746,13 +692,14 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
if (alloc_identity_pagetable(kvm) != 0)
goto free_vmcs;
}
+#endif /*XXX*/
-#endif /*NOTNOW*/
return &vmx->vcpu;
-#ifdef XXX
free_vmcs:
- free_vmcs(vmx->vmcs);
+ kmem_free(vmx->vmcs, PAGESIZE);
+ vmx->vmcs = 0;
+#ifdef XXX
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
@@ -772,10 +719,8 @@ kvm_arch_vcpu_create(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id)
return vmx_create_vcpu(kvm, arg, id);
}
-extern struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu);
-
+extern int enable_ept;
-static int enable_ept = 1;
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
uint32_t eb;
@@ -788,7 +733,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
eb |= 1u << BP_VECTOR;
#endif /*XXX*/
- if (to_vmx(vcpu)->rmode.vm86_active)
+ if (((struct vcpu_vmx *)vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -801,12 +746,12 @@ static inline uint32_t apic_get_reg(struct kvm_lapic *apic, int reg_off)
{
return *((uint32_t *) (apic->regs + reg_off));
}
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val)
+void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val)
{
*((uint32_t *) (apic->regs + reg_off)) = val;
}
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+int kvm_apic_id(struct kvm_lapic *apic)
{
return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
@@ -874,7 +819,7 @@ void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
vcpu->arch.cr8 = cr8;
}
-static int is_paging(struct kvm_vcpu *vcpu)
+int is_paging(struct kvm_vcpu *vcpu)
{
#ifdef XXX
return kvm_getcr0_bits(vcpu, X86_CR0_PG);
@@ -885,8 +830,8 @@ static int is_paging(struct kvm_vcpu *vcpu)
void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ unsigned long hw_cr4 = cr4 | (((struct vcpu_vmx *)vcpu)->rmode.vm86_active ?
+ KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
if (enable_ept) {
@@ -904,7 +849,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
unsigned long hw_cr0;
#ifdef XXX
if (enable_unrestricted_guest)
@@ -913,7 +858,6 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
else
#endif /*XXX*/
hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
-
#ifdef XXX
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -933,10 +877,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
-#ifdef XXX
if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
-#endif /*XXX*/
if (!vcpu->fpu_active)
hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
@@ -978,33 +920,145 @@ static void seg_setup(int seg)
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
-#ifdef XXX
+
if (enable_unrestricted_guest) {
ar = 0x93;
if (seg == VCPU_SREG_CS)
ar |= 0x08; /* code segment */
} else
-#endif /*XXX*/
ar = 0xf3;
vmcs_write32(sf->ar_bytes, ar);
}
+static gva_t rmode_tss_base(struct kvm *kvm)
+{
+ if (!kvm->arch.tss_addr) {
+ struct kvm_memslots *slots;
+ gfn_t base_gfn;
+
+#ifdef XXX
+ slots = rcu_dereference(kvm->memslots);
+#else
+ slots = kvm->memslots;
+#endif /*XXX*/
+ base_gfn = kvm->memslots->memslots[0].base_gfn +
+ kvm->memslots->memslots[0].npages - 3;
+ return base_gfn << PAGESHIFT;
+ }
+ return kvm->arch.tss_addr;
+}
+
+extern int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+ int offset, int len);
+
+unsigned long empty_zero_page[PAGESIZE / sizeof(unsigned long)];
+
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+{
+ return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
+}
+
+static int init_rmode_tss(struct kvm *kvm)
+{
+ gfn_t fn = rmode_tss_base(kvm) >> PAGESHIFT;
+ uint16_t data = 0;
+ int ret = 0;
+ int r;
+
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGESIZE);
+ if (r < 0)
+ goto out;
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+ r = kvm_write_guest_page(kvm, fn++, &data,
+ TSS_IOPB_BASE_OFFSET, sizeof(uint16_t));
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn++, 0, PAGESIZE);
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGESIZE);
+ if (r < 0)
+ goto out;
+ data = ~0;
+ r = kvm_write_guest_page(kvm, fn, &data,
+ RMODE_TSS_SIZE - 2 * PAGESIZE - 1,
+ sizeof(uint8_t));
+ if (r < 0)
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+ int i, r, ret;
+ pfn_t identity_map_pfn;
+ uint32_t tmp;
+
+ if (!enable_ept)
+ return 1;
+ if ((!kvm->arch.ept_identity_pagetable)) {
+ cmn_err(CE_WARN, "EPT: identity-mapping pagetable haven't been allocated!\n");
+ return 0;
+ }
+ if ((kvm->arch.ept_identity_pagetable_done))
+ return 1;
+ ret = 0;
+ identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGESHIFT;
+ r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGESIZE);
+ if (r < 0)
+ goto out;
+#ifdef XXX
+ /* Set up identity-mapping pagetable for EPT in real mode */
+ for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ r = kvm_write_guest_page(kvm, identity_map_pfn,
+ &tmp, i * sizeof(tmp), sizeof(tmp));
+ if (r < 0)
+ goto out;
+ }
+#endif /*XXX*/
+ kvm->arch.ept_identity_pagetable_done = 1;
+ ret = 1;
+out:
+ return ret;
+}
+
+static int init_rmode(struct kvm *kvm)
+{
+ if (!init_rmode_tss(kvm))
+ return 0;
+ if (!init_rmode_identity_map(kvm))
+ return 0;
+ return 1;
+}
+
+extern void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer);
+extern void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val);
+extern ulong kvm_read_cr0(struct kvm_vcpu *vcpu);
+extern void setup_msrs(struct vcpu_vmx *vmx);
+
int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = (struct vcpu_vmx *)to_vmx(vcpu);
+ struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu;
uint64_t msr;
int ret, idx;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
#ifdef XXX
idx = srcu_read_lock(&vcpu->kvm->srcu);
+#endif /*XXX*/
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
}
-#endif
vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
@@ -1027,12 +1081,12 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
*/
-#ifdef XXX
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
} else {
-#endif /*XXX*/
+#endif /*CONFIG_KVM_APIC_ARCHITECTURE*/
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
#ifdef XXX
@@ -1064,9 +1118,9 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_rip_write(vcpu, 0xfff0);
else
kvm_rip_write(vcpu, 0);
-
- kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
#endif /*XXX*/
+ kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
+
vmcs_writel(GUEST_DR7, 0x400);
vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -1082,9 +1136,7 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
/* Special registers */
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-#ifdef XXX
setup_msrs(vmx);
-#endif /*XXX*/
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
@@ -1101,20 +1153,18 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
#endif /*XXX*/
+
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-#ifdef XXX
vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-#endif /*XXX*/
vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef XXX
vmx_set_efer(&vmx->vcpu, 0);
+#ifdef XXX
vmx_fpu_activate(&vmx->vcpu);
#endif /*XXX*/
update_exception_bitmap(&vmx->vcpu);
-
#ifdef XXX
vpid_sync_vcpu_all(vmx);
#endif /*XXX*/
@@ -1148,18 +1198,424 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
extern void vcpu_load(struct kvm_vcpu *vcpu);
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min)
+{
+ caddr_t obj;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ obj = kmem_cache_alloc(base_cache, KM_SLEEP);
+ if (!obj)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = obj;
+ }
+ return 0;
+}
+
+extern struct kmem_cache *pte_chain_cache;
+extern struct kmem_cache *rmap_desc_cache;
+extern struct kmem_cache *mmu_page_header_cache;
+
+/*XXX the following is called for tdp (two dimensional hardware paging */
+/* we dont support this right now */
+int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+ int r = 0;
+
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+ pte_chain_cache, 4);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+ rmap_desc_cache, 4);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache, 4);
+out:
+ return r;
+}
+
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ struct kvm_memory_slot *slot;
+ int host_level, level, max_level;
+#ifdef XXX
+ slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+ if (slot && slot->dirty_bitmap)
+ return PT_PAGE_TABLE_LEVEL;
+
+ host_level = host_mapping_level(vcpu->kvm, large_gfn);
+
+ if (host_level == PT_PAGE_TABLE_LEVEL)
+ return host_level;
+
+ max_level = kvm_x86_ops->get_lpage_level() < host_level ?
+ kvm_x86_ops->get_lpage_level() : host_level;
+
+ for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
+ if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+ break;
+
+ return level - 1;
+#else
+ return 0;
+#endif /*XXX*/
+}
+
+extern struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ gfn = unalias_gfn_instantiation(kvm, gfn);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return bad_hva();
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE);
+}
+
+int kvm_is_error_hva(unsigned long addr)
+{
+ return addr == bad_hva();
+}
+
+extern caddr_t bad_page;
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr)) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+ return hva_to_pfn(kvm, addr);
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+ uint32_t error_code)
+{
+#ifdef XXX
+ pfn_t pfn;
+ int r;
+ int level;
+ gfn_t gfn = gpa >> PAGESHIFT;
+ unsigned long mmu_seq;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ level = mapping_level(vcpu, gfn);
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+ mutex_enter(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
+ kvm_mmu_free_some_pages(vcpu);
+ r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+ level, gfn, pfn);
+ mutex_exit(&vcpu->kvm->mmu_lock);
+
+ return r;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+#endif /*XXX*/
+ return 0;
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ mutex_enter(&vcpu->kvm->mmu_lock);
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ }
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+ mmu_free_roots(vcpu);
+}
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+int get_ept_level(void)
+{
+ return VMX_EPT_DEFAULT_GAW + 1;
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, u32 *error)
+{
+ if (error)
+ *error = 0;
+ return vaddr;
+}
+
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ u64 exb_bit_rsvd = 0;
+
+ if (!is_nx(vcpu))
+ exb_bit_rsvd = rsvd_bits(63, 63);
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ context->rsvd_bits_mask[0][1] = 0;
+ context->rsvd_bits_mask[0][0] = 0;
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ break;
+ case PT32E_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 63) |
+ rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PDE */
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PTE */
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ break;
+ case PT64_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+ context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 29);
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ break;
+ }
+}
+
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = tdp_page_fault;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+ context->root_hpa = INVALID_PAGE;
+
+ if (!is_paging(vcpu)) {
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->root_level = 0;
+ } else if (is_long_mode(vcpu)) {
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT64_ROOT_LEVEL;
+ } else if (is_pae(vcpu)) {
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT32E_ROOT_LEVEL;
+ } else {
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->root_level = PT32_ROOT_LEVEL;
+ }
+
+ return 0;
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->root_level = 0;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+ return 0;
+}
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ ASSERT(is_pae(vcpu));
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->prefetch_page = paging64_prefetch_page;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
+ context->free = paging_free;
+ context->root_level = level;
+ context->shadow_root_level = level;
+ context->root_hpa = INVALID_PAGE;
+ return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->free = paging_free;
+ context->prefetch_page = paging32_prefetch_page;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
+ context->root_level = PT32_ROOT_LEVEL;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+ return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ if (!is_paging(vcpu))
+ r = nonpaging_init_context(vcpu);
+ else if (is_long_mode(vcpu))
+ r = paging64_init_context(vcpu);
+ else if (is_pae(vcpu))
+ r = paging32E_init_context(vcpu);
+ else
+ r = paging32_init_context(vcpu);
+
+ vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+
+ return r;
+}
+
+int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
vcpu->arch.update_pte.pfn = -1; /* bad_pfn */
#ifdef XXX
+ /*
+ * XXX currently, we won't support 2 dimensional paging.
+ * So the hardware will not do guest-virtual to guest-physical
+ * and guest-physical to host physical. So we'll need to
+ * implement "shadow" paging...
+ */
+
if (tdp_enabled)
return init_kvm_tdp_mmu(vcpu);
else
+#endif
return init_kvm_softmmu(vcpu);
-#else
return 0;
-#endif /*XXX*/
}
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
@@ -1196,6 +1652,11 @@ free_vcpu:
return r;
}
+void kvm_get_kvm(struct kvm *kvm)
+{
+ atomic_inc_32(&kvm->users_count);
+}
+
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
@@ -1217,9 +1678,8 @@ kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int32_t id, struct kvm_vcpu_ioc *arg,
if (r)
return r;
-#ifdef NOTNOW
-
- mutex_lock(&kvm->lock);
+ mutex_enter(&kvm->lock);
+#ifdef XXX
if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
r = -EINVAL;
goto vcpu_destroy;
@@ -1233,30 +1693,33 @@ kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int32_t id, struct kvm_vcpu_ioc *arg,
BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+#endif /*XXX*/
+
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
-#endif /*NOTNOW*/
- *rval_p = vcpuid++; /* guarantee unique id */
+
+ *rval_p = kvm->online_vcpus; /* guarantee unique id */
+ vcpu->vcpu_id = *rval_p;
/* XXX need to protect online_vcpus */
- kvm->vcpus[kvm->online_vcpus] = vcpu;
+ kvm->vcpus[kvm->online_vcpus++] = vcpu;
-#ifdef NOTNOW
+#ifdef XXX
smp_wmb();
- atomic_inc(&kvm->online_vcpus);
+#endif /*XXX*/
+ atomic_inc_32(&kvm->online_vcpus);
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
if (kvm->bsp_vcpu_id == id)
kvm->bsp_vcpu = vcpu;
#endif
- mutex_unlock(&kvm->lock);
-#endif /*NOTNOW*/
+ mutex_exit(&kvm->lock);
return r;
vcpu_destroy:
#ifdef NOTNOW
- mutex_unlock(&kvm->lock);
+ mutex_exit(&kvm->lock);
kvm_arch_vcpu_destroy(vcpu);
#endif /*NOTNOW*/
return r;
@@ -1281,7 +1744,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
unsigned long userspace_addr;
down_write(&current->mm->mmap_sem);
userspace_addr = do_mmap(NULL, 0,
- npages * PAGE_SIZE,
+ npages * PAGESIZE,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
0);
@@ -1330,6 +1793,240 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
return kvm_set_memory_region(kvm, mem, user_alloc);
}
+static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
+}
+
+static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
+ gpa_t addr, int len)
+{
+ struct kvm_coalesced_mmio_zone *zone;
+ struct kvm_coalesced_mmio_ring *ring;
+ unsigned avail;
+ int i;
+
+ /* Are we able to batch it ? */
+
+ /* last is the first free entry
+ * check if we don't meet the first used entry
+ * there is always one unused entry in the buffer
+ */
+ ring = dev->kvm->coalesced_mmio_ring;
+ avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
+ if (avail < KVM_MAX_VCPUS) {
+ /* full */
+ return 0;
+ }
+
+ /* is it in a batchable area ? */
+
+ for (i = 0; i < dev->nb_zones; i++) {
+ zone = &dev->zone[i];
+
+ /* (addr,len) is fully included in
+ * (zone->addr, zone->size)
+ */
+
+ if (zone->addr <= addr &&
+ addr + len <= zone->addr + zone->size)
+ return 1;
+ }
+ return 0;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
+{
+ struct kvm_io_bus *new_bus, *bus;
+
+ bus = kvm->buses[bus_idx];
+ if (bus->dev_count > NR_IOBUS_DEVS-1)
+ return -ENOSPC;
+
+ new_bus = kmem_zalloc(sizeof(struct kvm_io_bus), KM_SLEEP);
+ if (!new_bus)
+ return -ENOMEM;
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+ new_bus->devs[new_bus->dev_count++] = dev;
+#ifdef XXX
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ synchronize_srcu_expedited(&kvm->srcu);
+#endif /*XXX*/
+ kmem_free(bus, sizeof(struct kvm_io_bus));
+
+ return 0;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
+{
+ int i, r;
+ struct kvm_io_bus *new_bus, *bus;
+
+ new_bus = kmem_zalloc(sizeof(struct kvm_io_bus), KM_SLEEP);
+ if (!new_bus)
+ return -ENOMEM;
+
+ bus = kvm->buses[bus_idx];
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+
+ r = -ENOENT;
+ for (i = 0; i < new_bus->dev_count; i++)
+ if (new_bus->devs[i] == dev) {
+ r = 0;
+ new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+ break;
+ }
+
+ if (r) {
+ kmem_free(new_bus, sizeof(struct kvm_io_bus));
+ return r;
+ }
+
+#ifdef XXX
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ synchronize_srcu_expedited(&kvm->srcu);
+#endif
+ kmem_free(bus, sizeof(struct kvm_io_bus));
+ return r;
+}
+
+static int coalesced_mmio_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+ struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+ if (!coalesced_mmio_in_range(dev, addr, len))
+ return -EOPNOTSUPP;
+
+ mutex_enter(&dev->lock);
+
+ /* copy data in first free entry of the ring */
+
+ ring->coalesced_mmio[ring->last].phys_addr = addr;
+ ring->coalesced_mmio[ring->last].len = len;
+ memcpy(ring->coalesced_mmio[ring->last].data, val, len);
+#ifdef XXX
+ smp_wmb();
+#endif /*XXX*/
+ ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
+ mutex_exit(&dev->lock);
+ return 0;
+}
+
+static void coalesced_mmio_destructor(struct kvm_io_device *this)
+{
+ struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+
+ kmem_free(dev, sizeof(struct kvm_coalesced_mmio_dev));
+}
+
+static const struct kvm_io_device_ops coalesced_mmio_ops = {
+ .write = coalesced_mmio_write,
+ .destructor = coalesced_mmio_destructor,
+};
+
+
+int kvm_coalesced_mmio_init(struct kvm *kvm)
+{
+ struct kvm_coalesced_mmio_dev *dev;
+ caddr_t *page;
+ int ret;
+
+ ret = -ENOMEM;
+ page = kmem_zalloc(PAGESIZE, KM_SLEEP);
+ if (!page)
+ goto out_err;
+ kvm->coalesced_mmio_ring = (struct kvm_coalesced_mmio_ring *)page;
+
+ ret = -ENOMEM;
+ dev = kmem_alloc(sizeof(struct kvm_coalesced_mmio_dev), KM_SLEEP);
+ if (!dev)
+ goto out_free_page;
+ mutex_init(&dev->lock, NULL, MUTEX_DRIVER, 0);
+ kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+ dev->kvm = kvm;
+ kvm->coalesced_mmio_dev = dev;
+
+ mutex_enter(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+ mutex_exit(&kvm->slots_lock);
+ if (ret < 0)
+ goto out_free_dev;
+
+ return ret;
+
+out_free_dev:
+ kmem_free(dev, sizeof(struct kvm_coalesced_mmio_dev));
+out_free_page:
+ kmem_free(page, PAGESIZE);
+out_err:
+ return ret;
+}
+
+void kvm_coalesced_mmio_free(struct kvm *kvm)
+{
+ if (kvm->coalesced_mmio_ring)
+ kmem_free(kvm->coalesced_mmio_ring, PAGESIZE);
+}
+
+int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone)
+{
+ struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+
+ if (dev == NULL)
+ return -EINVAL;
+
+ mutex_enter(&kvm->slots_lock);
+ if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
+ mutex_exit(&kvm->slots_lock);
+ return -ENOBUFS;
+ }
+
+ dev->zone[dev->nb_zones] = *zone;
+ dev->nb_zones++;
+
+ mutex_exit(&kvm->slots_lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
+ struct kvm_coalesced_mmio_zone *zone)
+{
+ int i;
+ struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+ struct kvm_coalesced_mmio_zone *z;
+
+ if (dev == NULL)
+ return -EINVAL;
+
+ mutex_enter(&kvm->slots_lock);
+
+ i = dev->nb_zones;
+ while (i) {
+ z = &dev->zone[i - 1];
+
+ /* unregister all zones
+ * included in (zone->addr, zone->size)
+ */
+
+ if (zone->addr <= z->addr &&
+ z->addr + z->size <= zone->addr + zone->size) {
+ dev->nb_zones--;
+ *z = dev->zone[dev->nb_zones];
+ }
+ i--;
+ }
+
+ mutex_exit(&kvm->slots_lock);
+
+ return 0;
+}
+
long
kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
{
@@ -1358,6 +2055,7 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
goto out;
break;
}
+#endif /*NOTNOW*/
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
case KVM_REGISTER_COALESCED_MMIO: {
@@ -1385,13 +2083,12 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
break;
}
#endif
-
+#ifdef XXX
case KVM_IRQFD: {
struct kvm_irqfd data;
- r = -EFAULT;
- if (copy_from_user(&data, argp, sizeof data))
- goto out;
+ if (ddi_copyin(argp, &data, sizeof data, mode))
+ return (EFAULT);
r = kvm_irqfd(kvmp, data.fd, data.gsi, data.flags);
break;
}
@@ -1404,24 +2101,21 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode)
r = kvm_ioeventfd(kvmp, &data);
break;
}
-#endif /*NOTNOW*/
+
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
case KVM_SET_BOOT_CPU_ID:
r = 0;
- mutex_lock(&kvmp->lock);
+ mutex_enter(&kvmp->lock);
if (atomic_read(&kvmp->online_vcpus) != 0)
r = -EBUSY;
else
kvmp->bsp_vcpu_id = arg;
- mutex_unlock(&kvmp->lock);
+ mutex_exit(&kvmp->lock);
break;
#endif
-#ifdef NOTNOW
+#endif /*XXX*/
default:
- r = kvm_arch_vm_ioctl(filp, ioctl, arg);
- if (r == -ENOTTY)
- r = kvm_vm_ioctl_assigned_device(kvmp, ioctl, arg);
-#endif /*NOTNOW*/
+ return EINVAL;
}
out:
diff --git a/msr.h b/msr.h
index df3f675..22c0b07 100644
--- a/msr.h
+++ b/msr.h
@@ -12,7 +12,7 @@
#ifdef _KERNEL
#include "asm.h"
-
+#include <sys/ontrap.h>
#include <sys/errno.h>
#ifdef XXX
@@ -76,21 +76,11 @@ static inline unsigned long long native_read_msr(unsigned int msr)
}
-static inline unsigned long long native_read_msr_safe(unsigned int msr,
- int *err)
-{
- DECLARE_ARGS(val, low, high);
+extern uint64_t native_read_msr_safe(unsigned int msr,
+ int *err);
+extern int native_write_msr_safe(unsigned int msr,
+ unsigned low, unsigned high);
- asm volatile("2: rdmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err] ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
- : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
- : "c" (msr), [fault] "i" (-EIO));
- return EAX_EDX_VAL(val, low, high);
-}
static inline void native_write_msr(unsigned int msr,
unsigned low, unsigned high)
@@ -98,23 +88,6 @@ static inline void native_write_msr(unsigned int msr,
asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
}
-/* Can be uninlined because referenced by paravirt */
-static inline int native_write_msr_safe(unsigned int msr,
- unsigned low, unsigned high)
-{
- int err;
- asm volatile("2: wrmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err] ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
- : [err] "=a" (err)
- : "c" (msr), "0" (low), "d" (high),
- [fault] "i" (-EIO)
- : "memory");
- return err;
-}
extern unsigned long long native_read_tsc(void);