diff options
author | max <max@maxpad.(none)> | 2010-12-03 19:19:31 +0100 |
---|---|---|
committer | max <max@maxpad.(none)> | 2010-12-03 19:19:31 +0100 |
commit | aaf4078a2967dbd67bf0efad9c3f4b81ab35e665 (patch) | |
tree | 5bfa0a8d72f2fa2b5c8f3b38880e0d3eb5ce01d4 | |
parent | 00233f503e3241dd6361421e306acbba7454c99f (diff) | |
download | illumos-kvm-aaf4078a2967dbd67bf0efad9c3f4b81ab35e665.tar.gz |
Lots of new code, and lots of code turned on. Doesn't compile...
-rw-r--r-- | Makefile | 13 | ||||
-rw-r--r-- | kvm.c | 6799 | ||||
-rw-r--r-- | kvm.h | 683 | ||||
-rw-r--r-- | kvm_host.h | 1201 | ||||
-rw-r--r-- | kvm_x86.c | 1144 | ||||
-rw-r--r-- | msr.h | 37 |
6 files changed, 8115 insertions, 1762 deletions
@@ -9,16 +9,19 @@ LD=/usr/bin/ld CTFCONVERT=$(KERNEL_SOURCE)/usr/src/tools/proto/opt/onbld/bin/i386/ctfconvert CTFMERGE=$(KERNEL_SOURCE)/usr/src/tools/proto/opt/onbld/bin/i386/ctfmerge -CFLAGS += -D_KERNEL -D_MACHDEP -Dx86 -DCONFIG_X86_64 -DDEBUG -c -O -g -INCLUDEDIR= -I $(KERNEL_SOURCE)/usr/src/uts/intel -I $(KERNEL_SOURCE)/usr/src/uts/i86pc +CFLAGS += -D_KERNEL -D_MACHDEP -Dx86 -DCONFIG_X86_64 -DDEBUG -c -g -DCONFIG_SOLARIS -DCONFIG_KVM_MMIO -kvm: kvm.c kvm_x86.c kvm.h +INCLUDEDIR= -I $(KERNEL_SOURCE)/usr/src/uts/intel -I $(KERNEL_SOURCE)/usr/src/uts/i86pc -I $(KERNEL_SOURCE)/usr/src/uts/common + +kvm: kvm.c kvm_x86.c emulate.c kvm.h kvm_x86host.h $(CC) $(CFLAGS) $(INCLUDEDIR) kvm.c $(CC) $(CFLAGS) $(INCLUDEDIR) kvm_x86.c + $(CC) $(CFLAGS) $(INCLUDEDIR) emulate.c $(CTFCONVERT) -i -L VERSION kvm.o $(CTFCONVERT) -i -L VERSION kvm_x86.o - $(LD) -r -o kvm kvm.o kvm_x86.o - $(CTFMERGE) -L VERSION -o kvm kvm.o kvm_x86.o + $(CTFCONVERT) -i -L VERSION emulate.o + $(LD) -r -o kvm kvm.o kvm_x86.o emulate.o + $(CTFMERGE) -L VERSION -o kvm kvm.o kvm_x86.o emulate.o install: kvm @echo "==> Installing kvm module" @@ -24,7 +24,15 @@ #include "msr.h" #include "irqflags.h" #include "kvm_host.h" +#include "kvm_x86host.h" +#include "processor-flags.h" +#include "hyperv.h" +#include "apicdef.h" +#include "segment.h" +#include "iodev.h" #include "kvm.h" +#include "irq.h" +#include "tss.h" int kvmid; /* monotonically increasing, unique per vm */ int largepages_enabled = 1; @@ -126,6 +134,69 @@ extern void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static int vmx_set_tss_addr(struct kvm *kvmp, uintptr_t addr); static int vmx_hardware_setup(void); extern int vmx_hardware_enable(void *garbage); +extern unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); +void vmcs_writel(unsigned long field, unsigned long value); +unsigned long vmcs_readl(unsigned long field); +extern void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr); +static int vmx_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata); +static int vmx_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data); +static void vmx_vcpu_run(struct kvm_vcpu *vcpu); +static void vmx_save_host_state(struct kvm_vcpu *vcpu); + +struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static int vmx_handle_exit(struct kvm_vcpu *vcpu); +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu); +static int vmx_get_lpage_level(void); +static int vmx_rdtscp_supported(void); +void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer); +static uint64_t vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt); +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt); +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt); +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt); +static int vmx_get_cpl(struct kvm_vcpu *vcpu); +int get_ept_level(void); + +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + vpid_sync_vcpu_all(to_vmx(vcpu)); + if (enable_ept) + ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); +#endif +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + unsigned long guest_cr3; + uint64_t eptp; + + guest_cr3 = cr3; +#ifdef XXX + if (enable_ept) { + /* + * ept not implemented right now... + */ + eptp = construct_eptp(cr3); + vmcs_write64(EPT_POINTER, eptp); + guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : + vcpu->kvm->arch.ept_identity_map_addr; + ept_load_pdptrs(vcpu); + } +#endif /*XXX*/ + + vmx_flush_tlb(vcpu); + vmcs_writel(GUEST_CR3, guest_cr3); +} static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = nulldev/*cpu_has_kvm_support*/, @@ -141,38 +212,38 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_free = nulldev /*vmx_free_vcpu*/, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = nulldev /*vmx_save_host_state*/, + .prepare_guest_switch = vmx_save_host_state /*vmx_save_host_state*/, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, .set_guest_debug = nulldev /*set_guest_debug*/, - .get_msr = nulldev /*vmx_get_msr*/, - .set_msr = nulldev /*vmx_set_msr*/, - .get_segment_base = nulldev /*vmx_get_segment_base*/, - .get_segment = nulldev /*vmx_get_segment*/, - .set_segment = nulldev /*vmx_set_segment*/, - .get_cpl = nulldev /*vmx_get_cpl*/, + .get_msr = vmx_get_msr /*vmx_get_msr*/, + .set_msr = vmx_set_msr /*vmx_set_msr*/, + .get_segment_base = vmx_get_segment_base /*vmx_get_segment_base*/, + .get_segment = vmx_get_segment /*vmx_get_segment*/, + .set_segment = vmx_set_segment /*vmx_set_segment*/, + .get_cpl = vmx_get_cpl /*vmx_get_cpl*/, .get_cs_db_l_bits = nulldev /*vmx_get_cs_db_l_bits*/, .decache_cr0_guest_bits = nulldev /*vmx_decache_cr0_guest_bits*/, .decache_cr4_guest_bits = nulldev /*vmx_decache_cr4_guest_bits*/, .set_cr0 = vmx_set_cr0, - .set_cr3 = nulldev /*vmx_set_cr3*/, + .set_cr3 = vmx_set_cr3 /*vmx_set_cr3*/, .set_cr4 = vmx_set_cr4, - .set_efer = nulldev /*vmx_set_efer*/, - .get_idt = nulldev /*vmx_get_idt*/, - .set_idt = nulldev /*vmx_set_idt*/, - .get_gdt = nulldev /*vmx_get_gdt*/, - .set_gdt = nulldev /*vmx_set_gdt*/, + .set_efer = vmx_set_efer /*vmx_set_efer*/, + .get_idt = vmx_get_idt /*vmx_get_idt*/, + .set_idt = vmx_set_idt /*vmx_set_idt*/, + .get_gdt = vmx_get_gdt /*vmx_get_gdt*/, + .set_gdt = vmx_set_gdt /*vmx_set_gdt*/, .cache_reg = nulldev /*vmx_cache_reg*/, - .get_rflags = nulldev /*vmx_get_rflags*/, - .set_rflags = nulldev /*vmx_set_rflags*/, + .get_rflags = vmx_get_rflags /*vmx_get_rflags*/, + .set_rflags = vmx_set_rflags /*vmx_set_rflags*/, .fpu_activate = nulldev /*vmx_fpu_activate*/, .fpu_deactivate = nulldev /*vmx_fpu_deactivate*/, .tlb_flush = nulldev /*vmx_flush_tlb*/, - .run = nulldev /*vmx_vcpu_run*/, - .handle_exit = nulldev /*vmx_handle_exit*/, + .run = vmx_vcpu_run /*vmx_vcpu_run*/, + .handle_exit = vmx_handle_exit /*vmx_handle_exit*/, .skip_emulated_instruction = nulldev /*skip_emulated_instruction*/, .set_interrupt_shadow = nulldev /*vmx_set_interrupt_shadow*/, .get_interrupt_shadow = nulldev /*vmx_get_interrupt_shadow*/, @@ -180,28 +251,62 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_irq = nulldev /*vmx_inject_irq*/, .set_nmi = nulldev /*vmx_inject_nmi*/, .queue_exception = nulldev /*vmx_queue_exception*/, - .interrupt_allowed = nulldev /*vmx_interrupt_allowed*/, + .interrupt_allowed = vmx_interrupt_allowed /*vmx_interrupt_allowed*/, .nmi_allowed = nulldev /*vmx_nmi_allowed*/, .get_nmi_mask = nulldev /*vmx_get_nmi_mask*/, .set_nmi_mask = nulldev /*vmx_set_nmi_mask*/, .enable_nmi_window = nulldev /*enable_nmi_window*/, .enable_irq_window = nulldev /*enable_irq_window*/, - .update_cr8_intercept = nulldev /*update_cr8_intercept*/, + .update_cr8_intercept = vmx_update_cr8_intercept /*update_cr8_intercept*/, .set_tss_addr = vmx_set_tss_addr, - .get_tdp_level = nulldev /*get_ept_level*/, + .get_tdp_level = get_ept_level /*get_ept_level*/, .get_mt_mask = nulldev /*vmx_get_mt_mask*/, .exit_reasons_str = nulldev /*vmx_exit_reasons_str*/, - .get_lpage_level = nulldev /*vmx_get_lpage_level*/, + .get_lpage_level = vmx_get_lpage_level /*vmx_get_lpage_level*/, .cpuid_update = nulldev /*vmx_cpuid_update*/, - .rdtscp_supported = nulldev /*vmx_rdtscp_supported*/, + .rdtscp_supported = vmx_rdtscp_supported /*vmx_rdtscp_supported*/, }; struct kvm_x86_ops *kvm_x86_ops; +uint32_t vmcs_read32(unsigned long field) +{ + return vmcs_readl(field); +} + +void vmcs_write32(unsigned long field, uint32_t value) +{ + vmcs_writel(field, value); +} + +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); + dt->base = vmcs_readl(GUEST_IDTR_BASE); +} + +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_IDTR_BASE, dt->base); +} + +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); + dt->base = vmcs_readl(GUEST_GDTR_BASE); +} + +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_GDTR_BASE, dt->base); +} + /* * In linux, there is a separate vmx kernel module from the kvm driver. * That may be a good idea, but we're going to do everything in @@ -212,8 +317,8 @@ struct kvm_x86_ops *kvm_x86_ops; struct vmcs **vmxarea; /* 1 per cpu */ -static int alloc_kvm_area(void) -{ +static int alloc_kvm_area(void){ + int i, j; /* @@ -244,6 +349,39 @@ static int alloc_kvm_area(void) extern struct vmcs_config vmcs_config; +static int adjust_vmx_controls(uint32_t ctl_min, uint32_t ctl_opt, + uint32_t msr, uint32_t *result) +{ + uint32_t vmx_msr_low, vmx_msr_high; + uint32_t ctl = ctl_min | ctl_opt; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + /* Ensure minimum (required) set of control bits are supported. */ + if (ctl_min & ~ctl) + return EIO; + + *result = ctl; + return DDI_SUCCESS; +} + +/* Pure 2^n version of get_order */ +static inline int get_order(unsigned long size) +{ + int order; + + size = (size - 1) >> (PAGESHIFT - 1); + order = -1; + do { + size >>= 1; + order++; + } while (size); + return order; +} + static int setup_vmcs_config(struct vmcs_config *vmcs_conf) { uint32_t vmx_msr_low, vmx_msr_high; @@ -254,12 +392,11 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf) uint32_t _vmexit_control = 0; uint32_t _vmentry_control = 0; -#ifdef XXX min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; opt = PIN_BASED_VIRTUAL_NMIS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; + &_pin_based_exec_control) != DDI_SUCCESS) + return EIO; min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 @@ -278,8 +415,8 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; + &_cpu_based_exec_control) != DDI_SUCCESS) + return EIO; #ifdef CONFIG_X86_64 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & @@ -296,8 +433,8 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) - return -EIO; + &_cpu_based_2nd_exec_control) != DDI_SUCCESS) + return EIO; } #ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & @@ -320,15 +457,14 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf) #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, - &_vmexit_control) < 0) - return -EIO; + &_vmexit_control) != DDI_SUCCESS) + return EIO; min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, - &_vmentry_control) < 0) - return -EIO; -#endif /*XXX*/ + &_vmentry_control) != DDI_SUCCESS) + return EIO; rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); @@ -347,29 +483,100 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf) return EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; -#ifdef XXX vmcs_conf->order = get_order(vmcs_config.size); -#endif vmcs_conf->revision_id = vmx_msr_low; -#ifdef XXX vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; -#endif + return 0; } +/* EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL; +#else +static uint64_t efer_reserved_bits = 0xfffffffffffffffeULL; +#endif + +static int bypass_guest_pf = 1; +int enable_vpid = 1; +static int flexpriority_enabled = 1; +int enable_ept = 0; +int enable_unrestricted_guest = 1; +int emulate_invalid_guest_state = 0; + +void kvm_enable_efer_bits(uint64_t mask) +{ + efer_reserved_bits &= ~mask; +} + +static inline int cpu_has_vmx_vpid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; +} + +static inline int cpu_has_vmx_ept(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; +} +static inline int cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + +static inline int cpu_has_vmx_tpr_shadow(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; +} + +static inline int cpu_has_vmx_virtualize_apic_accesses(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline int cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); +} + +static inline int cpu_has_vmx_ept_2m_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); +} + +void kvm_disable_largepages(void) +{ + largepages_enabled = 0; +} + +static inline int cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + static int vmx_hardware_setup(void) { - if (setup_vmcs_config(&vmcs_config) < 0) + if (setup_vmcs_config(&vmcs_config) != DDI_SUCCESS) return EIO; #ifdef XXX if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); +#endif /*XXX*/ + if (!cpu_has_vmx_vpid()) enable_vpid = 0; @@ -390,10 +597,11 @@ static int vmx_hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); - +#ifdef XXX if (!cpu_has_vmx_ple()) ple_gap = 0; -#endif /*XXX*/ +#endif + return alloc_kvm_area(); } @@ -402,42 +610,520 @@ int kvm_arch_hardware_setup(void) return kvm_x86_ops->hardware_setup(); } -int kvm_mmu_module_init(void) +struct kmem_cache *pte_chain_cache; +struct kmem_cache *rmap_desc_cache; +struct kmem_cache *mmu_page_header_cache; + +int tdp_enabled = 0; + +#define PT_WRITABLE_SHIFT 1 +#define PT_PRESENT_MASK (1ULL << 0) +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(uint64_t)(PAGESIZE-1)) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) { + void *p; + + p = mc->objects[--mc->nobjs]; + return p; +} + +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + uint64_t *parent_pte) +{ + struct kvm_mmu_page *sp; + + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + list_insert_head(&vcpu->kvm->arch.active_mmu_pages, sp); #ifdef XXX + /* XXX don't see this used anywhere */ + INIT_LIST_HEAD(&sp->oos_link); +#endif /*XXX*/ + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + sp->multimapped = 0; + sp->parent_pte = parent_pte; + --vcpu->kvm->arch.n_free_mmu_pages; + return sp; +} + +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + +struct kvm_mmu_page * +shadow_hpa_to_kvmpage(hpa_t shadow_page) +{ + /* + * XXX - We'll probably need a faster way to do this... + * For right now, search all kvm_mmu_page for matching hpa + */ + +} + +struct kvm_mmu_page * +page_header(hpa_t shadow_page) +{ + return (struct kvm_mmu_page *)shadow_hpa_to_kvmpage(shadow_page); +} + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + return; + } + for(pte_chain = list_head(sp->parent_ptes); pte_chain; + pte_chain = list_next(sp->parent_ptes, pte_chain)) { + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + } + } +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); +} + +static unsigned kvm_page_table_hashfn(gfn_t gfn) +{ + return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + gva_t gaddr, + unsigned level, + int direct, + unsigned access, + uint64_t *parent_pte) +{ + union kvm_mmu_page_role role; + unsigned index; + unsigned quadrant; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node, *tmp; + + role = vcpu->arch.mmu.base_role; + role.level = level; + role.direct = direct; + role.access = access; + if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); + quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + role.quadrant = quadrant; + } + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + for (sp = list_head(&vcpu->kvm->arch.mmu_page_hash[index]); sp; + sp = list_next(&vcpu->kvm->arch.mmu_page_hash[index], sp)) { + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + BT_SET(&vcpu->requests, KVM_REQ_MMU_SYNC); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } + return sp; + } + } +#ifdef XXX + ++vcpu->kvm->stat.mmu_cache_miss; +#endif + sp = kvm_mmu_alloc_page(vcpu, parent_pte); + if (!sp) + return sp; + sp->gfn = gfn; + sp->role = role; + list_insert_head(bucket, &sp); + if (!direct) { + if (rmap_write_protect(vcpu->kvm, gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); +#ifdef XXX + account_shadowed(vcpu->kvm, gfn); +#endif /*XXX*/ + } + if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) + vcpu->arch.mmu.prefetch_page(vcpu, sp); + else + nonpaging_prefetch_page(vcpu, sp); +#ifdef XXX + trace_kvm_mmu_get_page(sp, true); +#endif /*XXX*/ + return sp; +} + +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + int i; + gfn_t root_gfn; + struct kvm_mmu_page *sp; + int direct = 0; + uint64_t pdptr; + + root_gfn = vcpu->arch.cr3 >> PAGESHIFT; + + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + ASSERT(!VALID_PAGE(root)); + if (tdp_enabled) + direct = 1; + if (mmu_check_root(vcpu, root_gfn)) + return 1; + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, direct, + ACC_ALL, NULL); + root = kvm_va2pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.root_hpa = root; + return 0; + } + direct = !is_paging(vcpu); + if (tdp_enabled) + direct = 1; + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { + pdptr = kvm_pdptr_read(vcpu, i); + if (!is_present_gpte(pdptr)) { + vcpu->arch.mmu.pae_root[i] = 0; + continue; + } + root_gfn = pdptr >> PAGESHIFT; + } else if (vcpu->arch.mmu.root_level == 0) + root_gfn = 0; + if (mmu_check_root(vcpu, root_gfn)) + return 1; + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, direct, + ACC_ALL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + return 0; +} + +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + +static void mmu_destroy_caches(void) +{ + if (pte_chain_cache) + kmem_cache_destroy(pte_chain_cache); + if (rmap_desc_cache) + kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); +} + +int +zero_constructor(void *buf, void *arg, int tags) +{ + bzero(buf, (size_t)arg); +} + +int kvm_mmu_module_init(void) +{ pte_chain_cache = kmem_cache_create("kvm_pte_chain", - sizeof(struct kvm_pte_chain), - 0, 0, NULL); + sizeof(struct kvm_pte_chain), 0, + zero_constructor, NULL, NULL, + sizeof(struct kvm_pte_chain), NULL, 0); if (!pte_chain_cache) goto nomem; rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", - sizeof(struct kvm_rmap_desc), - 0, 0, NULL); + sizeof(struct kvm_rmap_desc), 0, + zero_constructor, NULL, NULL, + sizeof(struct kvm_rmap_desc), NULL, 0); if (!rmap_desc_cache) goto nomem; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", - sizeof(struct kvm_mmu_page), - 0, 0, NULL); + sizeof(struct kvm_mmu_page), 0, + zero_constructor, NULL, NULL, + sizeof(struct kvm_mmu_page), NULL, 0); if (!mmu_page_header_cache) goto nomem; +#ifdef XXX + /* this looks like a garbage collector/reaper. Implement later if needed */ register_shrinker(&mmu_shrinker); +#endif /*XXX*/ return 0; nomem: mmu_destroy_caches(); - return -ENOMEM; + return ENOMEM; +} + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. This capabilities test skips MSRs that are + * kvm-specific. Those are put in the beginning of the list. + */ + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +#define KVM_SAVE_MSRS_BEGIN 5 +static uint32_t msrs_to_save[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA +}; + +static unsigned num_msrs_to_save; + +static uint32_t emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, +}; + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +uint64_t native_read_msr_safe(unsigned int msr, + int *err) +{ + DECLARE_ARGS(val, low, high); + +#ifdef CONFIG_SOLARIS + { + on_trap_data_t otd; + + if (on_trap(&otd, OT_DATA_ACCESS) == 0) { + native_read_msr(msr); + } else { + *err = EINVAL; /* XXX probably not right... */ + } + no_trap(); + } #else - return DDI_SUCCESS; -#endif /*XXX*/ + asm volatile("2: rdmsr ; xor %[err],%[err]\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: mov %[fault],%[err] ; jmp 1b\n\t" + ".previous\n\t" + _ASM_EXTABLE(2b, 3b) + : [err] "=r" (*err), EAX_EDX_RET(val, low, high) + : "c" (msr), [fault] "i" (-EIO)); +#endif /*CONFIG_SOLARIS*/ + return EAX_EDX_VAL(val, low, high); +} + +/* Can be uninlined because referenced by paravirt */ +int native_write_msr_safe(unsigned int msr, + unsigned low, unsigned high) +{ + int err; +#ifdef CONFIG_SOLARIS + { + on_trap_data_t otd; + + if (on_trap(&otd, OT_DATA_ACCESS) == 0) { + native_write_msr(msr, low, high); + } else { + err = EINVAL; /* XXX probably not right... */ + } + no_trap(); + } +#else + asm volatile("2: wrmsr ; xor %[err],%[err]\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: mov %[fault],%[err] ; jmp 1b\n\t" + ".previous\n\t" + _ASM_EXTABLE(2b, 3b) + : [err] "=a" (err) + : "c" (msr), "0" (low), "d" (high), + [fault] "i" (-EIO) + : "memory"); +#endif /*CONFIG_SOLARIS*/ + return err; +} + +static void kvm_init_msr_list(void) +{ + uint32_t dummy[2]; + unsigned i, j; + + /* skip the first msrs in the list. KVM-specific */ + for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] = msrs_to_save[i]; + j++; + } + num_msrs_to_save = j; +} + +static uint64_t shadow_trap_nonpresent_pte; +static uint64_t shadow_notrap_nonpresent_pte; +static uint64_t shadow_base_present_pte; +static uint64_t shadow_nx_mask; +static uint64_t shadow_x_mask; /* mutual exclusive with nx_mask */ +static uint64_t shadow_user_mask; +static uint64_t shadow_accessed_mask; +static uint64_t shadow_dirty_mask; + +void kvm_mmu_set_nonpresent_ptes(uint64_t trap_pte, uint64_t notrap_pte) +{ + shadow_trap_nonpresent_pte = trap_pte; + shadow_notrap_nonpresent_pte = notrap_pte; +} + +void kvm_mmu_set_base_ptes(uint64_t base_pte) +{ + shadow_base_present_pte = base_pte; +} + +void kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask, + uint64_t dirty_mask, uint64_t nx_mask, uint64_t x_mask) +{ + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; +} + +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_SHIFT 5 +#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_SHIFT 63 +#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + +static void kvm_timer_init(void) +{ + int cpu; + + /* + * XXX We assume that any machine running solaris kvm + * has constant time stamp counter increment rate. + * This will be true for all but older machines. + */ +#ifndef CONFIG_SOLARIS + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; +#else + /* assume pi_clock in mhz */ + /* cpu_tsc_khz = (CPU)->cpu_type_info.pi_clock * 1000;*/ +#endif /*CONFIG_SOLARIS*/ } int kvm_arch_init(void *opaque) { int r; struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + volatile int x; /* XXX - dtrace return probe missing */ if (ops->cpu_has_kvm_support()) { cmn_err(CE_WARN, "kvm: no hardware support\n"); @@ -454,7 +1140,6 @@ int kvm_arch_init(void *opaque) if (r) goto out; -#ifdef XXX kvm_init_msr_list(); kvm_x86_ops = ops; @@ -463,16 +1148,20 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); - kvm_timer_init(); -#endif + x = 10; /*XXX*/ return 0; out: + x = 20; /*XXX*/ return r; } +caddr_t bad_page; /* XXX page_t on linux... */ +pfn_t bad_pfn; +kmem_cache_t *kvm_vcpu_cache; + int kvm_init(void *opaque, unsigned int vcpu_size) { int r; @@ -482,33 +1171,28 @@ int kvm_init(void *opaque, unsigned int vcpu_size) if (r != DDI_SUCCESS) return (r); -#ifdef XXX - if (r) - goto out_fail; - bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + bad_page = kmem_zalloc(PAGESIZE, KM_SLEEP); if (bad_page == NULL) { - r = -ENOMEM; + r = ENOMEM; goto out; } - bad_pfn = page_to_pfn(bad_page); + bad_pfn = hat_getpfnum(kas.a_hat, bad_page); +#ifdef XXX if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; } - #endif /*XXX*/ - r = kvm_arch_hardware_setup(); - return (r); -#ifdef XXX - if (r < 0) + if (r != DDI_SUCCESS) goto out_free_0a; +#ifdef XXX for_each_online_cpu(cpu) { smp_call_function_single(cpu, kvm_arch_check_processor_compat, @@ -516,7 +1200,10 @@ int kvm_init(void *opaque, unsigned int vcpu_size) if (r < 0) goto out_free_1; } +#endif /*XXX*/ + +#ifdef XXX r = register_cpu_notifier(&kvm_cpu_notifier); if (r) goto out_free_2; @@ -529,64 +1216,84 @@ int kvm_init(void *opaque, unsigned int vcpu_size) r = sysdev_register(&kvm_sysdev); if (r) goto out_free_4; - +#endif /*XXX*/ /* A kmem cache lets us meet the alignment requirements of fx_save. */ kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, __alignof__(struct kvm_vcpu), - 0, NULL); + NULL, NULL, NULL, NULL, NULL, 0); if (!kvm_vcpu_cache) { - r = -ENOMEM; + r = ENOMEM; goto out_free_5; } +#ifdef XXX kvm_chardev_ops.owner = module; kvm_vm_fops.owner = module; kvm_vcpu_fops.owner = module; r = misc_register(&kvm_dev); if (r) { - printk(KERN_ERR "kvm: misc device register failed\n"); + cmn_err(CE_WARN, "kvm: misc device register failed\n"); goto out_free; } + /* + * XXX - if kernel preemption occurs, we probably need + * to implement these, and add hooks to the preemption code. + * For right now, we'll make the totally unreasonable + * assumption that we won't be preempted while in the + * kernel, i.e., no realtime threads are running + */ kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; kvm_init_debug(); +#endif /*XXX*/ return 0; out_free: kmem_cache_destroy(kvm_vcpu_cache); out_free_5: +#ifdef XXX sysdev_unregister(&kvm_sysdev); out_free_4: sysdev_class_unregister(&kvm_sysdev_class); out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); +#endif /*XXX*/ out_free_2: out_free_1: +#ifdef XXX kvm_arch_hardware_unsetup(); +#endif /*XXX*/ out_free_0a: +#ifdef XXX free_cpumask_var(cpus_hardware_enabled); +#endif /*XXX*/ out_free_0: - __free_page(bad_page); + kmem_free(bad_page, PAGESIZE); out: +#ifdef XXX kvm_arch_exit(); +#endif out_fail: return r; -#endif /*XXX*/ } -extern unsigned long *vmx_io_bitmap_a; -extern unsigned long *vmx_io_bitmap_b; -extern unsigned long *vmx_msr_bitmap_legacy; -extern unsigned long *vmx_msr_bitmap_longmode; +extern unsigned long vmx_io_bitmap_a[]; +extern unsigned long vmx_io_bitmap_b[]; +extern unsigned long vmx_msr_bitmap_legacy[]; +extern unsigned long vmx_msr_bitmap_longmode[]; + +static inline int cpu_has_vmx_msr_bitmap(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; +} static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t msr) { -#ifdef XXX int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) @@ -598,14 +1305,13 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, uint32_t * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ if (msr <= 0x1fff) { - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ + BT_CLEAR(msr_bitmap + 0x000 / f, msr); /* read-low */ + BT_CLEAR(msr_bitmap + 0x800 / f, msr); /* write-low */ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ + BT_CLEAR(msr_bitmap + 0x400 / f, msr); /* read-high */ + BT_CLEAR(msr_bitmap + 0xc00 / f, msr); /* write-high */ } -#endif /*XXX*/ } static void vmx_disable_intercept_for_msr(uint32_t msr, int longmode_only) @@ -615,16 +1321,52 @@ static void vmx_disable_intercept_for_msr(uint32_t msr, int longmode_only) __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); } +static struct kvm_shared_msrs_global shared_msrs_global; + +void kvm_define_shared_msr(unsigned slot, uint32_t msr) +{ + if (slot >= shared_msrs_global.nr) + shared_msrs_global.nr = slot + 1; + shared_msrs_global.msrs[slot] = msr; +#ifdef XXX + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); +#endif /*XXX*/ +} + +static uint64_t host_efer; + +/* + * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ +static const uint32_t vmx_msr_index[] = { +#ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, +#endif + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, +}; +#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) +#define VMX_NR_VPIDS (1 << 16) +ulong_t *vmx_vpid_bitmap; +size_t vpid_bitmap_words; +kmutex_t vmx_vpid_lock; + +void kvm_disable_tdp(void) +{ + tdp_enabled = 0; +} + static int vmx_init(void) { int r, i; -#ifdef XXX + rdmsrl_safe(MSR_EFER, &host_efer); for (i = 0; i < NR_VMX_MSR; ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); -#endif /*XXX*/ +#ifdef XXX vmx_io_bitmap_a = (unsigned long *)kmem_zalloc(PAGESIZE, KM_SLEEP); if (!vmx_io_bitmap_a) return ENOMEM; @@ -643,10 +1385,9 @@ static int vmx_init(void) vmx_msr_bitmap_longmode = (unsigned long *)kmem_zalloc(PAGESIZE, KM_SLEEP); if (!vmx_msr_bitmap_longmode) { - r = ENOMEM; - goto out2; + r = ENOMEM; goto out2; } - +#endif /* * Allow direct access to the PC debug port (it is often used for I/O * delays, but the vmexits simply slow things down). @@ -659,9 +1400,7 @@ static int vmx_init(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGESIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGESIZE); -#ifdef XXX - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ -#endif /*XXX*/ + BT_SET(vmx_vpid_bitmap, 0); /* 0 is reserved for host */ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx)); @@ -675,7 +1414,6 @@ static int vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, 0); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, 0); -#ifdef XXX if (enable_ept) { bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | @@ -686,10 +1424,10 @@ static int vmx_init(void) } else kvm_disable_tdp(); +#ifdef XXX if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); #endif /*XXX*/ - return 0; out3: @@ -718,10 +1456,20 @@ _init(void) ddi_soft_state_fini(&kvm_state); } + if (enable_vpid) { + vpid_bitmap_words = howmany(VMX_NR_VPIDS, BT_NBIPUL); + vmx_vpid_bitmap = kmem_zalloc(sizeof(ulong_t)*vpid_bitmap_words, KM_SLEEP); + mutex_init(&vmx_vpid_lock, NULL, MUTEX_DRIVER, NULL); + } + mutex_init(&kvm_lock, NULL, MUTEX_DRIVER, 0); /* XXX */ kvm_x86_ops = &vmx_x86_ops; if ((r = vmx_init()) != DDI_SUCCESS) { mutex_destroy(&kvm_lock); + if (vmx_vpid_bitmap) { + kmem_free(vmx_vpid_bitmap, sizeof(ulong_t)*vpid_bitmap_words); + mutex_destroy(&vmx_vpid_lock); + } mod_remove(&modlinkage); ddi_soft_state_fini(&kvm_state); return (r); @@ -953,7 +1701,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); } -#endif static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, @@ -981,21 +1728,8 @@ mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, return (pvec->nr == KVM_PAGE_ARRAY_NR); } -static uint64_t shadow_trap_nonpresent_pte; -static uint64_t shadow_notrap_nonpresent_pte; - extern pfn_t hat_getpfnum(struct hat *hat, caddr_t); -#ifdef XXX - -static inline struct kvm_mmu_page * -page_header(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGESHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} - static int is_large_pte(uint64_t pte) { @@ -1009,6 +1743,7 @@ is_shadow_present_pte(uint64_t pte) && pte != shadow_notrap_nonpresent_pte; } + static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { @@ -1042,7 +1777,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, } } - if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) + if (bt_getlowbit(sp->unsync_child_bitmap, 0, 512) == 512) sp->unsync_children = 0; return nr_unsync_leaf; @@ -1228,7 +1963,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, kvm->mmu_notifier_count--; spin_unlock(&kvm->mmu_lock); - BUG_ON(kvm->mmu_notifier_count < 0); + assert(kvm->mmu_notifier_count >= 0); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, @@ -1257,18 +1992,14 @@ kvm_arch_flush_shadow(struct kvm *kvm) kvm_reload_remote_mmus(kvm); } -#ENDIF /*XXX*/ - static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; -#ifdef XXX idx = srcu_read_lock(&kvm->srcu); kvm_arch_flush_shadow(kvm); srcu_read_unlock(&kvm->srcu, idx); -#endif /*XXX*/ } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { @@ -1285,7 +2016,6 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } - #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ static int kvm_init_mmu_notifier(struct kvm *kvm) @@ -1345,7 +2075,8 @@ kvm_create_vm(void) (void *)ipltospl(DISP_LEVEL)); #ifdef XXX kvm_eventfd_init(kvmp); -#endif +#endif /*XXX*/ + mutex_init(&kvmp->lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&kvmp->irq_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&kvmp->slots_lock, NULL, MUTEX_DRIVER, NULL); @@ -1436,10 +2167,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; used_pages = max(0, used_pages); -#ifdef XXX /* for the time being, assume that address space will only grow */ /* larger. The following code will be added later. */ - +#ifdef XXX /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we @@ -1448,7 +2178,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) if (used_pages > kvm_nr_mmu_pages) { while (used_pages > kvm_nr_mmu_pages && - !list_empty(&kvm->arch.active_mmu_pages)) { + !list_is_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, @@ -1483,7 +2213,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, old.npages * PAGESIZE); up_write(¤t->mm->mmap_sem); if (ret < 0) - printk(KERN_WARNING + cmn_err(CE_WARN, "kvm_vm_ioctl_set_memory_region: " "failed to munmap memory\n"); } @@ -2004,6 +2734,31 @@ static inline uint32_t bit(int bitno) return 1 << (bitno & 31); } +static inline int cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static inline int cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + +static int vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + #define F(x) bit(X86_FEATURE_##x) @@ -2058,10 +2813,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function, F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 0 /* SKINIT */ | 0 /* WDT */; + volatile int x; /* XXX - dtrace return probe missing */ + /* all calls to cpuid_count() should be made on the same cpu */ /* XXX - right now, system panics at ddi_exit_critical() */ /* XXX - to run everything on same cpu, bind qemu at startup */ - /*ddic = ddi_enter_critical(); */ + kpreempt_disable(); do_cpuid_1_ent(entry, function, index); ++*nent; @@ -2135,6 +2892,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function, } /*XXX - see comment above for ddi_enter_critical() */ /*ddi_exit_critical(ddic);*/ + kpreempt_enable(); + x = 10; /*XXX*/ } #undef F @@ -2146,13 +2905,15 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = E2BIG; uint32_t func; + int allocsize = 0; if (cpuid->nent < 1) goto out; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; r = ENOMEM; - cpuid_entries = kmem_alloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent, KM_SLEEP); + allocsize = sizeof(struct kvm_cpuid_entry2)*cpuid->nent; + cpuid_entries = kmem_alloc(allocsize, KM_SLEEP); if (!cpuid_entries) goto out; @@ -2182,16 +2943,11 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, r = 0; out_free: - kmem_free(cpuid_entries, sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + kmem_free(cpuid_entries, allocsize); out: return r; } -struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_vmx, vcpu); -} - #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -2199,6 +2955,7 @@ void vmcs_clear(struct vmcs *vmcs) { unsigned char error; uint64_t phys_addr = (hat_getpfnum(kas.a_hat, (char *)vmcs)<<PAGESHIFT)|((uint64_t)vmcs&PAGEOFFSET); + volatile int x; /*XXX - dtrace return probe missing */ asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "\n\tsetna %0\n" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) @@ -2206,6 +2963,7 @@ void vmcs_clear(struct vmcs *vmcs) if (error) cmn_err(CE_PANIC, "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); + x = 10; /*XXX*/ } static void __vcpu_clear(void *arg) @@ -2236,16 +2994,21 @@ static void vcpu_clear(struct vcpu_vmx *vmx) } + +static void vmwrite_error(unsigned long field, unsigned long value) +{ + cmn_err(CE_WARN, "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + void vmcs_writel(unsigned long field, unsigned long value) { unsigned char error; asm volatile (ASM_VMX_VMWRITE_RAX_RDX "\n\tsetna %0" : "=q"(error) : "a"(value), "d"(field) : "cc"); -#ifdef XXX - if (unlikely(error)) + if ((error)) vmwrite_error(field, value); -#endif } unsigned long vmcs_readl(unsigned long field) @@ -2257,7 +3020,6 @@ unsigned long vmcs_readl(unsigned long field) return value; } - uint64_t vmcs_read64(unsigned long field) { #ifdef CONFIG_X86_64 @@ -2267,6 +3029,11 @@ uint64_t vmcs_read64(unsigned long field) #endif } +uint16_t vmcs_read16(unsigned long field) +{ + return vmcs_readl(field); +} + void vmcs_write64(unsigned long field, uint64_t value) { vmcs_writel(field, value); @@ -2276,15 +3043,208 @@ void vmcs_write64(unsigned long field, uint64_t value) #endif } + +void vmcs_write16(unsigned long field, uint16_t value) +{ + vmcs_writel(field, value); +} + +/* + * writes 'guest_tsc' into guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + */ +static void guest_write_tsc(uint64_t guest_tsc, uint64_t host_tsc) +{ + vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); +} + +static inline int cpu_has_secondary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +} + +int vm_need_virtualize_apic_accesses(struct kvm *kvm) +{ + return flexpriority_enabled && irqchip_in_kernel(kvm); +} + +extern uint64_t kvm_va2pa(caddr_t va); +/* + * Sets up the vmcs for emulated real mode. + */ +int vmx_vcpu_setup(struct vcpu_vmx *vmx) +{ + uint32_t host_sysenter_cs, msr_low, msr_high; + uint32_t junk; + uint64_t host_pat, tsc_this, tsc_base; + unsigned long a; + struct descriptor_table dt; + int i; + unsigned long kvm_vmx_return; + uint32_t exec_control; + + /* I/O */ + vmcs_write64(IO_BITMAP_A, kvm_va2pa((caddr_t)vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, kvm_va2pa((caddr_t)vmx_io_bitmap_b)); + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, kvm_va2pa((caddr_t)vmx_msr_bitmap_legacy)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + + /* Control */ + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, + vmcs_config.pin_based_exec_ctrl); + + exec_control = vmcs_config.cpu_based_exec_ctrl; +#ifdef XXX + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } +#endif /*XXX*/ + + if (!enable_ept) + exec_control |= CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + exec_control &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; + if (!enable_ept) { + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; +#ifdef XXX + if (!ple_gap) +#endif /*XXX*/ + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + +#ifdef XXX + if (ple_gap) { + vmcs_write32(PLE_GAP, ple_gap); + vmcs_write32(PLE_WINDOW, ple_window); + } +#endif /*XXX*/ + + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + vmcs_writel(HOST_CR0, getcr0()); /* 22.2.3 */ + vmcs_writel(HOST_CR4, getcr4()); /* 22.2.3, 22.2.5 */ + vmcs_writel(HOST_CR3, getcr3()); /* 22.2.3 FIXME: shadow tables */ + + vmcs_write16(HOST_CS_SELECTOR, GDT_KCODE); /* 22.2.4 */ + vmcs_write16(HOST_DS_SELECTOR, GDT_KDATA); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, GDT_KDATA); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_SS_SELECTOR, GDT_KDATA); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + rdmsrl(MSR_FS_BASE, a); + vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ + rdmsrl(MSR_GS_BASE, a); + vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ +#else + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ +#endif + + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + kvm_get_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ + + asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + + rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); + vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); + rdmsrl(MSR_IA32_SYSENTER_ESP, a); + vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ + rdmsrl(MSR_IA32_SYSENTER_EIP, a); + vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((uint64_t) msr_high << 32); + vmcs_write64(HOST_IA32_PAT, host_pat); + } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((uint64_t) msr_high << 32); + /* Write the default value follow host pat */ + vmcs_write64(GUEST_IA32_PAT, host_pat); + /* Keep arch.pat sync with GUEST_IA32_PAT */ + vmx->vcpu.arch.pat = host_pat; + } + + for (i = 0; i < NR_VMX_MSR; ++i) { + uint32_t index = vmx_msr_index[i]; + uint32_t data_low, data_high; + int j = vmx->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + vmx->guest_msrs[j].mask = -1ull; + ++vmx->nmsrs; + } + + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* 22.2.1, 20.8.1 */ + vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + + vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + + tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; + rdtscll(tsc_this); + if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) + tsc_base = tsc_this; + + guest_write_tsc(0, tsc_base); + return 0; +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + /* XXX - the following assignment assumes vmx contains vcpu */ + /* at the beginning of the structure */ + + struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu; uint64_t phys_addr = (hat_getpfnum(kas.a_hat, (char *)vmx->vmcs)<<PAGESHIFT)|((uint64_t)(vmx->vmcs)&0xfff); uint64_t tsc_this, delta, new_offset; + volatile int x; /* XXX - dtrace return probe missing */ if (vcpu->cpu != cpu) { vcpu_clear(vmx); @@ -2293,10 +3253,10 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) #endif /*XXX*/ BT_SET(&vcpu->requests, KVM_REQ_TLB_FLUSH); #ifdef XXX - local_irq_disable(); + kpreempt_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); - local_irq_enable(); + kpreempt_enable(); #endif /*XXX*/ } @@ -2351,6 +3311,8 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmcs_write64(TSC_OFFSET, new_offset); } } + x = 10; + return; } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -2382,6 +3344,15 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) #endif /*XXX*/ } +/* straight from xen code... */ +void +ldt_load(void) +{ + *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc; + wr_ldtr(ULDT_SEL); +} + + static void reload_tss(void) { /* @@ -2396,7 +3367,7 @@ static void reload_tss(void) load_TR_desc(); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) +int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 return vcpu->arch.efer & EFER_LMA; @@ -2405,6 +3376,27 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + +ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + uint64_t tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; +#ifdef XXX + if (tmask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); +#endif /*XXX*/ + return vcpu->arch.cr4 & mask; +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + + static void __vmx_load_host_state(struct vcpu_vmx *vmx) { @@ -2425,18 +3417,15 @@ __vmx_load_host_state(struct vcpu_vmx *vmx) * If we have to reload gs, we must take care to * preserve our gs base. */ -#ifdef XXX - local_irq_save(flags); -#endif /*XXX*/ + kpreempt_disable(); kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); #endif -#ifdef XXX - local_irq_restore(flags); -#endif /*XXX*/ + kpreempt_enable(); } reload_tss(); + #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -2445,9 +3434,16 @@ __vmx_load_host_state(struct vcpu_vmx *vmx) #endif } +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + kpreempt_disable(); + __vmx_load_host_state(vmx); + kpreempt_enable(); +} + void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - __vmx_load_host_state(to_vmx(vcpu)); + __vmx_load_host_state((struct vcpu_vmx *)vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -2488,9 +3484,75 @@ void vcpu_put(struct kvm_vcpu *vcpu) mutex_exit(&vcpu->mutex); } +/* find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) */ +static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + uint32_t function, uint32_t index) +{ + if (e->function != function) + return 0; + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return 0; + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return 0; + return 1; +} + +struct kvm_pic *pic_irqchip(struct kvm *kvm); +extern int irqchip_in_kernel(struct kvm *kvm); + +static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; ; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return j; + } + } + return 0; /* silence gcc, even though control never reaches here */ +} + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + uint32_t function, uint32_t index) +{ + int i; + struct kvm_cpuid_entry2 *best = NULL; + + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; + } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } + return best; +} + +#define APIC_LVT_NUM 6 +/* 14 is the version for Xeon and Pentium 8.4.8*/ +#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) + +extern void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val); + void kvm_apic_set_version(struct kvm_vcpu *vcpu) { -#ifdef XXX struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *feat; uint32_t v = APIC_VERSION; @@ -2502,7 +3564,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) v |= APIC_LVR_DIRECTED_EOI; apic_set_reg(apic, APIC_LVR, v); -#endif /*XXX*/ } @@ -2552,48 +3613,5166 @@ out: return r; } +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ +#ifdef XXX + if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, reg); +#endif /*XXX*/ + + return vcpu->arch.regs[reg]; +} + +void kvm_register_write(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + vcpu->arch.regs[reg] = val; +#ifdef XXX + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +#endif +} + +unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RIP); +} + +void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RIP, val); +} + +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; + + rflags = kvm_x86_ops->get_rflags(vcpu); +#ifdef XXX + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); +#endif /*XXX*/ + return rflags; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); +#ifdef CONFIG_X86_64 + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); +#endif + + regs->rip = kvm_rip_read(vcpu); + regs->rflags = kvm_get_rflags(vcpu); + + vcpu_put(vcpu); + + return 0; +} + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + uint32_t ar; + + var->base = vmcs_readl(sf->base); + var->limit = vmcs_read32(sf->limit); + var->selector = vmcs_read16(sf->selector); + ar = vmcs_read32(sf->ar_bytes); +#ifdef XXX + if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) + ar = 0; +#endif /*XXX*/ + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + var->present = (ar >> 7) & 1; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; + var->unusable = (ar >> 16) & 1; +} + +static uint32_t vmx_segment_access_rights(struct kvm_segment *var) +{ + uint32_t ar; + + if (var->unusable) + ar = 1 << 16; + else { + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + } + if (ar == 0) /* a 0 value means unusable */ + ar = AR_UNUSABLE_MASK; + + return ar; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu; + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + uint32_t ar; + + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmx->rmode.tr.selector = var->selector; + vmx->rmode.tr.base = var->base; + vmx->rmode.tr.limit = var->limit; + vmx->rmode.tr.ar = vmx_segment_access_rights(var); + return; + } + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (vmx->rmode.vm86_active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else + ar = vmx_segment_access_rights(var); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the usedland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ +#ifdef XXX + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + ar |= 0x1; /* Accessed */ +#endif /*XXX*/ + + vmcs_write32(sf->ar_bytes, ar); +} + +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + +static uint16_t get_segment_selector(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + return kvm_seg.selector; +} + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ +#ifdef XXX + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + vcpu->arch.singlestep_cs == + get_segment_selector(vcpu, VCPU_SREG_CS) && + vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; +#endif /*XXX*/ + kvm_x86_ops->set_rflags(vcpu, rflags); +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); +#ifdef CONFIG_X86_64 + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); +#endif + + kvm_rip_write(vcpu, regs->rip); + kvm_set_rflags(vcpu, regs->rflags); + + vcpu->arch.exception.pending = 0; + + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ +#ifdef XXX + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; +#endif /*XXX*/ + + vcpu_load(vcpu); +#ifdef XXX + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); +#endif /*XXX*/ + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ +#ifdef XXX + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; +#endif + + vcpu_load(vcpu); +#ifdef XXX + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); +#endif /*XXX*/ + vcpu_put(vcpu); + + return 0; +} + + +ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; +#ifdef XXX + if (tmask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); +#endif /*XXX*/ + return vcpu->arch.cr0 & mask; +} + + +ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + if (irqchip_in_kernel(vcpu->kvm)) + return kvm_lapic_get_cr8(vcpu); + else +#endif /*XXX*/ + return vcpu->arch.cr8; +} + +extern uint64_t kvm_get_apic_base(struct kvm_vcpu *vcpu); + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct descriptor_table dt; + + vcpu_load(vcpu); + + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_x86_ops->get_idt(vcpu, &dt); + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; + kvm_x86_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; + + sregs->cr0 = kvm_read_cr0(vcpu); + sregs->cr2 = vcpu->arch.cr2; + sregs->cr3 = vcpu->arch.cr3; + sregs->cr4 = kvm_read_cr4(vcpu); + sregs->cr8 = kvm_get_cr8(vcpu); + sregs->efer = vcpu->arch.efer; + sregs->apic_base = kvm_get_apic_base(vcpu); + + memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); + + if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) + BT_SET((unsigned long *)sregs->interrupt_bitmap, + vcpu->arch.interrupt.nr); + + vcpu_put(vcpu); + + return 0; +} + +static void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + vcpu->arch.mmu.free(vcpu); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + } +} + +extern int init_kvm_mmu(struct kvm_vcpu *vcpu); + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} + +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, uint8_t vector, + int soft) +{ + vcpu->arch.interrupt.pending = 1; + vcpu->arch.interrupt.soft = soft; + vcpu->arch.interrupt.nr = vector; +} + + +static inline int is_present_gpte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; +#ifdef XXX + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } +#endif /*XXX*/ + return gfn; +} + +struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) +{ + int i; +#ifdef XXX + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); +#else + struct kvm_memslots *slots = kvm->memslots; +#endif /*XXX*/ + + for (i = 0; i < slots->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return NULL; +} + +static inline unsigned long bad_hva(void) +{ + return PAGEOFFSET; +} + +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + gfn = unalias_gfn_instantiation(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return bad_hva(); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE); +} + + +int kvm_is_error_hva(unsigned long addr) +{ + return addr == bad_hva(); +} + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return EFAULT; + r = copyin((caddr_t)(addr + offset), data, len); + if (r) + return EFAULT; + return 0; +} + + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGESHIFT; + unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2; + int i; + int ret; + uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, + offset * sizeof(uint64_t), sizeof(pdpte)); + if (ret < 0) { + ret = 0; + goto out; + } + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { + if (is_present_gpte(pdpte[i]) && + (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { + ret = 0; + goto out; + } + } + ret = 1; + + memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + BT_SET((unsigned long *)&vcpu->arch.regs_avail, + VCPU_EXREG_PDPTR); + BT_SET((unsigned long *)&vcpu->arch.regs_dirty, + VCPU_EXREG_PDPTR); +out: + + return ret; +} + +static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + if (irr == -1 || tpr < irr) { + vmcs_write32(TPR_THRESHOLD, 0); + return; + } + + vmcs_write32(TPR_THRESHOLD, irr); +} + +static void update_cr8_intercept(struct kvm_vcpu *vcpu) +{ + int max_irr, tpr; + + if (!kvm_x86_ops->update_cr8_intercept) + return; + + if (!vcpu->arch.apic) + return; +#ifdef XXX + if (!vcpu->arch.apic->vapic_addr) + max_irr = kvm_lapic_find_highest_irr(vcpu); + else +#endif /*XXX*/ + max_irr = -1; + + if (max_irr != -1) + max_irr >>= 4; +#ifdef XXX + tpr = kvm_lapic_get_cr8(vcpu); + + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); +#endif /*XXX*/ +} + +static int __find_msr_index(struct vcpu_vmx *vmx, uint32_t msr) +{ + int i; + + for (i = 0; i < vmx->nmsrs; ++i) + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) + return i; + return -1; +} + +static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, uint32_t msr) +{ + int i; + + i = __find_msr_index(vmx, msr); + if (i >= 0) + return &vmx->guest_msrs[i]; + return NULL; +} + +/* + * Swap MSR entry in host/guest MSR entry array. + */ +static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +{ + struct shared_msr_entry tmp; + + tmp = vmx->guest_msrs[to]; + vmx->guest_msrs[to] = vmx->guest_msrs[from]; + vmx->guest_msrs[from] = tmp; +} + +static int update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +{ + uint64_t guest_efer; + uint64_t ignore_bits; + + guest_efer = vmx->vcpu.arch.efer; + + /* + * NX is emulated; LMA and LME handled by hardware; SCE meaninless + * outside long mode + */ + ignore_bits = EFER_NX | EFER_SCE; +#ifdef CONFIG_X86_64 + ignore_bits |= EFER_LMA | EFER_LME; + /* SCE is meaningful only in long mode on Intel */ + if (guest_efer & EFER_LMA) + ignore_bits &= ~(uint64_t)EFER_SCE; +#endif + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + vmx->guest_msrs[efer_offset].data = guest_efer; + vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + return 1; +} + +/* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +void setup_msrs(struct vcpu_vmx *vmx) +{ + int save_nmsrs, index; + unsigned long *msr_bitmap; + + vmx_load_host_state(vmx); + save_nmsrs = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_LSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_CSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vmx, MSR_K6_STAR); + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) + move_msr_up(vmx, index, save_nmsrs++); + } +#endif + index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx, index)) + move_msr_up(vmx, index, save_nmsrs++); + + vmx->save_nmsrs = save_nmsrs; + + if (cpu_has_vmx_msr_bitmap()) { + if (is_long_mode(&vmx->vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + + vmcs_write64(MSR_BITMAP, kvm_va2pa((caddr_t)msr_bitmap)); + } +} + +void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + if (!msr) + return; + + /* + * Force kernel_gs_base reloading before EFER changes, as control + * of this msr depends on is_long_mode(). + */ + vmx_load_host_state(to_vmx(vcpu)); + vcpu->arch.efer = efer; + if (efer & EFER_LMA) { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) | + VM_ENTRY_IA32E_MODE); + msr->data = efer; + } else { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) & + ~VM_ENTRY_IA32E_MODE); + + msr->data = efer & ~EFER_LME; + } + setup_msrs(vmx); +} + +static inline int is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE +int kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; +} +#endif + +void kvm_pic_clear_isr_ack(struct kvm *kvm) +{ + struct kvm_pic *s = pic_irqchip(kvm); + + mutex_enter(&s->lock); + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; + mutex_exit(&s->lock); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int mmu_reset_needed = 0; + int pending_vec, max_bits; + struct descriptor_table dt; + + vcpu_load(vcpu); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; + kvm_x86_ops->set_idt(vcpu, &dt); + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; + kvm_x86_ops->set_gdt(vcpu, &dt); + + vcpu->arch.cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; + vcpu->arch.cr3 = sregs->cr3; + + kvm_set_cr8(vcpu, sregs->cr8); + + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_set_apic_base(vcpu, sregs->apic_base); + + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + vcpu->arch.cr0 = sregs->cr0; + + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + if (!is_long_mode(vcpu) && is_pae(vcpu)) { + load_pdptrs(vcpu, vcpu->arch.cr3); + mmu_reset_needed = 1; + } + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + max_bits = (sizeof sregs->interrupt_bitmap) << 3; + pending_vec = bt_getlowbit( + (const unsigned long *)sregs->interrupt_bitmap, 0, max_bits); + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, 0); + cmn_err(CE_NOTE, "Set back pending irq %d\n", pending_vec); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + } + + kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + update_cr8_intercept(vcpu); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + /* Older userspace won't unhalt the vcpu on reset. */ + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !is_protmode(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +#endif /*CONFIG_KVM_APIC_ARCHITECTURE*/ + + vcpu_put(vcpu); + + return 0; +} + +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +{ + static int version; + struct pvclock_wall_clock wc; + struct timespec boot; + +#ifdef XXX + if (!wall_clock) + return; + + version++; + + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_write_guest_time below) to the + * wall clock specified here. guest system time equals host + * system time for us, thus we must fill in host boot time here. + */ + getboottime(&boot); + + wc.sec = boot.tv_sec; + wc.nsec = boot.tv_nsec; + wc.version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); +#endif /*XXX*/ +} + +static int next_segment(unsigned long len, int offset) +{ + if (len > PAGESIZE - offset) + return PAGESIZE - offset; + else + return len; +} + + +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + +#ifdef XXX + gfn = unalias_gfn(kvm, gfn); + memslot = gfn_to_memslot_unaliased(kvm, gfn); + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; + unsigned long *p = memslot->dirty_bitmap + + rel_gfn / BT_NBIPUL; + int offset = rel_gfn % BT_NBIPUL; + + /* avoid RMW */ + if (!generic_test_le_bit(offset, p)) + generic___set_le_bit(offset, p); + } +#endif /*XXX*/ +} + +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copyout(data, (caddr_t)((uint64_t)addr + offset), len); + if (r) + return -EFAULT; + mark_page_dirty(kvm, gfn); + return 0; +} + +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGESHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} + +static int xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data) +{ + struct kvm *kvm = vcpu->kvm; + int lm = is_long_mode(vcpu); + uint8_t *blob_addr = lm ? (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 + : (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32; + uint8_t blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 + : kvm->arch.xen_hvm_config.blob_size_32; + uint32_t page_num = data & ~PAGEMASK; + uint64_t page_addr = data & PAGEMASK; + uint8_t *page; + int r; + + r = E2BIG; + if (page_num >= blob_size) + goto out; + r = ENOMEM; + page = kmem_alloc(PAGESIZE, KM_SLEEP); + if (!page) + goto out; + r = EFAULT; + if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE)) + goto out_free; + if (kvm_write_guest(kvm, page_addr, page, PAGESIZE)) + goto out_free; + r = 0; +out_free: + kmem_free(page, PAGESIZE); +out: + return r; +} + +int ignore_msrs = 0; +extern int is_paging(struct kvm_vcpu *vcpu); + +static void set_efer(struct kvm_vcpu *vcpu, uint64_t efer) +{ + if (efer & efer_reserved_bits) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_paging(vcpu) + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { + kvm_inject_gp(vcpu, 0); + return; + } + + if (efer & EFER_FFXSR) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + kvm_inject_gp(vcpu, 0); + return; + } + } + + if (efer & EFER_SVME) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + kvm_inject_gp(vcpu, 0); + return; + } + } + + kvm_x86_ops->set_efer(vcpu, efer); + + efer &= ~EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; + + vcpu->arch.efer = efer; + + vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); +} + +static int msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return 1; + case 0x2f8: + return 1; + } + return 0; +} + + +static int valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static int valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +static int mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + int i; + + if (!msr_mtrr_valid(msr)) + return 0; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return 0; + return 1; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return 0; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return 0; + return 1; + } + + /* variable MTRRs */ + return valid_mtrr_type(data & 0xff); +} + + +static int set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + uint64_t *p = (uint64_t *)&vcpu->arch.mtrr_state.fixed_ranges; + + if (!mtrr_valid(vcpu, msr, data)) + return 1; + + if (msr == MSR_MTRRdefType) { + vcpu->arch.mtrr_state.def_type = data; + vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + uint64_t *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pt = data; + } + +#ifdef XXX + kvm_mmu_reset_context(vcpu); +#endif /*XXX*/ + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + switch (msr) { +#ifdef XXX + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGESIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); +#endif /*XXX*/ + default: + cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + uint64_t gfn; + unsigned long addr; + uint8_t instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copyout(instructions, (caddr_t)addr, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + uint64_t mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_MCG_STATUS: + vcpu->arch.mcg_status = data; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + if (data != 0 && data != ~(uint64_t)0) + return -1; + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + uint32_t offset = msr - MSR_IA32_MC0_CTL; + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10)) != ~(uint64_t)0) + return -1; + vcpu->arch.mce_banks[offset] = data; + break; + } + return 1; + } + return 0; +} + +static int kvm_hv_msr_partition_wide(uint32_t msr) +{ + int r = 0; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = 1; + break; + } + + return r; +} + + +static inline void get_page(caddr_t page) +{ +} + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + pfn_t pfn; + + pfn = gfn_to_pfn(kvm, gfn); +#ifdef XXX + if (!kvm_is_mmio_pfn(pfn)) + return pfn_to_page(pfn); +#endif /*XXX*/ + + get_page(bad_page); + return (struct page *)bad_page; +} + + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data) +{ + volatile int x; + + switch (msr) { + case MSR_EFER: + set_efer(vcpu, data); + break; + case MSR_K7_HWCR: + data &= ~(uint64_t)0x40; /* ignore flush filter disable */ + if (data != 0) { + cmn_err(CE_NOTE, "unimplemented HWCR wrmsr: 0x%llx\n", + data); + return 1; + } + break; + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + cmn_err(CE_NOTE, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); + return 1; + } + break; + case MSR_AMD64_NB_CFG: + break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* Values other than LBR and BTF are vendor-specific, + thus reserved and should throw a #GP */ + return 1; + } + cmn_err(CE_NOTE, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + break; + case 0x200 ... 0x2ff: + return set_msr_mtrr(vcpu, msr, data); + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; +#ifdef XXX + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_write(vcpu, msr, data); +#endif /*XXX*/ + case MSR_IA32_MISC_ENABLE: + vcpu->arch.ia32_misc_enable_msr = data; + break; + case MSR_KVM_WALL_CLOCK: + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data); + break; + case MSR_KVM_SYSTEM_TIME: { +#ifdef XXX + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } +#endif /*XXX*/ + + vcpu->arch.time = data; + + /* we verify if the enable bit is set... */ + if (!(data & 1)) + break; + + /* ...but clean it before doing the actual write */ + vcpu->arch.time_offset = data & ~(PAGEOFFSET | 1); +#ifdef XXX + vcpu->arch.time_page = + gfn_to_page(vcpu->kvm, data >> PAGESHIFT); + + if (is_error_page(vcpu->arch.time_page)) { + kvm_release_page_clean(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + kvm_request_guest_time_update(vcpu); +#endif /*XXX*/ + break; + } + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return set_msr_mce(vcpu, msr, data); + + /* Performance counters are not protected by a CPUID bit, + * so we should check all of them in the generic path for the sake of + * cross vendor migration. + * Writing a zero into the event select MSRs disables them, + * which we perfectly emulate ;-). Any other value should be at least + * reported, some guests depend on them. + */ + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + if (data != 0) + cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + /* at least RHEL 4 unconditionally writes to the perfctr registers, + * so we ignore writes to make it happy. + */ + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + cmn_err(CE_NOTE, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_enter(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_exit(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; + default: + if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) + return xen_hvm_config(vcpu, data); + if (!ignore_msrs) { + cmn_err(CE_NOTE, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); + return 1; + } else { + cmn_err(CE_NOTE, "ignored wrmsr: 0x%x data %llx\n", + msr, data); + break; + } + } + x = 10; /*XXX*/ + return 0; +} + + + +static int get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t *p = (uint64_t *)&vcpu->arch.mtrr_state.fixed_ranges; + + if (!msr_mtrr_valid(msr)) + return 1; + + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + uint64_t *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (uint64_t *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pdata = *pt; + } + + return 0; +} + + + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data = 0; + + switch (msr) { +#ifdef XXX + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); +#endif /*XXX*/ + default: + cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data; + uint64_t mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + data = 0; + break; + case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + uint32_t offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return 1; + } + *pdata = data; + return 0; +} + + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata) +{ + uint64_t data; + volatile int x; /*XXX - dtrace return probe is not there... */ + + switch (msr) { + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_UCODE_REV: + case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: + case MSR_K8_SYSCFG: + case MSR_K7_HWCR: + case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: + case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: + data = 0; + break; + case MSR_MTRRcap: + data = 0x500 | KVM_NR_VAR_MTRR; + break; + case 0x200 ... 0x2ff: + return get_msr_mtrr(vcpu, msr, pdata); + case 0xcd: /* fsb frequency */ + data = 3; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; +#ifdef XXX + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_read(vcpu, msr, pdata); + break; +#endif /*XXX*/ + case MSR_IA32_MISC_ENABLE: + data = vcpu->arch.ia32_misc_enable_msr; + break; + case MSR_IA32_PERF_STATUS: + /* TSC increment by tick */ + data = 1000ULL; + /* CPU multiplier */ + data |= (((uint64_t)4ULL) << 40); + break; + case MSR_EFER: + data = vcpu->arch.efer; + break; + case MSR_KVM_WALL_CLOCK: + data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + data = vcpu->arch.time; + break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_enter(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_exit(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; + default: + if (!ignore_msrs) { + cmn_err(CE_NOTE, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } else { + cmn_err(CE_NOTE, "ignored rdmsr: 0x%x\n", msr); + data = 0; + } + break; + } + *pdata = data; + x = 10; /*XXX*/ + return 0; +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, uint64_t *data)) +{ + int i, idx; + + vcpu_load(vcpu); + +#ifdef XXX + idx = srcu_read_lock(&vcpu->kvm->srcu); +#endif + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; +#ifdef XXX + srcu_read_unlock(&vcpu->kvm->srcu, idx); +#endif + vcpu_put(vcpu); + + return i; +} + +/* + * reads and returns guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset -- 21.3 + */ +static uint64_t guest_read_tsc(void) +{ + uint64_t host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata) +{ + uint64_t data; + struct shared_msr_entry *msr; + + if (!pdata) { + cmn_err(CE_WARN, "BUG: get_msr called with NULL pdata\n"); + return EINVAL; + } + + switch (msr_index) { +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state((struct vcpu_vmx *)vcpu); + data = ((struct vcpu_vmx *)(vcpu))->msr_guest_kernel_gs_base; + break; +#endif + case MSR_EFER: + return kvm_get_msr_common(vcpu, msr_index, pdata); + case MSR_IA32_TSC: + data = guest_read_tsc(); + break; + case MSR_IA32_SYSENTER_CS: + data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + data = vmcs_readl(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + data = vmcs_readl(GUEST_SYSENTER_ESP); + break; + case MSR_TSC_AUX: + if (!((struct vcpu_vmx *)(vcpu))->rdtscp_enabled) + return 1; + /* Otherwise falls through */ + default: + vmx_load_host_state((struct vcpu_vmx *)vcpu); + msr = find_msr_entry((struct vcpu_vmx *)vcpu, msr_index); + if (msr) { + vmx_load_host_state((struct vcpu_vmx *)vcpu); + data = msr->data; + break; + } + return kvm_get_msr_common(vcpu, msr_index, pdata); + } + + *pdata = data; + return 0; +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata) +{ + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); +} + + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data) +{ + struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu; + struct shared_msr_entry *msr; + uint64_t host_tsc; + int ret = 0; + + switch (msr_index) { + case MSR_EFER: + vmx_load_host_state(vmx); + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmcs_writel(GUEST_GS_BASE, data); + break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(vmx); + vmx->msr_guest_kernel_gs_base = data; + break; +#endif + case MSR_IA32_SYSENTER_CS: + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + vmcs_writel(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; + case MSR_IA32_TSC: + rdtscll(host_tsc); + guest_write_tsc(data, host_tsc); + break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ + default: + msr = find_msr_entry(vmx, msr_index); + if (msr) { + vmx_load_host_state(vmx); + msr->data = data; + break; + } + ret = kvm_set_msr_common(vcpu, msr_index, data); + } + + return ret; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data) +{ + return kvm_x86_ops->set_msr(vcpu, msr_index, data); +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data) +{ + return kvm_set_msr(vcpu, index, *data); +} + +static inline int is_machine_check(uint32_t intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); +} + +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#ifdef XXX +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s, 0); +#endif +#endif /*XXX*/ +} + +static void vmcs_clear_bits(unsigned long field, uint32_t mask) +{ + vmcs_writel(field, vmcs_readl(field) & ~mask); +} + +static void vmcs_set_bits(unsigned long field, uint32_t mask) +{ + vmcs_writel(field, vmcs_readl(field) | mask); +} + +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, int has_error, uint32_t error_code) +{ + uint32_t prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = 1; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + BT_SET(&vcpu->requests, KVM_REQ_TRIPLE_FAULT); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = 1; + vcpu->arch.exception.has_error_code = 1; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + kvm_multiple_exception(vcpu, nr, 0, 0); +} + +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code) +{ + kvm_multiple_exception(vcpu, nr, 1, error_code); +} + + +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + uint32_t exit_intr_info; + uint32_t idt_vectoring_info = vmx->idt_vectoring_info; + int unblock_nmi; + uint8_t vector; + int type; + int idtv_info_valid; + + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + + /* Handle machine checks before interrupts are enabled */ + if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI + && is_machine_check(exit_intr_info))) + kvm_machine_check(); + + /* We need to handle NMIs before interrupts are enabled */ + if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + (exit_intr_info & INTR_INFO_VALID_MASK)) + asm("int $2"); + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + +#ifdef XXX + if (cpu_has_virtual_nmis()) { + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->soft_vnmi_blocked)) + vmx->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); +#endif /*XXX*/ + vmx->vcpu.arch.nmi_injected = 0; +#ifdef XXX + kvm_clear_exception_queue(&vmx->vcpu); + kvm_clear_interrupt_queue(&vmx->vcpu); + + if (!idtv_info_valid) + return; +#endif /*XXX*/ + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + + switch (type) { + case INTR_TYPE_NMI_INTR: + vmx->vcpu.arch.nmi_injected = 1; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. + */ + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_HARD_EXCEPTION: +#ifdef XXX + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + uint32_t err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, err); + } else + kvm_queue_exception(&vmx->vcpu, vector); +#endif /*XXX*/ + break; + case INTR_TYPE_SOFT_INTR: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_EXT_INTR: +#ifdef XXX + kvm_queue_interrupt(&vmx->vcpu, vector, + type == INTR_TYPE_SOFT_INTR); +#endif /*XXX*/ + break; + default: + break; + } +} + +#ifdef CONFIG_X86_64 +#define R "r" +#define Q "q" +#else +#define R "e" +#define Q "l" +#endif + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +#define read_cr0() (native_read_cr0()) + +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu; + + /* Record the guest's net vcpu time for enforced NMI injections. */ +#ifdef XXX + if (!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked) + vmx->entry_time = ktime_get(); + + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return; + + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); +#endif /*XXX*/ + + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); + + asm( + /* Store host registers */ + "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" + "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%"R"sp, %c[host_rsp](%0) \n\t" + __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%cr2, %%"R"dx \n\t" + "cmp %%"R"ax, %%"R"dx \n\t" + "je 2f \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "2: \n\t" + /* Check if vmlaunch of vmresume is needed */ + "cmpl $0, %c[launched](%0) \n\t" + /* Load guest registers. Don't clobber flags. */ + "mov %c[rax](%0), %%"R"ax \n\t" + "mov %c[rbx](%0), %%"R"bx \n\t" + "mov %c[rdx](%0), %%"R"dx \n\t" + "mov %c[rsi](%0), %%"R"si \n\t" + "mov %c[rdi](%0), %%"R"di \n\t" + "mov %c[rbp](%0), %%"R"bp \n\t" +#ifdef CONFIG_X86_64 + "mov %c[r8](%0), %%r8 \n\t" + "mov %c[r9](%0), %%r9 \n\t" + "mov %c[r10](%0), %%r10 \n\t" + "mov %c[r11](%0), %%r11 \n\t" + "mov %c[r12](%0), %%r12 \n\t" + "mov %c[r13](%0), %%r13 \n\t" + "mov %c[r14](%0), %%r14 \n\t" + "mov %c[r15](%0), %%r15 \n\t" +#endif + "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + + /* Enter guest mode */ + "jne .Llaunched \n\t" + __ex(ASM_VMX_VMLAUNCH) "\n\t" + "jmp .Lkvm_vmx_return \n\t" + ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" + ".Lkvm_vmx_return: " + /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" + "mov %%"R"bx, %c[rbx](%0) \n\t" + "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "mov %%"R"dx, %c[rdx](%0) \n\t" + "mov %%"R"si, %c[rsi](%0) \n\t" + "mov %%"R"di, %c[rdi](%0) \n\t" + "mov %%"R"bp, %c[rbp](%0) \n\t" +#ifdef CONFIG_X86_64 + "mov %%r8, %c[r8](%0) \n\t" + "mov %%r9, %c[r9](%0) \n\t" + "mov %%r10, %c[r10](%0) \n\t" + "mov %%r11, %c[r11](%0) \n\t" + "mov %%r12, %c[r12](%0) \n\t" + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" +#endif + "mov %%cr2, %%"R"ax \n\t" + "mov %%"R"ax, %c[cr2](%0) \n\t" + + "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" + "setbe %c[fail](%0) \n\t" + : : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), +#ifdef CONFIG_X86_64 + [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), +#endif + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + : "cc", "memory" + , R"bx", R"di", R"si" +#ifdef CONFIG_X86_64 + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#endif + ); + + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_PDPTR)); + vcpu->arch.regs_dirty = 0; + + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + +#ifdef XXX + if (vmx->rmode.irq.pending) + fixup_rmode_irq(vmx); +#endif /*XXX*/ + + asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + vmx->launched = 1; + + vmx_complete_interrupts(vmx); +} + +#undef R +#undef Q + +void kvm_set_shared_msr(unsigned slot, uint64_t value, uint64_t mask) +{ +#ifdef XXX + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + + if (((value ^ smsr->values[slot].curr) & mask) == 0) + return; + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); + if (!smsr->registered) { + smsr->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&smsr->urn); + smsr->registered = 1; + } +#endif /*XXX*/ +} +static void vmx_save_host_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; + + if (vmx->host_state.loaded) + return; + + vmx->host_state.loaded = 1; + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + vmx->host_state.ldt_sel = kvm_read_ldt(); + vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + vmx->host_state.fs_sel = kvm_read_fs(); + if (!(vmx->host_state.fs_sel & 7)) { + vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + vmx->host_state.fs_reload_needed = 0; + } else { + vmcs_write16(HOST_FS_SELECTOR, 0); + vmx->host_state.fs_reload_needed = 1; + } + vmx->host_state.gs_sel = kvm_read_gs(); + if (!(vmx->host_state.gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + else { + vmcs_write16(HOST_GS_SELECTOR, 0); + vmx->host_state.gs_ldt_reload_needed = 1; + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); +#endif + +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + } +#endif + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); +} + +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->interrupt_allowed(vcpu); +} + +static int handle_machine_check(struct kvm_vcpu *vcpu) +{ + /* already handled by vcpu_run */ + return 1; +} + + +static inline int is_page_fault(uint32_t intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); +} + + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t access, + uint32_t *error) +{ + void *data = val; + int r = /*X86EMUL_CONTINUE*/ 0; + + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); + unsigned offset = addr & (PAGESIZE-1); + unsigned toread = min(bytes, (unsigned)PAGESIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = /*X86EMUL_PROPAGATE_FAULT*/1; + goto out; + } + ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + if (ret < 0) { + r = /*X86EMUL_UNHANDLEABLE*/ 1; + goto out; + } + + bytes -= toread; + data += toread; + addr += toread; + } +out: + return r; +} + +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + uint32_t error_code) +{ +#ifdef XXX + ++vcpu->stat.pf_guest; +#endif /*XXX*/ + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + if (!is_protmode(vcpu)) + return 0; + + if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + return 3; + + return vmcs_read16(GUEST_CS_SELECTOR) & 3; +} + + +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, + const void *v) +{ +#ifdef XXX + if (vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) + return 0; + + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); +#else + return 0; +#endif /*XXX*/ +} + +static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) +{ +#ifdef XXX + if (vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) + return 0; + + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); +#else + return 0; +#endif /*XXX*/ +} + +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) +{ +#ifdef XXX + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +#else + return UNMAPPED_GVA; +#endif +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int emulator_read_emulated(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + uint32_t error_code; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); +#ifdef XXX + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(uint64_t *)val); +#endif /*XXX*/ + vcpu->mmio_read_completed = 0; + return X86EMUL_CONTINUE; + } + + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } + + /* For APIC access vmexit */ + if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) + == X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; + +mmio: + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { +#ifdef XXX + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(uint64_t *)val); +#endif /*XXX*/ + return X86EMUL_CONTINUE; + } + +#ifdef XXX + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); +#endif /*XXX*/ + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return X86EMUL_UNHANDLEABLE; +} + +int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) +{ + int ret; + + ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + if (ret < 0) + return 0; +#ifdef XXX + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); +#endif /*XXX*/ + return 1; +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error) +{ +#ifdef XXX + uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +#else + return UNMAPPED_GVA; +#endif +} + +static int emulator_write_emulated_onepage(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + uint32_t error_code; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } + + /* For APIC access vmexit */ + if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return X86EMUL_CONTINUE; + +mmio: +#ifdef XXX + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(uint64_t *)val); +#endif /*XXX*/ + /* + * Is this MMIO handled locally? + */ + if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + return X86EMUL_CONTINUE; + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); + + return X86EMUL_CONTINUE; +} + +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGEMASK) { + int rc, now; + + now = -addr & ~PAGEMASK; + rc = emulator_write_emulated_onepage(addr, val, now, vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + addr += now; + val += now; + bytes -= now; + } + return emulator_write_emulated_onepage(addr, val, bytes, vcpu); +} + +static int emulator_cmpxchg_emulated(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + cmn_err(CE_WARN, "kvm: emulating exchange as write\n"); +#ifndef CONFIG_X86_64 + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes == 8) { + gpa_t gpa; + struct page *page; + char *kaddr; + uint64_t val; + + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); + + if (gpa == UNMAPPED_GVA || + (gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; + + if (((gpa + bytes - 1) & PAGEMASK) != (gpa & PAGEMASK)) + goto emul_write; + + val = *(uint64_t *)new; + + page = gfn_to_page(vcpu->kvm, gpa >> PAGESHIFT); + + kaddr = kmap_atomic(page, KM_USER0); + set_64bit((uint64_t *)(kaddr + offset_in_page(gpa)), val); + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); + } +emul_write: +#endif + + return emulator_write_emulated(addr, new, bytes, vcpu); +} + +static struct x86_emulate_ops emulate_ops = { + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, +}; + +static void cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; +} + +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + int r; +#ifdef XXX + if (tdp_enabled) + return 0; + + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + + mutex_enter(&vcpu->kvm->mmu_lock); + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGESHIFT); + mutex_exit(&vcpu->kvm->mmu_lock); + return r; +#else + return 0; +#endif /*XXX*/ +} + +int emulate_instruction(struct kvm_vcpu *vcpu, + unsigned long cr2, + uint16_t error_code, + int emulation_type) +{ + int r, shadow_mask; + struct decode_cache *c; + struct kvm_run *run = vcpu->run; + +#ifdef XXX + kvm_clear_exception_queue(vcpu); +#endif /*XXX*/ + vcpu->arch.mmio_fault_cr2 = cr2; + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); + + vcpu->mmio_is_write = 0; + vcpu->arch.pio.string = 0; + + if (!(emulation_type & EMULTYPE_NO_DECODE)) { + int cs_db, cs_l; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + + /* Only allow emulation of specific instructions on #UD + * (namely VMMCALL, sysenter, sysexit, syscall)*/ + c = &vcpu->arch.emulate_ctxt.decode; + if (emulation_type & EMULTYPE_TRAP_UD) { + if (!c->twobyte) + return EMULATE_FAIL; + switch (c->b) { + case 0x01: /* VMMCALL */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return EMULATE_FAIL; + break; + case 0x34: /* sysenter */ + case 0x35: /* sysexit */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + case 0x05: /* syscall */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + default: + return EMULATE_FAIL; + } + + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) + return EMULATE_FAIL; + } + +#ifdef XXX + ++vcpu->stat.insn_emulation; +#endif /*XXX*/ + if (r) { +#ifdef XXX + ++vcpu->stat.insn_emulation_fail; +#endif /*XXX*/ + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + return EMULATE_FAIL; + } + } + + if (emulation_type & EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + return EMULATE_DONE; + } + + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + + if (vcpu->arch.pio.string) + return EMULATE_DO_MMIO; + + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } + + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + if (!vcpu->mmio_needed) { +#ifdef XXX + kvm_report_emulation_failure(vcpu, "mmio"); +#endif /*XXX*/ + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } + + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; + } + + return EMULATE_DONE; +} + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ +static int handle_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; + uint32_t intr_info, ex_no, error_code; + unsigned long cr2, rip, dr6; + uint32_t vect_info; + enum emulation_result er; + + vect_info = vmx->idt_vectoring_info; + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + if (is_machine_check(intr_info)) + return handle_machine_check(vcpu); + + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !is_page_fault(intr_info)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } + + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) + return 1; /* already handled by vmx_vcpu_run() */ + +#ifdef XXX + if (is_no_device(intr_info)) { + vmx_fpu_activate(vcpu); + return 1; + } + + if (is_invalid_opcode(intr_info)) { + er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); + if (er != EMULATE_DONE) + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } +#endif /*XXX*/ + + error_code = 0; + rip = kvm_rip_read(vcpu); + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if (is_page_fault(intr_info)) { + /* EPT won't cause page fault directly */ + if (enable_ept) + cmn_err(CE_PANIC, "page fault with ept enabled\n"); + cr2 = vmcs_readl(EXIT_QUALIFICATION); +#ifdef XXX + trace_kvm_page_fault(cr2, error_code); + + if (kvm_event_needs_reinjection(vcpu)) + kvm_mmu_unprotect_page_virt(vcpu, cr2); + return kvm_mmu_page_fault(vcpu, cr2, error_code); +#else + return -1; +#endif /*XXX*/ + } + +#ifdef XXX + if (vmx->rmode.vm86_active && + handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, + error_code)) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt(vcpu); + } + return 1; + } +#endif /*XXX*/ + + ex_no = intr_info & INTR_INFO_VECTOR_MASK; + switch (ex_no) { + case DB_VECTOR: +#ifdef XXX + dr6 = vmcs_readl(EXIT_QUALIFICATION); + if (!(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + /* fall through */ +#endif /*XXX*/ + case BP_VECTOR: +#ifdef XXX + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.exception = ex_no; +#endif /*XXX*/ + break; + default: + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = ex_no; + kvm_run->ex.error_code = error_code; + break; + } + return 0; +} + +static int handle_external_interrupt(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + ++vcpu->stat.irq_exits; +#endif /*XXX*/ + return 1; +} + +static int handle_triple_fault(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + return 0; +} + +static int handle_io(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + int size, in, string; + unsigned port; + +#ifdef XXX + ++vcpu->stat.io_exits; +#endif /*XXX*/ + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + string = (exit_qualification & 16) != 0; + + if (string) { + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO) + return 0; + return 1; + } + + size = (exit_qualification & 7) + 1; + in = (exit_qualification & 8) != 0; + port = exit_qualification >> 16; +#ifdef XXX + skip_emulated_instruction(vcpu); + return kvm_emulate_pio(vcpu, in, size, port); +#endif /*XXX*/ +} + +static int handle_nmi_window(struct kvm_vcpu *vcpu) +{ + uint32_t cpu_based_vm_exec_control; + + /* clear pending NMI */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +#ifdef XXX + ++vcpu->stat.nmi_window_exits; +#endif /*XXX*/ + + return 1; +} + +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + enum emulation_result err = EMULATE_DONE; + int ret = 1; + +#ifdef XXX + while (!guest_state_valid(vcpu)) { + err = emulate_instruction(vcpu, 0, 0, 0); + + if (err == EMULATE_DO_MMIO) { + ret = 0; + goto out; + } + + if (err != EMULATE_DONE) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + ret = 0; + goto out; + } + if (signal_pending(current)) + goto out; + if (need_resched()) + schedule(); + } +#endif /*XXX*/ + + vmx->emulation_required = 0; +out: + return ret; +} + +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { +#ifdef XXX + kvm_inject_gp(vcpu, 0); +#endif + return; + } +#endif + + cr0 &= ~CR0_RESERVED_BITS; + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { +#ifdef XXX + kvm_inject_gp(vcpu, 0); +#endif + return; + } + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { +#ifdef XXX + kvm_inject_gp(vcpu, 0); +#endif + return; + } + + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { +#ifdef CONFIG_X86_64 +#ifdef XXX + if ((vcpu->arch.efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + kvm_inject_gp(vcpu, 0); + return; + + } + } else +#endif /*XXX*/ +#endif +#ifdef XXX + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } +#endif /*XXX*/ + + } + + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; +#ifdef XXX + kvm_mmu_reset_context(vcpu); +#endif /*XXX*/ + return; +} + +static inline int constant_test_bit(int nr, const void *addr) +{ + const uint32_t *p = (const uint32_t *)addr; + return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0; +} +static inline int variable_test_bit(int nr, const void *addr) +{ + uint8_t v; + const uint32_t *p = (const uint32_t *)addr; + + asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); + return v; +} + +#define test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + constant_test_bit((nr),(addr)) : \ + variable_test_bit((nr),(addr))) + +static int pdptrs_changed(struct kvm_vcpu *vcpu) +{ + uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + int changed = 1; + int r; + + if (is_long_mode(vcpu) || !is_pae(vcpu)) + return 0; + + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + return 1; + + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + if (r < 0) + goto out; + changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; +out: + + return changed; +} + +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { +#ifdef XXX + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); +#endif /*XXX*/ + return; + } + + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) { +#ifdef XXX + kvm_inject_gp(vcpu, 0); +#endif /*XXX*/ + return; + } + } else { +#ifdef XXX + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + } +#endif /*XXX*/ + /* + * We don't check reserved bits in nonpae mode, because + * this isn't enforced, and VMware depends on this. + */ + } + + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ +#ifdef XXX + if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT))) + kvm_inject_gp(vcpu, 0); + else { +#endif /*XXX*/ + vcpu->arch.cr3 = cr3; +#ifdef XXX + vcpu->arch.mmu.new_cr3(vcpu); + } +#endif /*XXX*/ +} + +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + + if (cr4 & CR4_RESERVED_BITS) { +#ifdef XXX + kvm_inject_gp(vcpu, 0); +#endif /*XXX*/ + return; + } + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) { +#ifdef XXX + kvm_inject_gp(vcpu, 0); +#endif /*XXX*/ + return; + } +#ifdef XXX + } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) + && ((cr4 ^ old_cr4) & pdptr_bits) + && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; +#endif /*XXX*/ + } + + if (cr4 & X86_CR4_VMXE) { +#ifdef XXX + kvm_inject_gp(vcpu, 0); +#endif /*XXX*/ + return; + } + kvm_x86_ops->set_cr4(vcpu, cr4); + vcpu->arch.cr4 = cr4; + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); +} + +static int handle_cr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification, val; + int cr; + int reg; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + cr = exit_qualification & 15; + reg = (exit_qualification >> 8) & 15; + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + val = kvm_register_read(vcpu, reg); +#ifdef XXX + trace_kvm_cr_write(cr, val); +#endif /*XXX*/ + switch (cr) { + case 0: + kvm_set_cr0(vcpu, val); +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; + case 3: + kvm_set_cr3(vcpu, val); +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; + case 4: + kvm_set_cr4(vcpu, val); +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; + case 8: { + uint8_t cr8_prev = kvm_get_cr8(vcpu); + uint8_t cr8 = kvm_register_read(vcpu, reg); + kvm_set_cr8(vcpu, cr8); +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + if (irqchip_in_kernel(vcpu->kvm)) + return 1; + if (cr8_prev <= cr8) + return 1; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + } + }; + break; + case 2: /* clts */ + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); +#ifdef XXX + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); + skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); +#endif /*XXX*/ + return 1; + case 1: /*mov from cr*/ + switch (cr) { + case 3: + kvm_register_write(vcpu, reg, vcpu->arch.cr3); +#ifdef XXX + trace_kvm_cr_read(cr, vcpu->arch.cr3); + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; + case 8: + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); +#ifdef XXX + trace_kvm_cr_read(cr, val); + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; + } + break; + case 3: /* lmsw */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; +#ifdef XXX + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); + + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; + default: + break; + } + vcpu->run->exit_reason = 0; + cmn_err(CE_WARN, "unhandled control register: op %d cr %d\n", + (int)(exit_qualification >> 4) & 3, cr); + return 0; +} + +static int handle_dr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + unsigned long val; + int dr, reg; + +#ifdef XXX + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ + if (!kvm_require_cpl(vcpu, 0)) + return 1; + dr = vmcs_readl(GUEST_DR7); + + if (dr & DR7_GD) { + /* + * As the vm-exit takes precedence over the debug trap, we + * need to emulate the latter, either for the host or the + * guest debugging itself. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr7 = dr; + vcpu->run->debug.arch.pc = + vmcs_readl(GUEST_CS_BASE) + + vmcs_readl(GUEST_RIP); + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } else { + vcpu->arch.dr7 &= ~DR7_GD; + vcpu->arch.dr6 |= DR6_BD; + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + } +#endif /*XXX*/ + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + reg = DEBUG_REG_ACCESS_REG(exit_qualification); + if (exit_qualification & TYPE_MOV_FROM_DR) { + switch (dr) { + case 0 ... 3: + val = vcpu->arch.db[dr]; + break; + case 4: +#ifdef XXX + if (check_dr_alias(vcpu) < 0) +#endif /*XXX*/ + return 1; + /* fall through */ + case 6: + val = vcpu->arch.dr6; + break; + case 5: +#ifdef XXX + if (check_dr_alias(vcpu) < 0) +#endif /*XXX*/ + return 1; + /* fall through */ + default: /* 7 */ + val = vcpu->arch.dr7; + break; + } + kvm_register_write(vcpu, reg, val); + } else { + val = vcpu->arch.regs[reg]; + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = val; +#ifdef XXX + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) +#endif + vcpu->arch.eff_db[dr] = val; + break; + case 4: +#ifdef XXX + if (check_dr_alias(vcpu) < 0) +#endif /*XXX*/ + return 1; + /* fall through */ + case 6: + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + break; + case 5: +#ifdef XXX + if (check_dr_alias(vcpu) < 0) +#endif /*XXX*/ + return 1; + /* fall through */ + default: /* 7 */ + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; +#ifdef XXX + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { +#endif /*XXX*/ + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + vcpu->arch.switch_db_regs = + (val & DR7_BP_EN_MASK); +#ifdef XXX + } +#endif /*XXX*/ + break; + } + } +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; +} + +static int handle_cpuid(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + kvm_emulate_cpuid(vcpu); +#endif /*XXX*/ + return 1; +} + +static int handle_rdmsr(struct kvm_vcpu *vcpu) +{ + uint32_t ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + uint64_t data; + + if (vmx_get_msr(vcpu, ecx, &data)) { +#ifdef XXX + trace_kvm_msr_read_ex(ecx); +#endif /*XXX*/ + kvm_inject_gp(vcpu, 0); + return 1; + } + +#ifdef XXX + trace_kvm_msr_read(ecx, data); +#endif /*XXX*/ + + /* FIXME: handling of bits 32:63 of rax, rdx */ + vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; +} + +static int handle_wrmsr(struct kvm_vcpu *vcpu) +{ + uint32_t ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + uint64_t data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) + | ((uint64_t)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + + if (vmx_set_msr(vcpu, ecx, data) != 0) { +#ifdef XXX + trace_kvm_msr_write_ex(ecx, data); +#endif /*XXX*/ + kvm_inject_gp(vcpu, 0); + return 1; + } + +#ifdef XXX + trace_kvm_msr_write(ecx, data); + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; +} + +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) +{ + return 1; +} + +static int kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + uint64_t param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + int fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((uint64_t)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + +#ifdef XXX + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); +#endif /*XXX*/ + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: +#ifdef XXX + kvm_vcpu_on_spin(vcpu); +#endif /*XXX*/ + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((uint64_t)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + + +/* Return values for hypercalls */ +#define KVM_ENOSYS 1000 +#define KVM_EFAULT EFAULT +#define KVM_E2BIG E2BIG +#define KVM_EPERM EPERM + +#define KVM_HC_VAPIC_POLL_IRQ 1 +#define KVM_HC_MMU_OP 2 + +/* + * hypercalls use architecture specific + */ + +#ifdef _KERNEL +#ifdef CONFIG_KVM_GUEST +void __init kvm_guest_init(void); +#else +#define kvm_guest_init() do { } while (0) +#endif + +static inline int kvm_para_has_feature(unsigned int feature) +{ + if (kvm_arch_para_features() & (1UL << feature)) + return 1; + return 0; +} +#endif /* _KERNEL */ + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + unsigned long nr, a0, a1, a2, a3, ret; + int r = 1; + + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); + +#ifdef XXX + trace_kvm_hypercall(nr, a0, a1, a2, a3); +#endif /*XXX*/ + + if (!is_long_mode(vcpu)) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + if (kvm_x86_ops->get_cpl(vcpu) != 0) { + ret = -EPERM; + goto out; + } + + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + case KVM_HC_MMU_OP: +#ifdef XXX + r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); +#endif /*XXX*/ + break; + default: + ret = -ENOSYS; + break; + } +out: + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); +#ifdef XXX + ++vcpu->stat.hypercalls; +#endif /*XXX*/ + return r; +} + +static int handle_halt(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + skip_emulated_instruction(vcpu); + return kvm_emulate_halt(vcpu); +#else + return 0; +#endif /*XXX*/ +} + +static int handle_vmcall(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + kvm_emulate_hypercall(vcpu); + return 1; +} + +static int handle_vmx_insn(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + kvm_queue_exception(vcpu, UD_VECTOR); +#endif /*XXX*/ + return 1; +} + +static int handle_invlpg(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + +#ifdef XXX + kvm_mmu_invlpg(vcpu, exit_qualification); + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + return 1; +} + +static int handle_wbinvd(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + /* TODO: Add support for VT-d/pass-through device */ + return 1; +} + +static int handle_apic_access(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + enum emulation_result er; + unsigned long offset; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + offset = exit_qualification & 0xffful; + + er = emulate_instruction(vcpu, 0, 0, 0); + + if (er != EMULATE_DONE) { + cmn_err(CE_PANIC, + "Fail to handle apic access vmexit! Offset is 0x%lx\n", + offset); + } + return 1; +} + +static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && + (seg != VCPU_SREG_TR) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); +} + +static inline unsigned long get_desc_limit(const struct desc_struct *desc) +{ + return desc->c.b.limit0 | (desc->c.b.limit << 16); +} + +static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, uint16_t selector, + struct kvm_segment *kvm_desct) +{ + kvm_desct->base = get_desc_base(seg_desc); + kvm_desct->limit = get_desc_limit(seg_desc); + if (seg_desc->c.b.g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } + kvm_desct->selector = selector; + kvm_desct->type = seg_desc->c.b.type; + kvm_desct->present = seg_desc->c.b.p; + kvm_desct->dpl = seg_desc->c.b.dpl; + kvm_desct->db = seg_desc->c.b.d; + kvm_desct->s = seg_desc->c.b.s; + kvm_desct->l = seg_desc->c.b.l; + kvm_desct->g = seg_desc->c.b.g; + kvm_desct->avl = seg_desc->c.b.avl; + if (!selector) + kvm_desct->unusable = 1; + else + kvm_desct->unusable = 0; + kvm_desct->padding = 0; +} + +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, uint16_t selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return 0; +} + +static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, + uint16_t selector, + struct descriptor_table *dtable) +{ + if (selector & 1 << 2) { + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + + if (kvm_seg.unusable) + dtable->limit = 0; + else + dtable->limit = kvm_seg.limit; + dtable->base = kvm_seg.base; + } + else + kvm_x86_ops->get_gdt(vcpu, dtable); +} + +/* allowed just for 8 bytes segments */ +static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + uint16_t index = selector >> 3; + int ret; + uint32_t err; + gva_t addr; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) { + kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); + return 1; + } + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == 1) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; +} + +static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, uint32_t *error) +{ + void *data = val; + int r = 0; + +#ifdef XXX + while (bytes) { + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); + unsigned offset = addr & (PAGESIZE-1); + unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } + ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } + + bytes -= towrite; + data += towrite; + addr += towrite; + } +out: +#endif /*XXX*/ + return r; +} + +/* allowed just for 8 bytes segments */ +static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + uint16_t index = selector >> 3; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) + return 1; + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, uint16_t selector, int seg) +{ + struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + uint8_t dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + uint32_t err_code = 0; + int null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; + + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) + return kvm_load_realmode_segment(vcpu, selector, seg); + + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.c.b.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: + kvm_set_segment(vcpu, &kvm_seg, seg); + return 0; +exception: +#ifdef XXX + kvm_queue_exception_e(vcpu, err_vec, err_code); +#endif /*XXX*/ + return 1; +} + +static void save_state_to_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + tss->cr3 = vcpu->arch.cr3; + tss->eip = kvm_rip_read(vcpu); + tss->eflags = kvm_get_rflags(vcpu); + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); + tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, uint16_t sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + +static int load_state_from_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + kvm_set_cr3(vcpu, tss->cr3); + + kvm_rip_write(vcpu, tss->eip); + kvm_set_rflags(vcpu, tss->eflags | 2); + + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) + return 1; + return 0; +} + +static void save_state_to_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + tss->ip = kvm_rip_read(vcpu); + tss->flag = kvm_get_rflags(vcpu); + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); + + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static int load_state_from_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + kvm_rip_write(vcpu, tss->ip); + kvm_set_rflags(vcpu, tss->flag | 2); + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return 1; + return 0; +} + +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +{ + gfn_t gfn = gpa >> PAGESHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} + +static int kvm_task_switch_16(struct kvm_vcpu *vcpu, uint16_t tss_selector, + uint16_t old_tss_sel, uint32_t old_tss_base, + struct desc_struct *nseg_desc) +{ + struct tss_segment_16 tss_segment_16; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) + goto out; + + save_state_to_tss16(vcpu, &tss_segment_16); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) + goto out; + +#ifdef XXX + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_16, sizeof tss_segment_16)) + goto out; +#endif /*XXX*/ + + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; +#ifdef XXX + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr_write(vcpu, nseg_desc), + &tss_segment_16.prev_task_link, + sizeof tss_segment_16.prev_task_link)) + goto out; +#endif /*XXX*/ + } + + if (load_state_from_tss16(vcpu, &tss_segment_16)) + goto out; + + ret = 1; +out: + return ret; +} + +static int kvm_task_switch_32(struct kvm_vcpu *vcpu, uint16_t tss_selector, + uint16_t old_tss_sel, uint32_t old_tss_base, + struct desc_struct *nseg_desc) +{ + struct tss_segment_32 tss_segment_32; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + + save_state_to_tss32(vcpu, &tss_segment_32); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + +#ifdef XXX + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_32, sizeof tss_segment_32)) + goto out; +#endif /*XXX*/ + + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + +#ifdef XXX + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr_write(vcpu, nseg_desc), + &tss_segment_32.prev_task_link, + sizeof tss_segment_32.prev_task_link)) + goto out; +#endif /*XXX*/ + } + + if (load_state_from_tss32(vcpu, &tss_segment_32)) + goto out; + + ret = 1; +out: + return ret; +} + +static uint64_t vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + return vmcs_readl(sf->base); +} + +static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_x86_ops->get_segment_base(vcpu, seg); +} + +int kvm_task_switch(struct kvm_vcpu *vcpu, uint16_t tss_selector, int reason) +{ + struct kvm_segment tr_seg; + struct desc_struct cseg_desc; + struct desc_struct nseg_desc; + int ret = 0; + uint32_t old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + uint16_t old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + uint32_t desc_limit; + +#ifdef XXX + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); +#endif /*XXX*/ + + /* FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ + if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) + goto out; + + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) + goto out; + + if (reason != TASK_SWITCH_IRET) { + int cpl; + + cpl = kvm_x86_ops->get_cpl(vcpu); + if ((tss_selector & 3) > nseg_desc.c.b.dpl || cpl > nseg_desc.c.b.dpl) { +#ifdef XXX + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); +#endif /*XXX*/ + return 1; + } + } + + desc_limit = get_desc_limit(&nseg_desc); + if (!nseg_desc.c.b.p || + ((desc_limit < 0x67 && (nseg_desc.c.b.type & 8)) || + desc_limit < 0x2b)) { +#ifdef XXX + kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); +#endif /*XXX*/ + return 1; + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + cseg_desc.c.b.type &= ~(1 << 1); //clear the B flag + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); + } + + if (reason == TASK_SWITCH_IRET) { + uint32_t eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); + } + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (nseg_desc.c.b.type & 8) + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + else + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { + uint32_t eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); + } + + if (reason != TASK_SWITCH_IRET) { + nseg_desc.c.b.type |= (1 << 1); + save_guest_segment_descriptor(vcpu, tss_selector, + &nseg_desc); + } + + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); + seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); + tr_seg.type = 11; + kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); +out: + return ret; +} + +static int handle_task_switch(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; + uint16_t tss_selector; + int reason, type, idt_v; + + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + reason = (uint32_t)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && idt_v) { + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = 0; +#ifdef XXX + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); +#endif + break; + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_SOFT_INTR: +#ifdef XXX + kvm_clear_interrupt_queue(vcpu); +#endif /*XXX*/ + break; + case INTR_TYPE_HARD_EXCEPTION: + case INTR_TYPE_SOFT_EXCEPTION: +#ifdef XXX + kvm_clear_exception_queue(vcpu); +#endif /*XXX*/ + break; + default: + break; + } + } + tss_selector = exit_qualification; +#ifdef XXX + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && + type != INTR_TYPE_EXT_INTR && + type != INTR_TYPE_NMI_INTR)) + skip_emulated_instruction(vcpu); +#endif /*XXX*/ + + if (!kvm_task_switch(vcpu, tss_selector, reason)) + return 0; + + /* clear all local breakpoint enable flags */ + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + + /* + * TODO: What about debug traps on tss switch? + * Are we supposed to inject them and update dr6? + */ + + return 1; +} + +static int handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + gpa_t gpa; + int gla_validity; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + if (exit_qualification & (1 << 6)) { + cmn_err(CE_PANIC, "EPT: GPA exceeds GAW!\n"); + } + + gla_validity = (exit_qualification >> 7) & 0x3; + if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + cmn_err(CE_WARN, "EPT: Handling EPT violation failed!\n"); + cmn_err(CE_CONT, "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + vmcs_readl(GUEST_LINEAR_ADDRESS)); + cmn_err(CE_PANIC, "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + return 0; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); +#ifdef XXX + trace_kvm_page_fault(gpa, exit_qualification); + return kvm_mmu_page_fault(vcpu, gpa & PAGEMASK, 0); +#else + return 0; +#endif +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) +{ + uint64_t sptes[4]; + int nr_sptes, i; + gpa_t gpa; + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + + cmn_err(CE_WARN, "EPT: Misconfiguration.\n"); + cmn_err(CE_CONT, "EPT: GPA: 0x%llx\n", gpa); +#ifdef XXX + nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); + + for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) + ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); +#endif /*XXX*/ + + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + + return 0; +} + +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + skip_emulated_instruction(vcpu); + kvm_vcpu_on_spin(vcpu); +#endif /*XXX*/ + + return 1; +} + +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + kvm_queue_exception(vcpu, UD_VECTOR); +#endif /*XXX*/ + return 1; +} + +static int handle_interrupt_window(struct kvm_vcpu *vcpu) +{ + uint32_t cpu_based_vm_exec_control; + + /* clear pending irq */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + +#ifdef XXX + ++vcpu->stat.irq_window_exits; + + /* + * If the user space waits to inject interrupts, exit as soon as + * possible + */ + if (!irqchip_in_kernel(vcpu->kvm) && + vcpu->run->request_interrupt_window && + !kvm_cpu_has_interrupt(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + return 0; + } +#endif /*XXX*/ + return 1; +} + +/* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. + */ +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = handle_cpuid, + [EXIT_REASON_MSR_READ] = handle_rdmsr, + [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_VMCLEAR] = handle_vmx_insn, + [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, + [EXIT_REASON_VMPTRLD] = handle_vmx_insn, + [EXIT_REASON_VMPTRST] = handle_vmx_insn, + [EXIT_REASON_VMREAD] = handle_vmx_insn, + [EXIT_REASON_VMRESUME] = handle_vmx_insn, + [EXIT_REASON_VMWRITE] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmx_insn, + [EXIT_REASON_VMON] = handle_vmx_insn, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, + [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_TASK_SWITCH] = handle_task_switch, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, +}; + +static const int kvm_vmx_max_exit_handlers = + ARRAY_SIZE(kvm_vmx_exit_handlers); + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ + +static int vmx_handle_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t exit_reason = vmx->exit_reason; + uint32_t vectoring_info = vmx->idt_vectoring_info; + + /* If guest state is invalid, start emulating */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return handle_invalid_guest_state(vcpu); + + /* Access CR3 don't cause VMExit in paging mode, so we need + * to sync with guest real CR3. */ + if (enable_ept && is_paging(vcpu)) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + + if (vmx->fail) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); + return 0; + } + + if ((vectoring_info & VECTORING_INFO_VALID_MASK) && + (exit_reason != EXIT_REASON_EXCEPTION_NMI && + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_TASK_SWITCH)) + cmn_err(CE_WARN, "%s: unexpected, valid vectoring info " + "(0x%x) and exit reason is 0x%x\n", + __func__, vectoring_info, exit_reason); + +#ifdef XXX + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (vmx_interrupt_allowed(vcpu)) { + vmx->soft_vnmi_blocked = 0; + } else if (vmx->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + cmn_err(CE_WARN, "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->soft_vnmi_blocked = 0; + } + } +#endif /*XXX*/ + + if (exit_reason < kvm_vmx_max_exit_handlers + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu); + else { + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = exit_reason; + } + return 0; +} + +static inline void kvm_guest_exit(void) +{ +#ifdef XXX + account_system_vtime(current); + current->flags &= ~PF_VCPU; +#endif /*XXX*/ +} + +static inline void kvm_guest_enter(void) +{ +#ifdef XXX + account_system_vtime(current); + current->flags |= PF_VCPU; +#endif /*XXX*/ +} + +int mmu_topup_memory_caches(struct kvm_vcpu *vcpu); + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + mutex_enter(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + r = mmu_alloc_roots(vcpu); + mmu_sync_roots(vcpu); + mutex_exit(&vcpu->kvm->mmu_lock); + if (r) + goto out; + /* set_cr3() should ensure TLB has been flushed */ + kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); +out: + return r; +} + +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mmu.root_hpa != INVALID_PAGE) + return 0; + + return kvm_mmu_load(vcpu); +} + +static int vcpu_enter_guest(struct kvm_vcpu *vcpu) +{ + int r; + + int req_int_win = !irqchip_in_kernel(vcpu->kvm) && + vcpu->run->request_interrupt_window; + + if (vcpu->requests) + if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + kvm_mmu_unload(vcpu); + + r = kvm_mmu_reload(vcpu); + if (r) + goto out; + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) + __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + kvm_write_guest_time(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); + if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, + &vcpu->requests)) { + vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; + r = 0; + goto out; + } + if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + r = 0; + goto out; + } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } + } + + kpreempt_disable(); + + kvm_x86_ops->prepare_guest_switch(vcpu); +#ifdef XXX + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); +#endif /*XXX*/ + kpreempt_disable(); + + BT_CLEAR(&vcpu->requests, KVM_REQ_KICK); +#ifdef XXX + smp_mb__after_clear_bit(); +#endif /*XXX*/ + + if (vcpu->requests /*XXX || need_resched() || signal_pending(current)*/) { + BT_SET(&vcpu->requests, KVM_REQ_KICK); + kpreempt_enable(); + r = 1; + goto out; + } +#ifdef XXX + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); +#ifdef XXX + kvm_lapic_sync_to_vapic(vcpu); +#endif /*XXX*/ + } + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); +#endif /*XXX*/ + kvm_guest_enter(); + +#ifdef XXX + if (unlikely(vcpu->arch.switch_db_regs)) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); + } + + trace_kvm_entry(vcpu->vcpu_id); +#endif /*XXX*/ + kvm_x86_ops->run(vcpu); +#ifdef XXX + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +#endif /*XXX*/ + BT_SET(&vcpu->requests, KVM_REQ_KICK); + +#ifdef XXX + ++vcpu->stat.exits; +#endif /*XXX*/ + kvm_guest_exit(); + + kpreempt_enable(); +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) { + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); + } + + kvm_lapic_sync_from_vapic(vcpu); +#endif /*XXX*/ + r = kvm_x86_ops->handle_exit(vcpu); +out: + return r; +} + + +static void post_kvm_run_save(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = kvm_get_cr8(vcpu); + kvm_run->apic_base = kvm_get_apic_base(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = 1; +#ifdef XXX + else + kvm_run->ready_for_interrupt_injection = + kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu); +#endif /*XXX*/ +} + +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +void kvm_vcpu_block(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + + if (kvm_arch_vcpu_runnable(vcpu)) { + set_bit(KVM_REQ_UNHALT, &vcpu->requests); + break; + } + if (kvm_cpu_has_pending_timer(vcpu)) + break; + if (signal_pending(current)) + break; + + schedule(); + } + + finish_wait(&vcpu->wq, &wait); +#endif /*XXX*/ +} + +static void vapic_enter(struct kvm_vcpu *vcpu) +{ +#ifdef XXX + struct kvm_lapic *apic = vcpu->arch.apic; + struct page *page; + + if (!apic || !apic->vapic_addr) + return; + + page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); + + vcpu->arch.apic->vapic_page = page; +#endif /*XXX*/ +} + +extern int kvm_apic_id(struct kvm_lapic *apic); + +static void vapic_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int idx; +#ifdef XXX + if (!apic || !apic->vapic_addr) +#endif /*XXX*/ + return; +#ifdef XXX + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_release_page_dirty(apic->vapic_page); + mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGESHIFT); + srcu_read_unlock(&vcpu->kvm->srcu, idx); +#endif /*XXX*/ +} + +void kvm_lapic_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic; + int i; + + ASSERT(vcpu); + apic = vcpu->arch.apic; + ASSERT(apic != NULL); + +#ifdef XXX + /* Stop the timer in case it's a reset to an active apic */ + hrtimer_cancel(&apic->lapic_timer.timer); +#endif /*XXX*/ + + apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); + kvm_apic_set_version(apic->vcpu); + + for (i = 0; i < APIC_LVT_NUM; i++) + apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + apic_set_reg(apic, APIC_LVT0, + SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + + apic_set_reg(apic, APIC_DFR, 0xffffffffU); + apic_set_reg(apic, APIC_SPIV, 0xff); + apic_set_reg(apic, APIC_TASKPRI, 0); + apic_set_reg(apic, APIC_LDR, 0); + apic_set_reg(apic, APIC_ESR, 0); + apic_set_reg(apic, APIC_ICR, 0); + apic_set_reg(apic, APIC_ICR2, 0); + apic_set_reg(apic, APIC_TDCR, 0); + apic_set_reg(apic, APIC_TMICT, 0); + for (i = 0; i < 8; i++) { + apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); + apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); + apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); + } + apic->irr_pending = 0; +#ifdef XXX + update_divide_count(apic); + atomic_set(&apic->lapic_timer.pending, 0); + if (kvm_vcpu_is_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + apic_update_ppr(apic); +#endif /*XXX*/ + + vcpu->arch.apic_arb_prio = 0; + + cmn_err(CE_NOTE, "%s: vcpu=%p, id=%d, base_msr= 0x%016 PRIx64 base_address=0x%0lx.\n", + __func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); +} + +static int __vcpu_run(struct kvm_vcpu *vcpu) +{ + int r; + struct kvm *kvm = vcpu->kvm; + + if (vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED) { + cmn_err(CE_NOTE, "vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } + +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); +#endif /*XXX*/ + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + r = vcpu_enter_guest(vcpu); + else { +#ifdef XXX + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); +#endif /*XXX*/ + kvm_vcpu_block(vcpu); +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); +#endif /*XXX*/ + /* + * XXX - the following should use a bitset_t + * and do bitset_atomic_test_and_del(). + * but I am lazy, and will get to it later + */ + if (BT_TEST(&vcpu->requests, KVM_REQ_UNHALT)) + { + BT_CLEAR(&vcpu->requests, KVM_REQ_UNHALT); + switch(vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + break; + case KVM_MP_STATE_SIPI_RECEIVED: + default: + r = -EINTR; + break; + } + } + } + + if (r <= 0) + break; + +#ifdef XXX + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + if (kvm_cpu_has_pending_timer(vcpu)) + kvm_inject_pending_timer_irqs(vcpu); + if (dm_request_for_irq_injection(vcpu)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + + if (signal_pending(current)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_resched(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + } +#endif /*XXX*/ + } +#ifdef XXX + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); +#endif /*XXX*/ + post_kvm_run_save(vcpu); + vapic_exit(vcpu); + return r; +} + + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + sigset_t sigsaved; + + vcpu_load(vcpu); + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) { + kvm_vcpu_block(vcpu); + BT_CLEAR(&vcpu->requests, KVM_REQ_UNHALT); + r = -EAGAIN; + goto out; + } + + /* re-sync apic's tpr */ + if (!irqchip_in_kernel(vcpu->kvm)) + kvm_set_cr8(vcpu, kvm_run->cr8); + + + if (vcpu->arch.pio.cur_count) { +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = complete_pio(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); +#endif /*XXX*/ + if (r) + goto out; + } + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; +#ifdef XXX + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, + EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ + r = 0; + goto out; + } +#endif /*XXX*/ + } + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, + kvm_run->hypercall.ret); + + r = __vcpu_run(vcpu); + +out: + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + vcpu_put(vcpu); + return r; +} + static int kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_p) { - int rval = EINVAL; + int rval = DDI_SUCCESS; + volatile int x; /* XXX - dtrace was not getting fbt return probe */ switch(cmd) { case KVM_GET_API_VERSION: cmn_err(CE_NOTE, "kvm_ioctl: KVM_GET_API_VERSION"); - if (arg != NULL) - return (rval); + if (arg != NULL) { + rval = EINVAL; + break; + } *rval_p = KVM_API_VERSION; - cmn_err(CE_NOTE, "kvm_ioctl: set rval_p to %d\n", *rval_p); - rval = DDI_SUCCESS; break; case KVM_CREATE_VM: - if (arg == NULL) - return (rval); + if (arg == NULL) { + rval = EINVAL; + break; + } rval = kvm_dev_ioctl_create_vm(arg, mode); - return (rval); + break; + case KVM_RUN: { + struct kvm_run_ioc kvm_run_ioc; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + + if (!arg) { + rval = EINVAL; + break; + } + + if (ddi_copyin((caddr_t)arg, &kvm_run_ioc, sizeof kvm_run_ioc, mode)) { + rval = EFAULT; + break; + } + + kvmp = find_kvm_id(kvm_run_ioc.kvm_kvmid); + if (kvmp == NULL) { + rval = EINVAL; + break; + } + if (!kvmp || kvm_run_ioc.kvm_cpu_index >= kvmp->online_vcpus) { + rval = EINVAL; + break; + } + vcpu = kvmp->vcpus[kvm_run_ioc.kvm_cpu_index]; + + rval = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + break; + } case KVM_CHECK_EXTENSION: rval = kvm_dev_ioctl_check_extension_generic(arg, rval_p); - if (rval != DDI_SUCCESS) - return (rval); break; + case KVM_GET_MSRS: { + struct kvm_msrs_ioc kvm_msrs_ioc; + struct kvm_msrs kvm_msrs; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + struct kvm_msr_entry *entries; + unsigned size; + int n; + + if (ddi_copyin((const void *)arg, &kvm_msrs_ioc, + sizeof(kvm_msrs_ioc), mode) != 0) { + rval = EFAULT; + break; + } + kvmp = find_kvm_id(kvm_msrs_ioc.kvm_kvmid); + rval = EINVAL; + if (kvmp == NULL) + break; + if (!kvmp || kvm_msrs_ioc.kvm_cpu_index >= kvmp->online_vcpus) + break; + + vcpu = kvmp->vcpus[kvm_msrs_ioc.kvm_cpu_index]; + + if (ddi_copyin(kvm_msrs_ioc.kvm_msrs, &kvm_msrs, sizeof(kvm_msrs), mode)) { + rval = EFAULT; + break; + } + + if (kvm_msrs.nmsrs >= MAX_IO_MSRS) { + rval = E2BIG; + break; + } + + size = sizeof(struct kvm_msr_entry) * kvm_msrs.nmsrs; + entries = (struct kvm_msr_entry *) kmem_alloc(size, KM_SLEEP); + if (!entries) { + rval = ENOMEM; + break; + } + + if (ddi_copyin((caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), entries, size, mode)) { + kmem_free(entries, size); + rval = EFAULT; + break; + } + + rval = n = __msr_io(vcpu, &kvm_msrs, entries, kvm_get_msr); + + if (rval < 0) { + kmem_free(entries, size); + rval = EINVAL; + break; + } + + rval = ddi_copyout(entries, (caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), size, mode); + kmem_free(entries, size); + + *rval_p = n; + + break; + } + + case KVM_SET_MSRS: { + struct kvm_msrs_ioc kvm_msrs_ioc; + struct kvm_msrs kvm_msrs; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + struct kvm_msr_entry *entries; + unsigned size; + int n; + + if (ddi_copyin((const void *)arg, &kvm_msrs_ioc, + sizeof(kvm_msrs_ioc), mode) != 0) { + rval = EFAULT; + break; + } + + rval = EINVAL; + kvmp = find_kvm_id(kvm_msrs_ioc.kvm_kvmid); + if (kvmp == NULL) + break; + if (!kvmp || kvm_msrs_ioc.kvm_cpu_index >= kvmp->online_vcpus) + break; + + vcpu = kvmp->vcpus[kvm_msrs_ioc.kvm_cpu_index]; + + if (ddi_copyin(kvm_msrs_ioc.kvm_msrs, &kvm_msrs, sizeof(kvm_msrs), mode)) { + rval = EFAULT; + break; + } + + if (kvm_msrs.nmsrs >= MAX_IO_MSRS) { + rval = E2BIG; + break; + } + + size = sizeof(struct kvm_msr_entry) * kvm_msrs.nmsrs; + entries = (struct kvm_msr_entry *)kmem_alloc(size, KM_SLEEP); + if (!entries) { + rval = ENOMEM; + break; + } + + if (ddi_copyin((caddr_t)(((uint64_t)kvm_msrs_ioc.kvm_msrs)+(sizeof (struct kvm_msrs))), entries, size, mode)) { + kmem_free(entries, size); + rval = EFAULT; + break; + } + + rval = n = __msr_io(vcpu, &kvm_msrs, entries, do_set_msr); + + if (rval < 0) { + kmem_free(entries, size); + rval = EINVAL; + break; + } + kmem_free(entries, size); + *rval_p = n; + break; + } + case KVM_CREATE_VCPU: { struct kvm_vcpu_ioc kvm_vcpu; struct kvm *kvmp; if (ddi_copyin((const void *)arg, &kvm_vcpu, - sizeof(kvm_vcpu), mode) != 0) - return (EFAULT); + sizeof(kvm_vcpu), mode) != 0) { + rval = EFAULT; + break; + } + rval = EINVAL; kvmp = find_kvm_id(kvm_vcpu.kvmid); if (kvmp == NULL) - return(EINVAL); + break; rval = kvm_vm_ioctl_create_vcpu(kvmp, kvm_vcpu.id, &kvm_vcpu, rval_p); - if (rval != 0) - return (rval); + if (rval != 0) { + rval = EINVAL; + break; + } + if (ddi_copyout(&kvm_vcpu, (void *)arg, sizeof(kvm_vcpu), mode) != 0) - return EFAULT; + rval = EFAULT; break; } @@ -2602,54 +8781,261 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_ struct kvm *kvmp; if (ddi_copyin((const void *)arg, &kvmioc, - sizeof(kvmioc), mode) != 0) - return (EFAULT); + sizeof(kvmioc), mode) != 0) { + rval = EFAULT; + break; + } kvmp = find_kvm_id(kvmioc.kvmid); - if (kvmp == NULL) - return(EINVAL); + if (kvmp == NULL) { + rval = EINVAL; + break; + } rval = kvm_vm_ioctl_set_memory_region(kvmp, &kvmioc.kvm_userspace_map, 1); - if (rval != 0) - return (rval); + if (rval != 0) { + rval = EINVAL; + break; + } break; } case KVM_GET_SUPPORTED_CPUID: { struct kvm_cpuid2 *cpuid_arg = (struct kvm_cpuid2 *)arg; struct kvm_cpuid2 cpuid; - if (ddi_copyin(cpuid_arg, &cpuid, sizeof (cpuid), mode)) - return (EFAULT); + if (ddi_copyin(cpuid_arg, &cpuid, sizeof (cpuid), mode)) { + rval = EFAULT; + break; + } rval = kvm_dev_ioctl_get_supported_cpuid(&cpuid, cpuid_arg->entries, mode); if (rval) - return (rval); + break; if (ddi_copyout(&cpuid, cpuid_arg, sizeof (cpuid), mode)) - return (EFAULT); + rval = EFAULT; + break; + } + + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg; + struct kvm_msr_list msr_list; + unsigned n; + + if (ddi_copyin(user_msr_list, &msr_list, sizeof msr_list, mode)) { + rval = EFAULT; + break; + } + + n = msr_list.nmsrs; + msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + if (ddi_copyout(&msr_list, user_msr_list, sizeof msr_list, mode)) { + rval = EFAULT; + break; + } + if (n < msr_list.nmsrs) { + rval = E2BIG; + break; + } + rval = EFAULT; + if (ddi_copyout(&msrs_to_save, user_msr_list->indices, + num_msrs_to_save * sizeof(uint32_t), mode)) + break; + if (ddi_copyout(&emulated_msrs, + user_msr_list->indices + num_msrs_to_save, + ARRAY_SIZE(emulated_msrs) * sizeof(uint32_t), mode)) + break; + rval = 0; + *rval_p = 0; + break; + } + case KVM_GET_REGS: { + struct kvm_regs_ioc kvm_regs_ioc; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + + if (ddi_copyin((caddr_t)arg, &kvm_regs_ioc, sizeof (kvm_regs_ioc), mode)) { + rval = EFAULT; + break; + } + + kvmp = find_kvm_id(kvm_regs_ioc.kvm_kvmid); + + if (!kvmp || kvm_regs_ioc.kvm_cpu_index >= kvmp->online_vcpus) { + rval = EINVAL; + break; + } + + vcpu = kvmp->vcpus[kvm_regs_ioc.kvm_cpu_index]; + + rval = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs_ioc.kvm_regs); + if (rval) { + rval = EINVAL; + break; + } + if (ddi_copyout(&kvm_regs_ioc, (caddr_t)arg, sizeof(kvm_regs_ioc), mode)) + rval = EFAULT; + *rval_p = 0; + break; + } + case KVM_SET_REGS: { + struct kvm_regs_ioc kvm_regs_ioc; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + + if (ddi_copyin((caddr_t)arg, &kvm_regs_ioc, sizeof (kvm_regs_ioc), mode)) { + rval = EFAULT; + break; + } + + kvmp = find_kvm_id(kvm_regs_ioc.kvm_kvmid); + if (!kvmp || kvm_regs_ioc.kvm_cpu_index >= kvmp->online_vcpus) { + rval = EINVAL; + break; + } + + vcpu = kvmp->vcpus[kvm_regs_ioc.kvm_cpu_index]; + + cmn_err(CE_NOTE, "KVM_SET_REGS: rax = %lx, rbx = %lx, rcx = %lx, rdx = %lx\n", + kvm_regs_ioc.kvm_regs.rax, kvm_regs_ioc.kvm_regs.rbx, kvm_regs_ioc.kvm_regs.rcx, kvm_regs_ioc.kvm_regs.rdx); + + rval = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs_ioc.kvm_regs); + if (rval) + rval = EINVAL; + *rval_p = 0; + break; + } + case KVM_GET_FPU: { + struct kvm_fpu_ioc kvm_fpu_ioc; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + + if (ddi_copyin((caddr_t)arg, &kvm_fpu_ioc, sizeof(kvm_fpu_ioc), mode)) { + rval = EFAULT; + break; + } + + kvmp = find_kvm_id(kvm_fpu_ioc.kvm_kvmid); + if (!kvmp || kvm_fpu_ioc.kvm_cpu_index >= kvmp->online_vcpus) { + rval = EINVAL; + break; + } + + vcpu = kvmp->vcpus[kvm_fpu_ioc.kvm_cpu_index]; + + rval = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &kvm_fpu_ioc.fpu); + if (rval) { + rval = EINVAL; + break; + } + + if (ddi_copyout(&kvm_fpu_ioc, (caddr_t)arg, sizeof(struct kvm_fpu), mode)) + rval = EFAULT; + + *rval_p = 0; + break; + } + case KVM_SET_FPU: { + struct kvm_fpu_ioc kvm_fpu_ioc; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + + if (ddi_copyin((caddr_t)arg, &kvm_fpu_ioc, sizeof(kvm_fpu_ioc), mode)) { + rval = EFAULT; + break; + } + + kvmp = find_kvm_id(kvm_fpu_ioc.kvm_kvmid); + if (!kvmp || kvm_fpu_ioc.kvm_cpu_index >= kvmp->online_vcpus) { + rval = EINVAL; + break; + } + + vcpu = kvmp->vcpus[kvm_fpu_ioc.kvm_cpu_index]; + + rval = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &kvm_fpu_ioc.fpu); + if (rval) + rval = EINVAL; + *rval_p = 0; break; } + case KVM_GET_SREGS: { + struct kvm_sregs_ioc kvm_sregs_ioc; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + if (ddi_copyin((caddr_t)arg, &kvm_sregs_ioc, sizeof (kvm_sregs_ioc), mode)) { + rval = EFAULT; + break; + } + + kvmp = find_kvm_id(kvm_sregs_ioc.kvm_kvmid); + if (!kvmp || kvm_sregs_ioc.kvm_cpu_index >= kvmp->online_vcpus) { + rval = EINVAL; + break; + } + + vcpu = kvmp->vcpus[kvm_sregs_ioc.kvm_cpu_index]; + + rval = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs_ioc.sregs); + if (rval) { + rval = EINVAL; + break; + } + if (ddi_copyout(&kvm_sregs_ioc, (caddr_t)arg, sizeof(kvm_sregs_ioc), mode)) + rval = EFAULT; + *rval_p = 0; + break; + } + case KVM_SET_SREGS: { + struct kvm_sregs_ioc kvm_sregs_ioc; + struct kvm *kvmp; + struct kvm_vcpu *vcpu; + + if (ddi_copyin((caddr_t)arg, &kvm_sregs_ioc, sizeof (kvm_sregs_ioc), mode)) { + rval = EFAULT; + break; + } + + kvmp = find_kvm_id(kvm_sregs_ioc.kvm_kvmid); + if (!kvmp || kvm_sregs_ioc.kvm_cpu_index >= kvmp->online_vcpus) { + rval = EINVAL; + break; + } + + vcpu = kvmp->vcpus[kvm_sregs_ioc.kvm_cpu_index]; + + rval = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs_ioc.sregs); + if (rval) + rval = EINVAL; + *rval_p = 0; + break; + } case KVM_SET_CPUID2: { struct kvm_cpuid2_ioc cpuid_ioc; struct kvm_cpuid2 cpuid_data; struct kvm_vcpu *vcpu; - rval = EFAULT; - if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) - return (EFAULT); - if (cpuid_ioc.kvm_vcpu_addr == NULL) - return (EINVAL); + if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) { + rval = EFAULT; + break; + } + if (cpuid_ioc.kvm_vcpu_addr == NULL) { + rval = EINVAL; + break; + } vcpu = (struct kvm_vcpu *)(cpuid_ioc.kvm_vcpu_addr); if (ddi_copyin((const char *)(cpuid_ioc.cpuid_data), (char *)&cpuid_data, - sizeof(cpuid_data), mode)) - return (EFAULT); + sizeof(cpuid_data), mode)) { + rval = EFAULT; + break; + } rval = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid_data, cpuid_data.entries, mode); if (rval) - return (rval); + rval = EINVAL; break; } @@ -2658,33 +9044,41 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_ struct kvm_cpuid2 cpuid_data; struct kvm_vcpu *vcpu; - rval = EFAULT; - if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) - return (EFAULT); + if (ddi_copyin((const char *)arg, &cpuid_ioc, sizeof cpuid_ioc, mode)) { + rval = EFAULT; + break; + } - if (cpuid_ioc.kvm_vcpu_addr == NULL) - return (EINVAL); + if (cpuid_ioc.kvm_vcpu_addr == NULL) { + rval = EINVAL; + break; + } vcpu = (struct kvm_vcpu *)cpuid_ioc.kvm_vcpu_addr; if (ddi_copyin((const char *)(cpuid_ioc.cpuid_data), (char *)&cpuid_data, - sizeof(cpuid_data), mode)) - return (EFAULT); + sizeof(cpuid_data), mode)) { + rval = EFAULT; + break; + } rval = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid_data, cpuid_data.entries, mode); - if (rval) - return (rval); - rval = EFAULT; + if (rval) { + rval = EINVAL; + break; + } + if (ddi_copyout(&cpuid_ioc, (char *)arg, sizeof cpuid_ioc, mode)) - return (EFAULT); - rval = 0; + rval = EFAULT; break; } case KVM_GET_VCPU_MMAP_SIZE: - if (arg != NULL) - return (rval); + if (arg != NULL) { + rval = EINVAL; + break; + } *rval_p = ptob(1); break; case KVM_SET_TSS_ADDR: @@ -2692,22 +9086,27 @@ kvm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_ struct kvm_tss kvm_tss; struct kvm *kvmp; if (ddi_copyin((const void *)arg, &kvm_tss, - sizeof(kvm_tss), mode) != 0) - return (EFAULT); + sizeof(kvm_tss), mode) != 0) { + rval = EFAULT; + break; + } kvmp = find_kvm_id(kvm_tss.kvmid); - if (kvmp == NULL) - return(EINVAL); + if (kvmp == NULL) { + rval = EINVAL; + break; + } rval = kvm_vm_ioctl_set_tss_addr(kvmp, kvm_tss.addr); - if (rval != DDI_SUCCESS) - return (rval); + break; } default: - return (rval); /* x64, others may do other things... */ + rval = EINVAL; /* x64, others may do other things... */ } + + x = 10; /*XXX do something...*/ if (*rval_p == -1) return (EINVAL); - return (DDI_SUCCESS); + return (rval); } static int @@ -5,6 +5,7 @@ #include <sys/ddi.h> #include <sys/sunddi.h> +#include "kvm_types.h" #include <sys/bitmap.h> #define KVM_API_VERSION 12 /* same as linux (for qemu compatability...) */ @@ -13,6 +14,8 @@ #define offsetof(s, m) ((size_t)(&((s *)0)->m)) #endif +#define offset_in_page(p) ((unsigned long)(p) & ~PAGEMASK) + /* borrowed liberally from linux... */ #define MAX_IO_MSRS 256 @@ -30,6 +33,8 @@ #define KVM_MAX_VCPUS 64 +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ + #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ @@ -154,12 +159,6 @@ #define KVM_NR_PAGE_SIZES 3 /* XXX assumes x86 */ -enum kvm_bus { - KVM_MMIO_BUS, - KVM_PIO_BUS, - KVM_NR_BUSES -}; - struct kvm_vcpu_data { char vcpu_vhpt[VHPT_SIZE]; char vcpu_vtlb[VTLB_SIZE]; @@ -175,105 +174,32 @@ struct kvm_vm_data { }; /* - * Address types: - * - * gva - guest virtual address - * gpa - guest physical address - * gfn - guest frame number - * hva - host virtual address - * hpa - host physical address - * hfn - host frame number - */ - -typedef unsigned long gva_t; -typedef uint64_t gpa_t; -typedef unsigned long gfn_t; - -typedef unsigned long hva_t; -typedef uint64_t hpa_t; -typedef unsigned long hfn_t; - -/* - * kvm_mmu_page_role, below, is defined as: - * - * bits 0:3 - total guest paging levels (2-4, or zero for real mode) - * bits 4:7 - page table level for this shadow (1-4) - * bits 8:9 - page table quadrant for 2-level guests - * bit 16 - direct mapping of virtual to physical mapping at gfn - * used for real mode and two-dimensional paging - * bits 17:19 - common access permissions for all ptes in this shadow page - */ -union kvm_mmu_page_role { - unsigned word; - struct { - unsigned glevels:4; - unsigned level:4; - unsigned quadrant:2; - unsigned pad_for_nice_hex_output:6; - unsigned direct:1; - unsigned access:3; - unsigned invalid:1; - unsigned cr4_pge:1; - unsigned nxe:1; - }w; -}; - - -/* - * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level - * 32-bit). The kvm_mmu structure abstracts the details of the current mmu - * mode. - */ - -struct kvm_vcpu; -struct kvm_mmu_page; - -struct kvm_mmu { - void (*new_cr3)(struct kvm_vcpu *vcpu); - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t err); - void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, uint32_t access, - uint32_t *error); - void (*prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page); - int (*sync_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); - hpa_t root_hpa; - int root_level; - int shadow_root_level; - union kvm_mmu_page_role base_role; - - uint64_t *pae_root; - uint64_t rsvd_bits_mask[2][4]; -}; - -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char pad[2]; - char buf[512]; /* XXX aligned */ -}; - -/* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. */ -#define KVM_NR_MEM_OBJS 40 -#define KVM_NR_DB_REGS 4 -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; +#define KVM_NR_DB_REGS 4 -/* Type, address-of, and value of an instruction's operand. */ -struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; - unsigned int bytes; - unsigned long val, orig_val, *ptr; +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + uint16_t cwd; + uint16_t swd; + uint16_t twd; + uint16_t fop; + uint64_t rip; + uint64_t rdp; + uint32_t mxcsr; + uint32_t mxcsr_mask; + uint32_t st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +#ifdef CONFIG_X86_64 + uint32_t xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ +#else + uint32_t xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ +#endif }; struct i387_fxsave_struct { @@ -311,109 +237,11 @@ struct i387_fxsave_struct { } __attribute__((aligned(16))); -struct kvm_pio_request { - unsigned long count; - int cur_count; - gva_t guest_gva; - int in; - int port; - int size; - int string; - int down; - int rep; -}; - #define KVM_MAX_CPUID_ENTRIES 40 -enum kvm_reg { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, - VCPU_REGS_RDX = 2, - VCPU_REGS_RBX = 3, - VCPU_REGS_RSP = 4, - VCPU_REGS_RBP = 5, - VCPU_REGS_RSI = 6, - VCPU_REGS_RDI = 7, -#ifdef CONFIG_X86_64 - VCPU_REGS_R8 = 8, - VCPU_REGS_R9 = 9, - VCPU_REGS_R10 = 10, - VCPU_REGS_R11 = 11, - VCPU_REGS_R12 = 12, - VCPU_REGS_R13 = 13, - VCPU_REGS_R14 = 14, - VCPU_REGS_R15 = 15, -#endif - VCPU_REGS_RIP, - NR_VCPU_REGS -}; -enum kvm_reg_ex { - VCPU_EXREG_PDPTR = NR_VCPU_REGS, -}; - -struct kvm_cpuid_entry2 { - uint32_t function; - uint32_t index; - uint32_t flags; - uint32_t eax; - uint32_t ebx; - uint32_t ecx; - uint32_t edx; - uint32_t padding[3]; -}; - -struct fetch_cache { - unsigned char data[15]; - unsigned long start; - unsigned long end; -}; - -struct decode_cache { - unsigned char twobyte; - unsigned char b; - unsigned char lock_prefix; - unsigned char rep_prefix; - unsigned char op_bytes; - unsigned char ad_bytes; - unsigned char rex_prefix; - struct operand src; - struct operand src2; - struct operand dst; - unsigned char has_seg_override; - unsigned char seg_override; - unsigned int d; - unsigned long regs[NR_VCPU_REGS]; - unsigned long eip, eip_orig; - /* modrm */ - unsigned char modrm; - unsigned char modrm_mod; - unsigned char modrm_reg; - unsigned char modrm_rm; - unsigned char use_modrm_ea; - unsigned char rip_relative; - unsigned long modrm_ea; - void *modrm_ptr; - unsigned long modrm_val; - struct fetch_cache fetch; -}; +#include "kvm_emulate.h" -struct x86_emulate_ctxt { - /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - - unsigned long eflags; - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ - int mode; - uint32_t cs_base; - - /* interruptibility state, as a result of execution of STI or MOV SS */ - int interruptibility; - - /* decode cache */ - struct decode_cache decode; -}; - /* * These structs MUST NOT be changed. * They are the ABI between hypervisor and guest OS. @@ -444,34 +272,12 @@ struct pvclock_vcpu_time_info { unsigned char pad[3]; } __attribute__((__packed__)); /* 32 bytes */ -/* In the Intel processor's MTRR interface, the MTRR type is always held in - an 8 bit field: */ -typedef unsigned char mtrr_type; - -#define MTRR_NUM_FIXED_RANGES 88 -#define MTRR_MAX_VAR_RANGES 256 - -struct mtrr_var_range { - uint32_t base_lo; - uint32_t base_hi; - uint32_t mask_lo; - uint32_t mask_hi; -}; - -struct mtrr_state_type { - struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; - mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - #define APIC_LDR 0xD0 - +#ifdef _KERNEL struct kvm_lapic { unsigned long base_address; -#ifdef XXX struct kvm_io_device dev; +#ifdef XXX struct kvm_timer lapic_timer; #endif /*XXX*/ uint32_t divide_count; @@ -484,114 +290,7 @@ struct kvm_lapic { struct page *vapic_page; }; -struct kvm_vcpu_arch { - uint64_t host_tsc; - /* - * rip and regs accesses must go through - * kvm_{register,rip}_{read,write} functions. - */ - unsigned long regs[NR_VCPU_REGS]; - uint32_t regs_avail; - uint32_t regs_dirty; - - unsigned long cr0; - unsigned long cr0_guest_owned_bits; - unsigned long cr2; - unsigned long cr3; - unsigned long cr4; - unsigned long cr4_guest_owned_bits; - unsigned long cr8; - uint32_t hflags; - uint64_t pdptrs[4]; /* pae */ - uint64_t efer; - uint64_t apic_base; - struct kvm_lapic *apic; /* kernel irqchip context */ - int32_t apic_arb_prio; - int mp_state; - int sipi_vector; - uint64_t ia32_misc_enable_msr; - char tpr_access_reporting; - - struct kvm_mmu mmu; - /* only needed in kvm_pv_mmu_op() path, but it's hot so - * put it here to avoid allocation */ - struct kvm_pv_mmu_op_buffer mmu_op_buffer; - - struct kvm_mmu_memory_cache mmu_pte_chain_cache; - struct kvm_mmu_memory_cache mmu_rmap_desc_cache; - struct kvm_mmu_memory_cache mmu_page_cache; - struct kvm_mmu_memory_cache mmu_page_header_cache; - - gfn_t last_pt_write_gfn; - int last_pt_write_count; - uint64_t *last_pte_updated; - gfn_t last_pte_gfn; - - struct { - gfn_t gfn; /* presumed gfn during guest pte update */ - pfn_t pfn; /* pfn corresponding to that gfn */ - unsigned long mmu_seq; - } update_pte; - - struct i387_fxsave_struct host_fx_image; - struct i387_fxsave_struct guest_fx_image; - - gva_t mmio_fault_cr2; - struct kvm_pio_request pio; - void *pio_data; - - unsigned char event_exit_inst_len; - - struct kvm_queued_exception { - char pending; - char has_error_code; - unsigned char nr; - uint32_t error_code; - } exception; - - struct kvm_queued_interrupt { - char pending; - char soft; - unsigned char nr; - } interrupt; - - int halt_request; /* real mode on Intel only */ - - int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; - /* emulate context */ - - struct x86_emulate_ctxt emulate_ctxt; - - gpa_t time; - struct pvclock_vcpu_time_info hv_clock; - unsigned int hv_clock_tsc_khz; - unsigned int time_offset; - struct page *time_page; - - char nmi_pending; - char nmi_injected; - - struct mtrr_state_type mtrr_state; - uint32_t pat; - - int switch_db_regs; - unsigned long db[KVM_NR_DB_REGS]; - unsigned long dr6; - unsigned long dr7; - unsigned long eff_db[KVM_NR_DB_REGS]; - - uint64_t mcg_cap; - uint64_t mcg_status; - uint64_t mcg_ctl; - uint64_t *mce_banks; - - /* used for guest single stepping over the given code position */ - unsigned short singlestep_cs; - unsigned long singlestep_rip; - /* fields used by HYPER-V emulation */ - uint64_t hv_vapic; -}; +struct vcpu_vmx; struct kvm_vcpu { struct kvm *kvm; @@ -615,18 +314,37 @@ struct kvm_vcpu { sigset_t sigset; struct kstat stat; -#ifdef CONFIG_HAS_IOMEM + /*#ifdef CONFIG_HAS_IOMEM*/ int mmio_needed; int mmio_read_completed; int mmio_is_write; int mmio_size; unsigned char mmio_data[8]; gpa_t mmio_phys_addr; -#endif + /*#endif*/ struct kvm_vcpu_arch arch; }; + +#define KVM_NR_SHARED_MSRS 16 + +struct kvm_shared_msrs_global { + int nr; + uint32_t msrs[KVM_NR_SHARED_MSRS]; +}; + +struct kvm_shared_msrs { +#ifdef XXX + struct user_return_notifier urn; +#endif /*XXX*/ + int registered; + struct kvm_shared_msr_values { + uint64_t host; + uint64_t curr; + } values[KVM_NR_SHARED_MSRS]; +}; + struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; @@ -641,12 +359,6 @@ struct kvm_memory_slot { int user_alloc; }; -#define KVM_MEMORY_SLOTS 32 /* XXX assumes x86 */ -#define KVM_PRIVATE_MEM_SLOTS 4 /* XXX assumes x86 */ -#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) -#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) -#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) - struct kvm_memslots { int nmemslots; @@ -654,6 +366,7 @@ struct kvm_memslots { KVM_PRIVATE_MEM_SLOTS]; }; +#endif /*_KERNEL*/ #ifdef x86 @@ -784,21 +497,18 @@ struct kvm_regs { uint64_t rip, rflags; }; +struct kvm_regs_ioc { + struct kvm_regs kvm_regs; + int kvm_cpu_index; + int kvm_kvmid; +}; + /* for KVM_GET_LAPIC and KVM_SET_LAPIC */ #define KVM_APIC_REG_SIZE 0x400 struct kvm_lapic_state { char regs[KVM_APIC_REG_SIZE]; }; -struct kvm_segment { - uint64_t base; - uint32_t limit; - unsigned short selector; - unsigned char type; - unsigned char present, dpl, db, s, l, g, avl; - unsigned char unusable; - unsigned char padding; -}; struct kvm_dtable { uint64_t base; @@ -822,6 +532,12 @@ struct kvm_sregs { uint64_t interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; +struct kvm_sregs_ioc { + struct kvm_sregs sregs; + int kvm_cpu_index; + int kvm_kvmid; +}; + /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ #define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 @@ -930,13 +646,6 @@ struct kvm_assigned_dev_kernel { kmutex_t assigned_dev_lock; }; -#define NR_PTE_CHAIN_ENTRIES 5 - -struct kvm_pte_chain { - uint64_t *parent_ptes[NR_PTE_CHAIN_ENTRIES]; - list_t link; -}; - /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. @@ -948,39 +657,6 @@ struct kvm_pte_chain { const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) - -struct kvm_mmu_page { - struct list_node link; - struct list_node hash_link; - - struct list_node oos_link; - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - gfn_t gfn; - union kvm_mmu_page_role role; - - uint64_t *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; - /* - * One bit set per slot which has memory - * in this shadow page. - */ - unsigned long slot_bitmap[BT_BITOUL(KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)]; - int multimapped; /* More than one parent_pte? */ - int root_count; /* Currently serving as active root */ - char unsync; - unsigned int unsync_children; - union { - uint64_t *parent_pte; /* !multimapped */ - list_t parent_ptes; /* hash list, multimapped, kvm_pte_chain */ - }v; - unsigned long unsync_child_bitmap[BT_BITOUL(512)]; -}; - #define PT64_ROOT_LEVEL 4 #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 @@ -1069,6 +745,11 @@ struct kvm_fpu { uint32_t pad2; }; +struct kvm_fpu_ioc { + struct kvm_fpu fpu; + int kvm_cpu_index; + int kvm_kvmid; +}; struct kvm_msr_entry { uint32_t index; @@ -1084,6 +765,12 @@ struct kvm_msrs { struct kvm_msr_entry entries[1]; }; +struct kvm_msrs_ioc { + struct kvm_msrs *kvm_msrs; + int kvm_cpu_index; + int kvm_kvmid; +}; + /* for KVM_GET_MSR_INDEX_LIST */ struct kvm_msr_list { uint32_t nmsrs; /* number of msrs in entries */ @@ -1122,73 +809,10 @@ struct pvclock_wall_clock { #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) - -struct kvm_mem_alias { - gfn_t base_gfn; - unsigned long npages; - gfn_t target_gfn; -#define KVM_ALIAS_INVALID 1UL - unsigned long flags; -}; - -#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION - -struct kvm_mem_aliases { - struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; - int naliases; -}; - -struct kvm_xen_hvm_config { - uint32_t flags; - uint32_t msr; - uint64_t blob_addr_32; - uint64_t blob_addr_64; - unsigned char blob_size_32; - unsigned char blob_size_64; - unsigned char pad2[30]; -}; - -struct kvm_arch { - struct kvm_mem_aliases *aliases; - - unsigned int n_free_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; - list_t mmu_page_hash[KVM_NUM_MMU_PAGES]; - /* - * Hash table of struct kvm_mmu_page. - */ - list_t active_mmu_pages; - list_t assigned_dev_head; - struct iommu_domain *iommu_domain; - int iommu_flags; - struct kvm_pic *vpic; - struct kvm_ioapic *vioapic; - struct kvm_pit *vpit; - int vapics_in_nmi_mode; - - unsigned int tss_addr; - struct page *apic_access_page; - - gpa_t wall_clock; - - struct page *ept_identity_pagetable; - char ept_identity_pagetable_done; - gpa_t ept_identity_map_addr; - - unsigned long irq_sources_bitmap; - uint64_t vm_init_tsc; - int64_t kvmclock_offset; - - struct kvm_xen_hvm_config xen_hvm_config; - - /* fields used by HYPER-V emulation */ - uint64_t hv_guest_os_id; - uint64_t hv_hypercall; -}; - #endif /*x86*/ +#ifdef _KERNEL + struct kvm { kmutex_t mmu_lock; kmutex_t requests_lock; @@ -1217,10 +841,10 @@ struct kvm { struct kstat kvm_kstat; struct kvm_arch arch; volatile int users_count; -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + /*#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET*/ struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; -#endif + /*#endif*/ kmutex_t irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP @@ -1238,6 +862,7 @@ struct kvm { #endif /*XXX*/ int kvmid; /* unique identifier for this kvm */ }; +#endif /*_KERNEL*/ #define KVM_EXIT_UNKNOWN 0 #define KVM_EXIT_EXCEPTION 1 @@ -1486,19 +1111,32 @@ static inline void native_load_tr_desc(void) #define _IO(x, y) ((x<<8)|y) /* original is in /usr/include/sys/ioccom.h */ #define KVMIO 0xAE +/* for KVM_SET_CPUID2/KVM_GET_CPUID2 */ +struct kvm_cpuid2_ioc { + struct cpuid_data *cpuid_data; + uint64_t kvm_vcpu_addr; + int kvm_cpu_index; +}; + +/* for KVM_RUN */ +struct kvm_run_ioc { + int kvm_kvmid; + int kvm_cpu_index; +}; + /* * ioctls for vcpu fds */ #define KVM_RUN _IO(KVMIO, 0x80) -#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) -#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) -#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) -#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs_ioc) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs_ioc) +#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs_ioc) +#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs_ioc) #define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) -#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) -#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) -#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) -#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) +#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu_ioc) +#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu_ioc) +#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs_ioc) +#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs_ioc) #define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) #define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) /* Available with KVM_CAP_VCPU_EVENTS */ @@ -1554,6 +1192,20 @@ struct vmcs_config { uint32_t vmexit_ctrl; uint32_t vmentry_ctrl; }; + +#define RMAP_EXT 4 + +struct kvm_rmap_desc { + uint64_t *sptes[RMAP_EXT]; + struct kvm_rmap_desc *more; +}; + + +static struct vmx_capability { + uint32_t ept; + uint32_t vpid; +} vmx_capability; + struct vmcs { uint32_t revision_id; uint32_t abort; @@ -1576,13 +1228,6 @@ struct kvm_dirty_log { }v; }; -/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ - -struct kvm_coalesced_mmio_zone { - uint64_t addr; - uint32_t size; - uint32_t pad; -}; struct kvm_coalesced_mmio { uint64_t phys_addr; @@ -1613,8 +1258,8 @@ struct kvm_mp_state { uint32_t mp_state; }; -#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) -#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) +#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2_ioc) +#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2_ioc) /* for kvm_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES 1UL @@ -1668,11 +1313,6 @@ struct kvm_vcpu_ioc { }; -/* for KVM_SET_CPUID2/KVM_GET_CPUID2 */ -struct kvm_cpuid2_ioc { - struct cpuid_data *cpuid_data; - uint64_t kvm_vcpu_addr; -}; /* LDT or TSS descriptor in the GDT. 16 bytes. */ struct ldttss_desc64 { @@ -1684,6 +1324,13 @@ struct ldttss_desc64 { uint32_t zero1; } __attribute__((packed)); +struct shared_msr_entry { + unsigned index; + uint64_t data; + uint64_t mask; +}; + +#ifdef _KERNEL struct vcpu_vmx { struct kvm_vcpu vcpu; list_t local_vcpus_link; @@ -1732,8 +1379,6 @@ struct vcpu_vmx { char rdtscp_enabled; }; -#ifdef _KERNEL - /* * vcpu->requests bit members */ @@ -1826,87 +1471,5 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl #define INVALID_PAGE (~(hpa_t)0) -struct kvm_x86_ops { - int (*cpu_has_kvm_support)(void); /* __init */ - int (*disabled_by_bios)(void); /* __init */ - int (*hardware_enable)(void *dummy); - void (*hardware_disable)(void *dummy); - void (*check_processor_compatibility)(void *rtn); - int (*hardware_setup)(void); /* __init */ - void (*hardware_unsetup)(void); /* __exit */ - int (*cpu_has_accelerated_tpr)(void); - void (*cpuid_update)(struct kvm_vcpu *vcpu); - - /* Create, but do not attach this VCPU */ - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); - void (*vcpu_free)(struct kvm_vcpu *vcpu); - int (*vcpu_reset)(struct kvm_vcpu *vcpu); - - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); - void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); - void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*set_guest_debug)(struct kvm_vcpu *vcpu, -#ifdef XXX - struct kvm_guest_debug *dbg); -#else - void *dbg); -#endif - - int (*get_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata); - int (*set_msr)(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data); - uint64_t (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); - void (*get_segment)(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); - int (*get_cpl)(struct kvm_vcpu *vcpu); - void (*set_segment)(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); - void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); - void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); - void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); - void (*set_efer)(struct kvm_vcpu *vcpu, uint64_t efer); - void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); - int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); - void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); - unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); - void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*fpu_activate)(struct kvm_vcpu *vcpu); - void (*fpu_deactivate)(struct kvm_vcpu *vcpu); - - void (*tlb_flush)(struct kvm_vcpu *vcpu); - - void (*run)(struct kvm_vcpu *vcpu); - int (*handle_exit)(struct kvm_vcpu *vcpu); - void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); - void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - uint32_t (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - void (*patch_hypercall)(struct kvm_vcpu *vcpu, - unsigned char *hypercall_addr); - void (*set_irq)(struct kvm_vcpu *vcpu); - void (*set_nmi)(struct kvm_vcpu *vcpu); - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - int has_error_code, uint32_t error_code); - int (*interrupt_allowed)(struct kvm_vcpu *vcpu); - int (*nmi_allowed)(struct kvm_vcpu *vcpu); - int (*get_nmi_mask)(struct kvm_vcpu *vcpu); - void (*set_nmi_mask)(struct kvm_vcpu *vcpu, int masked); - void (*enable_nmi_window)(struct kvm_vcpu *vcpu); - void (*enable_irq_window)(struct kvm_vcpu *vcpu); - void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); - int (*get_tdp_level)(void); - uint64_t (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, int is_mmio); - int (*get_lpage_level)(void); - int (*rdtscp_supported)(void); - - const struct trace_print_flags *exit_reasons_str; -}; - #endif @@ -1,849 +1,570 @@ +#ifndef __KVM_HOST_H +#define __KVM_HOST_H + /* - * Kernel-based Virtual Machine driver for Linux - * - * This header defines architecture specific interfaces, x86 version - * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. - * */ -#ifndef _ASM_X86_KVM_HOST_H -#define _ASM_X86_KVM_HOST_H - #ifdef XXX #include <linux/types.h> +#include <linux/hardirq.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/signal.h> +#include <linux/sched.h> #include <linux/mm.h> -#include <linux/mmu_notifier.h> -#include <linux/tracepoint.h> +#include <linux/preempt.h> +#include <linux/msi.h> +#include <asm/signal.h> #include <linux/kvm.h> #include <linux/kvm_para.h> -#include <linux/kvm_types.h> - -#include <asm/pvclock-abi.h> -#include <asm/desc.h> -#include <asm/mtrr.h> -#include <asm/msr-index.h> - -#endif - -#define KVM_PIO_PAGE_OFFSET 1 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 - -#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) -#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) -#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ - 0xFFFFFF0000000000ULL) - -#define INVALID_PAGE (~(hpa_t)0) -#define UNMAPPED_GVA (~(gpa_t)0) - -/* KVM Hugepage definitions for x86 */ -#define KVM_NR_PAGE_SIZES 3 -#define KVM_HPAGE_SHIFT(x) (PAGESHIFT + (((x) - 1) * 9)) -#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) -#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) -#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGESIZE) - -#define DE_VECTOR 0 -#define DB_VECTOR 1 -#define BP_VECTOR 3 -#define OF_VECTOR 4 -#define BR_VECTOR 5 -#define UD_VECTOR 6 -#define NM_VECTOR 7 -#define DF_VECTOR 8 -#define TS_VECTOR 10 -#define NP_VECTOR 11 -#define SS_VECTOR 12 -#define GP_VECTOR 13 -#define PF_VECTOR 14 -#define MF_VECTOR 16 -#define MC_VECTOR 18 - -#define SELECTOR_TI_MASK (1 << 2) -#define SELECTOR_RPL_MASK 0x03 - -#define IOPL_SHIFT 12 - -#define KVM_ALIAS_SLOTS 4 - -#define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_MMU_HASH_SHIFT 10 -#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) -#define KVM_MIN_FREE_MMU_PAGES 5 -#define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 40 -#define KVM_NR_FIXED_MTRR_REGION 88 -#define KVM_NR_VAR_MTRR 8 - -extern kmutex_t kvm_lock; -extern list_t vm_list; - -struct kvm_vcpu; -struct kvm; - -enum { - VCPU_SREG_ES, - VCPU_SREG_CS, - VCPU_SREG_SS, - VCPU_SREG_DS, - VCPU_SREG_FS, - VCPU_SREG_GS, - VCPU_SREG_TR, - VCPU_SREG_LDTR, -}; - -#ifdef XXX -#include <asm/kvm_emulate.h> #endif /*XXX*/ -#define KVM_NR_MEM_OBJS 40 +#include "kvm_types.h" -#define KVM_NR_DB_REGS 4 +#define KVM_MEMORY_SLOTS 32 /* XXX assumes x86 */ +#define KVM_PRIVATE_MEM_SLOTS 4 /* XXX assumes x86 */ +#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) -#define DR6_BD (1 << 13) -#define DR6_BS (1 << 14) -#define DR6_FIXED_1 0xffff0ff0 -#define DR6_VOLATILE 0x0000e00f +#include "kvm_x86host.h" -#define DR7_BP_EN_MASK 0x000000ff -#define DR7_GE (1 << 9) -#define DR7_GD (1 << 13) -#define DR7_FIXED_1 0x00000400 -#define DR7_VOLATILE 0xffff23ff - -#ifdef XXX /* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. + * vcpu->requests bit members */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - -#define NR_PTE_CHAIN_ENTRIES 5 +#define KVM_REQ_TLB_FLUSH 0 +#define KVM_REQ_MIGRATE_TIMER 1 +#define KVM_REQ_REPORT_TPR_ACCESS 2 +#define KVM_REQ_MMU_RELOAD 3 +#define KVM_REQ_TRIPLE_FAULT 4 +#define KVM_REQ_PENDING_TIMER 5 +#define KVM_REQ_UNHALT 6 +#define KVM_REQ_MMU_SYNC 7 +#define KVM_REQ_KVMCLOCK_UPDATE 8 +#define KVM_REQ_KICK 9 +#define KVM_REQ_DEACTIVATE_FPU 10 + +#define KVM_USERSPACE_IRQ_SOURCE_ID 0 -struct kvm_pte_chain { - u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; - struct hlist_node link; -}; +struct kvm; +struct kvm_vcpu; +extern struct kmem_cache *kvm_vcpu_cache; /* - * kvm_mmu_page_role, below, is defined as: - * - * bits 0:3 - total guest paging levels (2-4, or zero for real mode) - * bits 4:7 - page table level for this shadow (1-4) - * bits 8:9 - page table quadrant for 2-level guests - * bit 16 - direct mapping of virtual to physical mapping at gfn - * used for real mode and two-dimensional paging - * bits 17:19 - common access permissions for all ptes in this shadow page + * It would be nice to use something smarter than a linear search, TBD... + * Thankfully we dont expect many devices to register (famous last words :), + * so until then it will suffice. At least its abstracted so we can change + * in one place. */ -union kvm_mmu_page_role { - unsigned word; - struct { - unsigned glevels:4; - unsigned level:4; - unsigned quadrant:2; - unsigned pad_for_nice_hex_output:6; - unsigned direct:1; - unsigned access:3; - unsigned invalid:1; - unsigned cr4_pge:1; - unsigned nxe:1; - }; -}; - -struct kvm_mmu_page { - struct list_head link; - struct hlist_node hash_link; - - struct list_head oos_link; - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - gfn_t gfn; - union kvm_mmu_page_role role; - - u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; - /* - * One bit set per slot which has memory - * in this shadow page. - */ - DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); - int multimapped; /* More than one parent_pte? */ - int root_count; /* Currently serving as active root */ - bool unsync; - unsigned int unsync_children; - union { - u64 *parent_pte; /* !multimapped */ - struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ - }; - DECLARE_BITMAP(unsync_child_bitmap, 512); +struct kvm_io_bus { + int dev_count; +#define NR_IOBUS_DEVS 200 + struct kvm_io_device *devs[NR_IOBUS_DEVS]; }; -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); +enum kvm_bus { + KVM_MMIO_BUS, + KVM_PIO_BUS, + KVM_NR_BUSES }; -struct kvm_pio_request { - unsigned long count; - int cur_count; - gva_t guest_gva; - int in; - int port; - int size; - int string; - int down; - int rep; -}; - -/* - * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level - * 32-bit). The kvm_mmu structure abstracts the details of the current mmu - * mode. - */ -struct kvm_mmu { - void (*new_cr3)(struct kvm_vcpu *vcpu); - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); - void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, - u32 *error); - void (*prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page); - int (*sync_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); - hpa_t root_hpa; - int root_level; - int shadow_root_level; - union kvm_mmu_page_role base_role; - - u64 *pae_root; - u64 rsvd_bits_mask[2][4]; -}; - -struct kvm_vcpu_arch { - u64 host_tsc; - /* - * rip and regs accesses must go through - * kvm_{register,rip}_{read,write} functions. - */ - unsigned long regs[NR_VCPU_REGS]; - u32 regs_avail; - u32 regs_dirty; - - unsigned long cr0; - unsigned long cr0_guest_owned_bits; - unsigned long cr2; - unsigned long cr3; - unsigned long cr4; - unsigned long cr4_guest_owned_bits; - unsigned long cr8; - u32 hflags; - u64 pdptrs[4]; /* pae */ - u64 efer; - u64 apic_base; - struct kvm_lapic *apic; /* kernel irqchip context */ - int32_t apic_arb_prio; - int mp_state; - int sipi_vector; - u64 ia32_misc_enable_msr; - bool tpr_access_reporting; - - struct kvm_mmu mmu; - /* only needed in kvm_pv_mmu_op() path, but it's hot so - * put it here to avoid allocation */ - struct kvm_pv_mmu_op_buffer mmu_op_buffer; - - struct kvm_mmu_memory_cache mmu_pte_chain_cache; - struct kvm_mmu_memory_cache mmu_rmap_desc_cache; - struct kvm_mmu_memory_cache mmu_page_cache; - struct kvm_mmu_memory_cache mmu_page_header_cache; - - gfn_t last_pt_write_gfn; - int last_pt_write_count; - u64 *last_pte_updated; - gfn_t last_pte_gfn; +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, const void *val); +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, + void *val); +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); +#ifdef XXX +struct kvm_vcpu { + struct kvm *kvm; +#ifdef CONFIG_PREEMPT_NOTIFIERS + struct preempt_notifier preempt_notifier; +#endif + int vcpu_id; + struct mutex mutex; + int cpu; + struct kvm_run *run; + unsigned long requests; + unsigned long guest_debug; + int srcu_idx; + + int fpu_active; + int guest_fpu_loaded; + wait_queue_head_t wq; + int sigset_active; + sigset_t sigset; + struct kvm_vcpu_stat stat; + +#ifdef CONFIG_HAS_IOMEM + int mmio_needed; + int mmio_read_completed; + int mmio_is_write; + int mmio_size; + unsigned char mmio_data[8]; + gpa_t mmio_phys_addr; +#endif - struct { - gfn_t gfn; /* presumed gfn during guest pte update */ - pfn_t pfn; /* pfn corresponding to that gfn */ - unsigned long mmu_seq; - } update_pte; - - struct i387_fxsave_struct host_fx_image; - struct i387_fxsave_struct guest_fx_image; - - gva_t mmio_fault_cr2; - struct kvm_pio_request pio; - void *pio_data; - - u8 event_exit_inst_len; - - struct kvm_queued_exception { - bool pending; - bool has_error_code; - u8 nr; - u32 error_code; - } exception; - - struct kvm_queued_interrupt { - bool pending; - bool soft; - u8 nr; - } interrupt; - - int halt_request; /* real mode on Intel only */ - - int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; - /* emulate context */ - - struct x86_emulate_ctxt emulate_ctxt; - - gpa_t time; - struct pvclock_vcpu_time_info hv_clock; - unsigned int hv_clock_tsc_khz; - unsigned int time_offset; - struct page *time_page; - - bool nmi_pending; - bool nmi_injected; - - struct mtrr_state_type mtrr_state; - u32 pat; - - int switch_db_regs; - unsigned long db[KVM_NR_DB_REGS]; - unsigned long dr6; - unsigned long dr7; - unsigned long eff_db[KVM_NR_DB_REGS]; - - u64 mcg_cap; - u64 mcg_status; - u64 mcg_ctl; - u64 *mce_banks; - - /* used for guest single stepping over the given code position */ - u16 singlestep_cs; - unsigned long singlestep_rip; - /* fields used by HYPER-V emulation */ - u64 hv_vapic; + struct kvm_vcpu_arch arch; }; -struct kvm_mem_alias { +struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; - gfn_t target_gfn; -#define KVM_ALIAS_INVALID 1UL unsigned long flags; + unsigned long *rmap; + unsigned long *dirty_bitmap; + struct { + unsigned long rmap_pde; + int write_count; + } *lpage_info[KVM_NR_PAGE_SIZES - 1]; + unsigned long userspace_addr; + int user_alloc; }; -#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION +static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) +{ + return ALIGN(memslot->npages, BITS_PER_LONG) / 8; +} -struct kvm_mem_aliases { - struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; - int naliases; +struct kvm_kernel_irq_routing_entry { + uint32_t gsi; + uint32_t type; + int (*set)(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level); + union { + struct { + unsigned irqchip; + unsigned pin; + } irqchip; + struct msi_msg msi; + }; + struct hlist_node link; }; -struct kvm_arch { - struct kvm_mem_aliases *aliases; +#ifdef __KVM_HAVE_IOAPIC - unsigned int n_free_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; - struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS]; + struct kvm_kernel_irq_routing_entry *rt_entries; + uint32_t nr_rt_entries; /* - * Hash table of struct kvm_mmu_page. + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. */ - struct list_head active_mmu_pages; - struct list_head assigned_dev_head; - struct iommu_domain *iommu_domain; - int iommu_flags; - struct kvm_pic *vpic; - struct kvm_ioapic *vioapic; - struct kvm_pit *vpit; - int vapics_in_nmi_mode; - - unsigned int tss_addr; - struct page *apic_access_page; - - gpa_t wall_clock; + struct hlist_head map[0]; +}; - struct page *ept_identity_pagetable; - bool ept_identity_pagetable_done; - gpa_t ept_identity_map_addr; +#else - unsigned long irq_sources_bitmap; - u64 vm_init_tsc; - s64 kvmclock_offset; +struct kvm_irq_routing_table {}; - struct kvm_xen_hvm_config xen_hvm_config; +#endif - /* fields used by HYPER-V emulation */ - u64 hv_guest_os_id; - u64 hv_hypercall; -}; +static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) +{ + smp_rmb(); + return kvm->vcpus[i]; +} -struct kvm_vm_stat { - u32 mmu_shadow_zapped; - u32 mmu_pte_write; - u32 mmu_pte_updated; - u32 mmu_pde_zapped; - u32 mmu_flooded; - u32 mmu_recycled; - u32 mmu_cache_miss; - u32 mmu_unsync; - u32 remote_tlb_flush; - u32 lpages; +#define kvm_for_each_vcpu(idx, vcpup, kvm) \ + for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \ + idx < atomic_read(&kvm->online_vcpus) && vcpup; \ + vcpup = kvm_get_vcpu(kvm, ++idx)) + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); + +void vcpu_load(struct kvm_vcpu *vcpu); +void vcpu_put(struct kvm_vcpu *vcpu); + +int kvm_init(void *opaque, unsigned int vcpu_size, + struct module *module); +void kvm_exit(void); + +void kvm_get_kvm(struct kvm *kvm); +void kvm_put_kvm(struct kvm *kvm); + +#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) +#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) +static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); + +extern struct page *bad_page; +extern pfn_t bad_pfn; + +int is_error_page(struct page *page); +int is_error_pfn(pfn_t pfn); +int kvm_is_error_hva(unsigned long addr); +int kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc); +int __kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc); +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc); +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc); +void kvm_disable_largepages(void); +void kvm_arch_flush_shadow(struct kvm *kvm); +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn); + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); +void kvm_release_page_clean(struct page *page); +void kvm_release_page_dirty(struct page *page); +void kvm_set_page_dirty(struct page *page); +void kvm_set_page_accessed(struct page *page); + +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); +int memslot_id(struct kvm *kvm, gfn_t gfn); +void kvm_release_pfn_dirty(pfn_t); +void kvm_release_pfn_clean(pfn_t pfn); +void kvm_set_pfn_dirty(pfn_t pfn); +void kvm_set_pfn_accessed(pfn_t pfn); +void kvm_get_pfn(pfn_t pfn); + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len); +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len); +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len); +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len); +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty(struct kvm *kvm, gfn_t gfn); + +void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); +void kvm_resched(struct kvm_vcpu *vcpu); +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_flush_remote_tlbs(struct kvm *kvm); +void kvm_reload_remote_mmus(struct kvm *kvm); + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); + +int kvm_dev_ioctl_check_extension(long ext); + +int kvm_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log, int *is_dirty); +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log); + +int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct + kvm_userspace_memory_region *mem, + int user_alloc); +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr); + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs); +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs); +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state); +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state); +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg); +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); + +int kvm_arch_init(void *opaque); +void kvm_arch_exit(void); + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); + +int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); +int kvm_arch_hardware_enable(void *garbage); +void kvm_arch_hardware_disable(void *garbage); +int kvm_arch_hardware_setup(void); +void kvm_arch_hardware_unsetup(void); +void kvm_arch_check_processor_compat(void *rtn); +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); + +void kvm_free_physmem(struct kvm *kvm); + +struct kvm *kvm_arch_create_vm(void); +void kvm_arch_destroy_vm(struct kvm *kvm); +void kvm_free_all_assigned_devices(struct kvm *kvm); +void kvm_arch_sync_events(struct kvm *kvm); + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); +void kvm_vcpu_kick(struct kvm_vcpu *vcpu); + +int kvm_is_mmio_pfn(pfn_t pfn); + +struct kvm_irq_ack_notifier { + struct hlist_node link; + unsigned gsi; + void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; -struct kvm_vcpu_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; - - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 nmi_window_exits; - u32 halt_exits; - u32 halt_wakeup; - u32 request_irq_exits; - u32 irq_exits; - u32 host_state_reload; - u32 efer_reload; - u32 fpu_reload; - u32 insn_emulation; - u32 insn_emulation_fail; - u32 hypercalls; - u32 irq_injections; - u32 nmi_injections; +#define KVM_ASSIGNED_MSIX_PENDING 0x1 +struct kvm_guest_msix_entry { + u32 vector; + u16 entry; + u16 flags; }; - -struct kvm_x86_ops { - int (*cpu_has_kvm_support)(void); /* __init */ - int (*disabled_by_bios)(void); /* __init */ - int (*hardware_enable)(void *dummy); - void (*hardware_disable)(void *dummy); - void (*check_processor_compatibility)(void *rtn); - int (*hardware_setup)(void); /* __init */ - void (*hardware_unsetup)(void); /* __exit */ - bool (*cpu_has_accelerated_tpr)(void); - void (*cpuid_update)(struct kvm_vcpu *vcpu); - - /* Create, but do not attach this VCPU */ - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); - void (*vcpu_free)(struct kvm_vcpu *vcpu); - int (*vcpu_reset)(struct kvm_vcpu *vcpu); - - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); - void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); - void (*vcpu_put)(struct kvm_vcpu *vcpu); - - void (*set_guest_debug)(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); - void (*get_segment)(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); - int (*get_cpl)(struct kvm_vcpu *vcpu); - void (*set_segment)(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); - void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); - void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); - void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); - void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); - void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); - int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); - void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); - unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); - void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*fpu_activate)(struct kvm_vcpu *vcpu); - void (*fpu_deactivate)(struct kvm_vcpu *vcpu); - - void (*tlb_flush)(struct kvm_vcpu *vcpu); - - void (*run)(struct kvm_vcpu *vcpu); - int (*handle_exit)(struct kvm_vcpu *vcpu); - void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); - void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - void (*patch_hypercall)(struct kvm_vcpu *vcpu, - unsigned char *hypercall_addr); - void (*set_irq)(struct kvm_vcpu *vcpu); - void (*set_nmi)(struct kvm_vcpu *vcpu); - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code); - int (*interrupt_allowed)(struct kvm_vcpu *vcpu); - int (*nmi_allowed)(struct kvm_vcpu *vcpu); - bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); - void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); - void (*enable_nmi_window)(struct kvm_vcpu *vcpu); - void (*enable_irq_window)(struct kvm_vcpu *vcpu); - void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); - int (*get_tdp_level)(void); - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - int (*get_lpage_level)(void); - bool (*rdtscp_supported)(void); - - const struct trace_print_flags *exit_reasons_str; +struct kvm_assigned_dev_kernel { + struct kvm_irq_ack_notifier ack_notifier; + struct work_struct interrupt_work; + struct list_head list; + int assigned_dev_id; + int host_segnr; + int host_busnr; + int host_devfn; + unsigned int entries_nr; + int host_irq; + bool host_irq_disabled; + struct msix_entry *host_msix_entries; + int guest_irq; + struct kvm_guest_msix_entry *guest_msix_entries; + unsigned long irq_requested_type; + int irq_source_id; + int flags; + struct pci_dev *dev; + struct kvm *kvm; + spinlock_t assigned_dev_lock; }; -extern struct kvm_x86_ops *kvm_x86_ops; - -int kvm_mmu_module_init(void); -void kvm_mmu_module_exit(void); - -void kvm_mmu_destroy(struct kvm_vcpu *vcpu); -int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_setup(struct kvm_vcpu *vcpu); -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); -void kvm_mmu_set_base_ptes(u64 base_pte); -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); - -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -void kvm_mmu_zap_all(struct kvm *kvm); -unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); - -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); - -int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes); -int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, - gpa_t addr, unsigned long *ret); -u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); - -extern bool tdp_enabled; - -enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ - EMULATE_FAIL, /* can't emulate this instruction */ +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; }; -#define EMULTYPE_NO_DECODE (1 << 0) -#define EMULTYPE_TRAP_UD (1 << 1) -#define EMULTYPE_SKIP (1 << 2) -int emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, u16 error_code, int emulation_type); -void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags); - -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, - unsigned long *rflags); -void kvm_enable_efer_bits(u64); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - -struct x86_emulate_ctxt; - -int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, - int size, unsigned port); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, - int size, unsigned long count, int down, - gva_t address, int rep, unsigned port); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); -int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); -int emulate_clts(struct kvm_vcpu *vcpu); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest); -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long value); - -void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); - -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); - -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); -unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); - -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); - -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); - -#ifdef XXX -void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -#endif /*XXX*/ -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, - u32 error_code); -bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); - -int kvm_pic_set_irq(void *opaque, int irq, int level); - -void kvm_inject_nmi(struct kvm_vcpu *vcpu); - -void fx_init(struct kvm_vcpu *vcpu); - -int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - - -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - bool guest_initiated); -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); -int kvm_mmu_load(struct kvm_vcpu *vcpu); -void kvm_mmu_unload(struct kvm_vcpu *vcpu); -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); -gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); -gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); -gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); - -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); - -int kvm_fix_hypercall(struct kvm_vcpu *vcpu); - -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); -void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); - -void kvm_enable_tdp(void); -void kvm_disable_tdp(void); - -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); -int complete_pio(struct kvm_vcpu *vcpu); -bool kvm_check_iopl(struct kvm_vcpu *vcpu); - -struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); -static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) +#ifdef __KVM_HAVE_IOAPIC +void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, + union kvm_ioapic_redirect_entry *entry, + unsigned long *deliver_bitmask); +#endif +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +int kvm_request_irq_source_id(struct kvm *kvm); +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); + +/* For vcpu->arch.iommu_flags */ +#define KVM_IOMMU_CACHE_COHERENCY 0x1 + +#ifdef CONFIG_IOMMU_API +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); +int kvm_iommu_map_guest(struct kvm *kvm); +int kvm_iommu_unmap_guest(struct kvm *kvm); +int kvm_assign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev); +int kvm_deassign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev); +#else /* CONFIG_IOMMU_API */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, + unsigned long npages) { - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); + return 0; } -#endif /*XXX*/ - -static inline unsigned short kvm_read_fs(void) +static inline int kvm_iommu_map_guest(struct kvm *kvm) { - unsigned short seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; + return -ENODEV; } -static inline unsigned short kvm_read_gs(void) +static inline int kvm_iommu_unmap_guest(struct kvm *kvm) { - unsigned short seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; + return 0; } -static inline unsigned short kvm_read_ldt(void) +static inline int kvm_assign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) { - unsigned short ldt; - asm("sldt %0" : "=g"(ldt)); - return ldt; + return 0; } -static inline void kvm_load_fs(unsigned short sel) +static inline int kvm_deassign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) { - asm("mov %0, %%fs" : : "rm"(sel)); + return 0; } +#endif /* CONFIG_IOMMU_API */ -static inline void kvm_load_gs(unsigned short sel) +static inline void kvm_guest_enter(void) { - asm("mov %0, %%gs" : : "rm"(sel)); + account_system_vtime(current); + current->flags |= PF_VCPU; } -static inline void kvm_load_ldt(unsigned short sel) +static inline void kvm_guest_exit(void) { - asm("lldt %0" : : "rm"(sel)); + account_system_vtime(current); + current->flags &= ~PF_VCPU; } -struct descriptor_table { - unsigned short limit; - unsigned long base; -} __attribute__((packed)); - -static inline void kvm_get_idt(struct descriptor_table *table) +static inline gpa_t gfn_to_gpa(gfn_t gfn) { - asm("sidt %0" : "=m"(*table)); + return (gpa_t)gfn << PAGE_SHIFT; } -static inline void kvm_get_gdt(struct descriptor_table *table) +static inline hpa_t pfn_to_hpa(pfn_t pfn) { - asm("sgdt %0" : "=m"(*table)); + return (hpa_t)pfn << PAGE_SHIFT; } -/* - * FIXME: Accessing the desc_struct through its fields is more elegant, - * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than an union, - * so we can get rid of it transparently in the future -- glommer - */ -/* 8 byte segment descriptor */ -struct desc_struct { - union { - struct { - unsigned int a; - unsigned int b; - }a; - struct { - unsigned short limit0; - unsigned short base0; - unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; - unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; - }b; - }c; -} __attribute__((packed)); - -static inline unsigned long get_desc_base(const struct desc_struct *desc) +static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) { - return (unsigned)(desc->c.b.base0 | ((desc->c.b.base1) << 16) | ((desc->c.b.base2) << 24)); + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); } -extern unsigned long segment_base(uint16_t selector); +enum kvm_stat_kind { + KVM_STAT_VM, + KVM_STAT_VCPU, +}; + +struct kvm_stats_debugfs_item { + const char *name; + int offset; + enum kvm_stat_kind kind; + struct dentry *dentry; +}; +extern struct kvm_stats_debugfs_item debugfs_entries[]; +extern struct dentry *kvm_debugfs_dir; -static inline unsigned long kvm_read_tr_base(void) +#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) { - unsigned short tr; - asm("str %0" : "=g"(tr)); - return segment_base(tr); + if (unlikely(vcpu->kvm->mmu_notifier_count)) + return 1; + /* + * Both reads happen under the mmu_lock and both values are + * modified under mmu_lock, so there's no need of smb_rmb() + * here in between, otherwise mmu_notifier_count should be + * read before mmu_notifier_seq, see + * mmu_notifier_invalidate_range_end write side. + */ + if (vcpu->kvm->mmu_notifier_seq != mmu_seq) + return 1; + return 0; } +#endif -#ifdef CONFIG_X86_64 -static inline unsigned long read_msr(unsigned long msr) -{ - uint64_t value; +#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION +#define unalias_gfn_instantiation unalias_gfn +#endif + +#ifdef CONFIG_HAVE_KVM_IRQCHIP + +#define KVM_MAX_IRQ_ROUTES 1024 + +int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_set_irq_routing(struct kvm *kvm, + const struct kvm_irq_routing_entry *entries, + unsigned nr, + unsigned flags); +void kvm_free_irq_routing(struct kvm *kvm); + +#else + +static inline void kvm_free_irq_routing(struct kvm *kvm) {} - rdmsrl(msr, value); - return value; -} #endif -#ifdef XXX -static inline void kvm_fx_save(struct i387_fxsave_struct *image) -{ - asm("fxsave (%0)":: "r" (image)); -} +#undef CONFIG_HAVE_KVM_EVENTFD -static inline void kvm_fx_restore(struct i387_fxsave_struct *image) -{ - asm("fxrstor (%0)":: "r" (image)); -} +#ifdef CONFIG_HAVE_KVM_EVENTFD + +void kvm_eventfd_init(struct kvm *kvm); +int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); +void kvm_irqfd_release(struct kvm *kvm); +int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); -static inline void kvm_fx_finit(void) +#else + +static inline void kvm_eventfd_init(struct kvm *kvm) {} +static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) { - asm("finit"); + return -EINVAL; } -#endif /*XXX*/ -static inline uint32_t get_rdx_init_val(void) + +static inline void kvm_irqfd_release(struct kvm *kvm) {} +static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { - return 0x600; /* P6 family */ + return -ENOSYS; } -static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, uint32_t error_code) +#endif /* CONFIG_HAVE_KVM_EVENTFD */ + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE +static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { -#ifdef XXX - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); -#endif /*XXX*/ + return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; } +#endif -#define TSS_IOPB_BASE_OFFSET 0x66 -#define TSS_BASE_SIZE 0x68 -#define TSS_IOPB_SIZE (65536 / 8) -#define TSS_REDIRECTION_SIZE (256 / 8) -#define RMODE_TSS_SIZE \ - (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) - -enum { - TASK_SWITCH_CALL = 0, - TASK_SWITCH_IRET = 1, - TASK_SWITCH_JMP = 2, - TASK_SWITCH_GATE = 3, -}; - -#define HF_GIF_MASK (1 << 0) -#define HF_HIF_MASK (1 << 1) -#define HF_VINTR_MASK (1 << 2) -#define HF_NMI_MASK (1 << 3) -#define HF_IRET_MASK (1 << 4) +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Trap the fault and ignore the instruction if that happens. - */ +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg); -#ifdef XXX -#include "linkage.h" +#else -asmlinkage void kvm_handle_fault_on_reboot(void); +static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + return -ENOTTY; +} #endif - -#define __kvm_handle_fault_on_reboot(insn) \ - "666: " insn "\n\t" \ - ".pushsection .fixup, \"ax\" \n" \ - "667: \n\t" \ - __ASM_SIZE(push) " $666b \n\t" \ - ".popsection \n\t" \ - ".pushsection __ex_table, \"a\" \n\t" \ - _ASM_PTR " 666b, 667b \n\t" \ - ".popsection \n\t" - -#define KVM_ARCH_WANT_MMU_NOTIFIER - -#ifdef XXX -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); -int kvm_age_hva(struct kvm *kvm, unsigned long hva); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); -int kvm_cpu_get_interrupt(struct kvm_vcpu *v); - -void kvm_define_shared_msr(unsigned index, uint32_t msr); -void kvm_set_shared_msr(unsigned index, uint64_t val, uint64_t mask); #endif /*XXX*/ -#endif /* _ASM_X86_KVM_HOST_H */ +#endif + @@ -18,6 +18,7 @@ #include <sys/thread.h> #include <sys/cpuvar.h> #include <vm/hat_i86.h> +#include <sys/segments.h> #include "msr-index.h" #include "msr.h" @@ -25,16 +26,24 @@ #include "processor-flags.h" #include "apicdef.h" #include "kvm_host.h" +#include "kvm_x86host.h" +#include "iodev.h" #define PER_CPU_ATTRIBUTES #define PER_CPU_DEF_ATTRIBUTES #define PER_CPU_BASE_SECTION ".data" #include "percpu-defs.h" +#include "coalesced_mmio.h" #include "kvm.h" +#include "irq.h" extern struct vmcs **vmxarea; static int vcpuid; +extern uint64_t native_read_msr_safe(unsigned int msr, + int *err); +extern int native_write_msr_safe(unsigned int msr, + unsigned low, unsigned high); unsigned long segment_base(uint16_t selector) { @@ -124,7 +133,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm, gfn++; } - iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages); + iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGESIZE * npages); } static int @@ -219,6 +228,7 @@ vmx_hardware_enable(void *garbage) uint64_t phys_addr = kvtop(per_cpu(vmxarea, cpu)); #else uint64_t phys_addr; + volatile int x; /* XXX - dtrace return probe missing */ pfn = hat_getpfnum(kas.a_hat, (caddr_t)vmxarea[cpu]); phys_addr = ((uint64_t)pfn << PAGESHIFT)|((uint64_t)vmxarea[cpu] & PAGEOFFSET); #endif @@ -249,9 +259,35 @@ vmx_hardware_enable(void *garbage) ept_sync_global(); #endif /*XXX*/ + x = 10; /*XXX*/ return 0; } +extern struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu); +extern void vmcs_writel(unsigned long field, unsigned long value); +extern unsigned long vmcs_readl(unsigned long field); + +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags, save_rflags; + + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + return rflags; +} +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; + rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } + vmcs_writel(GUEST_RFLAGS, rflags); +} + int kvm_arch_hardware_enable(void *garbage) { #ifdef LATER @@ -362,19 +398,14 @@ kvm_dev_ioctl_check_extension(long ext, int *rval_p) return r; } -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - -static inline int irqchip_in_kernel(struct kvm *kvm) +int irqchip_in_kernel(struct kvm *kvm) { int ret; ret = (pic_irqchip(kvm) != NULL); #ifdef XXX smp_rmb(); -#endif /*XXX*/ +#endif return ret; } @@ -390,12 +421,16 @@ kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; #ifdef XXX if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) -#endif vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -#ifdef XXX + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; +#else + if (!irqchip_in_kernel(kvm) /* || kvm_vcpu_is_bsp(vcpu) */) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; #endif + page = kmem_zalloc(PAGESIZE, KM_SLEEP); if (!page) { r = ENOMEM; @@ -414,6 +449,7 @@ kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_mmu_destroy; } #endif /*XXX*/ + vcpu->arch.mce_banks = kmem_zalloc(KVM_MAX_MCE_BANKS * sizeof(uint64_t) * 4, KM_SLEEP); if (!vcpu->arch.mce_banks) { @@ -487,8 +523,7 @@ fail: * 32-bit hardware). */ -uint64_t -kvm_va2pa(caddr_t va) +uint64_t kvm_va2pa(caddr_t va) { uint64_t pa; @@ -496,12 +531,21 @@ kvm_va2pa(caddr_t va) return (pa); } +#ifdef XXX unsigned long *vmx_io_bitmap_a; unsigned long *vmx_io_bitmap_b; unsigned long *vmx_msr_bitmap_legacy; unsigned long *vmx_msr_bitmap_longmode; +#else +/* make these arrays to try to force into low 4GB memory...*/ +/* also need to be aligned... */ +__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_io_bitmap_a[PAGESIZE/sizeof(unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_io_bitmap_b[PAGESIZE/sizeof(unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_msr_bitmap_legacy[PAGESIZE/sizeof(unsigned long)]; +__attribute__((__aligned__(PAGESIZE)))unsigned long vmx_msr_bitmap_longmode[PAGESIZE/sizeof(unsigned long)]; +#endif /*XXX*/ + -extern void vmcs_writel(unsigned long field, unsigned long value); static void vmcs_write16(unsigned long field, uint16_t value) { vmcs_writel(field, value); @@ -521,169 +565,64 @@ static void vmcs_write64(unsigned long field, uint64_t value) #endif } -extern unsigned long vmcs_readl(unsigned long field); - - -/* - * Sets up the vmcs for emulated real mode. - */ -static int vmx_vcpu_setup(struct vcpu_vmx *vmx) -{ - uint32_t host_sysenter_cs, msr_low, msr_high; - uint32_t junk; - uint64_t host_pat, tsc_this, tsc_base; - unsigned long a; - struct descriptor_table dt; - int i; - unsigned long kvm_vmx_return; - uint32_t exec_control; - -#ifdef XXX - /* I/O */ - vmcs_write64(IO_BITMAP_A, kvm_va2pa((caddr_t)vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, kvm_va2pa((caddr_t)vmx_io_bitmap_b)); - - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, kvm_pa2va(vmx_msr_bitmap_legacy)); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ - - /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); - - exec_control = vmcs_config.cpu_based_exec_ctrl; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { - exec_control &= ~CPU_BASED_TPR_SHADOW; -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_STORE_EXITING | - CPU_BASED_CR8_LOAD_EXITING; -#endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); - - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - exec_control &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - if (vmx->vpid == 0) - exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) { - exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; - enable_unrestricted_guest = 0; - } - if (!enable_unrestricted_guest) - exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (!ple_gap) - exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - } - - if (ple_gap) { - vmcs_write32(PLE_GAP, ple_gap); - vmcs_write32(PLE_WINDOW, ple_window); - } - - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - - vmcs_writel(HOST_CR0, getcr0()); /* 22.2.3 */ - vmcs_writel(HOST_CR4, getcr4()); /* 22.2.3, 22.2.5 */ - vmcs_writel(HOST_CR3, getcr3()); /* 22.2.3 FIXME: shadow tables */ - - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else - vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ - vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif +extern int enable_ept; +extern int enable_unrestricted_guest; +extern int emulate_invalid_guest_state; - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ +static int bypass_guest_pf = 1; - kvm_get_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ +extern void vmcs_clear(struct vmcs *vmcs); +extern void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +extern void vmx_vcpu_put(struct kvm_vcpu *vcpu); - asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); - vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); +extern int vmx_vcpu_setup(struct vcpu_vmx *vmx); +extern int enable_vpid; - rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); - vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); - rdmsrl(MSR_IA32_SYSENTER_ESP, a); - vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ - rdmsrl(MSR_IA32_SYSENTER_EIP, a); - vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ +extern ulong_t *vmx_vpid_bitmap; +extern kmutex_t vmx_vpid_lock; - if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { - rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); - host_pat = msr_low | ((uint64_t) msr_high << 32); - vmcs_write64(HOST_IA32_PAT, host_pat); - } - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); - host_pat = msr_low | ((uint64_t) msr_high << 32); - /* Write the default value follow host pat */ - vmcs_write64(GUEST_IA32_PAT, host_pat); - /* Keep arch.pat sync with GUEST_IA32_PAT */ - vmx->vcpu.arch.pat = host_pat; - } +static void allocate_vpid(struct vcpu_vmx *vmx) +{ + int vpid; - for (i = 0; i < NR_VMX_MSR; ++i) { - uint32_t index = vmx_msr_index[i]; - uint32_t data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; + vmx->vpid = 0; + if (!enable_vpid) + return; + mutex_enter(&vmx_vpid_lock); + vpid = bt_availbit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) { + vmx->vpid = vpid; + BT_SET(vmx_vpid_bitmap, vpid); } + mutex_exit(&vmx_vpid_lock); +} - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); - - /* 22.2.1, 20.8.1 */ - vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); - - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); +#ifdef XXX +static int alloc_identity_pagetable(struct kvm *kvm) +{ + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; - tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; - rdtscll(tsc_this); - if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) - tsc_base = tsc_this; + mutex_enter(&kvm->slots_lock); + if (kvm->arch.ept_identity_pagetable) + goto out; + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; + kvm_userspace_mem.memory_size = PAGESIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; - guest_write_tsc(0, tsc_base); -#endif /*XXX*/ - return 0; + kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, + kvm->arch.ept_identity_map_addr >> PAGESHIFT); +out: + mutex_exit(&kvm->slots_lock); + return r; } -extern void vmcs_clear(struct vmcs *vmcs); -extern void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -extern void vmx_vcpu_put(struct kvm_vcpu *vcpu); +#endif /*XXX*/ struct kvm_vcpu * vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id) @@ -694,9 +633,8 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id) if (!vmx) return NULL; -#ifdef NOTNOW + allocate_vpid(vmx); -#endif /*NOTNOW*/ err = kvm_vcpu_init(&vmx->vcpu, kvm, arg, id); if (err) { #ifdef NOTNOW @@ -713,8 +651,11 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id) } vmx->vmcs = kmem_zalloc(PAGESIZE, KM_SLEEP); - if (!vmx->vmcs) + if (!vmx->vmcs) { + kmem_free(vmx, sizeof(struct vcpu_vmx)); + vmx = NULL; return NULL; + } kpreempt_disable(); @@ -734,11 +675,16 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id) kpreempt_enable(); if (err) vmx->vmcs = NULL; -#ifdef NOTNOW if (vm_need_virtualize_apic_accesses(kvm)) +#ifdef XXX if (alloc_apic_access_page(kvm) != 0) +#endif /*XXX*/ goto free_vmcs; +#ifdef XXX + /* + * XXX For right now, we don't implement ept + */ if (enable_ept) { if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = @@ -746,13 +692,14 @@ vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id) if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; } +#endif /*XXX*/ -#endif /*NOTNOW*/ return &vmx->vcpu; -#ifdef XXX free_vmcs: - free_vmcs(vmx->vmcs); + kmem_free(vmx->vmcs, PAGESIZE); + vmx->vmcs = 0; +#ifdef XXX free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: @@ -772,10 +719,8 @@ kvm_arch_vcpu_create(struct kvm *kvm, struct kvm_vcpu_ioc *arg, unsigned int id) return vmx_create_vcpu(kvm, arg, id); } -extern struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu); - +extern int enable_ept; -static int enable_ept = 1; static void update_exception_bitmap(struct kvm_vcpu *vcpu) { uint32_t eb; @@ -788,7 +733,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) eb |= 1u << BP_VECTOR; #endif /*XXX*/ - if (to_vmx(vcpu)->rmode.vm86_active) + if (((struct vcpu_vmx *)vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ @@ -801,12 +746,12 @@ static inline uint32_t apic_get_reg(struct kvm_lapic *apic, int reg_off) { return *((uint32_t *) (apic->regs + reg_off)); } -static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val) +void apic_set_reg(struct kvm_lapic *apic, int reg_off, uint32_t val) { *((uint32_t *) (apic->regs + reg_off)) = val; } -static inline int kvm_apic_id(struct kvm_lapic *apic) +int kvm_apic_id(struct kvm_lapic *apic) { return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } @@ -874,7 +819,7 @@ void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; } -static int is_paging(struct kvm_vcpu *vcpu) +int is_paging(struct kvm_vcpu *vcpu) { #ifdef XXX return kvm_getcr0_bits(vcpu, X86_CR0_PG); @@ -885,8 +830,8 @@ static int is_paging(struct kvm_vcpu *vcpu) void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + unsigned long hw_cr4 = cr4 | (((struct vcpu_vmx *)vcpu)->rmode.vm86_active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; if (enable_ept) { @@ -904,7 +849,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu; unsigned long hw_cr0; #ifdef XXX if (enable_unrestricted_guest) @@ -913,7 +858,6 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else #endif /*XXX*/ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - #ifdef XXX if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -933,10 +877,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif -#ifdef XXX if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); -#endif /*XXX*/ if (!vcpu->fpu_active) hw_cr0 |= X86_CR0_TS | X86_CR0_MP; @@ -978,33 +920,145 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); -#ifdef XXX + if (enable_unrestricted_guest) { ar = 0x93; if (seg == VCPU_SREG_CS) ar |= 0x08; /* code segment */ } else -#endif /*XXX*/ ar = 0xf3; vmcs_write32(sf->ar_bytes, ar); } +static gva_t rmode_tss_base(struct kvm *kvm) +{ + if (!kvm->arch.tss_addr) { + struct kvm_memslots *slots; + gfn_t base_gfn; + +#ifdef XXX + slots = rcu_dereference(kvm->memslots); +#else + slots = kvm->memslots; +#endif /*XXX*/ + base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; + return base_gfn << PAGESHIFT; + } + return kvm->arch.tss_addr; +} + +extern int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len); + +unsigned long empty_zero_page[PAGESIZE / sizeof(unsigned long)]; + +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +{ + return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); +} + +static int init_rmode_tss(struct kvm *kvm) +{ + gfn_t fn = rmode_tss_base(kvm) >> PAGESHIFT; + uint16_t data = 0; + int ret = 0; + int r; + + r = kvm_clear_guest_page(kvm, fn, 0, PAGESIZE); + if (r < 0) + goto out; + data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(uint16_t)); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn++, 0, PAGESIZE); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn, 0, PAGESIZE); + if (r < 0) + goto out; + data = ~0; + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGESIZE - 1, + sizeof(uint8_t)); + if (r < 0) + goto out; + + ret = 1; +out: + return ret; +} + +static int init_rmode_identity_map(struct kvm *kvm) +{ + int i, r, ret; + pfn_t identity_map_pfn; + uint32_t tmp; + + if (!enable_ept) + return 1; + if ((!kvm->arch.ept_identity_pagetable)) { + cmn_err(CE_WARN, "EPT: identity-mapping pagetable haven't been allocated!\n"); + return 0; + } + if ((kvm->arch.ept_identity_pagetable_done)) + return 1; + ret = 0; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGESHIFT; + r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGESIZE); + if (r < 0) + goto out; +#ifdef XXX + /* Set up identity-mapping pagetable for EPT in real mode */ + for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + r = kvm_write_guest_page(kvm, identity_map_pfn, + &tmp, i * sizeof(tmp), sizeof(tmp)); + if (r < 0) + goto out; + } +#endif /*XXX*/ + kvm->arch.ept_identity_pagetable_done = 1; + ret = 1; +out: + return ret; +} + +static int init_rmode(struct kvm *kvm) +{ + if (!init_rmode_tss(kvm)) + return 0; + if (!init_rmode_identity_map(kvm)) + return 0; + return 1; +} + +extern void vmx_set_efer(struct kvm_vcpu *vcpu, uint64_t efer); +extern void kvm_register_write(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val); +extern ulong kvm_read_cr0(struct kvm_vcpu *vcpu); +extern void setup_msrs(struct vcpu_vmx *vmx); + int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = (struct vcpu_vmx *)to_vmx(vcpu); + struct vcpu_vmx *vmx = (struct vcpu_vmx *)vcpu; uint64_t msr; int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); #ifdef XXX idx = srcu_read_lock(&vcpu->kvm->srcu); +#endif /*XXX*/ if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; } -#endif vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -1027,12 +1081,12 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu) * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. */ -#ifdef XXX +#ifdef CONFIG_KVM_APIC_ARCHITECTURE if (kvm_vcpu_is_bsp(&vmx->vcpu)) { vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0x000f0000); } else { -#endif /*XXX*/ +#endif /*CONFIG_KVM_APIC_ARCHITECTURE*/ vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); #ifdef XXX @@ -1064,9 +1118,9 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); - - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); #endif /*XXX*/ + kvm_register_write(vcpu, VCPU_REGS_RSP, 0); + vmcs_writel(GUEST_DR7, 0x400); vmcs_writel(GUEST_GDTR_BASE, 0); @@ -1082,9 +1136,7 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu) /* Special registers */ vmcs_write64(GUEST_IA32_DEBUGCTL, 0); -#ifdef XXX setup_msrs(vmx); -#endif /*XXX*/ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ @@ -1101,20 +1153,18 @@ int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); #endif /*XXX*/ + if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; -#ifdef XXX vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ -#endif /*XXX*/ vmx_set_cr4(&vmx->vcpu, 0); -#ifdef XXX vmx_set_efer(&vmx->vcpu, 0); +#ifdef XXX vmx_fpu_activate(&vmx->vcpu); #endif /*XXX*/ update_exception_bitmap(&vmx->vcpu); - #ifdef XXX vpid_sync_vcpu_all(vmx); #endif /*XXX*/ @@ -1148,18 +1198,424 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) extern void vcpu_load(struct kvm_vcpu *vcpu); -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + struct kmem_cache *base_cache, int min) +{ + caddr_t obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kmem_cache_alloc(base_cache, KM_SLEEP); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +extern struct kmem_cache *pte_chain_cache; +extern struct kmem_cache *rmap_desc_cache; +extern struct kmem_cache *mmu_page_header_cache; + +/*XXX the following is called for tdp (two dimensional hardware paging */ +/* we dont support this right now */ +int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r = 0; + + r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, + pte_chain_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, + rmap_desc_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache, 4); +out: + return r; +} + + +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + struct kvm_memory_slot *slot; + int host_level, level, max_level; +#ifdef XXX + slot = gfn_to_memslot(vcpu->kvm, large_gfn); + if (slot && slot->dirty_bitmap) + return PT_PAGE_TABLE_LEVEL; + + host_level = host_mapping_level(vcpu->kvm, large_gfn); + + if (host_level == PT_PAGE_TABLE_LEVEL) + return host_level; + + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) + if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + break; + + return level - 1; +#else + return 0; +#endif /*XXX*/ +} + +extern struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + gfn = unalias_gfn_instantiation(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return bad_hva(); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGESIZE); +} + +int kvm_is_error_hva(unsigned long addr) +{ + return addr == bad_hva(); +} + +extern caddr_t bad_page; + +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + return hva_to_pfn(kvm, addr); +} + +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, + uint32_t error_code) +{ +#ifdef XXX + pfn_t pfn; + int r; + int level; + gfn_t gfn = gpa >> PAGESHIFT; + unsigned long mmu_seq; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + level = mapping_level(vcpu, gfn); + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + pfn = gfn_to_pfn(vcpu->kvm, gfn); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return 1; + } + mutex_enter(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; + kvm_mmu_free_some_pages(vcpu); + r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, + level, gfn, pfn); + mutex_exit(&vcpu->kvm->mmu_lock); + + return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +#endif /*XXX*/ + return 0; +} + +static void mmu_free_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + mutex_enter(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + --sp->root_count; + if (!sp->root_count && sp->role.invalid) + kvm_mmu_zap_page(vcpu->kvm, sp); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + spin_unlock(&vcpu->kvm->mmu_lock); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + --sp->root_count; + if (!sp->root_count && sp->role.invalid) + kvm_mmu_zap_page(vcpu->kvm, sp); + } + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + } + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + +int get_ept_level(void) +{ + return VMX_EPT_DEFAULT_GAW + 1; +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) +{ + if (error) + *error = 0; + return vaddr; +} + +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 exb_bit_rsvd = 0; + + if (!is_nx(vcpu)) + exb_bit_rsvd = rsvd_bits(63, 63); + switch (level) { + case PT32_ROOT_LEVEL: + /* no rsvd bits for 2 level 4K page table entries */ + context->rsvd_bits_mask[0][1] = 0; + context->rsvd_bits_mask[0][0] = 0; + if (is_cpuid_PSE36()) + /* 36bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + else + /* 32 bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + case PT32E_ROOT_LEVEL: + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 63) | + rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PDE */ + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PTE */ + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + case PT64_ROOT_LEVEL: + context->rsvd_bits_mask[0][3] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 29); + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + break; + } +} + +static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = tdp_page_fault; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + context->root_hpa = INVALID_PAGE; + + if (!is_paging(vcpu)) { + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->root_level = 0; + } else if (is_long_mode(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT64_ROOT_LEVEL; + } else if (is_pae(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT32E_ROOT_LEVEL; + } else { + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); + context->gva_to_gpa = paging32_gva_to_gpa; + context->root_level = PT32_ROOT_LEVEL; + } + + return 0; +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; + context->root_level = 0; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->gva_to_gpa = paging64_gva_to_gpa; + context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; + context->free = paging_free; + context->root_level = level; + context->shadow_root_level = level; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); + return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); +} + +static int paging32_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu) +{ + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); +} + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +{ + int r; + + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + if (!is_paging(vcpu)) + r = nonpaging_init_context(vcpu); + else if (is_long_mode(vcpu)) + r = paging64_init_context(vcpu); + else if (is_pae(vcpu)) + r = paging32E_init_context(vcpu); + else + r = paging32_init_context(vcpu); + + vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + + return r; +} + +int init_kvm_mmu(struct kvm_vcpu *vcpu) { vcpu->arch.update_pte.pfn = -1; /* bad_pfn */ #ifdef XXX + /* + * XXX currently, we won't support 2 dimensional paging. + * So the hardware will not do guest-virtual to guest-physical + * and guest-physical to host physical. So we'll need to + * implement "shadow" paging... + */ + if (tdp_enabled) return init_kvm_tdp_mmu(vcpu); else +#endif return init_kvm_softmmu(vcpu); -#else return 0; -#endif /*XXX*/ } int kvm_mmu_setup(struct kvm_vcpu *vcpu) @@ -1196,6 +1652,11 @@ free_vcpu: return r; } +void kvm_get_kvm(struct kvm *kvm) +{ + atomic_inc_32(&kvm->users_count); +} + /* * Creates some virtual cpus. Good luck creating more than one. */ @@ -1217,9 +1678,8 @@ kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int32_t id, struct kvm_vcpu_ioc *arg, if (r) return r; -#ifdef NOTNOW - - mutex_lock(&kvm->lock); + mutex_enter(&kvm->lock); +#ifdef XXX if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { r = -EINVAL; goto vcpu_destroy; @@ -1233,30 +1693,33 @@ kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int32_t id, struct kvm_vcpu_ioc *arg, BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); +#endif /*XXX*/ + /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); -#endif /*NOTNOW*/ - *rval_p = vcpuid++; /* guarantee unique id */ + + *rval_p = kvm->online_vcpus; /* guarantee unique id */ + vcpu->vcpu_id = *rval_p; /* XXX need to protect online_vcpus */ - kvm->vcpus[kvm->online_vcpus] = vcpu; + kvm->vcpus[kvm->online_vcpus++] = vcpu; -#ifdef NOTNOW +#ifdef XXX smp_wmb(); - atomic_inc(&kvm->online_vcpus); +#endif /*XXX*/ + atomic_inc_32(&kvm->online_vcpus); #ifdef CONFIG_KVM_APIC_ARCHITECTURE if (kvm->bsp_vcpu_id == id) kvm->bsp_vcpu = vcpu; #endif - mutex_unlock(&kvm->lock); -#endif /*NOTNOW*/ + mutex_exit(&kvm->lock); return r; vcpu_destroy: #ifdef NOTNOW - mutex_unlock(&kvm->lock); + mutex_exit(&kvm->lock); kvm_arch_vcpu_destroy(vcpu); #endif /*NOTNOW*/ return r; @@ -1281,7 +1744,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, unsigned long userspace_addr; down_write(¤t->mm->mmap_sem); userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, + npages * PAGESIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0); @@ -1330,6 +1793,240 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, return kvm_set_memory_region(kvm, mem, user_alloc); } +static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_coalesced_mmio_dev, dev); +} + +static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, + gpa_t addr, int len) +{ + struct kvm_coalesced_mmio_zone *zone; + struct kvm_coalesced_mmio_ring *ring; + unsigned avail; + int i; + + /* Are we able to batch it ? */ + + /* last is the first free entry + * check if we don't meet the first used entry + * there is always one unused entry in the buffer + */ + ring = dev->kvm->coalesced_mmio_ring; + avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; + if (avail < KVM_MAX_VCPUS) { + /* full */ + return 0; + } + + /* is it in a batchable area ? */ + + for (i = 0; i < dev->nb_zones; i++) { + zone = &dev->zone[i]; + + /* (addr,len) is fully included in + * (zone->addr, zone->size) + */ + + if (zone->addr <= addr && + addr + len <= zone->addr + zone->size) + return 1; + } + return 0; +} + +/* Caller must hold slots_lock. */ +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) +{ + struct kvm_io_bus *new_bus, *bus; + + bus = kvm->buses[bus_idx]; + if (bus->dev_count > NR_IOBUS_DEVS-1) + return -ENOSPC; + + new_bus = kmem_zalloc(sizeof(struct kvm_io_bus), KM_SLEEP); + if (!new_bus) + return -ENOMEM; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + new_bus->devs[new_bus->dev_count++] = dev; +#ifdef XXX + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); +#endif /*XXX*/ + kmem_free(bus, sizeof(struct kvm_io_bus)); + + return 0; +} + +/* Caller must hold slots_lock. */ +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) +{ + int i, r; + struct kvm_io_bus *new_bus, *bus; + + new_bus = kmem_zalloc(sizeof(struct kvm_io_bus), KM_SLEEP); + if (!new_bus) + return -ENOMEM; + + bus = kvm->buses[bus_idx]; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + + r = -ENOENT; + for (i = 0; i < new_bus->dev_count; i++) + if (new_bus->devs[i] == dev) { + r = 0; + new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; + break; + } + + if (r) { + kmem_free(new_bus, sizeof(struct kvm_io_bus)); + return r; + } + +#ifdef XXX + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); +#endif + kmem_free(bus, sizeof(struct kvm_io_bus)); + return r; +} + +static int coalesced_mmio_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) +{ + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + if (!coalesced_mmio_in_range(dev, addr, len)) + return -EOPNOTSUPP; + + mutex_enter(&dev->lock); + + /* copy data in first free entry of the ring */ + + ring->coalesced_mmio[ring->last].phys_addr = addr; + ring->coalesced_mmio[ring->last].len = len; + memcpy(ring->coalesced_mmio[ring->last].data, val, len); +#ifdef XXX + smp_wmb(); +#endif /*XXX*/ + ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; + mutex_exit(&dev->lock); + return 0; +} + +static void coalesced_mmio_destructor(struct kvm_io_device *this) +{ + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + + kmem_free(dev, sizeof(struct kvm_coalesced_mmio_dev)); +} + +static const struct kvm_io_device_ops coalesced_mmio_ops = { + .write = coalesced_mmio_write, + .destructor = coalesced_mmio_destructor, +}; + + +int kvm_coalesced_mmio_init(struct kvm *kvm) +{ + struct kvm_coalesced_mmio_dev *dev; + caddr_t *page; + int ret; + + ret = -ENOMEM; + page = kmem_zalloc(PAGESIZE, KM_SLEEP); + if (!page) + goto out_err; + kvm->coalesced_mmio_ring = (struct kvm_coalesced_mmio_ring *)page; + + ret = -ENOMEM; + dev = kmem_alloc(sizeof(struct kvm_coalesced_mmio_dev), KM_SLEEP); + if (!dev) + goto out_free_page; + mutex_init(&dev->lock, NULL, MUTEX_DRIVER, 0); + kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); + dev->kvm = kvm; + kvm->coalesced_mmio_dev = dev; + + mutex_enter(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + mutex_exit(&kvm->slots_lock); + if (ret < 0) + goto out_free_dev; + + return ret; + +out_free_dev: + kmem_free(dev, sizeof(struct kvm_coalesced_mmio_dev)); +out_free_page: + kmem_free(page, PAGESIZE); +out_err: + return ret; +} + +void kvm_coalesced_mmio_free(struct kvm *kvm) +{ + if (kvm->coalesced_mmio_ring) + kmem_free(kvm->coalesced_mmio_ring, PAGESIZE); +} + +int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + + if (dev == NULL) + return -EINVAL; + + mutex_enter(&kvm->slots_lock); + if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { + mutex_exit(&kvm->slots_lock); + return -ENOBUFS; + } + + dev->zone[dev->nb_zones] = *zone; + dev->nb_zones++; + + mutex_exit(&kvm->slots_lock); + return 0; +} + +int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + int i; + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + struct kvm_coalesced_mmio_zone *z; + + if (dev == NULL) + return -EINVAL; + + mutex_enter(&kvm->slots_lock); + + i = dev->nb_zones; + while (i) { + z = &dev->zone[i - 1]; + + /* unregister all zones + * included in (zone->addr, zone->size) + */ + + if (zone->addr <= z->addr && + z->addr + z->size <= zone->addr + zone->size) { + dev->nb_zones--; + *z = dev->zone[dev->nb_zones]; + } + i--; + } + + mutex_exit(&kvm->slots_lock); + + return 0; +} + long kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode) { @@ -1358,6 +2055,7 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode) goto out; break; } +#endif /*NOTNOW*/ #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET case KVM_REGISTER_COALESCED_MMIO: { @@ -1385,13 +2083,12 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode) break; } #endif - +#ifdef XXX case KVM_IRQFD: { struct kvm_irqfd data; - r = -EFAULT; - if (copy_from_user(&data, argp, sizeof data)) - goto out; + if (ddi_copyin(argp, &data, sizeof data, mode)) + return (EFAULT); r = kvm_irqfd(kvmp, data.fd, data.gsi, data.flags); break; } @@ -1404,24 +2101,21 @@ kvm_vm_ioctl(struct kvm *kvmp, unsigned int ioctl, unsigned long arg, int mode) r = kvm_ioeventfd(kvmp, &data); break; } -#endif /*NOTNOW*/ + #ifdef CONFIG_KVM_APIC_ARCHITECTURE case KVM_SET_BOOT_CPU_ID: r = 0; - mutex_lock(&kvmp->lock); + mutex_enter(&kvmp->lock); if (atomic_read(&kvmp->online_vcpus) != 0) r = -EBUSY; else kvmp->bsp_vcpu_id = arg; - mutex_unlock(&kvmp->lock); + mutex_exit(&kvmp->lock); break; #endif -#ifdef NOTNOW +#endif /*XXX*/ default: - r = kvm_arch_vm_ioctl(filp, ioctl, arg); - if (r == -ENOTTY) - r = kvm_vm_ioctl_assigned_device(kvmp, ioctl, arg); -#endif /*NOTNOW*/ + return EINVAL; } out: @@ -12,7 +12,7 @@ #ifdef _KERNEL #include "asm.h" - +#include <sys/ontrap.h> #include <sys/errno.h> #ifdef XXX @@ -76,21 +76,11 @@ static inline unsigned long long native_read_msr(unsigned int msr) } -static inline unsigned long long native_read_msr_safe(unsigned int msr, - int *err) -{ - DECLARE_ARGS(val, low, high); +extern uint64_t native_read_msr_safe(unsigned int msr, + int *err); +extern int native_write_msr_safe(unsigned int msr, + unsigned low, unsigned high); - asm volatile("2: rdmsr ; xor %[err],%[err]\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: mov %[fault],%[err] ; jmp 1b\n\t" - ".previous\n\t" - _ASM_EXTABLE(2b, 3b) - : [err] "=r" (*err), EAX_EDX_RET(val, low, high) - : "c" (msr), [fault] "i" (-EIO)); - return EAX_EDX_VAL(val, low, high); -} static inline void native_write_msr(unsigned int msr, unsigned low, unsigned high) @@ -98,23 +88,6 @@ static inline void native_write_msr(unsigned int msr, asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); } -/* Can be uninlined because referenced by paravirt */ -static inline int native_write_msr_safe(unsigned int msr, - unsigned low, unsigned high) -{ - int err; - asm volatile("2: wrmsr ; xor %[err],%[err]\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: mov %[fault],%[err] ; jmp 1b\n\t" - ".previous\n\t" - _ASM_EXTABLE(2b, 3b) - : [err] "=a" (err) - : "c" (msr), "0" (low), "d" (high), - [fault] "i" (-EIO) - : "memory"); - return err; -} extern unsigned long long native_read_tsc(void); |