diff options
Diffstat (limited to 'linux/x86/x86.c')
-rw-r--r-- | linux/x86/x86.c | 3733 |
1 files changed, 1609 insertions, 2124 deletions
diff --git a/linux/x86/x86.c b/linux/x86/x86.c index a6233d2..1071014 100644 --- a/linux/x86/x86.c +++ b/linux/x86/x86.c @@ -46,7 +46,6 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -81,23 +80,17 @@ #include <linux/user-return-notifier.h> #include <linux/srcu.h> #include <linux/slab.h> -#include <linux/perf_event.h> -#include <linux/uaccess.h> -#include <linux/hash.h> #include <trace/events/kvm.h> - +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "trace.h" #include <asm/debugreg.h> +#include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/mce.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/pvclock.h> -#include <asm/div64.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -108,13 +101,12 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -172,7 +164,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, + { "kvm_request_irq", VCPU_STAT(request_irq_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -194,15 +186,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -u64 __read_mostly host_xcr0; - -static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) -{ - int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) - vcpu->arch.apf.gfns[i] = ~0; -} - static void kvm_on_user_return(struct kvm_user_return_notifier *urn) { unsigned slot; @@ -280,6 +263,34 @@ static void drop_user_return_notifiers(void *ignore) kvm_on_user_return(&smsr->urn); } +unsigned long segment_base(u16 selector) +{ + struct descriptor_table gdt; + struct kvm_desc_struct *d; + unsigned long table_base; + unsigned long v; + + if (selector == 0) + return 0; + + kvm_get_gdt(&gdt); + table_base = gdt.base; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector = kvm_read_ldt(); + + table_base = segment_base(ldt_selector); + } + d = (struct kvm_desc_struct *)(table_base + (selector & ~7)); + v = kvm_get_desc_base(d); +#ifdef CONFIG_X86_64 + if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32; +#endif + return v; +} +EXPORT_SYMBOL_GPL(segment_base); + u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) @@ -321,21 +332,17 @@ static int exception_class(int vector) } static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool reinject) + unsigned nr, bool has_error, u32 error_code) { u32 prev_nr; int class1, class2; - kvm_make_request(KVM_REQ_EVENT, vcpu); - if (!vcpu->arch.exception.pending) { queue: vcpu->arch.exception.pending = true; vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; vcpu->arch.exception.error_code = error_code; - vcpu->arch.exception.reinject = reinject; return; } @@ -343,7 +350,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); return; } class1 = exception_class(prev_nr); @@ -364,59 +371,30 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); - -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) -{ - if (err) - kvm_inject_gp(vcpu, 0); - else - kvm_x86_ops->skip_emulated_instruction(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); - -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + u32 error_code) { ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); -} - -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) -{ - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu.inject_page_fault(vcpu, fault); + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false); + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. @@ -431,49 +409,18 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) EXPORT_SYMBOL_GPL(kvm_require_cpl); /* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - -int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, - void *data, int offset, int len, u32 access) -{ - return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, - data, offset, len, access); -} - -/* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; - u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { ret = 0; goto out; @@ -487,7 +434,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } ret = 1; - memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); + memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, @@ -500,10 +447,8 @@ EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { - u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; bool changed = true; - int offset; - gfn_t gfn; int r; if (is_long_mode(vcpu) || !is_pae(vcpu)) @@ -513,181 +458,132 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); - r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), - PFERR_USER_MASK | PFERR_WRITE_MASK); + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; - changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; + changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; out: return changed; } -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | - X86_CR0_CD | X86_CR0_NW; - cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) - return 1; + if (cr0 & 0xffffffff00000000UL) { + kvm_inject_gp(vcpu, 0); + return; + } #endif cr0 &= ~CR0_RESERVED_BITS; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) - return 1; + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { + kvm_inject_gp(vcpu, 0); + return; + } - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) - return 1; + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { + kvm_inject_gp(vcpu, 0); + return; + } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; - if (!is_pae(vcpu)) - return 1; + if (!is_pae(vcpu)) { + kvm_inject_gp(vcpu, 0); + return; + } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) - return 1; + if (cs_l) { + kvm_inject_gp(vcpu, 0); + return; + + } } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) - return 1; + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + } kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; - if ((cr0 ^ old_cr0) & X86_CR0_PG) - kvm_clear_async_pf_completion_queue(vcpu); - - if ((cr0 ^ old_cr0) & update_bits) - kvm_mmu_reset_context(vcpu); - return 0; + kvm_mmu_reset_context(vcpu); + return; } EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - u64 xcr0; - - /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ - if (index != XCR_XFEATURE_ENABLED_MASK) - return 1; - xcr0 = xcr; - if (kvm_x86_ops->get_cpl(vcpu) != 0) - return 1; - if (!(xcr0 & XSTATE_FP)) - return 1; - if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) - return 1; - if (xcr0 & ~host_xcr0) - return 1; - vcpu->arch.xcr0 = xcr0; - vcpu->guest_xcr0_loaded = 0; - return 0; -} + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) -{ - if (__kvm_set_xcr(vcpu, index, xcr)) { + if (cr4 & CR4_RESERVED_BITS) { kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_xcr); - -static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_XSAVE)); -} - -static void update_cpuid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) return; - - /* Update OSXSAVE bit */ - if (kvm_cpu_has_xsave && best->function == 0x1) { - best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= bit(X86_FEATURE_OSXSAVE); } -} - -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - - if (cr4 & CR4_RESERVED_BITS) - return 1; - - if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) - return 1; if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) - return 1; + if (!(cr4 & X86_CR4_PAE)) { + kvm_inject_gp(vcpu, 0); + return; + } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) - return 1; - - if (cr4 & X86_CR4_VMXE) - return 1; + && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + if (cr4 & X86_CR4_VMXE) { + kvm_inject_gp(vcpu, 0); + return; + } kvm_x86_ops->set_cr4(vcpu, cr4); - - if ((cr4 ^ old_cr4) & pdptr_bits) - kvm_mmu_reset_context(vcpu); - - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) - update_cpuid(vcpu); - - return 0; + vcpu->arch.cr4 = cr4; + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); - return 0; + return; } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; + if (cr3 & CR3_L_MODE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } } else { if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) - return 1; - if (is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) - return 1; + if (cr3 & CR3_PAE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } } /* * We don't check reserved bits in nonpae mode, because @@ -705,23 +601,24 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - return 1; - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - vcpu->arch.mmu.new_cr3(vcpu); - return 0; + kvm_inject_gp(vcpu, 0); + else { + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + } } EXPORT_SYMBOL_GPL(kvm_set_cr3); -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if (cr8 & CR8_RESERVED_BITS) - return 1; + if (cr8 & CR8_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; - return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -734,90 +631,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static inline u32 bit(int bitno) { - switch (dr) { - case 0 ... 3: - vcpu->arch.db[dr] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = val; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ - /* fall through */ - case 6: - if (val & 0xffffffff00000000ULL) - return -1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ - /* fall through */ - default: /* 7 */ - if (val & 0xffffffff00000000ULL) - return -1; /* #GP */ - vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); - vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); - } - break; - } - - return 0; + return 1 << (bitno & 31); } -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - int res; - - res = __kvm_set_dr(vcpu, dr, val); - if (res > 0) - kvm_queue_exception(vcpu, UD_VECTOR); - else if (res < 0) - kvm_inject_gp(vcpu, 0); - - return res; -} -EXPORT_SYMBOL_GPL(kvm_set_dr); - -static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - switch (dr) { - case 0 ... 3: - *val = vcpu->arch.db[dr]; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; - /* fall through */ - case 6: - *val = vcpu->arch.dr6; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; - /* fall through */ - default: /* 7 */ - *val = vcpu->arch.dr7; - break; - } - - return 0; -} - -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - if (_kvm_get_dr(vcpu, dr, val)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_get_dr); - /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -827,67 +645,67 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 8 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, + MSR_K6_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, }; -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { - u64 old_efer = vcpu->arch.efer; - - if (efer & efer_reserved_bits) - return 1; + if (efer & efer_reserved_bits) { + kvm_inject_gp(vcpu, 0); + return; + } if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { + kvm_inject_gp(vcpu, 0); + return; + } if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) - return 1; + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + kvm_inject_gp(vcpu, 0); + return; + } } if (efer & EFER_SVME) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) - return 1; + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + kvm_inject_gp(vcpu, 0); + return; + } } + kvm_x86_ops->set_efer(vcpu, efer); + efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops->set_efer(vcpu, efer); + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - - /* Update reserved bits */ - if ((efer ^ old_efer) & EFER_NX) - kvm_mmu_reset_context(vcpu); - - return 0; + kvm_mmu_reset_context(vcpu); } void kvm_enable_efer_bits(u64 mask) @@ -917,28 +735,20 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { - int version; - int r; + static int version; struct pvclock_wall_clock wc; struct timespec boot; if (!wall_clock) return; - r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); - if (r) - return; - - if (version & 1) - ++version; /* first time write, random junk */ - - ++version; + version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); /* * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the + * system time (updated by kvm_write_guest_time below) to the * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ @@ -966,230 +776,64 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) return quotient; } -static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, - s8 *pshift, u32 *pmultiplier) +static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) { - uint64_t scaled64; + uint64_t nsecs = 1000000000LL; int32_t shift = 0; uint64_t tps64; uint32_t tps32; - tps64 = base_khz * 1000LL; - scaled64 = scaled_khz * 1000LL; - while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { - if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) - scaled64 >>= 1; - else - tps32 <<= 1; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; shift++; } - *pshift = shift; - *pmultiplier = div_frac(scaled64, tps32); + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); - pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", - __func__, base_khz, scaled_khz, shift, *pmultiplier); -} - -static inline u64 get_kernel_ns(void) -{ - struct timespec ts; - - WARN_ON(preemptible()); - ktime_get_ts(&ts); - kvm_monotonic_to_bootbased(&ts); - return timespec_to_ns(&ts); + pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", + __func__, tsc_khz, hv_clock->tsc_shift, + hv_clock->tsc_to_system_mul); } static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -unsigned long max_tsc_khz; - -static inline int kvm_tsc_changes_freq(void) -{ - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; -} - -static inline u64 nsec_to_cycles(u64 nsec) -{ - u64 ret; - - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * kvm___this_cpu_read(cpu_tsc_khz); - do_div(ret, USEC_PER_SEC); - return ret; -} -static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) -{ - /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &kvm->arch.virtual_tsc_shift, - &kvm->arch.virtual_tsc_mult); - kvm->arch.virtual_tsc_khz = this_tsc_khz; -} - -static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) -{ - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->kvm->arch.virtual_tsc_mult, - vcpu->kvm->arch.virtual_tsc_shift); - tsc += vcpu->arch.last_tsc_write; - return tsc; -} - -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - u64 offset, ns, elapsed; - unsigned long flags; - s64 sdiff; - - spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = data - kvm_native_read_tsc(); - ns = get_kernel_ns(); - elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; - - /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accomodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { - if (!kvm_check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); - } else { - u64 delta = nsec_to_cycles(elapsed); - offset += delta; - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); - } - ns = kvm->arch.last_tsc_nsec; - } - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; - kvm_x86_ops->write_tsc_offset(vcpu, offset); - spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - /* Reset of TSC must disable overshoot protection below */ - vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; -} -EXPORT_SYMBOL_GPL(kvm_write_tsc); - -static int kvm_guest_time_update(struct kvm_vcpu *v) +static void kvm_write_guest_time(struct kvm_vcpu *v) { + struct timespec ts; unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; - s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); - kernel_ns = get_kernel_ns(); - this_tsc_khz = kvm___this_cpu_read(cpu_tsc_khz); + if ((!vcpu->time_page)) + return; - if (unlikely(this_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - - /* - * We may have to catch up the TSC to match elapsed wall clock - * time for two reasons, even if kvmclock is used. - * 1) CPU could have been running below the maximum TSC rate - * 2) Broken TSC compensation resets the base at each VCPU - * entry to avoid unknown leaps of TSC even when running - * again on the same CPU. This may cause apparent elapsed - * time to disappear, and the guest to stand still or run - * very slowly. - */ - if (vcpu->tsc_catchup) { - u64 tsc = compute_guest_tsc(v, kernel_ns); - if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); - tsc_timestamp = tsc; - } + this_tsc_khz = get_cpu_var(cpu_tsc_khz); + if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; } + put_cpu_var(cpu_tsc_khz); + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + ktime_get_ts(&ts); + kvm_monotonic_to_bootbased(&ts); local_irq_restore(flags); - if (!vcpu->time_page) - return 0; - - /* - * Time as measured by the TSC may go backwards when resetting the base - * tsc_timestamp. The reason for this is that the TSC resolution is - * higher than the resolution of the other clock scales. Thus, many - * possible measurments of the TSC correspond to one measurement of any - * other clock, and so a spread of values is possible. This is not a - * problem for the computation of the nanosecond clock; with TSC rates - * around 1GHZ, there can only be a few cycles which correspond to one - * nanosecond value, and any path through this code will inevitably - * take longer than that. However, with the kernel_ns value itself, - * the precision may be much lower, down to HZ granularity. If the - * first sampling of TSC against kernel_ns ends in the low part of the - * range, and the second in the high end of the range, we can get: - * - * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new - * - * As the sampling errors potentially range in the thousands of cycles, - * it is possible such a time value has already been observed by the - * guest. To protect against this, we must compute the system time as - * observed by the guest and ensure the new system time is greater. - */ - max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { - max_kernel_ns = vcpu->last_guest_tsc - - vcpu->hv_clock.tsc_timestamp; - max_kernel_ns = pvclock_scale_delta(max_kernel_ns, - vcpu->hv_clock.tsc_to_system_mul, - vcpu->hv_clock.tsc_shift); - max_kernel_ns += vcpu->last_kernel_ns; - } - - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = this_tsc_khz; - } - - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - /* With all the info we got, fill in the values */ - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; - vcpu->last_guest_tsc = tsc_timestamp; - vcpu->hv_clock.flags = 0; + + vcpu->hv_clock.system_time = ts.tv_nsec + + (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; /* * The interface expects us to write an even number signaling that the @@ -1206,7 +850,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); - return 0; +} + +static int kvm_request_guest_time_update(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + + if (!vcpu->time_page) + return 0; + set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + return 1; } static bool msr_mtrr_valid(unsigned msr) @@ -1469,38 +1122,14 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } -static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) -{ - gpa_t gpa = data & ~0x3f; - - /* Bits 2:5 are resrved, Should be zero */ - if (data & 0x3c) - return 1; - - vcpu->arch.apf.msr_val = data; - - if (!(data & KVM_ASYNC_PF_ENABLED)) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - return 0; - } - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) - return 1; - - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); - kvm_async_pf_wakeup_all(vcpu); - return 0; -} - int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case MSR_EFER: - return set_efer(vcpu, data); + set_efer(vcpu, data); + break; case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ - data &= ~(u64)0x100; /* ignore ignne emulation enable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -1543,12 +1172,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; - case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data); break; - case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { if (vcpu->arch.time_page) { kvm_release_page_dirty(vcpu->arch.time_page); @@ -1556,7 +1183,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } vcpu->arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -1572,12 +1198,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_release_page_clean(vcpu->arch.time_page); vcpu->arch.time_page = NULL; } + + kvm_request_guest_time_update(vcpu); break; } - case MSR_KVM_ASYNC_PF_EN: - if (kvm_pv_enable_async_pf(vcpu, data)) - return 1; - break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1612,16 +1236,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; - case MSR_K7_CLK_CTL: - /* - * Ignore all writes to this no longer documented MSR. - * Writes are only relevant for old K7 processors, - * all pre-dating SVM, but a recommended workaround from - * AMD for these chips. It is possible to speicify the - * affected processor models on the command line, hence - * the need to ignore the workaround. - */ - break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1814,20 +1428,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case 0xcd: /* fsb frequency */ data = 3; break; - /* - * MSR_EBC_FREQUENCY_ID - * Conservative value valid for even the basic CPU models. - * Models 0,1: 000 in bits 23:21 indicating a bus speed of - * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, - * and 266MHz for model 3, or 4. Set Core Clock - * Frequency to System Bus Frequency Ratio to 1 (bits - * 31:24) even though these are only valid for CPU - * models > 2, however guests may end up dividing or - * multiplying by zero otherwise. - */ - case MSR_EBC_FREQUENCY_ID: - data = 1 << 24; - break; case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; @@ -1847,16 +1447,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: - case MSR_KVM_WALL_CLOCK_NEW: data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: - case MSR_KVM_SYSTEM_TIME_NEW: data = vcpu->arch.time; break; - case MSR_KVM_ASYNC_PF_EN: - data = vcpu->arch.apf.msr_val; - break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -1864,18 +1459,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); - case MSR_K7_CLK_CTL: - /* - * Provide expected ramp-up count for K7. All other - * are set to zero, indicating minimum divisors for - * every field. - * - * This prevents guest kernels on AMD host with CPU - * type 6, model 8 and higher from exploding due to - * the rdmsr failing. - */ - data = 0x20000000; - break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1913,11 +1496,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, { int i, idx; - idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu_load(vcpu); + + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); + + vcpu_put(vcpu); return i; } @@ -1947,7 +1534,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs, r = -ENOMEM; size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = kmalloc(size, GFP_KERNEL); + entries = vmalloc(size); if (!entries) goto out; @@ -1966,7 +1553,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs, r = n; out_free: - kfree(entries); + vfree(entries); out: return r; } @@ -1988,7 +1575,6 @@ int kvm_dev_ioctl_check_extension(long ext) #ifdef CONFIG_MMU_NOTIFIER case KVM_CAP_SYNC_MMU: #endif - case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: @@ -2006,12 +1592,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_PCI_SEGMENT: - case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: - case KVM_CAP_XSAVE: -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) - case KVM_CAP_ASYNC_PF: -#endif r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2035,9 +1616,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; - case KVM_CAP_XCRS: - r = kvm_cpu_has_xsave; - break; default: r = 0; break; @@ -2114,51 +1692,22 @@ out: return r; } -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - -static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - return vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - /* Address WBINVD may be executed by guest */ - if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); - } - kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - kvm_native_read_tsc() - vcpu->arch.last_host_tsc; - if (tsc_delta < 0) - mark_tsc_unstable("KVM discovered backwards TSC"); - if (kvm_check_tsc_unstable()) { - kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); - vcpu->arch.tsc_catchup = 1; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } - if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); - vcpu->cpu = cpu; + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { + unsigned long khz = cpufreq_quick_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; } + kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = kvm_native_read_tsc(); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -2207,6 +1756,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; + vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -2224,7 +1774,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); + vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -2245,10 +1795,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; + vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); + vcpu_put(vcpu); return 0; out: @@ -2275,11 +1826,6 @@ out: return r; } -static void cpuid_mask(u32 *word, int wordnum) -{ - *word &= boot_cpu_data.x86_capability[wordnum]; -} - static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index) { @@ -2328,20 +1874,19 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | + F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C); + 0 /* Reserved, XSAVE, OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | + F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + 0 /* SKINIT */ | 0 /* WDT */; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -2350,13 +1895,11 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)0xb); break; case 1: entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -2410,51 +1953,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } - case 0xd: { - int i; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (i = 1; *nent < maxnent; ++i) { - if (entry[i - 1].eax == 0 && i != 2) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case KVM_CPUID_SIGNATURE: { - char signature[12] = "KVMKVMKVM\0\0"; - u32 *sigptr = (u32 *)signature; - entry->eax = 0; - entry->ebx = sigptr[0]; - entry->ecx = sigptr[1]; - entry->edx = sigptr[2]; - break; - } - case KVM_CPUID_FEATURES: - entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; case 0x80000000: entry->eax = min(entry->eax, 0x8000001a); break; case 0x80000001: entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); break; } - - kvm_x86_ops->set_supported_cpuid(function, entry); - put_cpu(); } @@ -2490,23 +1996,6 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); - - - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, - cpuid->nent); - r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -2527,7 +2016,9 @@ out: static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + vcpu_load(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); + vcpu_put(vcpu); return 0; } @@ -2535,9 +2026,11 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); update_cr8_intercept(vcpu); + vcpu_put(vcpu); return 0; } @@ -2549,16 +2042,20 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; + vcpu_load(vcpu); kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu_put(vcpu); return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { + vcpu_load(vcpu); kvm_inject_nmi(vcpu); + vcpu_put(vcpu); return 0; } @@ -2624,7 +2121,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); return 0; } if (banks[1] & MCI_STATUS_VAL) @@ -2649,43 +2146,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - events->exception.injected = - vcpu->arch.exception.pending && - !kvm_exception_is_soft(vcpu->arch.exception.nr); + vcpu_load(vcpu); + + events->exception.injected = vcpu->arch.exception.pending; events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.pad = 0; events->exception.error_code = vcpu->arch.exception.error_code; - events->interrupt.injected = - vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; + events->interrupt.injected = vcpu->arch.interrupt.pending; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = 0; - events->interrupt.shadow = - kvm_x86_ops->get_interrupt_shadow(vcpu, - KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); + events->interrupt.soft = vcpu->arch.interrupt.soft; events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); - events->nmi.pad = 0; events->sipi_vector = vcpu->arch.sipi_vector; events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW); - memset(&events->reserved, 0, sizeof(events->reserved)); + | KVM_VCPUEVENT_VALID_SIPI_VECTOR); + + vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW)) + | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) return -EINVAL; + vcpu_load(vcpu); + vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -2696,9 +2188,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.soft = events->interrupt.soft; if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) kvm_pic_clear_isr_ack(vcpu->kvm); - if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, - events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) @@ -2708,134 +2197,34 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - dbgregs->dr6 = vcpu->arch.dr6; - dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); -} - -static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - if (dbgregs->flags) - return -EINVAL; - - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = dbgregs->dr6; - vcpu->arch.dr7 = dbgregs->dr7; + vcpu_put(vcpu); return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (kvm_cpu_has_xsave) - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->xsave, - kvm_xstate_size); - else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->fxsave, - sizeof(struct kvm_i387_fxsave_struct)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XSTATE_FPSSE; - } -} - -static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - u64 xstate_bv = - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - - if (kvm_cpu_has_xsave) - memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, kvm_xstate_size); - else { - if (xstate_bv & ~XSTATE_FPSSE) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->fxsave, - guest_xsave->region, sizeof(struct kvm_i387_fxsave_struct)); - } - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) -{ - if (!kvm_cpu_has_xsave) { - guest_xcrs->nr_xcrs = 0; - return; - } - - guest_xcrs->nr_xcrs = 1; - guest_xcrs->flags = 0; - guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; - guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; -} - -static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) -{ - int i, r = 0; - - if (!kvm_cpu_has_xsave) - return -EINVAL; - - if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) - return -EINVAL; - - for (i = 0; i < guest_xcrs->nr_xcrs; i++) - /* Only support XCR0 currently */ - if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { - r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, - guest_xcrs->xcrs[0].value); - break; - } - if (r) - r = -EINVAL; - return r; -} - long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void *argp = (void *)arg; int r; - union { - struct kvm_lapic_state *lapic; - struct kvm_xsave *xsave; - struct kvm_xcrs *xcrs; - void *buffer; - } u; + struct kvm_lapic_state *lapic = NULL; - u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; if (!vcpu->arch.apic) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!u.lapic) + if (!lapic) goto out; - r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) + if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; @@ -2844,14 +2233,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!u.lapic) + if (!lapic) goto out; r = -EFAULT; - if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) + if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); + r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); if (r) goto out; r = 0; @@ -2991,90 +2380,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); break; } - case KVM_GET_DEBUGREGS: { - struct kvm_debugregs dbgregs; - - kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); - - r = -EFAULT; - if (copy_to_user(argp, &dbgregs, - sizeof(struct kvm_debugregs))) - break; - r = 0; - break; - } - case KVM_SET_DEBUGREGS: { - struct kvm_debugregs dbgregs; - - r = -EFAULT; - if (copy_from_user(&dbgregs, argp, - sizeof(struct kvm_debugregs))) - break; - - r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); - break; - } - case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); - r = -ENOMEM; - if (!u.xsave) - break; - - kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); - - r = -EFAULT; - if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) - break; - r = 0; - break; - } - case KVM_SET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); - r = -ENOMEM; - if (!u.xsave) - break; - - r = -EFAULT; - if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) - break; - - r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); - break; - } - case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); - r = -ENOMEM; - if (!u.xcrs) - break; - - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); - - r = -EFAULT; - if (copy_to_user(argp, u.xcrs, - sizeof(struct kvm_xcrs))) - break; - r = 0; - break; - } - case KVM_SET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); - r = -ENOMEM; - if (!u.xcrs) - break; - - r = -EFAULT; - if (copy_from_user(u.xcrs, argp, - sizeof(struct kvm_xcrs))) - break; - - r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); - break; - } default: r = -EINVAL; } out: - kfree(u.buffer); + kfree(lapic); return r; } @@ -3114,7 +2424,116 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { - return kvm->arch.n_max_mmu_pages; + return kvm->arch.n_alloc_mmu_pages; +} + +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +/* + * Set a new alias region. Aliases map a portion of physical memory into + * another portion. This is useful for memory windows, for example the PC + * VGA region. + */ +static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, + struct kvm_memory_alias *alias) +{ + int r, n; + struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; + + r = -EINVAL; + /* General sanity checks */ + if (alias->memory_size & (PAGE_SIZE - 1)) + goto out; + if (alias->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (alias->slot >= KVM_ALIAS_SLOTS) + goto out; + if (alias->guest_phys_addr + alias->memory_size + < alias->guest_phys_addr) + goto out; + if (alias->target_phys_addr + alias->memory_size + < alias->target_phys_addr) + goto out; + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); + + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + kvm_synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + + p = &aliases->aliases[alias->slot]; + p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; + p->npages = alias->memory_size >> PAGE_SHIFT; + p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); + + for (n = KVM_ALIAS_SLOTS; n > 0; --n) + if (aliases->aliases[n - 1].npages) + break; + aliases->naliases = n; + + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + kvm_synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; + +out_unlock: + mutex_unlock(&kvm->slots_lock); +out: + return r; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) @@ -3150,18 +2569,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -3204,7 +2623,6 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); return r; } @@ -3246,6 +2664,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; unsigned long n; unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; mutex_lock(&kvm->slots_lock); @@ -3260,47 +2679,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) + goto out; + memset(dirty_bitmap, 0, n); + for (i = 0; !is_dirty && i < n/sizeof(long); i++) is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap; - dirty_bitmap = memslot->dirty_bitmap_head; - if (memslot->dirty_bitmap == dirty_bitmap) - dirty_bitmap += n / sizeof(long); - memset(dirty_bitmap, 0, n); + spin_lock(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); - r = -ENOMEM; slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) - goto out; + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; - slots->generation++; old_slots = kvm->memslots; rcu_assign_pointer(kvm->memslots, slots); kvm_synchronize_srcu_expedited(&kvm->srcu); dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); - - spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - spin_unlock(&kvm->mmu_lock); - - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - goto out; - } else { - r = -EFAULT; - if (clear_user(log->dirty_bitmap, n)) - goto out; } r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: mutex_unlock(&kvm->slots_lock); return r; @@ -3320,6 +2734,7 @@ long kvm_arch_vm_ioctl(struct file *filp, union { struct kvm_pit_state ps; struct kvm_pit_state2 ps2; + struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -3340,6 +2755,22 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } + case KVM_SET_MEMORY_REGION: { + struct kvm_memory_region kvm_mem; + struct kvm_userspace_memory_region kvm_userspace_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) + goto out; + kvm_userspace_mem.slot = kvm_mem.slot; + kvm_userspace_mem.flags = kvm_mem.flags; + kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; + kvm_userspace_mem.memory_size = kvm_mem.memory_size; + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + break; + } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); if (r) @@ -3348,6 +2779,14 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; + case KVM_SET_MEMORY_ALIAS: + r = -EFAULT; + if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) + goto out; + r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); + if (r) + goto out; + break; case KVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; @@ -3360,10 +2799,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { - mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); - mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; } @@ -3374,12 +2811,10 @@ long kvm_arch_vm_ioctl(struct file *filp, smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { - mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); - mutex_unlock(&kvm->slots_lock); } create_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -3412,13 +2847,11 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; - r = -ENXIO; if (irqchip_in_kernel(kvm)) { __s32 status; status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; irq_event.status = status; if (copy_to_user(argp, &irq_event, sizeof irq_event)) @@ -3555,6 +2988,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_SET_CLOCK: { + struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; s64 delta; @@ -3568,23 +3002,21 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - local_irq_disable(); - now_ns = get_kernel_ns(); + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); delta = user_ns.clock - now_ns; - local_irq_enable(); kvm->arch.kvmclock_offset = delta; break; } case KVM_GET_CLOCK: { + struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = get_kernel_ns(); + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; - local_irq_enable(); user_ns.flags = 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; if (copy_to_user(argp, &user_ns, sizeof(user_ns))) @@ -3635,86 +3067,52 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->set_segment(vcpu, var, seg); -} - -void kvm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->get_segment(vcpu, var, seg); -} - -static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) -{ - return gpa; -} - -static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) -{ - gpa_t t_gpa; - struct x86_exception exception; - - BUG_ON(!mmu_is_nested(vcpu)); - - /* NPT walks are always user-walks */ - access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); - - return t_gpa; -} - -gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); } - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); } -gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); } /* uses this to access any guest's mapped memory without checking CPL */ -gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u32 access, - struct x86_exception *exception) + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - exception); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { - r = X86EMUL_IO_NEEDED; + r = X86EMUL_UNHANDLEABLE; goto out; } @@ -3728,52 +3126,46 @@ out: /* used for instruction fetching */ static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) + struct kvm_vcpu *vcpu, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, - exception); + access | PFERR_FETCH_MASK, error); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) + struct kvm_vcpu *vcpu, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, - exception); + error); } static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) + struct kvm_vcpu *vcpu, u32 *error) { - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); } -static int kvm_write_guest_virt_system(gva_t addr, void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) +static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, - PFERR_WRITE_MASK, - exception); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { - r = X86EMUL_IO_NEEDED; + r = X86EMUL_UNHANDLEABLE; goto out; } @@ -3785,13 +3177,14 @@ out: return r; } + static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, - struct x86_exception *exception, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3801,17 +3194,19 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); - if (gpa == UNMAPPED_GVA) + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) - == X86EMUL_CONTINUE) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) + == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; mmio: @@ -3826,16 +3221,15 @@ mmio: trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; - return X86EMUL_IO_NEEDED; + return X86EMUL_UNHANDLEABLE; } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) + const void *val, int bytes) { int ret; @@ -3849,15 +3243,17 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *exception, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); - if (gpa == UNMAPPED_GVA) + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3875,185 +3271,72 @@ mmio: return X86EMUL_CONTINUE; vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, val, bytes); + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); return X86EMUL_CONTINUE; } int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, exception, - vcpu); + rc = emulator_write_emulated_onepage(addr, val, now, vcpu); if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, exception, - vcpu); + return emulator_write_emulated_onepage(addr, val, bytes, vcpu); } - -#define CMPXCHG_TYPE(t, ptr, old, new) \ - (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) - -#ifdef CONFIG_X86_64 -# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) -#else -# define CMPXCHG64(ptr, old, new) \ - (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) -#endif +EXPORT_SYMBOL_GPL(emulator_write_emulated); static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *exception, struct kvm_vcpu *vcpu) { - gpa_t gpa; - struct page *page; - char *kaddr; - bool exchanged; - - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes > 8 || (bytes & (bytes - 1))) - goto emul_write; - - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - - if (gpa == UNMAPPED_GVA || - (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; - - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) - goto emul_write; - - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); - goto emul_write; - } - - kaddr = kmap_atomic(page, KM_USER0); - kaddr += offset_in_page(gpa); - switch (bytes) { - case 1: - exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); - break; - case 2: - exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); - break; - case 4: - exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); - break; - case 8: - exchanged = CMPXCHG64(kaddr, old, new); - break; - default: - BUG(); - } - kunmap_atomic(kaddr, KM_USER0); - kvm_release_page_dirty(page); - - if (!exchanged) - return X86EMUL_CMPXCHG_FAILED; - - kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); - - return X86EMUL_CONTINUE; - -emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); +#ifndef CONFIG_X86_64 + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes == 8) { + gpa_t gpa; + struct page *page; + char *kaddr; + u64 val; - return emulator_write_emulated(addr, new, bytes, exception, vcpu); -} - -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - /* TODO: String I/O for in kernel device */ - int r; - - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - return r; -} - - -static int emulator_pio_in_emulated(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.pio.count) - goto data_avail; - - trace_kvm_pio(0, port, size, count); - - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 1; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; - - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - vcpu->arch.pio.count = 0; - return 1; - } - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_IN; - vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - return 0; -} + if (gpa == UNMAPPED_GVA || + (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; -static int emulator_pio_out_emulated(int size, unsigned short port, - const void *val, unsigned int count, - struct kvm_vcpu *vcpu) -{ - trace_kvm_pio(1, port, size, count); + if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + goto emul_write; - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 0; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; + val = *(u64 *)new; - memcpy(vcpu->arch.pio_data, val, size * count); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; - return 1; + kaddr = kmap_atomic(page, KM_USER0); + set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); } +emul_write: +#endif - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_OUT; - vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; - - return 0; + return emulator_write_emulated(addr, new, bytes, vcpu); } static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -4067,25 +3350,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) return X86EMUL_CONTINUE; } -int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - if (!need_emulate_wbinvd(vcpu)) - return X86EMUL_CONTINUE; - - if (kvm_x86_ops->has_wbinvd_exit()) { - int cpu = get_cpu(); - - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); - put_cpu(); - cpumask_clear(vcpu->arch.wbinvd_dirty_mask); - } else - wbinvd(); - return X86EMUL_CONTINUE; -} -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); - int emulate_clts(struct kvm_vcpu *vcpu) { kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); @@ -4093,194 +3357,42 @@ int emulate_clts(struct kvm_vcpu *vcpu) return X86EMUL_CONTINUE; } -int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) -{ - return _kvm_get_dr(vcpu, dr, dest); -} - -int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) -{ - - return __kvm_set_dr(vcpu, dr, value); -} - -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; -} - -static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) -{ - unsigned long value; - - switch (cr) { - case 0: - value = kvm_read_cr0(vcpu); - break; - case 2: - value = vcpu->arch.cr2; - break; - case 3: - value = kvm_read_cr3(vcpu); - break; - case 4: - value = kvm_read_cr4(vcpu); - break; - case 8: - value = kvm_get_cr8(vcpu); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - return 0; - } - - return value; -} - -static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) -{ - int res = 0; - - switch (cr) { - case 0: - res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); - break; - case 2: - vcpu->arch.cr2 = val; - break; - case 3: - res = kvm_set_cr3(vcpu, val); - break; - case 4: - res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); - break; - case 8: - res = kvm_set_cr8(vcpu, val); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - res = -1; - } - - return res; -} - -static int emulator_get_cpl(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->get_cpl(vcpu); -} - -static void emulator_get_gdt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->get_gdt(vcpu, dt); -} - -static void emulator_get_idt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->get_idt(vcpu, dt); -} - -static unsigned long emulator_get_cached_segment_base(int seg, - struct kvm_vcpu *vcpu) +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return get_segment_base(vcpu, seg); + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } -static bool emulator_get_cached_descriptor(struct kvm_desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - struct kvm_segment var; - - kvm_get_segment(vcpu, &var, seg); - - if (var.unusable) - return false; + unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - if (var.g) - var.limit >>= 12; - kvm_set_desc_limit(desc, var.limit); - kvm_set_desc_base(desc, (unsigned long)var.base); - desc->type = var.type; - desc->s = var.s; - desc->dpl = var.dpl; - desc->p = var.present; - desc->avl = var.avl; - desc->l = var.l; - desc->d = var.db; - desc->g = var.g; - - return true; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } -static void emulator_set_cached_descriptor(struct kvm_desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { - struct kvm_segment var; - - /* needed to preserve selector */ - kvm_get_segment(vcpu, &var, seg); - - var.base = kvm_get_desc_base(desc); - var.limit = kvm_get_desc_limit(desc); - if (desc->g) - var.limit = (var.limit << 12) | 0xfff; - var.type = desc->type; - var.present = desc->p; - var.dpl = desc->dpl; - var.db = desc->d; - var.s = desc->s; - var.l = desc->l; - var.g = desc->g; - var.avl = desc->avl; - var.present = desc->p; - var.unusable = !var.present; - var.padding = 0; - - kvm_set_segment(vcpu, &var, seg); - return; -} + u8 opcodes[4]; + unsigned long rip = kvm_rip_read(vcpu); + unsigned long rip_linear; -static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) -{ - struct kvm_segment kvm_seg; + if (!printk_ratelimit()) + return; - kvm_get_segment(vcpu, &kvm_seg, seg); - return kvm_seg.selector; -} + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); -static void emulator_set_segment_selector(u16 sel, int seg, - struct kvm_vcpu *vcpu) -{ - struct kvm_segment kvm_seg; + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - kvm_get_segment(vcpu, &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); + printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", + context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); } +EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, - .write_std = kvm_write_guest_virt_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, - .pio_in_emulated = emulator_pio_in_emulated, - .pio_out_emulated = emulator_pio_out_emulated, - .get_cached_descriptor = emulator_get_cached_descriptor, - .set_cached_descriptor = emulator_set_cached_descriptor, - .get_segment_selector = emulator_get_segment_selector, - .set_segment_selector = emulator_set_segment_selector, - .get_cached_segment_base = emulator_get_cached_segment_base, - .get_gdt = emulator_get_gdt, - .get_idt = emulator_get_idt, - .get_cr = emulator_get_cr, - .set_cr = emulator_set_cr, - .cpl = emulator_get_cpl, - .get_dr = emulator_get_dr, - .set_dr = emulator_set_dr, - .set_msr = kvm_set_msr, - .get_msr = kvm_get_msr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -4291,134 +3403,14 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = ~0; } -static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +int emulate_instruction(struct kvm_vcpu *vcpu, + unsigned long cr2, + u16 error_code, + int emulation_type) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); -} - -static void inject_emulated_exception(struct kvm_vcpu *vcpu) -{ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - if (ctxt->exception.vector == PF_VECTOR) - kvm_propagate_fault(vcpu, &ctxt->exception); - else if (ctxt->exception.error_code_valid) - kvm_queue_exception_e(vcpu, ctxt->exception.vector, - ctxt->exception.error_code); - else - kvm_queue_exception(vcpu, ctxt->exception.vector); -} - -static void init_emulate_ctxt(struct kvm_vcpu *vcpu) -{ - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - int cs_db, cs_l; - - cache_all_regs(vcpu); - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); -} - -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) -{ - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - int ret; - - init_emulate_ctxt(vcpu); - - vcpu->arch.emulate_ctxt.decode.op_bytes = 2; - vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); - - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - vcpu->arch.emulate_ctxt.eip = c->eip; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - - if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = false; - else - vcpu->arch.interrupt.pending = false; - - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); - -static int handle_emulation_failure(struct kvm_vcpu *vcpu) -{ - int r = EMULATE_DONE; - - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - if (!is_guest_mode(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - r = EMULATE_FAIL; - } - kvm_queue_exception(vcpu, UD_VECTOR); - - return r; -} - -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - - if (tdp_enabled) - return false; - - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-entetr the - * guest to let CPU execute the instruction. - */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) - return true; - - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); - - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ - - if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) - return true; - - return false; -} - -int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - int emulation_type, - void *insn, - int insn_len) -{ - int r; - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int r, shadow_mask; + struct decode_cache *c; + struct kvm_run *run = vcpu->run; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -4430,20 +3422,27 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, */ cache_all_regs(vcpu); + vcpu->mmio_is_write = 0; + vcpu->arch.pio.string = 0; + if (!(emulation_type & EMULTYPE_NO_DECODE)) { - init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.interruptibility = 0; - vcpu->arch.emulate_ctxt.have_exception = false; - vcpu->arch.emulate_ctxt.perm_ok = false; + int cs_db, cs_l; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); - if (r == X86EMUL_PROPAGATE_FAULT) - goto done; + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - trace_kvm_emulate_insn_start(vcpu); + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ + c = &vcpu->arch.emulate_ctxt.decode; if (emulation_type & EMULTYPE_TRAP_UD) { if (!c->twobyte) return EMULATE_FAIL; @@ -4471,11 +3470,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - if (reexecute_instruction(vcpu, cr2)) + ++vcpu->stat.insn_emulation_fail; + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; - return handle_emulation_failure(vcpu); + return EMULATE_FAIL; } } @@ -4484,74 +3482,245 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } - /* this is needed for vmware backdor interface to work since it - changes registers values during IO operation */ - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + + if (vcpu->arch.pio.string) + return EMULATE_DO_MMIO; -restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } - if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2)) + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; + if (!vcpu->mmio_needed) { + kvm_report_emulation_failure(vcpu, "mmio"); + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } + + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; + } + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(emulate_instruction); + +static int pio_copy_data(struct kvm_vcpu *vcpu) +{ + void *p = vcpu->arch.pio_data; + gva_t q = vcpu->arch.pio.guest_gva; + unsigned bytes; + int ret; + u32 error_code; - return handle_emulation_failure(vcpu); - } - -done: - if (vcpu->arch.emulate_ctxt.have_exception) { - inject_emulated_exception(vcpu); - r = EMULATE_DONE; - } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) - vcpu->arch.pio.count = 0; - r = EMULATE_DO_MMIO; - } else if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; - r = EMULATE_DO_MMIO; - } else if (r == EMULATION_RESTART) - goto restart; + bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; + if (vcpu->arch.pio.in) + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - r = EMULATE_DONE; + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + + return ret; +} + +int complete_pio(struct kvm_vcpu *vcpu) +{ + struct kvm_pio_request *io = &vcpu->arch.pio; + long delta; + int r; + unsigned long val; - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (!io->string) { + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } + } else { + if (io->in) { + r = pio_copy_data(vcpu); + if (r) + goto out; + } + delta = 1; + if (io->rep) { + delta *= io->cur_count; + /* + * The size of the register should really depend on + * current address size. + */ + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); + } + if (io->down) + delta = -delta; + delta *= io->size; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } + } +out: + io->count -= io->cur_count; + io->cur_count = 0; + + return 0; +} + +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +{ + /* TODO: String I/O for in kernel device */ + int r; + + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); -int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) +static int pio_string_write(struct kvm_vcpu *vcpu) { - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); - /* do not return to emulator after return from userspace */ - vcpu->arch.pio.count = 0; - return ret; + struct kvm_pio_request *io = &vcpu->arch.pio; + void *pd = vcpu->arch.pio_data; + int i, r = 0; + + for (i = 0; i < io->cur_count; i++) { + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + io->port, io->size, pd)) { + r = -EOPNOTSUPP; + break; + } + pd += io->size; + } + return r; } -EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void tsc_bad(void *info) +int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { - kvm___this_cpu_write(cpu_tsc_khz, 0); + unsigned long val; + + trace_kvm_pio(!in, port, size, 1); + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 0; + vcpu->arch.pio.down = 0; + vcpu->arch.pio.rep = 0; + + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + complete_pio(vcpu); + return 1; + } + return 0; } +EXPORT_SYMBOL_GPL(kvm_emulate_pio); -static void tsc_khz_changed(void *data) +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, + int size, unsigned long count, int down, + gva_t address, int rep, unsigned port) { - struct cpufreq_freqs *freq = data; - unsigned long khz = 0; + unsigned now, in_page; + int ret = 0; + + trace_kvm_pio(!in, port, size, count); - if (data) - khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - khz = cpufreq_quick_get(raw_smp_processor_id()); - if (!khz) - khz = tsc_khz; - kvm___this_cpu_write(cpu_tsc_khz, khz); + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 1; + vcpu->arch.pio.down = down; + vcpu->arch.pio.rep = rep; + + if (!count) { + kvm_x86_ops->skip_emulated_instruction(vcpu); + return 1; + } + + if (!down) + in_page = PAGE_SIZE - offset_in_page(address); + else + in_page = offset_in_page(address) + size; + now = min(count, (unsigned long)in_page / size); + if (!now) + now = 1; + if (down) { + /* + * String I/O in reverse. Yuck. Kill the guest, fix later. + */ + pr_unimpl(vcpu, "guest string pio down\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->run->io.count = now; + vcpu->arch.pio.cur_count = now; + + if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) + kvm_x86_ops->skip_emulated_instruction(vcpu); + + vcpu->arch.pio.guest_gva = address; + + if (!vcpu->arch.pio.in) { + /* string PIO write */ + ret = pio_copy_data(vcpu); + if (ret == X86EMUL_PROPAGATE_FAULT) + return 1; + if (ret == 0 && !pio_string_write(vcpu)) { + complete_pio(vcpu); + if (vcpu->arch.pio.count == 0) + ret = 1; + } + } + /* no string PIO read support yet */ + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); + +static void bounce_off(void *info) +{ + /* nothing */ } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -4562,60 +3731,21 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; - /* - * We allow guests to temporarily run on slowing clocks, - * provided we notify them after, or to run on accelerating - * clocks, provided we notify them before. Thus time never - * goes backwards. - * - * However, we have a problem. We can't atomically update - * the frequency of a given CPU from this function; it is - * merely a notifier, which can be called from any CPU. - * Changing the TSC frequency at arbitrary points in time - * requires a recomputation of local variables related to - * the TSC for each VCPU. We must flag these local variables - * to be updated and be sure the update takes place with the - * new frequency before any guests proceed. - * - * Unfortunately, the combination of hotplug CPU and frequency - * change creates an intractable locking scenario; the order - * of when these callouts happen is undefined with respect to - * CPU hotplug, and they can race with each other. As such, - * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is - * undefined; you can actually have a CPU frequency change take - * place in between the computation of X and the setting of the - * variable. To protect against this problem, all updates of - * the per_cpu tsc_khz variable are done in an interrupt - * protected IPI, and all callers wishing to update the value - * must wait for a synchronous IPI to complete (which is trivial - * if the caller is on the CPU already). This establishes the - * necessary total order on variable updates. - * - * Note that because a guest time update may take place - * anytime after the setting of the VCPU's request bit, the - * correct TSC value must be set before the request. However, - * to ensure the update actually makes it to any guest which - * starts running in hardware virtualization between the set - * and the acquisition of the spinlock, we must also ping the - * CPU after setting the request bit. - * - */ - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + if (!kvm_request_guest_time_update(vcpu)) + continue; if (vcpu->cpu != smp_processor_id()) - send_ipi = 1; + send_ipi++; } } spin_unlock(&kvm_lock); @@ -4633,106 +3763,34 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + smp_call_function_single(freq->cpu, bounce_off, NULL, 1); } return 0; } static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier -}; - -static int kvmclock_cpu_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, tsc_bad, NULL, 1); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block kvmclock_cpu_notifier_block = { - .notifier_call = kvmclock_cpu_notifier, - .priority = -INT_MAX + .notifier_call = kvmclock_cpufreq_notifier }; static void kvm_timer_init(void) { int cpu; - max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; - memset(&policy, 0, sizeof(policy)); - cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; - put_cpu(); -#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + for_each_online_cpu(cpu) { + unsigned long khz = cpufreq_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; + } + } else { + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - for_each_online_cpu(cpu) - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); -} - -static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); - -static int kvm_is_in_guest(void) -{ - return percpu_read(current_vcpu) != NULL; } -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (percpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (percpu_read(current_vcpu)) - ip = kvm_rip_read(percpu_read(current_vcpu)); - - return ip; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) -{ - percpu_write(current_vcpu, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); - -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) -{ - percpu_write(current_vcpu, NULL); -} -EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); - int kvm_arch_init(void *opaque) { int r; @@ -4751,10 +3809,6 @@ int kvm_arch_init(void *opaque) } if (ops->disabled_by_bios()) { printk(KERN_ERR "kvm: disabled by bios\n"); -#ifndef KVM_TBOOT_ENABLED_WORKS - - printk(KERN_ERR "kvm: if TXT is enabled in the BIOS, disable it\n"); -#endif r = -EOPNOTSUPP; goto out; } @@ -4765,20 +3819,14 @@ int kvm_arch_init(void *opaque) kvm_init_msr_list(); - kvm_xstate_size_init(); - kvm_x86_ops = ops; kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); kvm_timer_init(); - perf_register_guest_info_callbacks(&kvm_guest_cbs); - - if (kvm_cpu_has_xsave) - host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - return 0; out: @@ -4787,12 +3835,9 @@ out: void kvm_arch_exit(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -4942,23 +3987,88 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); + return emulator_write_emulated(rip, instruction, 3, vcpu); +} + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct kvm_desc_ptr dt = { limit, base }; + struct descriptor_table dt = { limit, base }; kvm_x86_ops->set_gdt(vcpu, &dt); } void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct kvm_desc_ptr dt = { limit, base }; + struct descriptor_table dt = { limit, base }; kvm_x86_ops->set_idt(vcpu, &dt); } +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) +{ + kvm_lmsw(vcpu, msw); + *rflags = kvm_get_rflags(vcpu); +} + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + unsigned long value; + + switch (cr) { + case 0: + value = kvm_read_cr0(vcpu); + break; + case 2: + value = vcpu->arch.cr2; + break; + case 3: + value = vcpu->arch.cr3; + break; + case 4: + value = kvm_read_cr4(vcpu); + break; + case 8: + value = kvm_get_cr8(vcpu); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + return 0; + } + + return value; +} + +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + *rflags = kvm_get_rflags(vcpu); + break; + case 2: + vcpu->arch.cr2 = val; + break; + case 3: + kvm_set_cr3(vcpu, val); + break; + case 4: + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + break; + case 8: + kvm_set_cr8(vcpu, val & 0xfUL); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + } +} + static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; @@ -5022,13 +4132,9 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); - if (!best || best->eax < 0x80000008) - goto not_found; best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); if (best) return best->eax & 0xff; -not_found: return 36; } @@ -5109,10 +4215,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; - idx = srcu_read_lock(&vcpu->kvm->srcu); + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -5142,13 +4248,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) { /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code, - vcpu->arch.exception.reinject); + vcpu->arch.exception.error_code); return; } @@ -5178,84 +4280,44 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) } } -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; - } -} - -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_xcr0_loaded) { - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; - } -} - static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - if (vcpu->requests) { - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) + if (vcpu->requests) + if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) kvm_mmu_unload(vcpu); - if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) + + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { - r = kvm_guest_time_update(vcpu); - if (unlikely(r)) - goto out; - } - if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + kvm_write_guest_time(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); - if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { + if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, + &vcpu->requests)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { vcpu->fpu_active = 0; kvm_x86_ops->fpu_deactivate(vcpu); } - if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { - /* Page is swapped out. Do synthetic halt */ - vcpu->arch.apf.halted = true; - r = 1; - goto out; - } - } - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } } preempt_disable(); @@ -5263,25 +4325,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); - kvm_load_guest_xcr0(vcpu); - - atomic_set(&vcpu->guest_mode, 1); - smp_wmb(); local_irq_disable(); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests - || need_resched() || signal_pending(current)) { - atomic_set(&vcpu->guest_mode, 0); - smp_wmb(); + clear_bit(KVM_REQ_KICK, &vcpu->requests); + smp_mb__after_clear_bit(); + + if (vcpu->requests || need_resched() || signal_pending(current)) { + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); r = 1; goto out; } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + + kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -5306,10 +4377,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); - - atomic_set(&vcpu->guest_mode, 0); - smp_wmb(); + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); ++vcpu->stat.exits; @@ -5326,7 +4394,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -5360,26 +4428,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; while (r > 0) { - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) + vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu); + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: - vcpu->arch.apf.halted = false; break; case KVM_MP_STATE_SIPI_RECEIVED: default: @@ -5401,22 +4467,20 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.request_irq_exits; } - - kvm_check_async_pf_completion(vcpu); - if (signal_pending(current)) { r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } if (need_resched()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu); } } - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -5428,8 +4492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - if (!tsk_used_math(current) && kvm_init_fpu(current)) - return -ENOMEM; + vcpu_load(vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -5442,23 +4505,29 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { - if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { - r = -EINVAL; + if (!irqchip_in_kernel(vcpu->kvm)) + kvm_set_cr8(vcpu, kvm_run->cr8); + + if (vcpu->arch.pio.cur_count) { + vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); + r = complete_pio(vcpu); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r) goto out; - } } + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; - if (vcpu->arch.pio.count || vcpu->mmio_needed) { - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) { + vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, + EMULTYPE_NO_DECODE); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ r = 0; goto out; } @@ -5470,15 +4539,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = __vcpu_run(vcpu); out: - post_kvm_run_save(vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); + vcpu_put(vcpu); return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -5501,11 +4572,15 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); + vcpu_put(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -5530,11 +4605,17 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { struct kvm_segment cs; @@ -5548,7 +4629,9 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - struct kvm_desc_ptr dt; + struct descriptor_table dt; + + vcpu_load(vcpu); kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -5561,15 +4644,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); kvm_x86_ops->get_idt(vcpu, &dt); - sregs->idt.limit = dt.size; - sregs->idt.base = dt.address; + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; kvm_x86_ops->get_gdt(vcpu, &dt); - sregs->gdt.limit = dt.size; - sregs->gdt.base = dt.address; + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = kvm_read_cr3(vcpu); + sregs->cr3 = vcpu->arch.cr3; sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; @@ -5581,44 +4664,586 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); + vcpu_put(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); vcpu->arch.mp_state = mp_state->mp_state; - kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +static void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + +static void seg_desct_to_kvm_desct(struct kvm_desc_struct *seg_desc, u16 selector, + struct kvm_segment *kvm_desct) +{ + kvm_desct->base = kvm_get_desc_base(seg_desc); + kvm_desct->limit = kvm_get_desc_limit(seg_desc); + if (seg_desc->g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } + kvm_desct->selector = selector; + kvm_desct->type = seg_desc->type; + kvm_desct->present = seg_desc->p; + kvm_desct->dpl = seg_desc->dpl; + kvm_desct->db = seg_desc->d; + kvm_desct->s = seg_desc->s; + kvm_desct->l = seg_desc->l; + kvm_desct->g = seg_desc->g; + kvm_desct->avl = seg_desc->avl; + if (!selector) + kvm_desct->unusable = 1; + else + kvm_desct->unusable = 0; + kvm_desct->padding = 0; +} + +static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, + u16 selector, + struct descriptor_table *dtable) +{ + if (selector & 1 << 2) { + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + + if (kvm_seg.unusable) + dtable->limit = 0; + else + dtable->limit = kvm_seg.limit; + dtable->base = kvm_seg.base; + } + else + kvm_x86_ops->get_gdt(vcpu, dtable); +} + +/* allowed just for 8 bytes segments */ +static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + struct kvm_desc_struct *seg_desc) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct descriptor_table dtable; + u16 index = selector >> 3; int ret; + u32 err; + gva_t addr; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) { + kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); - init_emulate_ctxt(vcpu); + return ret; +} - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, - tss_selector, reason, has_error_code, - error_code); +/* allowed just for 8 bytes segments */ +static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + struct kvm_desc_struct *seg_desc) +{ + struct descriptor_table dtable; + u16 index = selector >> 3; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) + return 1; + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct kvm_desc_struct *seg_desc) +{ + u32 base_addr = kvm_get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); +} + +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, + struct kvm_desc_struct *seg_desc) +{ + u32 base_addr = kvm_get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); +} + +static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + return kvm_seg.selector; +} + +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return X86EMUL_CONTINUE; +} + +static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && + (seg != VCPU_SREG_TR) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); +} + +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment kvm_seg; + struct kvm_desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; + + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) + return kvm_load_realmode_segment(vcpu, selector, seg); + + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); if (ret) - return EMULATE_FAIL; + return ret; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: + kvm_set_segment(vcpu, &kvm_seg, seg); + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; +} + +static void save_state_to_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + tss->cr3 = vcpu->arch.cr3; + tss->eip = kvm_rip_read(vcpu); + tss->eflags = kvm_get_rflags(vcpu); + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); + tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + +static int load_state_from_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + kvm_set_cr3(vcpu, tss->cr3); + + kvm_rip_write(vcpu, tss->eip); + kvm_set_rflags(vcpu, tss->eflags | 2); + + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) + return 1; + return 0; +} + +static void save_state_to_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + tss->ip = kvm_rip_read(vcpu); + tss->flag = kvm_get_rflags(vcpu); + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); + + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static int load_state_from_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + kvm_rip_write(vcpu, tss->ip); + kvm_set_rflags(vcpu, tss->flag | 2); + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return 1; + return 0; +} + +static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, + u16 old_tss_sel, u32 old_tss_base, + struct kvm_desc_struct *nseg_desc) +{ + struct tss_segment_16 tss_segment_16; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) + goto out; + + save_state_to_tss16(vcpu, &tss_segment_16); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_16, sizeof tss_segment_16)) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr_write(vcpu, nseg_desc), + &tss_segment_16.prev_task_link, + sizeof tss_segment_16.prev_task_link)) + goto out; + } + + if (load_state_from_tss16(vcpu, &tss_segment_16)) + goto out; + + ret = 1; +out: + return ret; +} + +static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, + u16 old_tss_sel, u32 old_tss_base, + struct kvm_desc_struct *nseg_desc) +{ + struct tss_segment_32 tss_segment_32; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + + save_state_to_tss32(vcpu, &tss_segment_32); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_32, sizeof tss_segment_32)) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr_write(vcpu, nseg_desc), + &tss_segment_32.prev_task_link, + sizeof tss_segment_32.prev_task_link)) + goto out; + } + + if (load_state_from_tss32(vcpu, &tss_segment_32)) + goto out; + + ret = 1; +out: + return ret; +} + +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) +{ + struct kvm_segment tr_seg; + struct kvm_desc_struct cseg_desc; + struct kvm_desc_struct nseg_desc; + int ret = 0; + u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + u32 desc_limit; + + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); + + /* FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ + if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) + goto out; + + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) + goto out; + + if (reason != TASK_SWITCH_IRET) { + int cpl; + + cpl = kvm_x86_ops->get_cpl(vcpu); + if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + } + + desc_limit = kvm_get_desc_limit(&nseg_desc); + if (!nseg_desc.p || + ((desc_limit < 0x67 && (nseg_desc.type & 8)) || + desc_limit < 0x2b)) { + kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); + return 1; + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + cseg_desc.type &= ~(1 << 1); //clear the B flag + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); + } + + if (reason == TASK_SWITCH_IRET) { + u32 eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); + } + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (nseg_desc.type & 8) + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + else + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { + u32 eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); + } + + if (reason != TASK_SWITCH_IRET) { + nseg_desc.type |= (1 << 1); + save_guest_segment_descriptor(vcpu, tss_selector, + &nseg_desc); + } + + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); + seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); + tr_seg.type = 11; + kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); +out: + return ret; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -5627,19 +5252,20 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { int mmu_reset_needed = 0; int pending_vec, max_bits; - struct kvm_desc_ptr dt; + struct descriptor_table dt; - dt.size = sregs->idt.limit; - dt.address = sregs->idt.base; + vcpu_load(vcpu); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); - dt.size = sregs->gdt.limit; - dt.address = sregs->gdt.base; + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; kvm_x86_ops->set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; + mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); kvm_set_cr8(vcpu, sregs->cr8); @@ -5653,10 +5279,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) - update_cpuid(vcpu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + load_pdptrs(vcpu, vcpu->arch.cr3); mmu_reset_needed = 1; } @@ -5691,7 +5315,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } @@ -5702,10 +5326,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; + vcpu_load(vcpu); + if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) - goto out; + goto unlock_out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else @@ -5733,9 +5359,11 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + vcpu->arch.singlestep_cs = + get_segment_selector(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); + } /* * Trigger an rflags update that will inject or remove the trace @@ -5747,12 +5375,34 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; -out: +unlock_out: + vcpu_put(vcpu); return r; } /* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +#ifdef CONFIG_X86_64 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ +#else + u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ +#endif +}; + +/* * Translate a guest virtual address to a guest physical address. */ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -5762,21 +5412,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; - idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu_load(vcpu); + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; + vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct kvm_i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -5787,13 +5440,16 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + vcpu_put(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct kvm_i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -5804,63 +5460,61 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + vcpu_put(vcpu); + return 0; } -int fx_init(struct kvm_vcpu *vcpu) +void fx_init(struct kvm_vcpu *vcpu) { - int err; - - err = kvm_fpu_alloc(&vcpu->arch.guest_fpu); - if (err) - return err; - - kvm_fpu_finit(&vcpu->arch.guest_fpu); + unsigned after_mxcsr_mask; /* - * Ensure guest xcr0 is valid for loading + * Touch the fpu the first time in non atomic context as if + * this is the first fpu instruction the exception handler + * will fire before the instruction returns and it'll have to + * allocate ram with GFP_KERNEL. */ - vcpu->arch.xcr0 = XSTATE_FP; + if (!used_math()) + kvm_fx_save(&vcpu->arch.host_fx_image); - vcpu->arch.cr0 |= X86_CR0_ET; + /* Initialize guest FPU by resetting ours and saving into guest's */ + preempt_disable(); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_finit(); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); + preempt_enable(); - return 0; + vcpu->arch.cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->arch.guest_fx_image.mxcsr = 0x1f80; + memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, + 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } EXPORT_SYMBOL_GPL(fx_init); -static void fx_free(struct kvm_vcpu *vcpu) -{ - kvm_fpu_free(&vcpu->arch.guest_fpu); -} - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) return; - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - unlazy_fpu(current); - kvm_fpu_restore_checking(&vcpu->arch.guest_fpu); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_restore(&vcpu->arch.guest_fx_image); trace_kvm_fpu(1); } void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 0; - kvm_fpu_save_init(&vcpu->arch.guest_fpu); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); trace_kvm_fpu(0); } @@ -5871,18 +5525,12 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - printk_once(KERN_WARNING - "kvm: SMP vm created on host with unstable TSC; " - "guest TSC will not be reliable\n"); return kvm_x86_ops->vcpu_create(kvm, id); } @@ -5890,6 +5538,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; + /* We do fxsave: this must be aligned. */ + BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); + vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); @@ -5907,13 +5558,10 @@ free_vcpu: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5927,27 +5575,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; - - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - vcpu->arch.apf.halted = false; - return kvm_x86_ops->vcpu_reset(vcpu); } int kvm_arch_hardware_enable(void *garbage) { - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + /* + * Since this may be called from a hotplug notifcation, + * we can't get the CPU frequency directly. + */ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + int cpu = raw_smp_processor_id(); + per_cpu(cpu_tsc_khz, cpu) = 0; + } kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + return kvm_x86_ops->hardware_enable(garbage); } @@ -5981,11 +5624,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.emulate_ctxt.ops = &emulate_ops; - vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.translate_gpa = translate_gpa; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -5998,9 +5637,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - if (!kvm->arch.virtual_tsc_khz) - kvm_arch_set_tsc_khz(kvm, max_tsc_khz); - r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -6019,14 +5655,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) - goto fail_free_mce_banks; - - kvm_async_pf_hash_reset(vcpu); - return 0; -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: @@ -6043,23 +5672,34 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } -int kvm_arch_init_vm(struct kvm *kvm) +struct kvm *kvm_arch_create_vm(void) { + struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + + if (!kvm) + return ERR_PTR(-ENOMEM); + + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - spin_lock_init(&kvm->arch.tsc_write_lock); + rdtscll(kvm->arch.vm_init_tsc); - return 0; + return kvm; } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -6077,10 +5717,8 @@ static void kvm_free_vcpus(struct kvm *kvm) /* * Unpin any mmu pages first. */ - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); + kvm_for_each_vcpu(i, vcpu, kvm) kvm_unload_vcpu_mmu(vcpu); - } kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_free(vcpu); @@ -6095,19 +5733,23 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); - kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); + kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); if (kvm->arch.apic_access_page) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + kvm_cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); + kfree(kvm); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -6117,11 +5759,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, int user_alloc) { int npages = memslot->npages; - int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; - - /* Prevent internal slot pages from being moved by fork()/COW. */ - if (memslot->id >= KVM_MEMORY_SLOTS) - map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -6134,7 +5771,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - map_flags, + MAP_PRIVATE | MAP_ANONYMOUS, 0); up_write(¤t->mm->mmap_sem); @@ -6188,9 +5825,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) - || !list_empty_careful(&vcpu->async_pf.done) + return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || vcpu->arch.nmi_pending || (kvm_arch_interrupt_allowed(vcpu) && @@ -6209,7 +5844,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (atomic_xchg(&vcpu->guest_mode, 0)) + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) kvm_smp_send_reschedule(cpu); put_cpu(); } @@ -6219,22 +5854,13 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) -{ - unsigned long current_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); - - return current_rip == linear_rip; -} -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); - unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; rflags = kvm_x86_ops->get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~X86_EFLAGS_TF; + rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); return rflags; } EXPORT_SYMBOL_GPL(kvm_get_rflags); @@ -6242,154 +5868,14 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && - kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) - rflags |= X86_EFLAGS_TF; + vcpu->arch.singlestep_cs == + get_segment_selector(vcpu, VCPU_SREG_CS) && + vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; kvm_x86_ops->set_rflags(vcpu, rflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - is_error_page(work->page)) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu.direct_map && - work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) - return; - - vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); -} - -static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) -{ - return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); -} - -static inline u32 kvm_async_pf_next_probe(u32 key) -{ - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); -} - -static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 key = kvm_async_pf_hash_fn(gfn); - - while (vcpu->arch.apf.gfns[key] != ~0) - key = kvm_async_pf_next_probe(key); - - vcpu->arch.apf.gfns[key] = gfn; -} - -static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - int i; - u32 key = kvm_async_pf_hash_fn(gfn); - - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && - (vcpu->arch.apf.gfns[key] != gfn && - vcpu->arch.apf.gfns[key] != ~0); i++) - key = kvm_async_pf_next_probe(key); - - return key; -} - -bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; -} - -static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 i, j, k; - - i = j = kvm_async_pf_gfn_slot(vcpu, gfn); - while (true) { - vcpu->arch.apf.gfns[i] = ~0; - do { - j = kvm_async_pf_next_probe(j); - if (vcpu->arch.apf.gfns[j] == ~0) - return; - k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); - /* - * k lies cyclically in ]i,j] - * | i.k.j | - * |....j i.k.| or |.k..j i...| - */ - } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); - vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; - i = j; - } -} - -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) -{ - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); -} - -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_not_present(work->arch.token, work->gva); - kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } -} - -void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (is_error_page(work->page)) - work->arch.token = ~0; /* broadcast wakeup */ - else - kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } - vcpu->arch.apf.halted = false; -} - -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) - return true; - else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); -} - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); @@ -6401,4 +5887,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); |