summaryrefslogtreecommitdiff
path: root/linux/x86/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'linux/x86/x86.c')
-rw-r--r--linux/x86/x86.c3733
1 files changed, 1609 insertions, 2124 deletions
diff --git a/linux/x86/x86.c b/linux/x86/x86.c
index a6233d2..1071014 100644
--- a/linux/x86/x86.c
+++ b/linux/x86/x86.c
@@ -46,7 +46,6 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -81,23 +80,17 @@
#include <linux/user-return-notifier.h>
#include <linux/srcu.h>
#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <linux/uaccess.h>
-#include <linux/hash.h>
#include <trace/events/kvm.h>
-
+#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "trace.h"
#include <asm/debugreg.h>
+#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/pvclock.h>
-#include <asm/div64.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -108,13 +101,12 @@
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXSAVE \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
@@ -172,7 +164,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
- { "request_irq", VCPU_STAT(request_irq_exits) },
+ { "kvm_request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -194,15 +186,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
-u64 __read_mostly host_xcr0;
-
-static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
-{
- int i;
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
- vcpu->arch.apf.gfns[i] = ~0;
-}
-
static void kvm_on_user_return(struct kvm_user_return_notifier *urn)
{
unsigned slot;
@@ -280,6 +263,34 @@ static void drop_user_return_notifiers(void *ignore)
kvm_on_user_return(&smsr->urn);
}
+unsigned long segment_base(u16 selector)
+{
+ struct descriptor_table gdt;
+ struct kvm_desc_struct *d;
+ unsigned long table_base;
+ unsigned long v;
+
+ if (selector == 0)
+ return 0;
+
+ kvm_get_gdt(&gdt);
+ table_base = gdt.base;
+
+ if (selector & 4) { /* from ldt */
+ u16 ldt_selector = kvm_read_ldt();
+
+ table_base = segment_base(ldt_selector);
+ }
+ d = (struct kvm_desc_struct *)(table_base + (selector & ~7));
+ v = kvm_get_desc_base(d);
+#ifdef CONFIG_X86_64
+ if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32;
+#endif
+ return v;
+}
+EXPORT_SYMBOL_GPL(segment_base);
+
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
@@ -321,21 +332,17 @@ static int exception_class(int vector)
}
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
- unsigned nr, bool has_error, u32 error_code,
- bool reinject)
+ unsigned nr, bool has_error, u32 error_code)
{
u32 prev_nr;
int class1, class2;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
if (!vcpu->arch.exception.pending) {
queue:
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
- vcpu->arch.exception.reinject = reinject;
return;
}
@@ -343,7 +350,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
prev_nr = vcpu->arch.exception.nr;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
return;
}
class1 = exception_class(prev_nr);
@@ -364,59 +371,30 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, false);
+ kvm_multiple_exception(vcpu, nr, false, 0);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
- kvm_multiple_exception(vcpu, nr, false, 0, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
- if (err)
- kvm_inject_gp(vcpu, 0);
- else
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
-
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+ u32 error_code)
{
++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = fault->address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
-}
-
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
-{
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
- else
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+ vcpu->arch.cr2 = addr;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, false);
+ kvm_multiple_exception(vcpu, nr, true, error_code);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
-{
- kvm_multiple_exception(vcpu, nr, true, error_code, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
-
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
* a #GP and return false.
@@ -431,49 +409,18 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
EXPORT_SYMBOL_GPL(kvm_require_cpl);
/*
- * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
- * can read from guest physical or from the guest's guest physical memory.
- */
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t ngfn, void *data, int offset, int len,
- u32 access)
-{
- gfn_t real_gfn;
- gpa_t ngpa;
-
- ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
- if (real_gfn == UNMAPPED_GVA)
- return -EFAULT;
-
- real_gfn = gpa_to_gfn(real_gfn);
-
- return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- void *data, int offset, int len, u32 access)
-{
- return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
- data, offset, len, access);
-}
-
-/*
* Load the pae pdptrs. Return true is they are all valid.
*/
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
int ret;
- u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
+ ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+ offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
ret = 0;
goto out;
@@ -487,7 +434,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
}
ret = 1;
- memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
+ memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
@@ -500,10 +447,8 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
bool changed = true;
- int offset;
- gfn_t gfn;
int r;
if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -513,181 +458,132 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
- gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
- offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
- r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
- PFERR_USER_MASK | PFERR_WRITE_MASK);
+ r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
+ changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
return changed;
}
-int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
- X86_CR0_CD | X86_CR0_NW;
-
cr0 |= X86_CR0_ET;
#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL)
- return 1;
+ if (cr0 & 0xffffffff00000000UL) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
#endif
cr0 &= ~CR0_RESERVED_BITS;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- return 1;
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- return 1;
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
- if (!is_pae(vcpu))
- return 1;
+ if (!is_pae(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l)
- return 1;
+ if (cs_l) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+
+ }
} else
#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
- return 1;
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+
}
kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
- if ((cr0 ^ old_cr0) & X86_CR0_PG)
- kvm_clear_async_pf_completion_queue(vcpu);
-
- if ((cr0 ^ old_cr0) & update_bits)
- kvm_mmu_reset_context(vcpu);
- return 0;
+ kvm_mmu_reset_context(vcpu);
+ return;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- u64 xcr0;
-
- /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
- if (index != XCR_XFEATURE_ENABLED_MASK)
- return 1;
- xcr0 = xcr;
- if (kvm_x86_ops->get_cpl(vcpu) != 0)
- return 1;
- if (!(xcr0 & XSTATE_FP))
- return 1;
- if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
- return 1;
- if (xcr0 & ~host_xcr0)
- return 1;
- vcpu->arch.xcr0 = xcr0;
- vcpu->guest_xcr0_loaded = 0;
- return 0;
-}
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
-{
- if (__kvm_set_xcr(vcpu, index, xcr)) {
+ if (cr4 & CR4_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
-
-static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & bit(X86_FEATURE_XSAVE));
-}
-
-static void update_cpuid(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- if (!best)
return;
-
- /* Update OSXSAVE bit */
- if (kvm_cpu_has_xsave && best->function == 0x1) {
- best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
- best->ecx |= bit(X86_FEATURE_OSXSAVE);
}
-}
-
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-
- if (cr4 & CR4_RESERVED_BITS)
- return 1;
-
- if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
- return 1;
if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE))
- return 1;
+ if (!(cr4 & X86_CR4_PAE)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
- return 1;
-
- if (cr4 & X86_CR4_VMXE)
- return 1;
+ && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ if (cr4 & X86_CR4_VMXE) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
kvm_x86_ops->set_cr4(vcpu, cr4);
-
- if ((cr4 ^ old_cr4) & pdptr_bits)
- kvm_mmu_reset_context(vcpu);
-
- if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
- update_cpuid(vcpu);
-
- return 0;
+ vcpu->arch.cr4 = cr4;
+ vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
+ kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
-int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
- if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
+ if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
- return 0;
+ return;
}
if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
+ if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
} else {
if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS)
- return 1;
- if (is_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
- return 1;
+ if (cr3 & CR3_PAE_RESERVED_BITS) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
+ if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
}
/*
* We don't check reserved bits in nonpae mode, because
@@ -705,23 +601,24 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- return 1;
- vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
- vcpu->arch.mmu.new_cr3(vcpu);
- return 0;
+ kvm_inject_gp(vcpu, 0);
+ else {
+ vcpu->arch.cr3 = cr3;
+ vcpu->arch.mmu.new_cr3(vcpu);
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- if (cr8 & CR8_RESERVED_BITS)
- return 1;
+ if (cr8 & CR8_RESERVED_BITS) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
- return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
@@ -734,90 +631,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
-static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static inline u32 bit(int bitno)
{
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = val;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = val;
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
- /* fall through */
- case 6:
- if (val & 0xffffffff00000000ULL)
- return -1; /* #GP */
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
- /* fall through */
- default: /* 7 */
- if (val & 0xffffffff00000000ULL)
- return -1; /* #GP */
- vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
- }
- break;
- }
-
- return 0;
+ return 1 << (bitno & 31);
}
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- int res;
-
- res = __kvm_set_dr(vcpu, dr, val);
- if (res > 0)
- kvm_queue_exception(vcpu, UD_VECTOR);
- else if (res < 0)
- kvm_inject_gp(vcpu, 0);
-
- return res;
-}
-EXPORT_SYMBOL_GPL(kvm_set_dr);
-
-static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- switch (dr) {
- case 0 ... 3:
- *val = vcpu->arch.db[dr];
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
- /* fall through */
- case 6:
- *val = vcpu->arch.dr6;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
- /* fall through */
- default: /* 7 */
- *val = vcpu->arch.dr7;
- break;
- }
-
- return 0;
-}
-
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- if (_kvm_get_dr(vcpu, dr, val)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_dr);
-
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -827,67 +645,67 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 8
+#define KVM_SAVE_MSRS_BEGIN 5
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+ HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_STAR,
+ MSR_K6_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+ MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
- MSR_IA32_MCG_STATUS,
- MSR_IA32_MCG_CTL,
};
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- u64 old_efer = vcpu->arch.efer;
-
- if (efer & efer_reserved_bits)
- return 1;
+ if (efer & efer_reserved_bits) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
- return 1;
+ if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
}
if (efer & EFER_SVME) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
- return 1;
+ if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+ kvm_inject_gp(vcpu, 0);
+ return;
+ }
}
+ kvm_x86_ops->set_efer(vcpu, efer);
+
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- kvm_x86_ops->set_efer(vcpu, efer);
+ vcpu->arch.efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
- kvm_mmu_reset_context(vcpu);
-
- return 0;
+ kvm_mmu_reset_context(vcpu);
}
void kvm_enable_efer_bits(u64 mask)
@@ -917,28 +735,20 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
- int version;
- int r;
+ static int version;
struct pvclock_wall_clock wc;
struct timespec boot;
if (!wall_clock)
return;
- r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
- if (r)
- return;
-
- if (version & 1)
- ++version; /* first time write, random junk */
-
- ++version;
+ version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
/*
* The guest calculates current wall clock time by adding
- * system time (updated by kvm_guest_time_update below) to the
+ * system time (updated by kvm_write_guest_time below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
@@ -966,230 +776,64 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
return quotient;
}
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
- s8 *pshift, u32 *pmultiplier)
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
{
- uint64_t scaled64;
+ uint64_t nsecs = 1000000000LL;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
- tps64 = base_khz * 1000LL;
- scaled64 = scaled_khz * 1000LL;
- while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
+ tps64 = tsc_khz * 1000LL;
+ while (tps64 > nsecs*2) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
- while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
- if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
- scaled64 >>= 1;
- else
- tps32 <<= 1;
+ while (tps32 <= (uint32_t)nsecs) {
+ tps32 <<= 1;
shift++;
}
- *pshift = shift;
- *pmultiplier = div_frac(scaled64, tps32);
+ hv_clock->tsc_shift = shift;
+ hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
- pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
- __func__, base_khz, scaled_khz, shift, *pmultiplier);
-}
-
-static inline u64 get_kernel_ns(void)
-{
- struct timespec ts;
-
- WARN_ON(preemptible());
- ktime_get_ts(&ts);
- kvm_monotonic_to_bootbased(&ts);
- return timespec_to_ns(&ts);
+ pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+ __func__, tsc_khz, hv_clock->tsc_shift,
+ hv_clock->tsc_to_system_mul);
}
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
-
-static inline int kvm_tsc_changes_freq(void)
-{
- int cpu = get_cpu();
- int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
- cpufreq_quick_get(cpu) != 0;
- put_cpu();
- return ret;
-}
-
-static inline u64 nsec_to_cycles(u64 nsec)
-{
- u64 ret;
-
- WARN_ON(preemptible());
- if (kvm_tsc_changes_freq())
- printk_once(KERN_WARNING
- "kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * kvm___this_cpu_read(cpu_tsc_khz);
- do_div(ret, USEC_PER_SEC);
- return ret;
-}
-static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
-{
- /* Compute a scale to convert nanoseconds in TSC cycles */
- kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &kvm->arch.virtual_tsc_shift,
- &kvm->arch.virtual_tsc_mult);
- kvm->arch.virtual_tsc_khz = this_tsc_khz;
-}
-
-static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
-{
- u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
- vcpu->kvm->arch.virtual_tsc_mult,
- vcpu->kvm->arch.virtual_tsc_shift);
- tsc += vcpu->arch.last_tsc_write;
- return tsc;
-}
-
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
- u64 offset, ns, elapsed;
- unsigned long flags;
- s64 sdiff;
-
- spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = data - kvm_native_read_tsc();
- ns = get_kernel_ns();
- elapsed = ns - kvm->arch.last_tsc_nsec;
- sdiff = data - kvm->arch.last_tsc_write;
- if (sdiff < 0)
- sdiff = -sdiff;
-
- /*
- * Special case: close write to TSC within 5 seconds of
- * another CPU is interpreted as an attempt to synchronize
- * The 5 seconds is to accomodate host load / swapping as
- * well as any reset of TSC during the boot process.
- *
- * In that case, for a reliable TSC, we can match TSC offsets,
- * or make a best guest using elapsed value.
- */
- if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
- elapsed < 5ULL * NSEC_PER_SEC) {
- if (!kvm_check_tsc_unstable()) {
- offset = kvm->arch.last_tsc_offset;
- pr_debug("kvm: matched tsc offset for %llu\n", data);
- } else {
- u64 delta = nsec_to_cycles(elapsed);
- offset += delta;
- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
- }
- ns = kvm->arch.last_tsc_nsec;
- }
- kvm->arch.last_tsc_nsec = ns;
- kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_offset = offset;
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
-
- /* Reset of TSC must disable overshoot protection below */
- vcpu->arch.hv_clock.tsc_timestamp = 0;
- vcpu->arch.last_tsc_write = data;
- vcpu->arch.last_tsc_nsec = ns;
-}
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
-static int kvm_guest_time_update(struct kvm_vcpu *v)
+static void kvm_write_guest_time(struct kvm_vcpu *v)
{
+ struct timespec ts;
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
unsigned long this_tsc_khz;
- s64 kernel_ns, max_kernel_ns;
- u64 tsc_timestamp;
- /* Keep irq disabled to prevent changes to the clock */
- local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
- kernel_ns = get_kernel_ns();
- this_tsc_khz = kvm___this_cpu_read(cpu_tsc_khz);
+ if ((!vcpu->time_page))
+ return;
- if (unlikely(this_tsc_khz == 0)) {
- local_irq_restore(flags);
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- return 1;
- }
-
- /*
- * We may have to catch up the TSC to match elapsed wall clock
- * time for two reasons, even if kvmclock is used.
- * 1) CPU could have been running below the maximum TSC rate
- * 2) Broken TSC compensation resets the base at each VCPU
- * entry to avoid unknown leaps of TSC even when running
- * again on the same CPU. This may cause apparent elapsed
- * time to disappear, and the guest to stand still or run
- * very slowly.
- */
- if (vcpu->tsc_catchup) {
- u64 tsc = compute_guest_tsc(v, kernel_ns);
- if (tsc > tsc_timestamp) {
- kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
- tsc_timestamp = tsc;
- }
+ this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+ if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+ kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = this_tsc_khz;
}
+ put_cpu_var(cpu_tsc_khz);
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+ ktime_get_ts(&ts);
+ kvm_monotonic_to_bootbased(&ts);
local_irq_restore(flags);
- if (!vcpu->time_page)
- return 0;
-
- /*
- * Time as measured by the TSC may go backwards when resetting the base
- * tsc_timestamp. The reason for this is that the TSC resolution is
- * higher than the resolution of the other clock scales. Thus, many
- * possible measurments of the TSC correspond to one measurement of any
- * other clock, and so a spread of values is possible. This is not a
- * problem for the computation of the nanosecond clock; with TSC rates
- * around 1GHZ, there can only be a few cycles which correspond to one
- * nanosecond value, and any path through this code will inevitably
- * take longer than that. However, with the kernel_ns value itself,
- * the precision may be much lower, down to HZ granularity. If the
- * first sampling of TSC against kernel_ns ends in the low part of the
- * range, and the second in the high end of the range, we can get:
- *
- * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
- *
- * As the sampling errors potentially range in the thousands of cycles,
- * it is possible such a time value has already been observed by the
- * guest. To protect against this, we must compute the system time as
- * observed by the guest and ensure the new system time is greater.
- */
- max_kernel_ns = 0;
- if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
- max_kernel_ns = vcpu->last_guest_tsc -
- vcpu->hv_clock.tsc_timestamp;
- max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
- vcpu->hv_clock.tsc_to_system_mul,
- vcpu->hv_clock.tsc_shift);
- max_kernel_ns += vcpu->last_kernel_ns;
- }
-
- if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
- &vcpu->hv_clock.tsc_shift,
- &vcpu->hv_clock.tsc_to_system_mul);
- vcpu->hw_tsc_khz = this_tsc_khz;
- }
-
- if (max_kernel_ns > kernel_ns)
- kernel_ns = max_kernel_ns;
-
/* With all the info we got, fill in the values */
- vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
- vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
- vcpu->last_kernel_ns = kernel_ns;
- vcpu->last_guest_tsc = tsc_timestamp;
- vcpu->hv_clock.flags = 0;
+
+ vcpu->hv_clock.system_time = ts.tv_nsec +
+ (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
/*
* The interface expects us to write an even number signaling that the
@@ -1206,7 +850,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kunmap_atomic(shared_kaddr, KM_USER0);
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
- return 0;
+}
+
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+
+ if (!vcpu->time_page)
+ return 0;
+ set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+ return 1;
}
static bool msr_mtrr_valid(unsigned msr)
@@ -1469,38 +1122,14 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
-static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
-{
- gpa_t gpa = data & ~0x3f;
-
- /* Bits 2:5 are resrved, Should be zero */
- if (data & 0x3c)
- return 1;
-
- vcpu->arch.apf.msr_val = data;
-
- if (!(data & KVM_ASYNC_PF_ENABLED)) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- return 0;
- }
-
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
- return 1;
-
- vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
- kvm_async_pf_wakeup_all(vcpu);
- return 0;
-}
-
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
- return set_efer(vcpu, data);
+ set_efer(vcpu, data);
+ break;
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
- data &= ~(u64)0x100; /* ignore ignne emulation enable */
if (data != 0) {
pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -1543,12 +1172,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
- case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data);
break;
- case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
if (vcpu->arch.time_page) {
kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1556,7 +1183,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
vcpu->arch.time = data;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
if (!(data & 1))
@@ -1572,12 +1198,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_release_page_clean(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
+
+ kvm_request_guest_time_update(vcpu);
break;
}
- case MSR_KVM_ASYNC_PF_EN:
- if (kvm_pv_enable_async_pf(vcpu, data))
- return 1;
- break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1612,16 +1236,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
"0x%x data 0x%llx\n", msr, data);
break;
- case MSR_K7_CLK_CTL:
- /*
- * Ignore all writes to this no longer documented MSR.
- * Writes are only relevant for old K7 processors,
- * all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
- * affected processor models on the command line, hence
- * the need to ignore the workaround.
- */
- break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1814,20 +1428,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case 0xcd: /* fsb frequency */
data = 3;
break;
- /*
- * MSR_EBC_FREQUENCY_ID
- * Conservative value valid for even the basic CPU models.
- * Models 0,1: 000 in bits 23:21 indicating a bus speed of
- * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
- * and 266MHz for model 3, or 4. Set Core Clock
- * Frequency to System Bus Frequency Ratio to 1 (bits
- * 31:24) even though these are only valid for CPU
- * models > 2, however guests may end up dividing or
- * multiplying by zero otherwise.
- */
- case MSR_EBC_FREQUENCY_ID:
- data = 1 << 24;
- break;
case MSR_IA32_APICBASE:
data = kvm_get_apic_base(vcpu);
break;
@@ -1847,16 +1447,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
- case MSR_KVM_WALL_CLOCK_NEW:
data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
- case MSR_KVM_SYSTEM_TIME_NEW:
data = vcpu->arch.time;
break;
- case MSR_KVM_ASYNC_PF_EN:
- data = vcpu->arch.apf.msr_val;
- break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1864,18 +1459,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
return get_msr_mce(vcpu, msr, pdata);
- case MSR_K7_CLK_CTL:
- /*
- * Provide expected ramp-up count for K7. All other
- * are set to zero, indicating minimum divisors for
- * every field.
- *
- * This prevents guest kernels on AMD host with CPU
- * type 6, model 8 and higher from exploding due to
- * the rdmsr failing.
- */
- data = 0x20000000;
- break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1913,11 +1496,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
{
int i, idx;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ vcpu_load(vcpu);
+
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ vcpu_put(vcpu);
return i;
}
@@ -1947,7 +1534,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs,
r = -ENOMEM;
size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = kmalloc(size, GFP_KERNEL);
+ entries = vmalloc(size);
if (!entries)
goto out;
@@ -1966,7 +1553,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs,
r = n;
out_free:
- kfree(entries);
+ vfree(entries);
out:
return r;
}
@@ -1988,7 +1575,6 @@ int kvm_dev_ioctl_check_extension(long ext)
#ifdef CONFIG_MMU_NOTIFIER
case KVM_CAP_SYNC_MMU:
#endif
- case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -2006,12 +1592,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
case KVM_CAP_PCI_SEGMENT:
- case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
- case KVM_CAP_XSAVE:
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)
- case KVM_CAP_ASYNC_PF:
-#endif
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2035,9 +1616,6 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
- case KVM_CAP_XCRS:
- r = kvm_cpu_has_xsave;
- break;
default:
r = 0;
break;
@@ -2114,51 +1692,22 @@ out:
return r;
}
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
-static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
-{
- return vcpu->kvm->arch.iommu_domain &&
- !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
-}
-
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- /* Address WBINVD may be executed by guest */
- if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops->has_wbinvd_exit())
- cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
- }
-
kvm_x86_ops->vcpu_load(vcpu, cpu);
- if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
- /* Make sure TSC doesn't go backwards */
- s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
- kvm_native_read_tsc() - vcpu->arch.last_host_tsc;
- if (tsc_delta < 0)
- mark_tsc_unstable("KVM discovered backwards TSC");
- if (kvm_check_tsc_unstable()) {
- kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
- vcpu->arch.tsc_catchup = 1;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
- if (vcpu->cpu != cpu)
- kvm_migrate_timers(vcpu);
- vcpu->cpu = cpu;
+ if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
+ unsigned long khz = cpufreq_quick_get(cpu);
+ if (!khz)
+ khz = tsc_khz;
+ per_cpu(cpu_tsc_khz, cpu) = khz;
}
+ kvm_request_guest_time_update(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_host_tsc = kvm_native_read_tsc();
+ kvm_x86_ops->vcpu_put(vcpu);
}
static int is_efer_nx(void)
@@ -2207,6 +1756,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
if (copy_from_user(cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out_free;
+ vcpu_load(vcpu);
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -2224,7 +1774,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- update_cpuid(vcpu);
+ vcpu_put(vcpu);
out_free:
vfree(cpuid_entries);
@@ -2245,10 +1795,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
+ vcpu_load(vcpu);
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- update_cpuid(vcpu);
+ vcpu_put(vcpu);
return 0;
out:
@@ -2275,11 +1826,6 @@ out:
return r;
}
-static void cpuid_mask(u32 *word, int wordnum)
-{
- *word &= boot_cpu_data.x86_capability[wordnum];
-}
-
static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index)
{
@@ -2328,20 +1874,19 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
- F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
- F(F16C);
+ 0 /* Reserved, XSAVE, OSXSAVE */;
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
- F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
- 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+ 0 /* SKINIT */ | 0 /* WDT */;
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -2350,13 +1895,11 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
switch (function) {
case 0:
- entry->eax = min(entry->eax, (u32)0xd);
+ entry->eax = min(entry->eax, (u32)0xb);
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
- cpuid_mask(&entry->edx, 0);
entry->ecx &= kvm_supported_word4_x86_features;
- cpuid_mask(&entry->ecx, 4);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@@ -2410,51 +1953,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
- case 0xd: {
- int i;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- for (i = 1; *nent < maxnent; ++i) {
- if (entry[i - 1].eax == 0 && i != 2)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- }
- break;
- }
- case KVM_CPUID_SIGNATURE: {
- char signature[12] = "KVMKVMKVM\0\0";
- u32 *sigptr = (u32 *)signature;
- entry->eax = 0;
- entry->ebx = sigptr[0];
- entry->ecx = sigptr[1];
- entry->edx = sigptr[2];
- break;
- }
- case KVM_CPUID_FEATURES:
- entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
- (1 << KVM_FEATURE_NOP_IO_DELAY) |
- (1 << KVM_FEATURE_CLOCKSOURCE2) |
- (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
- entry->ebx = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
case 0x80000000:
entry->eax = min(entry->eax, 0x8000001a);
break;
case 0x80000001:
entry->edx &= kvm_supported_word1_x86_features;
- cpuid_mask(&entry->edx, 1);
entry->ecx &= kvm_supported_word6_x86_features;
- cpuid_mask(&entry->ecx, 6);
break;
}
-
- kvm_x86_ops->set_supported_cpuid(function, entry);
-
put_cpu();
}
@@ -2490,23 +1996,6 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
do_cpuid_ent(&cpuid_entries[nent], func, 0,
&nent, cpuid->nent);
-
-
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
- cpuid->nent);
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
- cpuid->nent);
-
r = -E2BIG;
if (nent >= cpuid->nent)
goto out_free;
@@ -2527,7 +2016,9 @@ out:
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ vcpu_load(vcpu);
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
+ vcpu_put(vcpu);
return 0;
}
@@ -2535,9 +2026,11 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
update_cr8_intercept(vcpu);
+ vcpu_put(vcpu);
return 0;
}
@@ -2549,16 +2042,20 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
+ vcpu_load(vcpu);
kvm_queue_interrupt(vcpu, irq->irq, false);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vcpu_put(vcpu);
return 0;
}
static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
{
+ vcpu_load(vcpu);
kvm_inject_nmi(vcpu);
+ vcpu_put(vcpu);
return 0;
}
@@ -2624,7 +2121,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
return 0;
}
if (banks[1] & MCI_STATUS_VAL)
@@ -2649,43 +2146,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- events->exception.injected =
- vcpu->arch.exception.pending &&
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ vcpu_load(vcpu);
+
+ events->exception.injected = vcpu->arch.exception.pending;
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.pad = 0;
events->exception.error_code = vcpu->arch.exception.error_code;
- events->interrupt.injected =
- vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
+ events->interrupt.injected = vcpu->arch.interrupt.pending;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = 0;
- events->interrupt.shadow =
- kvm_x86_ops->get_interrupt_shadow(vcpu,
- KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
+ events->interrupt.soft = vcpu->arch.interrupt.soft;
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending;
events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
- events->nmi.pad = 0;
events->sipi_vector = vcpu->arch.sipi_vector;
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW);
- memset(&events->reserved, 0, sizeof(events->reserved));
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+
+ vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW))
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
return -EINVAL;
+ vcpu_load(vcpu);
+
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2696,9 +2188,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
kvm_pic_clear_isr_ack(vcpu->kvm);
- if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops->set_interrupt_shadow(vcpu,
- events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2708,134 +2197,34 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
vcpu->arch.sipi_vector = events->sipi_vector;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- dbgregs->dr6 = vcpu->arch.dr6;
- dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
-}
-
-static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- if (dbgregs->flags)
- return -EINVAL;
-
- memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = dbgregs->dr6;
- vcpu->arch.dr7 = dbgregs->dr7;
+ vcpu_put(vcpu);
return 0;
}
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
-{
- if (kvm_cpu_has_xsave)
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->xsave,
- kvm_xstate_size);
- else {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->fxsave,
- sizeof(struct kvm_i387_fxsave_struct));
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XSTATE_FPSSE;
- }
-}
-
-static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
-{
- u64 xstate_bv =
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-
- if (kvm_cpu_has_xsave)
- memcpy(&vcpu->arch.guest_fpu.state->xsave,
- guest_xsave->region, kvm_xstate_size);
- else {
- if (xstate_bv & ~XSTATE_FPSSE)
- return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state->fxsave,
- guest_xsave->region, sizeof(struct kvm_i387_fxsave_struct));
- }
- return 0;
-}
-
-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
-{
- if (!kvm_cpu_has_xsave) {
- guest_xcrs->nr_xcrs = 0;
- return;
- }
-
- guest_xcrs->nr_xcrs = 1;
- guest_xcrs->flags = 0;
- guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
- guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
-}
-
-static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
-{
- int i, r = 0;
-
- if (!kvm_cpu_has_xsave)
- return -EINVAL;
-
- if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
- return -EINVAL;
-
- for (i = 0; i < guest_xcrs->nr_xcrs; i++)
- /* Only support XCR0 currently */
- if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
- r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
- guest_xcrs->xcrs[0].value);
- break;
- }
- if (r)
- r = -EINVAL;
- return r;
-}
-
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void *argp = (void *)arg;
int r;
- union {
- struct kvm_lapic_state *lapic;
- struct kvm_xsave *xsave;
- struct kvm_xcrs *xcrs;
- void *buffer;
- } u;
+ struct kvm_lapic_state *lapic = NULL;
- u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!u.lapic)
+ if (!lapic)
goto out;
- r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
+ if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
@@ -2844,14 +2233,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!u.lapic)
+ if (!lapic)
goto out;
r = -EFAULT;
- if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
+ if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
if (r)
goto out;
r = 0;
@@ -2991,90 +2380,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
break;
}
- case KVM_GET_DEBUGREGS: {
- struct kvm_debugregs dbgregs;
-
- kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
-
- r = -EFAULT;
- if (copy_to_user(argp, &dbgregs,
- sizeof(struct kvm_debugregs)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_DEBUGREGS: {
- struct kvm_debugregs dbgregs;
-
- r = -EFAULT;
- if (copy_from_user(&dbgregs, argp,
- sizeof(struct kvm_debugregs)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
- break;
- }
- case KVM_GET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xsave)
- break;
-
- kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
-
- r = -EFAULT;
- if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xsave)
- break;
-
- r = -EFAULT;
- if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
- break;
- }
- case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xcrs)
- break;
-
- kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
-
- r = -EFAULT;
- if (copy_to_user(argp, u.xcrs,
- sizeof(struct kvm_xcrs)))
- break;
- r = 0;
- break;
- }
- case KVM_SET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xcrs)
- break;
-
- r = -EFAULT;
- if (copy_from_user(u.xcrs, argp,
- sizeof(struct kvm_xcrs)))
- break;
-
- r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
- break;
- }
default:
r = -EINVAL;
}
out:
- kfree(u.buffer);
+ kfree(lapic);
return r;
}
@@ -3114,7 +2424,116 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
- return kvm->arch.n_max_mmu_pages;
+ return kvm->arch.n_alloc_mmu_pages;
+}
+
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (alias->flags & KVM_ALIAS_INVALID)
+ continue;
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
+/*
+ * Set a new alias region. Aliases map a portion of physical memory into
+ * another portion. This is useful for memory windows, for example the PC
+ * VGA region.
+ */
+static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
+ struct kvm_memory_alias *alias)
+{
+ int r, n;
+ struct kvm_mem_alias *p;
+ struct kvm_mem_aliases *aliases, *old_aliases;
+
+ r = -EINVAL;
+ /* General sanity checks */
+ if (alias->memory_size & (PAGE_SIZE - 1))
+ goto out;
+ if (alias->guest_phys_addr & (PAGE_SIZE - 1))
+ goto out;
+ if (alias->slot >= KVM_ALIAS_SLOTS)
+ goto out;
+ if (alias->guest_phys_addr + alias->memory_size
+ < alias->guest_phys_addr)
+ goto out;
+ if (alias->target_phys_addr + alias->memory_size
+ < alias->target_phys_addr)
+ goto out;
+
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /* invalidate any gfn reference in case of deletion/shrinking */
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+ aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ kvm_mmu_zap_all(kvm);
+ kfree(old_aliases);
+
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out_unlock;
+
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+
+ p = &aliases->aliases[alias->slot];
+ p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
+ p->npages = alias->memory_size >> PAGE_SHIFT;
+ p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+ p->flags &= ~(KVM_ALIAS_INVALID);
+
+ for (n = KVM_ALIAS_SLOTS; n > 0; --n)
+ if (aliases->aliases[n - 1].npages)
+ break;
+ aliases->naliases = n;
+
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ kvm_synchronize_srcu_expedited(&kvm->srcu);
+ kfree(old_aliases);
+ r = 0;
+
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
+out:
+ return r;
}
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -3150,18 +2569,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3204,7 +2623,6 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
return r;
}
@@ -3246,6 +2664,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
unsigned long n;
unsigned long is_dirty = 0;
+ unsigned long *dirty_bitmap = NULL;
mutex_lock(&kvm->slots_lock);
@@ -3260,47 +2679,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
n = kvm_dirty_bitmap_bytes(memslot);
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
+ goto out;
+ memset(dirty_bitmap, 0, n);
+
for (i = 0; !is_dirty && i < n/sizeof(long); i++)
is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
struct kvm_memslots *slots, *old_slots;
- unsigned long *dirty_bitmap;
- dirty_bitmap = memslot->dirty_bitmap_head;
- if (memslot->dirty_bitmap == dirty_bitmap)
- dirty_bitmap += n / sizeof(long);
- memset(dirty_bitmap, 0, n);
+ spin_lock(&kvm->mmu_lock);
+ kvm_mmu_slot_remove_write_access(kvm, log->slot);
+ spin_unlock(&kvm->mmu_lock);
- r = -ENOMEM;
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!slots)
- goto out;
+ goto out_free;
+
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
- slots->generation++;
old_slots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
kvm_synchronize_srcu_expedited(&kvm->srcu);
dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
kfree(old_slots);
-
- spin_lock(&kvm->mmu_lock);
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
- spin_unlock(&kvm->mmu_lock);
-
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
- goto out;
- } else {
- r = -EFAULT;
- if (clear_user(log->dirty_bitmap, n))
- goto out;
}
r = 0;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
+ r = -EFAULT;
+out_free:
+ vfree(dirty_bitmap);
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -3320,6 +2734,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
union {
struct kvm_pit_state ps;
struct kvm_pit_state2 ps2;
+ struct kvm_memory_alias alias;
struct kvm_pit_config pit_config;
} u;
@@ -3340,6 +2755,22 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
break;
}
+ case KVM_SET_MEMORY_REGION: {
+ struct kvm_memory_region kvm_mem;
+ struct kvm_userspace_memory_region kvm_userspace_mem;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+ goto out;
+ kvm_userspace_mem.slot = kvm_mem.slot;
+ kvm_userspace_mem.flags = kvm_mem.flags;
+ kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
+ kvm_userspace_mem.memory_size = kvm_mem.memory_size;
+ r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ if (r)
+ goto out;
+ break;
+ }
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
if (r)
@@ -3348,6 +2779,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
+ case KVM_SET_MEMORY_ALIAS:
+ r = -EFAULT;
+ if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
+ goto out;
+ r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
+ if (r)
+ goto out;
+ break;
case KVM_CREATE_IRQCHIP: {
struct kvm_pic *vpic;
@@ -3360,10 +2799,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
- mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
&vpic->dev);
- mutex_unlock(&kvm->slots_lock);
kfree(vpic);
goto create_irqchip_unlock;
}
@@ -3374,12 +2811,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
- mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
- mutex_unlock(&kvm->slots_lock);
}
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -3412,13 +2847,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
- r = -ENXIO;
if (irqchip_in_kernel(kvm)) {
__s32 status;
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
sizeof irq_event))
@@ -3555,6 +2988,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
case KVM_SET_CLOCK: {
+ struct timespec now;
struct kvm_clock_data user_ns;
u64 now_ns;
s64 delta;
@@ -3568,23 +3002,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = 0;
- local_irq_disable();
- now_ns = get_kernel_ns();
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
delta = user_ns.clock - now_ns;
- local_irq_enable();
kvm->arch.kvmclock_offset = delta;
break;
}
case KVM_GET_CLOCK: {
+ struct timespec now;
struct kvm_clock_data user_ns;
u64 now_ns;
- local_irq_disable();
- now_ns = get_kernel_ns();
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
- local_irq_enable();
user_ns.flags = 0;
- memset(&user_ns.pad, 0, sizeof(user_ns.pad));
r = -EFAULT;
if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
@@ -3635,86 +3067,52 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-void kvm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
- return gpa;
-}
-
-static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
- gpa_t t_gpa;
- struct x86_exception exception;
-
- BUG_ON(!mmu_is_nested(vcpu));
-
- /* NPT walks are always user-walks */
- access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
-
- return t_gpa;
-}
-
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
}
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
}
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
}
/* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
- struct x86_exception *exception)
+ u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- exception);
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
- r = X86EMUL_IO_NEEDED;
+ r = X86EMUL_UNHANDLEABLE;
goto out;
}
@@ -3728,52 +3126,46 @@ out:
/* used for instruction fetching */
static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+ struct kvm_vcpu *vcpu, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK,
- exception);
+ access | PFERR_FETCH_MASK, error);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+ struct kvm_vcpu *vcpu, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- exception);
+ error);
}
static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+ struct kvm_vcpu *vcpu, u32 *error)
{
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
}
-static int kvm_write_guest_virt_system(gva_t addr, void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu,
- struct x86_exception *exception)
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- PFERR_WRITE_MASK,
- exception);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
- r = X86EMUL_IO_NEEDED;
+ r = X86EMUL_UNHANDLEABLE;
goto out;
}
@@ -3785,13 +3177,14 @@ out:
return r;
}
+
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
- struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3801,17 +3194,19 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
- if (gpa == UNMAPPED_GVA)
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
return X86EMUL_PROPAGATE_FAULT;
+ }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
- == X86EMUL_CONTINUE)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
+ == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
mmio:
@@ -3826,16 +3221,15 @@ mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 0;
- return X86EMUL_IO_NEEDED;
+ return X86EMUL_UNHANDLEABLE;
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+ const void *val, int bytes)
{
int ret;
@@ -3849,15 +3243,17 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
- struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
- if (gpa == UNMAPPED_GVA)
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
return X86EMUL_PROPAGATE_FAULT;
+ }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3875,185 +3271,72 @@ mmio:
return X86EMUL_CONTINUE;
vcpu->mmio_needed = 1;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
- memcpy(vcpu->run->mmio.data, val, bytes);
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
return X86EMUL_CONTINUE;
}
int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, exception,
- vcpu);
+ rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, exception,
- vcpu);
+ return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
}
-
-#define CMPXCHG_TYPE(t, ptr, old, new) \
- (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-
-#ifdef CONFIG_X86_64
-# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
-#else
-# define CMPXCHG64(ptr, old, new) \
- (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
-#endif
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
- gpa_t gpa;
- struct page *page;
- char *kaddr;
- bool exchanged;
-
- /* guests cmpxchg8b have to be emulated atomically */
- if (bytes > 8 || (bytes & (bytes - 1)))
- goto emul_write;
-
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
-
- if (gpa == UNMAPPED_GVA ||
- (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto emul_write;
-
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
- goto emul_write;
-
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
- goto emul_write;
- }
-
- kaddr = kmap_atomic(page, KM_USER0);
- kaddr += offset_in_page(gpa);
- switch (bytes) {
- case 1:
- exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
- break;
- case 2:
- exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
- break;
- case 4:
- exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
- break;
- case 8:
- exchanged = CMPXCHG64(kaddr, old, new);
- break;
- default:
- BUG();
- }
- kunmap_atomic(kaddr, KM_USER0);
- kvm_release_page_dirty(page);
-
- if (!exchanged)
- return X86EMUL_CMPXCHG_FAILED;
-
- kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
-
- return X86EMUL_CONTINUE;
-
-emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+#ifndef CONFIG_X86_64
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes == 8) {
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ u64 val;
- return emulator_write_emulated(addr, new, bytes, exception, vcpu);
-}
-
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
- /* TODO: String I/O for in kernel device */
- int r;
-
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
- return r;
-}
-
-
-static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
- unsigned int count, struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.pio.count)
- goto data_avail;
-
- trace_kvm_pio(0, port, size, count);
-
- vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = 1;
- vcpu->arch.pio.count = count;
- vcpu->arch.pio.size = size;
-
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- data_avail:
- memcpy(val, vcpu->arch.pio_data, size * count);
- vcpu->arch.pio.count = 0;
- return 1;
- }
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = KVM_EXIT_IO_IN;
- vcpu->run->io.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = count;
- vcpu->run->io.port = port;
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
- return 0;
-}
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
-static int emulator_pio_out_emulated(int size, unsigned short port,
- const void *val, unsigned int count,
- struct kvm_vcpu *vcpu)
-{
- trace_kvm_pio(1, port, size, count);
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
- vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = 0;
- vcpu->arch.pio.count = count;
- vcpu->arch.pio.size = size;
+ val = *(u64 *)new;
- memcpy(vcpu->arch.pio_data, val, size * count);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- vcpu->arch.pio.count = 0;
- return 1;
+ kaddr = kmap_atomic(page, KM_USER0);
+ set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
+ kunmap_atomic(kaddr, KM_USER0);
+ kvm_release_page_dirty(page);
}
+emul_write:
+#endif
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = KVM_EXIT_IO_OUT;
- vcpu->run->io.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = count;
- vcpu->run->io.port = port;
-
- return 0;
+ return emulator_write_emulated(addr, new, bytes, vcpu);
}
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -4067,25 +3350,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
-{
- if (!need_emulate_wbinvd(vcpu))
- return X86EMUL_CONTINUE;
-
- if (kvm_x86_ops->has_wbinvd_exit()) {
- int cpu = get_cpu();
-
- cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
- put_cpu();
- cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
- } else
- wbinvd();
- return X86EMUL_CONTINUE;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-
int emulate_clts(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
@@ -4093,194 +3357,42 @@ int emulate_clts(struct kvm_vcpu *vcpu)
return X86EMUL_CONTINUE;
}
-int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
-{
- return _kvm_get_dr(vcpu, dr, dest);
-}
-
-int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
-{
-
- return __kvm_set_dr(vcpu, dr, value);
-}
-
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
-static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
-{
- unsigned long value;
-
- switch (cr) {
- case 0:
- value = kvm_read_cr0(vcpu);
- break;
- case 2:
- value = vcpu->arch.cr2;
- break;
- case 3:
- value = kvm_read_cr3(vcpu);
- break;
- case 4:
- value = kvm_read_cr4(vcpu);
- break;
- case 8:
- value = kvm_get_cr8(vcpu);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- return 0;
- }
-
- return value;
-}
-
-static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
-{
- int res = 0;
-
- switch (cr) {
- case 0:
- res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
- break;
- case 2:
- vcpu->arch.cr2 = val;
- break;
- case 3:
- res = kvm_set_cr3(vcpu, val);
- break;
- case 4:
- res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
- break;
- case 8:
- res = kvm_set_cr8(vcpu, val);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- res = -1;
- }
-
- return res;
-}
-
-static int emulator_get_cpl(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_ops->get_cpl(vcpu);
-}
-
-static void emulator_get_gdt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->get_gdt(vcpu, dt);
-}
-
-static void emulator_get_idt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->get_idt(vcpu, dt);
-}
-
-static unsigned long emulator_get_cached_segment_base(int seg,
- struct kvm_vcpu *vcpu)
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return get_segment_base(vcpu, seg);
+ return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
}
-static bool emulator_get_cached_descriptor(struct kvm_desc_struct *desc, int seg,
- struct kvm_vcpu *vcpu)
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
- struct kvm_segment var;
-
- kvm_get_segment(vcpu, &var, seg);
-
- if (var.unusable)
- return false;
+ unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- if (var.g)
- var.limit >>= 12;
- kvm_set_desc_limit(desc, var.limit);
- kvm_set_desc_base(desc, (unsigned long)var.base);
- desc->type = var.type;
- desc->s = var.s;
- desc->dpl = var.dpl;
- desc->p = var.present;
- desc->avl = var.avl;
- desc->l = var.l;
- desc->d = var.db;
- desc->g = var.g;
-
- return true;
+ return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
}
-static void emulator_set_cached_descriptor(struct kvm_desc_struct *desc, int seg,
- struct kvm_vcpu *vcpu)
+void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
{
- struct kvm_segment var;
-
- /* needed to preserve selector */
- kvm_get_segment(vcpu, &var, seg);
-
- var.base = kvm_get_desc_base(desc);
- var.limit = kvm_get_desc_limit(desc);
- if (desc->g)
- var.limit = (var.limit << 12) | 0xfff;
- var.type = desc->type;
- var.present = desc->p;
- var.dpl = desc->dpl;
- var.db = desc->d;
- var.s = desc->s;
- var.l = desc->l;
- var.g = desc->g;
- var.avl = desc->avl;
- var.present = desc->p;
- var.unusable = !var.present;
- var.padding = 0;
-
- kvm_set_segment(vcpu, &var, seg);
- return;
-}
+ u8 opcodes[4];
+ unsigned long rip = kvm_rip_read(vcpu);
+ unsigned long rip_linear;
-static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
-{
- struct kvm_segment kvm_seg;
+ if (!printk_ratelimit())
+ return;
- kvm_get_segment(vcpu, &kvm_seg, seg);
- return kvm_seg.selector;
-}
+ rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-static void emulator_set_segment_selector(u16 sel, int seg,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_segment kvm_seg;
+ kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
- kvm_get_segment(vcpu, &kvm_seg, seg);
- kvm_seg.selector = sel;
- kvm_set_segment(vcpu, &kvm_seg, seg);
+ printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+ context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
}
+EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
- .write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
- .pio_in_emulated = emulator_pio_in_emulated,
- .pio_out_emulated = emulator_pio_out_emulated,
- .get_cached_descriptor = emulator_get_cached_descriptor,
- .set_cached_descriptor = emulator_set_cached_descriptor,
- .get_segment_selector = emulator_get_segment_selector,
- .set_segment_selector = emulator_set_segment_selector,
- .get_cached_segment_base = emulator_get_cached_segment_base,
- .get_gdt = emulator_get_gdt,
- .get_idt = emulator_get_idt,
- .get_cr = emulator_get_cr,
- .set_cr = emulator_set_cr,
- .cpl = emulator_get_cpl,
- .get_dr = emulator_get_dr,
- .set_dr = emulator_set_dr,
- .set_msr = kvm_set_msr,
- .get_msr = kvm_get_msr,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4291,134 +3403,14 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = ~0;
}
-static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+int emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ u16 error_code,
+ int emulation_type)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
-}
-
-static void inject_emulated_exception(struct kvm_vcpu *vcpu)
-{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- if (ctxt->exception.vector == PF_VECTOR)
- kvm_propagate_fault(vcpu, &ctxt->exception);
- else if (ctxt->exception.error_code_valid)
- kvm_queue_exception_e(vcpu, ctxt->exception.vector,
- ctxt->exception.error_code);
- else
- kvm_queue_exception(vcpu, ctxt->exception.vector);
-}
-
-static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
-{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
- int cs_db, cs_l;
-
- cache_all_regs(vcpu);
-
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
- vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
- vcpu->arch.emulate_ctxt.mode =
- (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
- (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_VM86 : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- memset(c, 0, sizeof(struct decode_cache));
- memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
-}
-
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
-{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
- int ret;
-
- init_emulate_ctxt(vcpu);
-
- vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
- vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
- vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
- ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
-
- if (ret != X86EMUL_CONTINUE)
- return EMULATE_FAIL;
-
- vcpu->arch.emulate_ctxt.eip = c->eip;
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
- if (irq == NMI_VECTOR)
- vcpu->arch.nmi_pending = false;
- else
- vcpu->arch.interrupt.pending = false;
-
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-
-static int handle_emulation_failure(struct kvm_vcpu *vcpu)
-{
- int r = EMULATE_DONE;
-
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- if (!is_guest_mode(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- r = EMULATE_FAIL;
- }
- kvm_queue_exception(vcpu, UD_VECTOR);
-
- return r;
-}
-
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
-
- if (tdp_enabled)
- return false;
-
- /*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
- * guest to let CPU execute the instruction.
- */
- if (kvm_mmu_unprotect_page_virt(vcpu, gva))
- return true;
-
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-
- if (gpa == UNMAPPED_GVA)
- return true; /* let cpu generate fault */
-
- if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
- return true;
-
- return false;
-}
-
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- int emulation_type,
- void *insn,
- int insn_len)
-{
- int r;
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int r, shadow_mask;
+ struct decode_cache *c;
+ struct kvm_run *run = vcpu->run;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
@@ -4430,20 +3422,27 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
*/
cache_all_regs(vcpu);
+ vcpu->mmio_is_write = 0;
+ vcpu->arch.pio.string = 0;
+
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
- vcpu->arch.emulate_ctxt.interruptibility = 0;
- vcpu->arch.emulate_ctxt.have_exception = false;
- vcpu->arch.emulate_ctxt.perm_ok = false;
+ int cs_db, cs_l;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
- if (r == X86EMUL_PROPAGATE_FAULT)
- goto done;
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_VM86 : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- trace_kvm_emulate_insn_start(vcpu);
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
+ c = &vcpu->arch.emulate_ctxt.decode;
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
@@ -4471,11 +3470,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
++vcpu->stat.insn_emulation;
if (r) {
- if (reexecute_instruction(vcpu, cr2))
+ ++vcpu->stat.insn_emulation_fail;
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
return EMULATE_DONE;
- if (emulation_type & EMULTYPE_SKIP)
- return EMULATE_FAIL;
- return handle_emulation_failure(vcpu);
+ return EMULATE_FAIL;
}
}
@@ -4484,74 +3482,245 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
- /* this is needed for vmware backdor interface to work since it
- changes registers values during IO operation */
- memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+ if (r == 0)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+
+ if (vcpu->arch.pio.string)
+ return EMULATE_DO_MMIO;
-restart:
- r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+ if ((r || vcpu->mmio_is_write) && run) {
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = vcpu->mmio_phys_addr;
+ memcpy(run->mmio.data, vcpu->mmio_data, 8);
+ run->mmio.len = vcpu->mmio_size;
+ run->mmio.is_write = vcpu->mmio_is_write;
+ }
- if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2))
+ if (r) {
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
return EMULATE_DONE;
+ if (!vcpu->mmio_needed) {
+ kvm_report_emulation_failure(vcpu, "mmio");
+ return EMULATE_FAIL;
+ }
+ return EMULATE_DO_MMIO;
+ }
+
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+ if (vcpu->mmio_is_write) {
+ vcpu->mmio_needed = 0;
+ return EMULATE_DO_MMIO;
+ }
+
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(emulate_instruction);
+
+static int pio_copy_data(struct kvm_vcpu *vcpu)
+{
+ void *p = vcpu->arch.pio_data;
+ gva_t q = vcpu->arch.pio.guest_gva;
+ unsigned bytes;
+ int ret;
+ u32 error_code;
- return handle_emulation_failure(vcpu);
- }
-
-done:
- if (vcpu->arch.emulate_ctxt.have_exception) {
- inject_emulated_exception(vcpu);
- r = EMULATE_DONE;
- } else if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
- vcpu->arch.pio.count = 0;
- r = EMULATE_DO_MMIO;
- } else if (vcpu->mmio_needed) {
- if (vcpu->mmio_is_write)
- vcpu->mmio_needed = 0;
- r = EMULATE_DO_MMIO;
- } else if (r == EMULATION_RESTART)
- goto restart;
+ bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
+ if (vcpu->arch.pio.in)
+ ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
else
- r = EMULATE_DONE;
+ ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
+
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, q, error_code);
+
+ return ret;
+}
+
+int complete_pio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pio_request *io = &vcpu->arch.pio;
+ long delta;
+ int r;
+ unsigned long val;
- toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ if (!io->string) {
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(&val, vcpu->arch.pio_data, io->size);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ }
+ } else {
+ if (io->in) {
+ r = pio_copy_data(vcpu);
+ if (r)
+ goto out;
+ }
+ delta = 1;
+ if (io->rep) {
+ delta *= io->cur_count;
+ /*
+ * The size of the register should really depend on
+ * current address size.
+ */
+ val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ val -= delta;
+ kvm_register_write(vcpu, VCPU_REGS_RCX, val);
+ }
+ if (io->down)
+ delta = -delta;
+ delta *= io->size;
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+ } else {
+ val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+ }
+ }
+out:
+ io->count -= io->cur_count;
+ io->cur_count = 0;
+
+ return 0;
+}
+
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+ /* TODO: String I/O for in kernel device */
+ int r;
+
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
return r;
}
-EXPORT_SYMBOL_GPL(x86_emulate_instruction);
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
+static int pio_string_write(struct kvm_vcpu *vcpu)
{
- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
- /* do not return to emulator after return from userspace */
- vcpu->arch.pio.count = 0;
- return ret;
+ struct kvm_pio_request *io = &vcpu->arch.pio;
+ void *pd = vcpu->arch.pio_data;
+ int i, r = 0;
+
+ for (i = 0; i < io->cur_count; i++) {
+ if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ io->port, io->size, pd)) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ pd += io->size;
+ }
+ return r;
}
-EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
-static void tsc_bad(void *info)
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
- kvm___this_cpu_write(cpu_tsc_khz, 0);
+ unsigned long val;
+
+ trace_kvm_pio(!in, port, size, 1);
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.string = 0;
+ vcpu->arch.pio.down = 0;
+ vcpu->arch.pio.rep = 0;
+
+ if (!vcpu->arch.pio.in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(vcpu->arch.pio_data, &val, 4);
+ }
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ complete_pio(vcpu);
+ return 1;
+ }
+ return 0;
}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-static void tsc_khz_changed(void *data)
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
+ int size, unsigned long count, int down,
+ gva_t address, int rep, unsigned port)
{
- struct cpufreq_freqs *freq = data;
- unsigned long khz = 0;
+ unsigned now, in_page;
+ int ret = 0;
+
+ trace_kvm_pio(!in, port, size, count);
- if (data)
- khz = freq->new;
- else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- khz = cpufreq_quick_get(raw_smp_processor_id());
- if (!khz)
- khz = tsc_khz;
- kvm___this_cpu_write(cpu_tsc_khz, khz);
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.string = 1;
+ vcpu->arch.pio.down = down;
+ vcpu->arch.pio.rep = rep;
+
+ if (!count) {
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (!down)
+ in_page = PAGE_SIZE - offset_in_page(address);
+ else
+ in_page = offset_in_page(address) + size;
+ now = min(count, (unsigned long)in_page / size);
+ if (!now)
+ now = 1;
+ if (down) {
+ /*
+ * String I/O in reverse. Yuck. Kill the guest, fix later.
+ */
+ pr_unimpl(vcpu, "guest string pio down\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->run->io.count = now;
+ vcpu->arch.pio.cur_count = now;
+
+ if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+ vcpu->arch.pio.guest_gva = address;
+
+ if (!vcpu->arch.pio.in) {
+ /* string PIO write */
+ ret = pio_copy_data(vcpu);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ return 1;
+ if (ret == 0 && !pio_string_write(vcpu)) {
+ complete_pio(vcpu);
+ if (vcpu->arch.pio.count == 0)
+ ret = 1;
+ }
+ }
+ /* no string PIO read support yet */
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+
+static void bounce_off(void *info)
+{
+ /* nothing */
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4562,60 +3731,21 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
- /*
- * We allow guests to temporarily run on slowing clocks,
- * provided we notify them after, or to run on accelerating
- * clocks, provided we notify them before. Thus time never
- * goes backwards.
- *
- * However, we have a problem. We can't atomically update
- * the frequency of a given CPU from this function; it is
- * merely a notifier, which can be called from any CPU.
- * Changing the TSC frequency at arbitrary points in time
- * requires a recomputation of local variables related to
- * the TSC for each VCPU. We must flag these local variables
- * to be updated and be sure the update takes place with the
- * new frequency before any guests proceed.
- *
- * Unfortunately, the combination of hotplug CPU and frequency
- * change creates an intractable locking scenario; the order
- * of when these callouts happen is undefined with respect to
- * CPU hotplug, and they can race with each other. As such,
- * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
- * undefined; you can actually have a CPU frequency change take
- * place in between the computation of X and the setting of the
- * variable. To protect against this problem, all updates of
- * the per_cpu tsc_khz variable are done in an interrupt
- * protected IPI, and all callers wishing to update the value
- * must wait for a synchronous IPI to complete (which is trivial
- * if the caller is on the CPU already). This establishes the
- * necessary total order on variable updates.
- *
- * Note that because a guest time update may take place
- * anytime after the setting of the VCPU's request bit, the
- * correct TSC value must be set before the request. However,
- * to ensure the update actually makes it to any guest which
- * starts running in hardware virtualization between the set
- * and the acquisition of the spinlock, we must also ping the
- * CPU after setting the request bit.
- *
- */
-
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
-
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (!kvm_request_guest_time_update(vcpu))
+ continue;
if (vcpu->cpu != smp_processor_id())
- send_ipi = 1;
+ send_ipi++;
}
}
spin_unlock(&kvm_lock);
@@ -4633,106 +3763,34 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
}
return 0;
}
static struct notifier_block kvmclock_cpufreq_notifier_block = {
- .notifier_call = kvmclock_cpufreq_notifier
-};
-
-static int kvmclock_cpu_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, tsc_bad, NULL, 1);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block kvmclock_cpu_notifier_block = {
- .notifier_call = kvmclock_cpu_notifier,
- .priority = -INT_MAX
+ .notifier_call = kvmclock_cpufreq_notifier
};
static void kvm_timer_init(void)
{
int cpu;
- max_tsc_khz = tsc_khz;
- register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy policy;
- memset(&policy, 0, sizeof(policy));
- cpu = get_cpu();
- cpufreq_get_policy(&policy, cpu);
- if (policy.cpuinfo.max_freq)
- max_tsc_khz = policy.cpuinfo.max_freq;
- put_cpu();
-#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
+ for_each_online_cpu(cpu) {
+ unsigned long khz = cpufreq_get(cpu);
+ if (!khz)
+ khz = tsc_khz;
+ per_cpu(cpu_tsc_khz, cpu) = khz;
+ }
+ } else {
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
}
- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
- for_each_online_cpu(cpu)
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
-}
-
-static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
-
-static int kvm_is_in_guest(void)
-{
- return percpu_read(current_vcpu) != NULL;
}
-static int kvm_is_user_mode(void)
-{
- int user_mode = 3;
-
- if (percpu_read(current_vcpu))
- user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
-
- return user_mode != 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
- unsigned long ip = 0;
-
- if (percpu_read(current_vcpu))
- ip = kvm_rip_read(percpu_read(current_vcpu));
-
- return ip;
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
- .is_in_guest = kvm_is_in_guest,
- .is_user_mode = kvm_is_user_mode,
- .get_guest_ip = kvm_get_guest_ip,
-};
-
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
-{
- percpu_write(current_vcpu, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
-
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
-{
- percpu_write(current_vcpu, NULL);
-}
-EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
-
int kvm_arch_init(void *opaque)
{
int r;
@@ -4751,10 +3809,6 @@ int kvm_arch_init(void *opaque)
}
if (ops->disabled_by_bios()) {
printk(KERN_ERR "kvm: disabled by bios\n");
-#ifndef KVM_TBOOT_ENABLED_WORKS
-
- printk(KERN_ERR "kvm: if TXT is enabled in the BIOS, disable it\n");
-#endif
r = -EOPNOTSUPP;
goto out;
}
@@ -4765,20 +3819,14 @@ int kvm_arch_init(void *opaque)
kvm_init_msr_list();
- kvm_xstate_size_init();
-
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+ kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
kvm_timer_init();
- perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
- if (kvm_cpu_has_xsave)
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-
return 0;
out:
@@ -4787,12 +3835,9 @@ out:
void kvm_arch_exit(void)
{
- perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
-
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
@@ -4942,23 +3987,88 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
+ return emulator_write_emulated(rip, instruction, 3, vcpu);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct kvm_desc_ptr dt = { limit, base };
+ struct descriptor_table dt = { limit, base };
kvm_x86_ops->set_gdt(vcpu, &dt);
}
void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct kvm_desc_ptr dt = { limit, base };
+ struct descriptor_table dt = { limit, base };
kvm_x86_ops->set_idt(vcpu, &dt);
}
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags)
+{
+ kvm_lmsw(vcpu, msw);
+ *rflags = kvm_get_rflags(vcpu);
+}
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = vcpu->arch.cr3;
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+ unsigned long *rflags)
+{
+ switch (cr) {
+ case 0:
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ *rflags = kvm_get_rflags(vcpu);
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ kvm_set_cr8(vcpu, val & 0xfUL);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ }
+}
+
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -5022,13 +4132,9 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
- if (!best || best->eax < 0x80000008)
- goto not_found;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
if (best)
return best->eax & 0xff;
-not_found:
return 36;
}
@@ -5109,10 +4215,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
if (!apic || !apic->vapic_addr)
return;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
kvm_release_page_dirty(apic->vapic_page);
mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -5142,13 +4248,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
{
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code,
- vcpu->arch.exception.reinject);
+ vcpu->arch.exception.error_code);
return;
}
@@ -5178,84 +4280,44 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
}
}
-static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
- }
-}
-
-static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
-{
- if (vcpu->guest_xcr0_loaded) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
- }
-}
-
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
- if (vcpu->requests) {
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
kvm_mmu_unload(vcpu);
- if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
+ if (vcpu->requests) {
+ if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
- if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
- r = kvm_guest_time_update(vcpu);
- if (unlikely(r))
- goto out;
- }
- if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
+ if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+ kvm_write_guest_time(vcpu);
+ if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
kvm_mmu_sync_roots(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
- if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+ &vcpu->requests)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
+ if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
- if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
- /* Page is swapped out. Do synthetic halt */
- vcpu->arch.apf.halted = true;
- r = 1;
- goto out;
- }
- }
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
- if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
- inject_pending_event(vcpu);
-
- /* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
-
- if (kvm_lapic_enabled(vcpu)) {
- update_cr8_intercept(vcpu);
- kvm_lapic_sync_to_vapic(vcpu);
- }
}
preempt_disable();
@@ -5263,25 +4325,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
- kvm_load_guest_xcr0(vcpu);
-
- atomic_set(&vcpu->guest_mode, 1);
- smp_wmb();
local_irq_disable();
- if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
- || need_resched() || signal_pending(current)) {
- atomic_set(&vcpu->guest_mode, 0);
- smp_wmb();
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ smp_mb__after_clear_bit();
+
+ if (vcpu->requests || need_resched() || signal_pending(current)) {
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
preempt_enable();
- kvm_x86_ops->cancel_injection(vcpu);
r = 1;
goto out;
}
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ inject_pending_event(vcpu);
+
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_guest_enter();
@@ -5306,10 +4377,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
-
- atomic_set(&vcpu->guest_mode, 0);
- smp_wmb();
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
++vcpu->stat.exits;
@@ -5326,7 +4394,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
/*
* Profile KVM exit RIPs:
@@ -5360,26 +4428,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu);
vapic_enter(vcpu);
r = 1;
while (r > 0) {
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
r = vcpu_enter_guest(vcpu);
else {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu);
+ if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
{
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
- vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_SIPI_RECEIVED:
default:
@@ -5401,22 +4467,20 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
-
- kvm_check_async_pf_completion(vcpu);
-
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
if (need_resched()) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_resched(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu);
}
}
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ post_kvm_run_save(vcpu);
vapic_exit(vcpu);
@@ -5428,8 +4492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
- if (!tsk_used_math(current) && kvm_init_fpu(current))
- return -ENOMEM;
+ vcpu_load(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -5442,23 +4505,29 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm)) {
- if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
- r = -EINVAL;
+ if (!irqchip_in_kernel(vcpu->kvm))
+ kvm_set_cr8(vcpu, kvm_run->cr8);
+
+ if (vcpu->arch.pio.cur_count) {
+ vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
+ r = complete_pio(vcpu);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r)
goto out;
- }
}
+ if (vcpu->mmio_needed) {
+ memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+ vcpu->mmio_read_completed = 1;
+ vcpu->mmio_needed = 0;
- if (vcpu->arch.pio.count || vcpu->mmio_needed) {
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- }
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE) {
+ vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
+ EMULTYPE_NO_DECODE);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r == EMULATE_DO_MMIO) {
+ /*
+ * Read-modify-write. Back to userspace.
+ */
r = 0;
goto out;
}
@@ -5470,15 +4539,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
r = __vcpu_run(vcpu);
out:
- post_kvm_run_save(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ vcpu_put(vcpu);
return r;
}
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu_load(vcpu);
+
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -5501,11 +4572,15 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
+ vcpu_put(vcpu);
+
return 0;
}
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu_load(vcpu);
+
kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -5530,11 +4605,17 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.exception.pending = false;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu_put(vcpu);
return 0;
}
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
struct kvm_segment cs;
@@ -5548,7 +4629,9 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
- struct kvm_desc_ptr dt;
+ struct descriptor_table dt;
+
+ vcpu_load(vcpu);
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -5561,15 +4644,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.size;
- sregs->idt.base = dt.address;
+ sregs->idt.limit = dt.limit;
+ sregs->idt.base = dt.base;
kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.size;
- sregs->gdt.base = dt.address;
+ sregs->gdt.limit = dt.limit;
+ sregs->gdt.base = dt.base;
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = kvm_read_cr3(vcpu);
+ sregs->cr3 = vcpu->arch.cr3;
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@@ -5581,44 +4664,586 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
+ vcpu_put(vcpu);
+
return 0;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu_put(vcpu);
return 0;
}
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
- bool has_error_code, u32 error_code)
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+static void seg_desct_to_kvm_desct(struct kvm_desc_struct *seg_desc, u16 selector,
+ struct kvm_segment *kvm_desct)
+{
+ kvm_desct->base = kvm_get_desc_base(seg_desc);
+ kvm_desct->limit = kvm_get_desc_limit(seg_desc);
+ if (seg_desc->g) {
+ kvm_desct->limit <<= 12;
+ kvm_desct->limit |= 0xfff;
+ }
+ kvm_desct->selector = selector;
+ kvm_desct->type = seg_desc->type;
+ kvm_desct->present = seg_desc->p;
+ kvm_desct->dpl = seg_desc->dpl;
+ kvm_desct->db = seg_desc->d;
+ kvm_desct->s = seg_desc->s;
+ kvm_desct->l = seg_desc->l;
+ kvm_desct->g = seg_desc->g;
+ kvm_desct->avl = seg_desc->avl;
+ if (!selector)
+ kvm_desct->unusable = 1;
+ else
+ kvm_desct->unusable = 0;
+ kvm_desct->padding = 0;
+}
+
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+ u16 selector,
+ struct descriptor_table *dtable)
+{
+ if (selector & 1 << 2) {
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+
+ if (kvm_seg.unusable)
+ dtable->limit = 0;
+ else
+ dtable->limit = kvm_seg.limit;
+ dtable->base = kvm_seg.base;
+ }
+ else
+ kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+
+/* allowed just for 8 bytes segments */
+static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ struct kvm_desc_struct *seg_desc)
{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ struct descriptor_table dtable;
+ u16 index = selector >> 3;
int ret;
+ u32 err;
+ gva_t addr;
+
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ addr = dtable.base + index * 8;
+ ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+ vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, addr, err);
- init_emulate_ctxt(vcpu);
+ return ret;
+}
- ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
- tss_selector, reason, has_error_code,
- error_code);
+/* allowed just for 8 bytes segments */
+static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ struct kvm_desc_struct *seg_desc)
+{
+ struct descriptor_table dtable;
+ u16 index = selector >> 3;
+
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7)
+ return 1;
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+}
+static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
+ struct kvm_desc_struct *seg_desc)
+{
+ u32 base_addr = kvm_get_desc_base(seg_desc);
+
+ return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
+}
+
+static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
+ struct kvm_desc_struct *seg_desc)
+{
+ u32 base_addr = kvm_get_desc_base(seg_desc);
+
+ return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
+}
+
+static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ return kvm_seg.selector;
+}
+
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+ struct kvm_segment segvar = {
+ .base = selector << 4,
+ .limit = 0xffff,
+ .selector = selector,
+ .type = 3,
+ .present = 1,
+ .dpl = 3,
+ .db = 0,
+ .s = 1,
+ .l = 0,
+ .g = 0,
+ .avl = 0,
+ .unusable = 0,
+ };
+ kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+ return X86EMUL_CONTINUE;
+}
+
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+ return (seg != VCPU_SREG_LDTR) &&
+ (seg != VCPU_SREG_TR) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
+}
+
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+ struct kvm_segment kvm_seg;
+ struct kvm_desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
+
+ if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
+ return kvm_load_realmode_segment(vcpu, selector, seg);
+
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
if (ret)
- return EMULATE_FAIL;
+ return ret;
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- return EMULATE_DONE;
+ seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+ if (null_selector) { /* for NULL selector skip all following checks */
+ kvm_seg.unusable = 1;
+ goto load;
+ }
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+ goto exception;
+
+ if (!kvm_seg.present) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = kvm_seg.dpl;
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(kvm_seg.type & 8))
+ goto exception;
+
+ if (kvm_seg.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (kvm_seg.s || kvm_seg.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((kvm_seg.type & 0xa) == 0x8 ||
+ (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!kvm_seg.unusable && kvm_seg.s) {
+ /* mark segment as accessed */
+ kvm_seg.type |= 1;
+ seg_desc.type |= 1;
+ save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ }
+load:
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+ return X86EMUL_CONTINUE;
+exception:
+ kvm_queue_exception_e(vcpu, err_vec, err_code);
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static void save_state_to_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ tss->cr3 = vcpu->arch.cr3;
+ tss->eip = kvm_rip_read(vcpu);
+ tss->eflags = kvm_get_rflags(vcpu);
+ tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+ tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
+{
+ struct kvm_segment kvm_seg;
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static int load_state_from_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ kvm_set_cr3(vcpu, tss->cr3);
+
+ kvm_rip_write(vcpu, tss->eip);
+ kvm_set_rflags(vcpu, tss->eflags | 2);
+
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+ kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+ kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
+ return 1;
+ return 0;
+}
+
+static void save_state_to_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = kvm_rip_read(vcpu);
+ tss->flag = kvm_get_rflags(vcpu);
+ tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ kvm_rip_write(vcpu, tss->ip);
+ kvm_set_rflags(vcpu, tss->flag | 2);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
+ return 1;
+
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
+ return 1;
+ return 0;
+}
+
+static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
+ u16 old_tss_sel, u32 old_tss_base,
+ struct kvm_desc_struct *nseg_desc)
+{
+ struct tss_segment_16 tss_segment_16;
+ int ret = 0;
+
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
+ goto out;
+
+ save_state_to_tss16(vcpu, &tss_segment_16);
+
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
+ goto out;
+
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+ &tss_segment_16, sizeof tss_segment_16))
+ goto out;
+
+ if (old_tss_sel != 0xffff) {
+ tss_segment_16.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr_write(vcpu, nseg_desc),
+ &tss_segment_16.prev_task_link,
+ sizeof tss_segment_16.prev_task_link))
+ goto out;
+ }
+
+ if (load_state_from_tss16(vcpu, &tss_segment_16))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
+ u16 old_tss_sel, u32 old_tss_base,
+ struct kvm_desc_struct *nseg_desc)
+{
+ struct tss_segment_32 tss_segment_32;
+ int ret = 0;
+
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
+ goto out;
+
+ save_state_to_tss32(vcpu, &tss_segment_32);
+
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
+ goto out;
+
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
+ &tss_segment_32, sizeof tss_segment_32))
+ goto out;
+
+ if (old_tss_sel != 0xffff) {
+ tss_segment_32.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr_write(vcpu, nseg_desc),
+ &tss_segment_32.prev_task_link,
+ sizeof tss_segment_32.prev_task_link))
+ goto out;
+ }
+
+ if (load_state_from_tss32(vcpu, &tss_segment_32))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+{
+ struct kvm_segment tr_seg;
+ struct kvm_desc_struct cseg_desc;
+ struct kvm_desc_struct nseg_desc;
+ int ret = 0;
+ u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
+ u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
+ u32 desc_limit;
+
+ old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
+
+ /* FIXME: Handle errors. Failure to read either TSS or their
+ * descriptors should generate a pagefault.
+ */
+ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+ goto out;
+
+ if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
+ goto out;
+
+ if (reason != TASK_SWITCH_IRET) {
+ int cpl;
+
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+ if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ }
+
+ desc_limit = kvm_get_desc_limit(&nseg_desc);
+ if (!nseg_desc.p ||
+ ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+ return 1;
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ cseg_desc.type &= ~(1 << 1); //clear the B flag
+ save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET) {
+ u32 eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+ }
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (nseg_desc.type & 8)
+ ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
+ else
+ ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+ u32 eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+ }
+
+ if (reason != TASK_SWITCH_IRET) {
+ nseg_desc.type |= (1 << 1);
+ save_guest_segment_descriptor(vcpu, tss_selector,
+ &nseg_desc);
+ }
+
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
+ seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+ tr_seg.type = 11;
+ kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+ return ret;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5627,19 +5252,20 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int mmu_reset_needed = 0;
int pending_vec, max_bits;
- struct kvm_desc_ptr dt;
+ struct descriptor_table dt;
- dt.size = sregs->idt.limit;
- dt.address = sregs->idt.base;
+ vcpu_load(vcpu);
+
+ dt.limit = sregs->idt.limit;
+ dt.base = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
- dt.size = sregs->gdt.limit;
- dt.address = sregs->gdt.base;
+ dt.limit = sregs->gdt.limit;
+ dt.base = sregs->gdt.base;
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -5653,10 +5279,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (sregs->cr4 & X86_CR4_OSXSAVE)
- update_cpuid(vcpu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ load_pdptrs(vcpu, vcpu->arch.cr3);
mmu_reset_needed = 1;
}
@@ -5691,7 +5315,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu_put(vcpu);
return 0;
}
@@ -5702,10 +5326,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
+ vcpu_load(vcpu);
+
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
if (vcpu->arch.exception.pending)
- goto out;
+ goto unlock_out;
if (dbg->control & KVM_GUESTDBG_INJECT_DB)
kvm_queue_exception(vcpu, DB_VECTOR);
else
@@ -5733,9 +5359,11 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
}
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ vcpu->arch.singlestep_cs =
+ get_segment_selector(vcpu, VCPU_SREG_CS);
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
+ }
/*
* Trigger an rflags update that will inject or remove the trace
@@ -5747,12 +5375,34 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
r = 0;
-out:
+unlock_out:
+ vcpu_put(vcpu);
return r;
}
/*
+ * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+ u16 cwd;
+ u16 swd;
+ u16 twd;
+ u16 fop;
+ u64 rip;
+ u64 rdp;
+ u32 mxcsr;
+ u32 mxcsr_mask;
+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+ u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
+};
+
+/*
* Translate a guest virtual address to a guest physical address.
*/
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5762,21 +5412,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
int idx;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ vcpu_load(vcpu);
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct kvm_i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+ vcpu_load(vcpu);
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
@@ -5787,13 +5440,16 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fpu->last_dp = fxsave->rdp;
memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+ vcpu_put(vcpu);
+
return 0;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct kvm_i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+ vcpu_load(vcpu);
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -5804,63 +5460,61 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fxsave->rdp = fpu->last_dp;
memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+ vcpu_put(vcpu);
+
return 0;
}
-int fx_init(struct kvm_vcpu *vcpu)
+void fx_init(struct kvm_vcpu *vcpu)
{
- int err;
-
- err = kvm_fpu_alloc(&vcpu->arch.guest_fpu);
- if (err)
- return err;
-
- kvm_fpu_finit(&vcpu->arch.guest_fpu);
+ unsigned after_mxcsr_mask;
/*
- * Ensure guest xcr0 is valid for loading
+ * Touch the fpu the first time in non atomic context as if
+ * this is the first fpu instruction the exception handler
+ * will fire before the instruction returns and it'll have to
+ * allocate ram with GFP_KERNEL.
*/
- vcpu->arch.xcr0 = XSTATE_FP;
+ if (!used_math())
+ kvm_fx_save(&vcpu->arch.host_fx_image);
- vcpu->arch.cr0 |= X86_CR0_ET;
+ /* Initialize guest FPU by resetting ours and saving into guest's */
+ preempt_disable();
+ kvm_fx_save(&vcpu->arch.host_fx_image);
+ kvm_fx_finit();
+ kvm_fx_save(&vcpu->arch.guest_fx_image);
+ kvm_fx_restore(&vcpu->arch.host_fx_image);
+ preempt_enable();
- return 0;
+ vcpu->arch.cr0 |= X86_CR0_ET;
+ after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+ vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
+ memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
+ 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
}
EXPORT_SYMBOL_GPL(fx_init);
-static void fx_free(struct kvm_vcpu *vcpu)
-{
- kvm_fpu_free(&vcpu->arch.guest_fpu);
-}
-
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_fpu_loaded)
return;
- /*
- * Restore all possible states in the guest,
- * and assume host would use all available bits.
- * Guest xcr0 would be loaded later.
- */
- kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
- unlazy_fpu(current);
- kvm_fpu_restore_checking(&vcpu->arch.guest_fpu);
+ kvm_fx_save(&vcpu->arch.host_fx_image);
+ kvm_fx_restore(&vcpu->arch.guest_fx_image);
trace_kvm_fpu(1);
}
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- kvm_put_guest_xcr0(vcpu);
-
if (!vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 0;
- kvm_fpu_save_init(&vcpu->arch.guest_fpu);
+ kvm_fx_save(&vcpu->arch.guest_fx_image);
+ kvm_fx_restore(&vcpu->arch.host_fx_image);
++vcpu->stat.fpu_reload;
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
trace_kvm_fpu(0);
}
@@ -5871,18 +5525,12 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
vcpu->arch.time_page = NULL;
}
- free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
- fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
- if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
return kvm_x86_ops->vcpu_create(kvm, id);
}
@@ -5890,6 +5538,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
+ /* We do fxsave: this must be aligned. */
+ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+
vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = kvm_arch_vcpu_reset(vcpu);
@@ -5907,13 +5558,10 @@ free_vcpu:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- vcpu->arch.apf.msr_val = 0;
-
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
- fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5927,27 +5575,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr6 = DR6_FIXED_1;
vcpu->arch.dr7 = DR7_FIXED_1;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- vcpu->arch.apf.msr_val = 0;
-
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- vcpu->arch.apf.halted = false;
-
return kvm_x86_ops->vcpu_reset(vcpu);
}
int kvm_arch_hardware_enable(void *garbage)
{
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
- int i;
+ /*
+ * Since this may be called from a hotplug notifcation,
+ * we can't get the CPU frequency directly.
+ */
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ int cpu = raw_smp_processor_id();
+ per_cpu(cpu_tsc_khz, cpu) = 0;
+ }
kvm_shared_msr_cpu_online();
- list_for_each_entry(kvm, &vm_list, vm_list)
- kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu->cpu == smp_processor_id())
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
return kvm_x86_ops->hardware_enable(garbage);
}
@@ -5981,11 +5624,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.mmu.translate_gpa = translate_gpa;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
@@ -5998,9 +5637,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- if (!kvm->arch.virtual_tsc_khz)
- kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
-
r = kvm_mmu_create(vcpu);
if (r < 0)
goto fail_free_pio_data;
@@ -6019,14 +5655,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
- goto fail_free_mce_banks;
-
- kvm_async_pf_hash_reset(vcpu);
-
return 0;
-fail_free_mce_banks:
- kfree(vcpu->arch.mce_banks);
fail_free_lapic:
kvm_free_lapic(vcpu);
fail_mmu_destroy:
@@ -6043,23 +5672,34 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ idx = kvm_srcu_read_lock(&vcpu->kvm->srcu);
kvm_mmu_destroy(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
}
-int kvm_arch_init_vm(struct kvm *kvm)
+struct kvm *kvm_arch_create_vm(void)
{
+ struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+ if (!kvm)
+ return ERR_PTR(-ENOMEM);
+
+ kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!kvm->arch.aliases) {
+ kfree(kvm);
+ return ERR_PTR(-ENOMEM);
+ }
+
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- spin_lock_init(&kvm->arch.tsc_write_lock);
+ rdtscll(kvm->arch.vm_init_tsc);
- return 0;
+ return kvm;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -6077,10 +5717,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
/*
* Unpin any mmu pages first.
*/
- kvm_for_each_vcpu(i, vcpu, kvm) {
- kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_for_each_vcpu(i, vcpu, kvm)
kvm_unload_vcpu_mmu(vcpu);
- }
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@@ -6095,19 +5733,23 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
kvm_free_all_assigned_devices(kvm);
- kvm_free_pit(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_iommu_unmap_guest(kvm);
+ kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
+ kvm_free_physmem(kvm);
if (kvm->arch.apic_access_page)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ kvm_cleanup_srcu_struct(&kvm->srcu);
+ kfree(kvm->arch.aliases);
+ kfree(kvm);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6117,11 +5759,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
int user_alloc)
{
int npages = memslot->npages;
- int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
- /* Prevent internal slot pages from being moved by fork()/COW. */
- if (memslot->id >= KVM_MEMORY_SLOTS)
- map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -6134,7 +5771,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- map_flags,
+ MAP_PRIVATE | MAP_ANONYMOUS,
0);
up_write(&current->mm->mmap_sem);
@@ -6188,9 +5825,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted)
- || !list_empty_careful(&vcpu->async_pf.done)
+ return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) &&
@@ -6209,7 +5844,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
me = get_cpu();
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (atomic_xchg(&vcpu->guest_mode, 0))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
kvm_smp_send_reschedule(cpu);
put_cpu();
}
@@ -6219,22 +5854,13 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
-{
- unsigned long current_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
-
- return current_rip == linear_rip;
-}
-EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
-
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
rflags = kvm_x86_ops->get_rflags(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- rflags &= ~X86_EFLAGS_TF;
+ rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
return rflags;
}
EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -6242,154 +5868,14 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
- kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
- rflags |= X86_EFLAGS_TF;
+ vcpu->arch.singlestep_cs ==
+ get_segment_selector(vcpu, VCPU_SREG_CS) &&
+ vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+ rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
kvm_x86_ops->set_rflags(vcpu, rflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
-void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
-{
- int r;
-
- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
- is_error_page(work->page))
- return;
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- return;
-
- if (!vcpu->arch.mmu.direct_map &&
- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
- return;
-
- vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
-}
-
-static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
-{
- return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
-}
-
-static inline u32 kvm_async_pf_next_probe(u32 key)
-{
- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
-}
-
-static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u32 key = kvm_async_pf_hash_fn(gfn);
-
- while (vcpu->arch.apf.gfns[key] != ~0)
- key = kvm_async_pf_next_probe(key);
-
- vcpu->arch.apf.gfns[key] = gfn;
-}
-
-static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- int i;
- u32 key = kvm_async_pf_hash_fn(gfn);
-
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
- (vcpu->arch.apf.gfns[key] != gfn &&
- vcpu->arch.apf.gfns[key] != ~0); i++)
- key = kvm_async_pf_next_probe(key);
-
- return key;
-}
-
-bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
-}
-
-static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u32 i, j, k;
-
- i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
- while (true) {
- vcpu->arch.apf.gfns[i] = ~0;
- do {
- j = kvm_async_pf_next_probe(j);
- if (vcpu->arch.apf.gfns[j] == ~0)
- return;
- k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
- /*
- * k lies cyclically in ]i,j]
- * | i.k.j |
- * |....j i.k.| or |.k..j i...|
- */
- } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
- vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
- i = j;
- }
-}
-
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
-{
-
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
- sizeof(val));
-}
-
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work)
-{
- struct x86_exception fault;
-
- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
- kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
-
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
- (vcpu->arch.apf.send_user_only &&
- kvm_x86_ops->get_cpl(vcpu) == 0))
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- kvm_inject_page_fault(vcpu, &fault);
- }
-}
-
-void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
- struct kvm_async_pf *work)
-{
- struct x86_exception fault;
-
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
- if (is_error_page(work->page))
- work->arch.token = ~0; /* broadcast wakeup */
- else
- kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
-
- if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
- !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- kvm_inject_page_fault(vcpu, &fault);
- }
- vcpu->arch.apf.halted = false;
-}
-
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
-{
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
- return true;
- else
- return !kvm_event_needs_reinjection(vcpu) &&
- kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
@@ -6401,4 +5887,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);