diff options
author | Josh Wilsdon <jwilsdon@joyent.com> | 2011-04-06 17:32:34 -0700 |
---|---|---|
committer | Josh Wilsdon <jwilsdon@joyent.com> | 2011-04-06 17:42:30 -0700 |
commit | 44f65dde684a09c2319449bac768974120ed5d7f (patch) | |
tree | 81e3a1a603f3e30b8c511ee15c56b10f44f51c07 /linux | |
parent | c5e99aab98c3a8ddb8e0e2953c1a3e534d67ca4f (diff) | |
download | illumos-kvm-44f65dde684a09c2319449bac768974120ed5d7f.tar.gz |
[HVM-29] Apparently I still made a mess because of that .gitignore. This version should actually match the kvm-kmod-2.6.34.
Diffstat (limited to 'linux')
47 files changed, 5600 insertions, 11684 deletions
diff --git a/linux/include/asm-ia64/kvm.h b/linux/include/asm-ia64/kvm.h deleted file mode 100644 index ce31fac..0000000 --- a/linux/include/asm-ia64/kvm.h +++ /dev/null @@ -1,304 +0,0 @@ -#ifndef KVM_UNIFDEF_H -#define KVM_UNIFDEF_H - -#ifdef __i386__ -#ifndef CONFIG_X86_32 -#define CONFIG_X86_32 1 -#endif -#endif - -#ifdef __x86_64__ -#ifndef CONFIG_X86_64 -#define CONFIG_X86_64 1 -#endif -#endif - -#if defined(__i386__) || defined (__x86_64__) -#ifndef CONFIG_X86 -#define CONFIG_X86 1 -#endif -#endif - -#ifdef __ia64__ -#ifndef CONFIG_IA64 -#define CONFIG_IA64 1 -#endif -#endif - -#ifdef __PPC__ -#ifndef CONFIG_PPC -#define CONFIG_PPC 1 -#endif -#endif - -#ifdef __s390__ -#ifndef CONFIG_S390 -#define CONFIG_S390 1 -#endif -#endif - -#endif -#ifndef __ASM_IA64_KVM_H -#define __ASM_IA64_KVM_H - -/* - * kvm structure definitions for ia64 - * - * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - */ - -#include <asm/types.h> -#include <linux/ioctl.h> - -/* Select x86 specific features in <linux/kvm.h> */ -#define __KVM_HAVE_IOAPIC -#define __KVM_HAVE_DEVICE_ASSIGNMENT - -/* Architectural interrupt line count. */ -#define KVM_NR_INTERRUPTS 256 - -#define KVM_IOAPIC_NUM_PINS 48 - -struct kvm_ioapic_state { - __u64 base_address; - __u32 ioregsel; - __u32 id; - __u32 irr; - __u32 pad; - union { - __u64 bits; - struct { - __u8 vector; - __u8 delivery_mode:3; - __u8 dest_mode:1; - __u8 delivery_status:1; - __u8 polarity:1; - __u8 remote_irr:1; - __u8 trig_mode:1; - __u8 mask:1; - __u8 reserve:7; - __u8 reserved[4]; - __u8 dest_id; - } fields; - } redirtbl[KVM_IOAPIC_NUM_PINS]; -}; - -#define KVM_IRQCHIP_PIC_MASTER 0 -#define KVM_IRQCHIP_PIC_SLAVE 1 -#define KVM_IRQCHIP_IOAPIC 2 -#define KVM_NR_IRQCHIPS 3 - -#define KVM_CONTEXT_SIZE 8*1024 - -struct kvm_fpreg { - union { - unsigned long bits[2]; - long double __dummy; /* force 16-byte alignment */ - } u; -}; - -union context { - /* 8K size */ - char dummy[KVM_CONTEXT_SIZE]; - struct { - unsigned long psr; - unsigned long pr; - unsigned long caller_unat; - unsigned long pad; - unsigned long gr[32]; - unsigned long ar[128]; - unsigned long br[8]; - unsigned long cr[128]; - unsigned long rr[8]; - unsigned long ibr[8]; - unsigned long dbr[8]; - unsigned long pkr[8]; - struct kvm_fpreg fr[128]; - }; -}; - -struct thash_data { - union { - struct { - unsigned long p : 1; /* 0 */ - unsigned long rv1 : 1; /* 1 */ - unsigned long ma : 3; /* 2-4 */ - unsigned long a : 1; /* 5 */ - unsigned long d : 1; /* 6 */ - unsigned long pl : 2; /* 7-8 */ - unsigned long ar : 3; /* 9-11 */ - unsigned long ppn : 38; /* 12-49 */ - unsigned long rv2 : 2; /* 50-51 */ - unsigned long ed : 1; /* 52 */ - unsigned long ig1 : 11; /* 53-63 */ - }; - struct { - unsigned long __rv1 : 53; /* 0-52 */ - unsigned long contiguous : 1; /*53 */ - unsigned long tc : 1; /* 54 TR or TC */ - unsigned long cl : 1; - /* 55 I side or D side cache line */ - unsigned long len : 4; /* 56-59 */ - unsigned long io : 1; /* 60 entry is for io or not */ - unsigned long nomap : 1; - /* 61 entry cann't be inserted into machine TLB.*/ - unsigned long checked : 1; - /* 62 for VTLB/VHPT sanity check */ - unsigned long invalid : 1; - /* 63 invalid entry */ - }; - unsigned long page_flags; - }; /* same for VHPT and TLB */ - - union { - struct { - unsigned long rv3 : 2; - unsigned long ps : 6; - unsigned long key : 24; - unsigned long rv4 : 32; - }; - unsigned long itir; - }; - union { - struct { - unsigned long ig2 : 12; - unsigned long vpn : 49; - unsigned long vrn : 3; - }; - unsigned long ifa; - unsigned long vadr; - struct { - unsigned long tag : 63; - unsigned long ti : 1; - }; - unsigned long etag; - }; - union { - struct thash_data *next; - unsigned long rid; - unsigned long gpaddr; - }; -}; - -#define NITRS 8 -#define NDTRS 8 - -struct saved_vpd { - unsigned long vhpi; - unsigned long vgr[16]; - unsigned long vbgr[16]; - unsigned long vnat; - unsigned long vbnat; - unsigned long vcpuid[5]; - unsigned long vpsr; - unsigned long vpr; - union { - unsigned long vcr[128]; - struct { - unsigned long dcr; - unsigned long itm; - unsigned long iva; - unsigned long rsv1[5]; - unsigned long pta; - unsigned long rsv2[7]; - unsigned long ipsr; - unsigned long isr; - unsigned long rsv3; - unsigned long iip; - unsigned long ifa; - unsigned long itir; - unsigned long iipa; - unsigned long ifs; - unsigned long iim; - unsigned long iha; - unsigned long rsv4[38]; - unsigned long lid; - unsigned long ivr; - unsigned long tpr; - unsigned long eoi; - unsigned long irr[4]; - unsigned long itv; - unsigned long pmv; - unsigned long cmcv; - unsigned long rsv5[5]; - unsigned long lrr0; - unsigned long lrr1; - unsigned long rsv6[46]; - }; - }; -}; - -struct kvm_regs { - struct saved_vpd vpd; - /*Arch-regs*/ - int mp_state; - unsigned long vmm_rr; - /* TR and TC. */ - struct thash_data itrs[NITRS]; - struct thash_data dtrs[NDTRS]; - /* Bit is set if there is a tr/tc for the region. */ - unsigned char itr_regions; - unsigned char dtr_regions; - unsigned char tc_regions; - - char irq_check; - unsigned long saved_itc; - unsigned long itc_check; - unsigned long timer_check; - unsigned long timer_pending; - unsigned long last_itc; - - unsigned long vrr[8]; - unsigned long ibr[8]; - unsigned long dbr[8]; - unsigned long insvc[4]; /* Interrupt in service. */ - unsigned long xtp; - - unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */ - unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */ - unsigned long metaphysical_saved_rr0; /* from kvm_arch */ - unsigned long metaphysical_saved_rr4; /* from kvm_arch */ - unsigned long fp_psr; /*used for lazy float register */ - unsigned long saved_gp; - /*for phycial emulation */ - - union context saved_guest; - - unsigned long reserved[64]; /* for future use */ -}; - -struct kvm_sregs { -}; - -struct kvm_fpu { -}; - -#define KVM_IA64_VCPU_STACK_SHIFT 16 -#define KVM_IA64_VCPU_STACK_SIZE (1UL << KVM_IA64_VCPU_STACK_SHIFT) - -struct kvm_ia64_vcpu_stack { - unsigned char stack[KVM_IA64_VCPU_STACK_SIZE]; -}; - -struct kvm_debug_exit_arch { -}; - -/* for KVM_SET_GUEST_DEBUG */ -struct kvm_guest_debug_arch { -}; - -#endif diff --git a/linux/include/asm-ia64/kvm_host.h b/linux/include/asm-ia64/kvm_host.h deleted file mode 100644 index 91e5de5..0000000 --- a/linux/include/asm-ia64/kvm_host.h +++ /dev/null @@ -1,639 +0,0 @@ -#ifndef KVM_UNIFDEF_H -#define KVM_UNIFDEF_H - -#ifdef __i386__ -#ifndef CONFIG_X86_32 -#define CONFIG_X86_32 1 -#endif -#endif - -#ifdef __x86_64__ -#ifndef CONFIG_X86_64 -#define CONFIG_X86_64 1 -#endif -#endif - -#if defined(__i386__) || defined (__x86_64__) -#ifndef CONFIG_X86 -#define CONFIG_X86 1 -#endif -#endif - -#ifdef __ia64__ -#ifndef CONFIG_IA64 -#define CONFIG_IA64 1 -#endif -#endif - -#ifdef __PPC__ -#ifndef CONFIG_PPC -#define CONFIG_PPC 1 -#endif -#endif - -#ifdef __s390__ -#ifndef CONFIG_S390 -#define CONFIG_S390 1 -#endif -#endif - -#endif -/* - * kvm_host.h: used for kvm module, and hold ia64-specific sections. - * - * Copyright (C) 2007, Intel Corporation. - * - * Xiantao Zhang <xiantao.zhang@intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - */ - -#ifndef __ASM_KVM_HOST_H -#define __ASM_KVM_HOST_H - -#define KVM_MEMORY_SLOTS 32 -/* memory slots that does not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 4 - -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 - -/* define exit reasons from vmm to kvm*/ -#define EXIT_REASON_VM_PANIC 0 -#define EXIT_REASON_MMIO_INSTRUCTION 1 -#define EXIT_REASON_PAL_CALL 2 -#define EXIT_REASON_SAL_CALL 3 -#define EXIT_REASON_SWITCH_RR6 4 -#define EXIT_REASON_VM_DESTROY 5 -#define EXIT_REASON_EXTERNAL_INTERRUPT 6 -#define EXIT_REASON_IPI 7 -#define EXIT_REASON_PTC_G 8 -#define EXIT_REASON_DEBUG 20 - -/*Define vmm address space and vm data space.*/ -#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20) -#define KVM_VMM_SHIFT 24 -#define KVM_VMM_BASE 0xD000000000000000 -#define VMM_SIZE (__IA64_UL_CONST(8)<<20) - -/* - * Define vm_buffer, used by PAL Services, base address. - * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M - */ -#define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE) -#define KVM_VM_BUFFER_SIZE (__IA64_UL_CONST(8)<<20) - -/* - * kvm guest's data area looks as follow: - * - * +----------------------+ ------- KVM_VM_DATA_SIZE - * | vcpu[n]'s data | | ___________________KVM_STK_OFFSET - * | | | / | - * | .......... | | /vcpu's struct&stack | - * | .......... | | /---------------------|---- 0 - * | vcpu[5]'s data | | / vpd | - * | vcpu[4]'s data | |/-----------------------| - * | vcpu[3]'s data | / vtlb | - * | vcpu[2]'s data | /|------------------------| - * | vcpu[1]'s data |/ | vhpt | - * | vcpu[0]'s data |____________________________| - * +----------------------+ | - * | memory dirty log | | - * +----------------------+ | - * | vm's data struct | | - * +----------------------+ | - * | | | - * | | | - * | | | - * | | | - * | | | - * | | | - * | | | - * | vm's p2m table | | - * | | | - * | | | - * | | | | - * vm's data->| | | | - * +----------------------+ ------- 0 - * To support large memory, needs to increase the size of p2m. - * To support more vcpus, needs to ensure it has enough space to - * hold vcpus' data. - */ - -#define KVM_VM_DATA_SHIFT 26 -#define KVM_VM_DATA_SIZE (__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT) -#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VM_DATA_SIZE) - -#define KVM_P2M_BASE KVM_VM_DATA_BASE -#define KVM_P2M_SIZE (__IA64_UL_CONST(24) << 20) - -#define VHPT_SHIFT 16 -#define VHPT_SIZE (__IA64_UL_CONST(1) << VHPT_SHIFT) -#define VHPT_NUM_ENTRIES (__IA64_UL_CONST(1) << (VHPT_SHIFT-5)) - -#define VTLB_SHIFT 16 -#define VTLB_SIZE (__IA64_UL_CONST(1) << VTLB_SHIFT) -#define VTLB_NUM_ENTRIES (1UL << (VHPT_SHIFT-5)) - -#define VPD_SHIFT 16 -#define VPD_SIZE (__IA64_UL_CONST(1) << VPD_SHIFT) - -#define VCPU_STRUCT_SHIFT 16 -#define VCPU_STRUCT_SIZE (__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT) - -/* - * This must match KVM_IA64_VCPU_STACK_{SHIFT,SIZE} arch/ia64/include/asm/kvm.h - */ -#define KVM_STK_SHIFT 16 -#define KVM_STK_OFFSET (__IA64_UL_CONST(1)<< KVM_STK_SHIFT) - -#define KVM_VM_STRUCT_SHIFT 19 -#define KVM_VM_STRUCT_SIZE (__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT) - -#define KVM_MEM_DIRY_LOG_SHIFT 19 -#define KVM_MEM_DIRTY_LOG_SIZE (__IA64_UL_CONST(1) << KVM_MEM_DIRY_LOG_SHIFT) - -#ifndef __ASSEMBLY__ - -/*Define the max vcpus and memory for Guests.*/ -#define KVM_MAX_VCPUS (KVM_VM_DATA_SIZE - KVM_P2M_SIZE - KVM_VM_STRUCT_SIZE -\ - KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data) -#define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT) - -#define VMM_LOG_LEN 256 - -#include <asm/types.h> -#include <linux/mm.h> -#include <linux/kvm.h> -#include <linux/kvm_para.h> -#include <linux/kvm_types.h> - -#include <asm/pal.h> -#include <asm/sal.h> -#include <asm/page.h> - -struct kvm_vcpu_data { - char vcpu_vhpt[VHPT_SIZE]; - char vcpu_vtlb[VTLB_SIZE]; - char vcpu_vpd[VPD_SIZE]; - char vcpu_struct[VCPU_STRUCT_SIZE]; -}; - -struct kvm_vm_data { - char kvm_p2m[KVM_P2M_SIZE]; - char kvm_vm_struct[KVM_VM_STRUCT_SIZE]; - char kvm_mem_dirty_log[KVM_MEM_DIRTY_LOG_SIZE]; - struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS]; -}; - -#define VCPU_BASE(n) (KVM_VM_DATA_BASE + \ - offsetof(struct kvm_vm_data, vcpu_data[n])) -#define KVM_VM_BASE (KVM_VM_DATA_BASE + \ - offsetof(struct kvm_vm_data, kvm_vm_struct)) -#define KVM_MEM_DIRTY_LOG_BASE KVM_VM_DATA_BASE + \ - offsetof(struct kvm_vm_data, kvm_mem_dirty_log) - -#define VHPT_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vhpt)) -#define VTLB_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vtlb)) -#define VPD_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vpd)) -#define VCPU_STRUCT_BASE(n) (VCPU_BASE(n) + \ - offsetof(struct kvm_vcpu_data, vcpu_struct)) - -/*IO section definitions*/ -#define IOREQ_READ 1 -#define IOREQ_WRITE 0 - -#define STATE_IOREQ_NONE 0 -#define STATE_IOREQ_READY 1 -#define STATE_IOREQ_INPROCESS 2 -#define STATE_IORESP_READY 3 - -/*Guest Physical address layout.*/ -#define GPFN_MEM (0UL << 60) /* Guest pfn is normal mem */ -#define GPFN_FRAME_BUFFER (1UL << 60) /* VGA framebuffer */ -#define GPFN_LOW_MMIO (2UL << 60) /* Low MMIO range */ -#define GPFN_PIB (3UL << 60) /* PIB base */ -#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */ -#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */ -#define GPFN_GFW (6UL << 60) /* Guest Firmware */ -#define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */ - -#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */ -#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */ -#define INVALID_MFN (~0UL) -#define MEM_G (1UL << 30) -#define MEM_M (1UL << 20) -#define MMIO_START (3 * MEM_G) -#define MMIO_SIZE (512 * MEM_M) -#define VGA_IO_START 0xA0000UL -#define VGA_IO_SIZE 0x20000 -#define LEGACY_IO_START (MMIO_START + MMIO_SIZE) -#define LEGACY_IO_SIZE (64 * MEM_M) -#define IO_SAPIC_START 0xfec00000UL -#define IO_SAPIC_SIZE 0x100000 -#define PIB_START 0xfee00000UL -#define PIB_SIZE 0x200000 -#define GFW_START (4 * MEM_G - 16 * MEM_M) -#define GFW_SIZE (16 * MEM_M) - -/*Deliver mode, defined for ioapic.c*/ -#define dest_Fixed IOSAPIC_FIXED -#define dest_LowestPrio IOSAPIC_LOWEST_PRIORITY - -#define NMI_VECTOR 2 -#define ExtINT_VECTOR 0 -#define NULL_VECTOR (-1) -#define IA64_SPURIOUS_INT_VECTOR 0x0f - -#define VCPU_LID(v) (((u64)(v)->vcpu_id) << 24) - -/* - *Delivery mode - */ -#define SAPIC_DELIV_SHIFT 8 -#define SAPIC_FIXED 0x0 -#define SAPIC_LOWEST_PRIORITY 0x1 -#define SAPIC_PMI 0x2 -#define SAPIC_NMI 0x4 -#define SAPIC_INIT 0x5 -#define SAPIC_EXTINT 0x7 - -/* - * vcpu->requests bit members for arch - */ -#define KVM_REQ_PTC_G 32 -#define KVM_REQ_RESUME 33 - -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) 1 - -struct kvm; -struct kvm_vcpu; - -struct kvm_mmio_req { - uint64_t addr; /* physical address */ - uint64_t size; /* size in bytes */ - uint64_t data; /* data (or paddr of data) */ - uint8_t state:4; - uint8_t dir:1; /* 1=read, 0=write */ -}; - -/*Pal data struct */ -struct kvm_pal_call{ - /*In area*/ - uint64_t gr28; - uint64_t gr29; - uint64_t gr30; - uint64_t gr31; - /*Out area*/ - struct ia64_pal_retval ret; -}; - -/* Sal data structure */ -struct kvm_sal_call{ - /*In area*/ - uint64_t in0; - uint64_t in1; - uint64_t in2; - uint64_t in3; - uint64_t in4; - uint64_t in5; - uint64_t in6; - uint64_t in7; - struct sal_ret_values ret; -}; - -/*Guest change rr6*/ -struct kvm_switch_rr6 { - uint64_t old_rr; - uint64_t new_rr; -}; - -union ia64_ipi_a{ - unsigned long val; - struct { - unsigned long rv : 3; - unsigned long ir : 1; - unsigned long eid : 8; - unsigned long id : 8; - unsigned long ib_base : 44; - }; -}; - -union ia64_ipi_d { - unsigned long val; - struct { - unsigned long vector : 8; - unsigned long dm : 3; - unsigned long ig : 53; - }; -}; - -/*ipi check exit data*/ -struct kvm_ipi_data{ - union ia64_ipi_a addr; - union ia64_ipi_d data; -}; - -/*global purge data*/ -struct kvm_ptc_g { - unsigned long vaddr; - unsigned long rr; - unsigned long ps; - struct kvm_vcpu *vcpu; -}; - -/*Exit control data */ -struct exit_ctl_data{ - uint32_t exit_reason; - uint32_t vm_status; - union { - struct kvm_mmio_req ioreq; - struct kvm_pal_call pal_data; - struct kvm_sal_call sal_data; - struct kvm_switch_rr6 rr_data; - struct kvm_ipi_data ipi_data; - struct kvm_ptc_g ptc_g_data; - } u; -}; - -union pte_flags { - unsigned long val; - struct { - unsigned long p : 1; /*0 */ - unsigned long : 1; /* 1 */ - unsigned long ma : 3; /* 2-4 */ - unsigned long a : 1; /* 5 */ - unsigned long d : 1; /* 6 */ - unsigned long pl : 2; /* 7-8 */ - unsigned long ar : 3; /* 9-11 */ - unsigned long ppn : 38; /* 12-49 */ - unsigned long : 2; /* 50-51 */ - unsigned long ed : 1; /* 52 */ - }; -}; - -union ia64_pta { - unsigned long val; - struct { - unsigned long ve : 1; - unsigned long reserved0 : 1; - unsigned long size : 6; - unsigned long vf : 1; - unsigned long reserved1 : 6; - unsigned long base : 49; - }; -}; - -struct thash_cb { - /* THASH base information */ - struct thash_data *hash; /* hash table pointer */ - union ia64_pta pta; - int num; -}; - -struct kvm_vcpu_stat { -}; - -struct kvm_vcpu_arch { - int launched; - int last_exit; - int last_run_cpu; - int vmm_tr_slot; - int vm_tr_slot; - int sn_rtc_tr_slot; - -#define KVM_MP_STATE_RUNNABLE 0 -#define KVM_MP_STATE_UNINITIALIZED 1 -#define KVM_MP_STATE_INIT_RECEIVED 2 -#define KVM_MP_STATE_HALTED 3 - int mp_state; - -#define MAX_PTC_G_NUM 3 - int ptc_g_count; - struct kvm_ptc_g ptc_g_data[MAX_PTC_G_NUM]; - - /*halt timer to wake up sleepy vcpus*/ - struct hrtimer hlt_timer; - long ht_active; - - struct kvm_lapic *apic; /* kernel irqchip context */ - struct vpd *vpd; - - /* Exit data for vmm_transition*/ - struct exit_ctl_data exit_data; - - cpumask_t cache_coherent_map; - - unsigned long vmm_rr; - unsigned long host_rr6; - unsigned long psbits[8]; - unsigned long cr_iipa; - unsigned long cr_isr; - unsigned long vsa_base; - unsigned long dirty_log_lock_pa; - unsigned long __gp; - /* TR and TC. */ - struct thash_data itrs[NITRS]; - struct thash_data dtrs[NDTRS]; - /* Bit is set if there is a tr/tc for the region. */ - unsigned char itr_regions; - unsigned char dtr_regions; - unsigned char tc_regions; - /* purge all */ - unsigned long ptce_base; - unsigned long ptce_count[2]; - unsigned long ptce_stride[2]; - /* itc/itm */ - unsigned long last_itc; - long itc_offset; - unsigned long itc_check; - unsigned long timer_check; - unsigned int timer_pending; - unsigned int timer_fired; - - unsigned long vrr[8]; - unsigned long ibr[8]; - unsigned long dbr[8]; - unsigned long insvc[4]; /* Interrupt in service. */ - unsigned long xtp; - - unsigned long metaphysical_rr0; /* from kvm_arch (so is pinned) */ - unsigned long metaphysical_rr4; /* from kvm_arch (so is pinned) */ - unsigned long metaphysical_saved_rr0; /* from kvm_arch */ - unsigned long metaphysical_saved_rr4; /* from kvm_arch */ - unsigned long fp_psr; /*used for lazy float register */ - unsigned long saved_gp; - /*for phycial emulation */ - int mode_flags; - struct thash_cb vtlb; - struct thash_cb vhpt; - char irq_check; - char irq_new_pending; - - unsigned long opcode; - unsigned long cause; - char log_buf[VMM_LOG_LEN]; - union context host; - union context guest; -}; - -struct kvm_vm_stat { - u64 remote_tlb_flush; -}; - -struct kvm_sal_data { - unsigned long boot_ip; - unsigned long boot_gp; -}; - -struct kvm_arch { - spinlock_t dirty_log_lock; - - unsigned long vm_base; - unsigned long metaphysical_rr0; - unsigned long metaphysical_rr4; - unsigned long vmm_init_rr; - - int is_sn2; - - struct kvm_ioapic *vioapic; - struct kvm_vm_stat stat; - struct kvm_sal_data rdv_sal_data; - - struct list_head assigned_dev_head; - struct iommu_domain *iommu_domain; - int iommu_flags; - - unsigned long irq_sources_bitmap; - unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; -}; - -union cpuid3_t { - u64 value; - struct { - u64 number : 8; - u64 revision : 8; - u64 model : 8; - u64 family : 8; - u64 archrev : 8; - u64 rv : 24; - }; -}; - -struct kvm_pt_regs { - /* The following registers are saved by SAVE_MIN: */ - unsigned long b6; /* scratch */ - unsigned long b7; /* scratch */ - - unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */ - unsigned long ar_ssd; /* reserved for future use (scratch) */ - - unsigned long r8; /* scratch (return value register 0) */ - unsigned long r9; /* scratch (return value register 1) */ - unsigned long r10; /* scratch (return value register 2) */ - unsigned long r11; /* scratch (return value register 3) */ - - unsigned long cr_ipsr; /* interrupted task's psr */ - unsigned long cr_iip; /* interrupted task's instruction pointer */ - unsigned long cr_ifs; /* interrupted task's function state */ - - unsigned long ar_unat; /* interrupted task's NaT register (preserved) */ - unsigned long ar_pfs; /* prev function state */ - unsigned long ar_rsc; /* RSE configuration */ - /* The following two are valid only if cr_ipsr.cpl > 0: */ - unsigned long ar_rnat; /* RSE NaT */ - unsigned long ar_bspstore; /* RSE bspstore */ - - unsigned long pr; /* 64 predicate registers (1 bit each) */ - unsigned long b0; /* return pointer (bp) */ - unsigned long loadrs; /* size of dirty partition << 16 */ - - unsigned long r1; /* the gp pointer */ - unsigned long r12; /* interrupted task's memory stack pointer */ - unsigned long r13; /* thread pointer */ - - unsigned long ar_fpsr; /* floating point status (preserved) */ - unsigned long r15; /* scratch */ - - /* The remaining registers are NOT saved for system calls. */ - unsigned long r14; /* scratch */ - unsigned long r2; /* scratch */ - unsigned long r3; /* scratch */ - unsigned long r16; /* scratch */ - unsigned long r17; /* scratch */ - unsigned long r18; /* scratch */ - unsigned long r19; /* scratch */ - unsigned long r20; /* scratch */ - unsigned long r21; /* scratch */ - unsigned long r22; /* scratch */ - unsigned long r23; /* scratch */ - unsigned long r24; /* scratch */ - unsigned long r25; /* scratch */ - unsigned long r26; /* scratch */ - unsigned long r27; /* scratch */ - unsigned long r28; /* scratch */ - unsigned long r29; /* scratch */ - unsigned long r30; /* scratch */ - unsigned long r31; /* scratch */ - unsigned long ar_ccv; /* compare/exchange value (scratch) */ - - /* - * Floating point registers that the kernel considers scratch: - */ - struct ia64_fpreg f6; /* scratch */ - struct ia64_fpreg f7; /* scratch */ - struct ia64_fpreg f8; /* scratch */ - struct ia64_fpreg f9; /* scratch */ - struct ia64_fpreg f10; /* scratch */ - struct ia64_fpreg f11; /* scratch */ - - unsigned long r4; /* preserved */ - unsigned long r5; /* preserved */ - unsigned long r6; /* preserved */ - unsigned long r7; /* preserved */ - unsigned long eml_unat; /* used for emulating instruction */ - unsigned long pad0; /* alignment pad */ -}; - -static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v) -{ - return (struct kvm_pt_regs *) ((unsigned long) v + KVM_STK_OFFSET) - 1; -} - -typedef int kvm_vmm_entry(void); -typedef void kvm_tramp_entry(union context *host, union context *guest); - -struct kvm_vmm_info{ - struct module *module; - kvm_vmm_entry *vmm_entry; - kvm_tramp_entry *tramp_entry; - unsigned long vmm_ivt; - unsigned long patch_mov_ar; - unsigned long patch_mov_ar_sn2; -}; - -int kvm_highest_pending_irq(struct kvm_vcpu *vcpu); -int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); -void kvm_sal_emul(struct kvm_vcpu *vcpu); - -#define __KVM_HAVE_ARCH_VM_ALLOC 1 -struct kvm *kvm_arch_alloc_vm(void); -void kvm_arch_free_vm(struct kvm *kvm); - -#endif /* __ASSEMBLY__*/ - -#endif diff --git a/linux/include/asm-ia64/kvm_para.h b/linux/include/asm-ia64/kvm_para.h deleted file mode 100644 index 2e2f499..0000000 --- a/linux/include/asm-ia64/kvm_para.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef KVM_UNIFDEF_H -#define KVM_UNIFDEF_H - -#ifdef __i386__ -#ifndef CONFIG_X86_32 -#define CONFIG_X86_32 1 -#endif -#endif - -#ifdef __x86_64__ -#ifndef CONFIG_X86_64 -#define CONFIG_X86_64 1 -#endif -#endif - -#if defined(__i386__) || defined (__x86_64__) -#ifndef CONFIG_X86 -#define CONFIG_X86 1 -#endif -#endif - -#ifdef __ia64__ -#ifndef CONFIG_IA64 -#define CONFIG_IA64 1 -#endif -#endif - -#ifdef __PPC__ -#ifndef CONFIG_PPC -#define CONFIG_PPC 1 -#endif -#endif - -#ifdef __s390__ -#ifndef CONFIG_S390 -#define CONFIG_S390 1 -#endif -#endif - -#endif -#ifndef __IA64_KVM_PARA_H -#define __IA64_KVM_PARA_H - -/* - * Copyright (C) 2007 Xiantao Zhang <xiantao.zhang@intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - */ - -#ifdef __KERNEL__ - -static inline unsigned int kvm_arch_para_features(void) -{ - return 0; -} - -#endif - -#endif diff --git a/linux/include/asm-x86/hyperv.h b/linux/include/asm-x86/hyperv.h index 8b44b46..e33dcc8 100644 --- a/linux/include/asm-x86/hyperv.h +++ b/linux/include/asm-x86/hyperv.h @@ -38,8 +38,8 @@ #endif #endif -#ifndef _ASM_X86_HYPERV_H -#define _ASM_X86_HYPERV_H +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H #include <linux/types.h> @@ -54,10 +54,6 @@ #define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 -#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 -#define HYPERV_CPUID_MIN 0x40000005 -#define HYPERV_CPUID_MAX 0x4000ffff - /* * Feature identification. EAX indicates which features are available * to the partition based upon the current partition privileges. @@ -173,9 +169,6 @@ /* MSR used to provide vcpu index */ #define HV_X64_MSR_VP_INDEX 0x40000002 -/* MSR used to read the per-partition time reference counter */ -#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 - /* Define the virtual APIC registers */ #define HV_X64_MSR_EOI 0x40000070 #define HV_X64_MSR_ICR 0x40000071 diff --git a/linux/include/asm-x86/kvm.h b/linux/include/asm-x86/kvm.h index 12ddb51..7cf06d2 100644 --- a/linux/include/asm-x86/kvm.h +++ b/linux/include/asm-x86/kvm.h @@ -61,9 +61,6 @@ #define __KVM_HAVE_PIT_STATE2 #define __KVM_HAVE_XEN_HVM #define __KVM_HAVE_VCPU_EVENTS -#define __KVM_HAVE_DEBUGREGS -#define __KVM_HAVE_XSAVE -#define __KVM_HAVE_XCRS /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -300,11 +297,6 @@ struct kvm_reinject_control { /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ #define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 -#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 - -/* Interrupt shadow states */ -#define KVM_X86_SHADOW_INT_MOV_SS 0x01 -#define KVM_X86_SHADOW_INT_STI 0x02 /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { @@ -319,7 +311,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 soft; - __u8 shadow; + __u8 pad; } interrupt; struct { __u8 injected; @@ -332,33 +324,4 @@ struct kvm_vcpu_events { __u32 reserved[10]; }; -/* for KVM_GET/SET_DEBUGREGS */ -struct kvm_debugregs { - __u64 db[4]; - __u64 dr6; - __u64 dr7; - __u64 flags; - __u64 reserved[9]; -}; - -/* for KVM_CAP_XSAVE */ -struct kvm_xsave { - __u32 region[1024]; -}; - -#define KVM_MAX_XCRS 16 - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - #endif /* _ASM_X86_KVM_H */ diff --git a/linux/include/asm-x86/kvm_emulate.h b/linux/include/asm-x86/kvm_emulate.h index a57ee7e..6f7d0f6 100644 --- a/linux/include/asm-x86/kvm_emulate.h +++ b/linux/include/asm-x86/kvm_emulate.h @@ -51,18 +51,8 @@ #ifndef _ASM_X86_KVM_X86_EMULATE_H #define _ASM_X86_KVM_X86_EMULATE_H - - struct x86_emulate_ctxt; -struct x86_exception { - u8 vector; - bool error_code_valid; - u16 error_code; - bool nested_page_fault; - u64 address; /* cr2 or nested page fault gpa */ -}; - /* * x86_emulate_ops: * @@ -99,10 +89,8 @@ struct x86_exception { #define X86EMUL_UNHANDLEABLE 1 /* Terminate emulation but return success to the caller. */ #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ -#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ -#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ -#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ - +#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. @@ -112,20 +100,9 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, - struct x86_exception *fault); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* - * write_std: Write bytes of standard (non-emulated/special) memory. - * Used for descriptor writing. - * @addr: [IN ] Linear address to which to write. - * @val: [OUT] Value write to memory, zero-extended to 'u_long'. - * @bytes: [IN ] Number of bytes to write to memory. - */ - int (*write_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, - struct x86_exception *fault); - /* * fetch: Read bytes of standard (non-emulated/special) memory. * Used for instruction fetch. * @addr: [IN ] Linear address from which to read. @@ -133,8 +110,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to read from memory. */ int (*fetch)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, - struct x86_exception *fault); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -145,7 +121,6 @@ struct x86_emulate_ops { int (*read_emulated)(unsigned long addr, void *val, unsigned int bytes, - struct x86_exception *fault, struct kvm_vcpu *vcpu); /* @@ -158,7 +133,6 @@ struct x86_emulate_ops { int (*write_emulated)(unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *fault, struct kvm_vcpu *vcpu); /* @@ -173,53 +147,15 @@ struct x86_emulate_ops { const void *old, const void *new, unsigned int bytes, - struct x86_exception *fault, struct kvm_vcpu *vcpu); - int (*pio_in_emulated)(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - int (*pio_out_emulated)(int size, unsigned short port, const void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - bool (*get_cached_descriptor)(struct kvm_desc_struct *desc, - int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct kvm_desc_struct *desc, - int seg, struct kvm_vcpu *vcpu); - u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); - void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); - unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); - void (*get_gdt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu); - void (*get_idt)(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu); - ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); - int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); - int (*cpl)(struct kvm_vcpu *vcpu); - int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); - int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); }; /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; unsigned int bytes; - union { - unsigned long orig_val; - u64 orig_val64; - }; - union { - unsigned long *reg; - struct segmented_address { - ulong ea; - unsigned seg; - } mem; - } addr; - union { - unsigned long val; - u64 val64; - char valptr[sizeof(unsigned long) + 2]; - }; + unsigned long val, orig_val, *ptr; }; struct fetch_cache { @@ -228,12 +164,6 @@ struct fetch_cache { unsigned long end; }; -struct read_cache { - u8 data[1024]; - unsigned long pos; - unsigned long end; -}; - struct decode_cache { u8 twobyte; u8 b; @@ -248,29 +178,29 @@ struct decode_cache { bool has_seg_override; u8 seg_override; unsigned int d; - int (*execute)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; + unsigned long eip, eip_orig; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; - u8 modrm_seg; + u8 use_modrm_ea; bool rip_relative; + unsigned long modrm_ea; + void *modrm_ptr; + unsigned long modrm_val; struct fetch_cache fetch; - struct read_cache io_read; - struct read_cache mem_read; }; -struct x86_emulate_ctxt { - struct x86_emulate_ops *ops; +#define X86_SHADOW_INT_MOV_SS 1 +#define X86_SHADOW_INT_STI 2 +struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; unsigned long eflags; - unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; u32 cs_base; @@ -278,11 +208,6 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - bool perm_ok; /* do not check permissions if true */ - - bool have_exception; - struct x86_exception exception; - /* decode cache */ struct decode_cache decode; }; @@ -305,14 +230,9 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); -#define EMULATION_FAILED -1 -#define EMULATION_OK 0 -#define EMULATION_RESTART 1 -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); -int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, - bool has_error_code, u32 error_code); -int emulate_int_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int irq); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); + #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/linux/include/asm-x86/kvm_host.h b/linux/include/asm-x86/kvm_host.h index 0c5e1f7..3090d71 100644 --- a/linux/include/asm-x86/kvm_host.h +++ b/linux/include/asm-x86/kvm_host.h @@ -55,7 +55,6 @@ #include <linux/mm.h> #include <linux/mmu_notifier.h> #include <linux/tracepoint.h> -#include <linux/cpumask.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -80,14 +79,11 @@ 0xFFFFFF0000000000ULL) #define INVALID_PAGE (~(hpa_t)0) -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ #define KVM_NR_PAGE_SIZES 3 -#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) -#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) @@ -113,24 +109,23 @@ #define IOPL_SHIFT 12 +#define KVM_ALIAS_SLOTS 4 + #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 #define KVM_MMU_HASH_SHIFT 10 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 80 +#define KVM_MAX_CPUID_ENTRIES 40 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 -#define ASYNC_PF_PER_VCPU 64 - extern spinlock_t kvm_lock; extern struct list_head vm_list; struct kvm_vcpu; struct kvm; -struct kvm_async_pf; enum kvm_reg { VCPU_REGS_RAX = 0, @@ -157,7 +152,6 @@ enum kvm_reg { enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, - VCPU_EXREG_CR3, }; enum { @@ -217,15 +211,15 @@ struct kvm_pte_chain { union kvm_mmu_page_role { unsigned word; struct { + unsigned glevels:4; unsigned level:4; - unsigned cr4_pae:1; unsigned quadrant:2; unsigned pad_for_nice_hex_output:6; unsigned direct:1; unsigned access:3; unsigned invalid:1; + unsigned cr4_pge:1; unsigned nxe:1; - unsigned cr0_wp:1; }; }; @@ -233,6 +227,8 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head oos_link; + /* * The following two entries are used to key the shadow page in the * hash table. @@ -248,9 +244,9 @@ struct kvm_mmu_page { * in this shadow page. */ DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); - bool multimapped; /* More than one parent_pte? */ - bool unsync; + int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ + bool unsync; unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ @@ -268,9 +264,14 @@ struct kvm_pv_mmu_op_buffer { struct kvm_pio_request { unsigned long count; + int cur_count; + gva_t guest_gva; int in; int port; int size; + int string; + int down; + int rep; }; /* @@ -280,16 +281,10 @@ struct kvm_pio_request { */ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); - unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, - bool prefault); - void (*inject_page_fault)(struct kvm_vcpu *vcpu, - struct x86_exception *fault); + int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, - struct x86_exception *exception); - gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -299,18 +294,13 @@ struct kvm_mmu { int root_level; int shadow_root_level; union kvm_mmu_page_role base_role; - bool direct_map; u64 *pae_root; - u64 *lm_root; u64 rsvd_bits_mask[2][4]; - - bool nx; - - u64 pdptrs[4]; /* pae */ }; struct kvm_vcpu_arch { + u64 host_tsc; /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. @@ -327,6 +317,7 @@ struct kvm_vcpu_arch { unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; + u64 pdptrs[4]; /* pae */ u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ @@ -336,31 +327,7 @@ struct kvm_vcpu_arch { u64 ia32_misc_enable_msr; bool tpr_access_reporting; - /* - * Paging state of the vcpu - * - * If the vcpu runs in guest mode with two level paging this still saves - * the paging mode of the l1 guest. This context is always used to - * handle faults. - */ struct kvm_mmu mmu; - - /* - * Paging state of an L2 guest (used for nested npt) - * - * This context will save all necessary information to walk page tables - * of the an L2 guest. This context is only initialized for page table - * walking and not for faulting since we never handle l2 page faults on - * the host. - */ - struct kvm_mmu nested_mmu; - - /* - * Pointer to the mmu context currently used for - * gva_to_gpa translations. - */ - struct kvm_mmu *walk_mmu; - /* only needed in kvm_pv_mmu_op() path, but it's hot so * put it here to avoid allocation */ struct kvm_pv_mmu_op_buffer mmu_op_buffer; @@ -381,8 +348,8 @@ struct kvm_vcpu_arch { unsigned long mmu_seq; } update_pte; - struct kvm_compat_fpu guest_fpu; - u64 xcr0; + struct i387_fxsave_struct host_fx_image; + struct i387_fxsave_struct guest_fx_image; gva_t mmio_fault_cr2; struct kvm_pio_request pio; @@ -393,7 +360,6 @@ struct kvm_vcpu_arch { struct kvm_queued_exception { bool pending; bool has_error_code; - bool reinject; u8 nr; u32 error_code; } exception; @@ -413,16 +379,10 @@ struct kvm_vcpu_arch { struct x86_emulate_ctxt emulate_ctxt; gpa_t time; - struct kvm_pvclock_vcpu_time_info hv_clock; - unsigned int hw_tsc_khz; + struct pvclock_vcpu_time_info hv_clock; + unsigned int hv_clock_tsc_khz; unsigned int time_offset; struct page *time_page; - u64 last_host_tsc; - u64 last_guest_tsc; - u64 last_kernel_ns; - u64 last_tsc_nsec; - u64 last_tsc_write; - bool tsc_catchup; bool nmi_pending; bool nmi_injected; @@ -442,28 +402,33 @@ struct kvm_vcpu_arch { u64 *mce_banks; /* used for guest single stepping over the given code position */ + u16 singlestep_cs; unsigned long singlestep_rip; - /* fields used by HYPER-V emulation */ u64 hv_vapic; +}; - cpumask_var_t wbinvd_dirty_mask; +struct kvm_mem_alias { + gfn_t base_gfn; + unsigned long npages; + gfn_t target_gfn; +#define KVM_ALIAS_INVALID 1UL + unsigned long flags; +}; - struct { - bool halted; - gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; - struct gfn_to_hva_cache data; - u64 msr_val; - u32 id; - bool send_user_only; - } apf; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + +struct kvm_mem_aliases { + struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + int naliases; }; struct kvm_arch { - unsigned int n_used_mmu_pages; + struct kvm_mem_aliases *aliases; + + unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; - unsigned int n_max_mmu_pages; - atomic_t invlpg_counter; + unsigned int n_alloc_mmu_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. @@ -487,24 +452,14 @@ struct kvm_arch { gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; + u64 vm_init_tsc; s64 kvmclock_offset; - spinlock_t tsc_write_lock; - u64 last_tsc_nsec; - u64 last_tsc_offset; - u64 last_tsc_write; - u32 virtual_tsc_khz; - u32 virtual_tsc_mult; - s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; /* fields used by HYPER-V emulation */ u64 hv_guest_os_id; u64 hv_hypercall; - - #ifdef CONFIG_KVM_MMU_AUDIT - int audit_point; - #endif }; struct kvm_vm_stat { @@ -546,6 +501,11 @@ struct kvm_vcpu_stat { u32 nmi_injections; }; +struct descriptor_table { + u16 limit; + unsigned long base; +} __attribute__((packed)); + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -578,17 +538,17 @@ struct kvm_x86_ops { struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr3)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); - void (*get_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); - void (*set_idt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); - void (*get_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); - void (*set_gdt)(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt); - void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); + void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); + int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -607,9 +567,7 @@ struct kvm_x86_ops { void (*set_irq)(struct kvm_vcpu *vcpu); void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject); - void (*cancel_injection)(struct kvm_vcpu *vcpu); + bool has_error_code, u32 error_code); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); @@ -622,27 +580,10 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); - - void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - - void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); - bool (*has_wbinvd_exit)(void); - - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); const struct trace_print_flags *exit_reasons_str; }; -struct kvm_arch_async_pf { - u32 token; - gfn_t gfn; - unsigned long cr3; - bool direct_map; -}; - extern struct kvm_x86_ops *kvm_x86_ops; int kvm_mmu_module_init(void); @@ -652,6 +593,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); +void kvm_mmu_set_base_ptes(u64 base_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); @@ -661,7 +603,7 @@ void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -680,47 +622,49 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, - int emulation_type, void *insn, int insn_len); - -static inline int emulate_instruction(struct kvm_vcpu *vcpu, - int emulation_type) -{ - return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); -} - +int emulate_instruction(struct kvm_vcpu *vcpu, + unsigned long cr2, u16 error_code, int emulation_type); +void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags); +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, + unsigned long *rflags); void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); struct x86_emulate_ctxt; -int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); +int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, + int size, unsigned port); +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, + int size, unsigned long count, int down, + gva_t address, int rep, unsigned port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); -int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long *dest); +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); @@ -730,20 +674,22 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t gfn, void *data, int offset, int len, - u32 access); -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, + u32 error_code); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -int fx_init(struct kvm_vcpu *vcpu); +void fx_init(struct kvm_vcpu *vcpu); + +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + +unsigned long segment_base(u16 selector); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -754,29 +700,27 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_fix_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, - void *insn, int insn_len); +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_enable_tdp(void); void kvm_disable_tdp(void); +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); bool kvm_check_iopl(struct kvm_vcpu *vcpu); +struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); + static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -784,6 +728,20 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } +static inline u16 kvm_read_fs(void) +{ + u16 seg; + asm("mov %%fs, %0" : "=g"(seg)); + return seg; +} + +static inline u16 kvm_read_gs(void) +{ + u16 seg; + asm("mov %%gs, %0" : "=g"(seg)); + return seg; +} + static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -791,11 +749,38 @@ static inline u16 kvm_read_ldt(void) return ldt; } +static inline void kvm_load_fs(u16 sel) +{ + asm("mov %0, %%fs" : : "rm"(sel)); +} + +static inline void kvm_load_gs(u16 sel) +{ + asm("mov %0, %%gs" : : "rm"(sel)); +} + static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); } +static inline void kvm_get_idt(struct descriptor_table *table) +{ + asm("sidt %0" : "=m"(*table)); +} + +static inline void kvm_get_gdt(struct descriptor_table *table) +{ + asm("sgdt %0" : "=m"(*table)); +} + +static inline unsigned long kvm_read_tr_base(void) +{ + u16 tr; + asm("str %0" : "=g"(tr)); + return segment_base(tr); +} + #ifdef CONFIG_X86_64 static inline unsigned long read_msr(unsigned long msr) { @@ -806,6 +791,21 @@ static inline unsigned long read_msr(unsigned long msr) } #endif +static inline void kvm_fx_save(struct i387_fxsave_struct *image) +{ + asm("fxsave (%0)":: "r" (image)); +} + +static inline void kvm_fx_restore(struct i387_fxsave_struct *image) +{ + asm("fxrstor (%0)":: "r" (image)); +} + +static inline void kvm_fx_finit(void) +{ + asm("finit"); +} + static inline u32 get_rdx_init_val(void) { return 0x600; /* P6 family */ @@ -835,25 +835,20 @@ enum { #define HF_VINTR_MASK (1 << 2) #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) -#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ /* * Hardware virtualization extension instructions may fault if a * reboot turns off virtualization while processes are running. * Trap the fault and ignore the instruction if that happens. */ -asmlinkage void kvm_spurious_fault(void); -extern bool kvm_rebooting; +asmlinkage void kvm_handle_fault_on_reboot(void); #define __kvm_handle_fault_on_reboot(insn) \ "666: " insn "\n\t" \ - "668: \n\t" \ ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ - "jne 668b \n\t" \ __ASM_SIZE(push) " $666b \n\t" \ - "call kvm_spurious_fault \n\t" \ + "jmp kvm_handle_fault_on_reboot \n\t" \ ".popsection \n\t" \ ".pushsection __ex_table, \"a\" \n\t" \ _ASM_PTR " 666b, 667b \n\t" \ @@ -862,7 +857,6 @@ extern bool kvm_rebooting; #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); @@ -872,17 +866,4 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_define_shared_msr(unsigned index, u32 msr); void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); - -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work); -void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work); -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work); -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); -extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); - -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); - #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/linux/include/asm-x86/kvm_para.h b/linux/include/asm-x86/kvm_para.h index 5f575a4..a1a5c1f 100644 --- a/linux/include/asm-x86/kvm_para.h +++ b/linux/include/asm-x86/kvm_para.h @@ -56,30 +56,12 @@ #define KVM_FEATURE_CLOCKSOURCE 0 #define KVM_FEATURE_NOP_IO_DELAY 1 #define KVM_FEATURE_MMU_OP 2 -/* This indicates that the new set of kvmclock msrs - * are available. The use of 0x11 and 0x12 is deprecated - */ -#define KVM_FEATURE_CLOCKSOURCE2 3 -#define KVM_FEATURE_ASYNC_PF 4 - -/* The last 8 bits are used to indicate how to interpret the flags field - * in pvclock structure. If no bits are set, all flags are ignored. - */ -#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 #define MSR_KVM_WALL_CLOCK 0x11 #define MSR_KVM_SYSTEM_TIME 0x12 -/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ -#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 -#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 -#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 - #define KVM_MAX_MMU_OP_BATCH 32 -#define KVM_ASYNC_PF_ENABLED (1 << 0) -#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) - /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 #define KVM_MMU_OP_FLUSH_TLB 2 @@ -106,20 +88,10 @@ struct kvm_mmu_op_release_pt { __u64 pt_phys; }; -#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 -#define KVM_PV_REASON_PAGE_READY 2 - -struct kvm_vcpu_pv_apf_data { - __u32 reason; - __u8 pad[60]; - __u32 enabled; -}; - #ifdef __KERNEL__ #include <asm/processor.h> extern void kvmclock_init(void); -extern int kvm_register_clock(char *txt); /* This instruction is vmcall. On non-VT architectures, it will generate a @@ -213,21 +185,6 @@ static inline unsigned int kvm_arch_para_features(void) return cpuid_eax(KVM_CPUID_FEATURES); } -#ifdef CONFIG_KVM_GUEST -void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); -void kvm_async_pf_task_wake(u32 token); -u32 kvm_read_and_reset_pf_reason(void); -#else -#define kvm_guest_init() do { } while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) -#define kvm_async_pf_task_wake(T) do {} while(0) -static inline u32 kvm_read_and_reset_pf_reason(void) -{ - return 0; -} #endif -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/linux/include/asm-x86/svm.h b/linux/include/asm-x86/svm.h index 5529318..b7f0e58 100644 --- a/linux/include/asm-x86/svm.h +++ b/linux/include/asm-x86/svm.h @@ -87,13 +87,14 @@ enum { INTERCEPT_MONITOR, INTERCEPT_MWAIT, INTERCEPT_MWAIT_COND, - INTERCEPT_XSETBV, }; struct __attribute__ ((__packed__)) vmcb_control_area { - u32 intercept_cr; - u32 intercept_dr; + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; u32 intercept_exceptions; u64 intercept; u8 reserved_1[42]; @@ -120,19 +121,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 event_inj_err; u64 nested_cr3; u64 lbr_ctl; - u32 clean; - u32 reserved_5; - u64 next_rip; - u8 insn_len; - u8 insn_bytes[15]; - u8 reserved_6[800]; + u8 reserved_5[832]; }; #define TLB_CONTROL_DO_NOTHING 0 #define TLB_CONTROL_FLUSH_ALL_ASID 1 -#define TLB_CONTROL_FLUSH_ASID 3 -#define TLB_CONTROL_FLUSH_ASID_LOCAL 7 #define V_TPR_MASK 0x0f @@ -161,10 +155,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) -#define SVM_VM_CR_VALID_MASK 0x001fULL -#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL -#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL - struct __attribute__ ((__packed__)) vmcb_seg { u16 selector; u16 attrib; @@ -248,31 +238,19 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK #define SVM_SELECTOR_CODE_MASK (1 << 3) -#define INTERCEPT_CR0_READ 0 -#define INTERCEPT_CR3_READ 3 -#define INTERCEPT_CR4_READ 4 -#define INTERCEPT_CR8_READ 8 -#define INTERCEPT_CR0_WRITE (16 + 0) -#define INTERCEPT_CR3_WRITE (16 + 3) -#define INTERCEPT_CR4_WRITE (16 + 4) -#define INTERCEPT_CR8_WRITE (16 + 8) - -#define INTERCEPT_DR0_READ 0 -#define INTERCEPT_DR1_READ 1 -#define INTERCEPT_DR2_READ 2 -#define INTERCEPT_DR3_READ 3 -#define INTERCEPT_DR4_READ 4 -#define INTERCEPT_DR5_READ 5 -#define INTERCEPT_DR6_READ 6 -#define INTERCEPT_DR7_READ 7 -#define INTERCEPT_DR0_WRITE (16 + 0) -#define INTERCEPT_DR1_WRITE (16 + 1) -#define INTERCEPT_DR2_WRITE (16 + 2) -#define INTERCEPT_DR3_WRITE (16 + 3) -#define INTERCEPT_DR4_WRITE (16 + 4) -#define INTERCEPT_DR5_WRITE (16 + 5) -#define INTERCEPT_DR6_WRITE (16 + 6) -#define INTERCEPT_DR7_WRITE (16 + 7) +#define INTERCEPT_CR0_MASK 1 +#define INTERCEPT_CR3_MASK (1 << 3) +#define INTERCEPT_CR4_MASK (1 << 4) +#define INTERCEPT_CR8_MASK (1 << 8) + +#define INTERCEPT_DR0_MASK 1 +#define INTERCEPT_DR1_MASK (1 << 1) +#define INTERCEPT_DR2_MASK (1 << 2) +#define INTERCEPT_DR3_MASK (1 << 3) +#define INTERCEPT_DR4_MASK (1 << 4) +#define INTERCEPT_DR5_MASK (1 << 5) +#define INTERCEPT_DR6_MASK (1 << 6) +#define INTERCEPT_DR7_MASK (1 << 7) #define SVM_EVTINJ_VEC_MASK 0xff @@ -300,9 +278,6 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 -#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 - -#define SVM_EXITINFO_REG_MASK 0x0F #define SVM_EXIT_READ_CR0 0x000 #define SVM_EXIT_READ_CR3 0x003 @@ -374,7 +349,6 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_MONITOR 0x08a #define SVM_EXIT_MWAIT 0x08b #define SVM_EXIT_MWAIT_COND 0x08c -#define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_ERR -1 diff --git a/linux/include/asm-x86/vmx.h b/linux/include/asm-x86/vmx.h index 0174fa5..cf2af50 100644 --- a/linux/include/asm-x86/vmx.h +++ b/linux/include/asm-x86/vmx.h @@ -65,8 +65,6 @@ * */ -#include <linux/types.h> - /* * Definitions of Primary Processor-Based VM-Execution Controls. */ @@ -106,23 +104,15 @@ #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 -#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 -#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 #define VM_EXIT_SAVE_IA32_PAT 0x00040000 #define VM_EXIT_LOAD_IA32_PAT 0x00080000 -#define VM_EXIT_SAVE_IA32_EFER 0x00100000 -#define VM_EXIT_LOAD_IA32_EFER 0x00200000 -#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 -#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 -#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 -#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 /* VMCS Encodings */ enum vmcs_field { @@ -170,8 +160,6 @@ enum vmcs_field { GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, GUEST_IA32_PAT = 0x00002804, GUEST_IA32_PAT_HIGH = 0x00002805, - GUEST_IA32_EFER = 0x00002806, - GUEST_IA32_EFER_HIGH = 0x00002807, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -182,8 +170,6 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x00002811, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, - HOST_IA32_EFER = 0x00002c02, - HOST_IA32_EFER_HIGH = 0x00002c03, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -287,7 +273,6 @@ enum vmcs_field { #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 #define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVD 13 #define EXIT_REASON_INVLPG 14 #define EXIT_REASON_RDPMC 15 #define EXIT_REASON_RDTSC 16 @@ -306,7 +291,6 @@ enum vmcs_field { #define EXIT_REASON_IO_INSTRUCTION 30 #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_INVALID_STATE 33 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 @@ -316,7 +300,6 @@ enum vmcs_field { #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 -#define EXIT_REASON_XSETBV 55 /* * Interruption-information format @@ -345,12 +328,6 @@ enum vmcs_field { #define GUEST_INTR_STATE_SMI 0x00000004 #define GUEST_INTR_STATE_NMI 0x00000008 -/* GUEST_ACTIVITY_STATE flags */ -#define GUEST_ACTIVITY_ACTIVE 0 -#define GUEST_ACTIVITY_HLT 1 -#define GUEST_ACTIVITY_SHUTDOWN 2 -#define GUEST_ACTIVITY_WAIT_SIPI 3 - /* * Exit Qualifications for MOV for Control Register Access */ @@ -432,9 +409,6 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) -#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ -#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ - #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 @@ -460,10 +434,6 @@ enum vmcs_field { #define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" #define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" -struct vmx_msr_entry { - u32 index; - u32 reserved; - u64 value; -} __aligned(16); + #endif diff --git a/linux/include/linux/kvm.h b/linux/include/linux/kvm.h index 88fccc0..416465c 100644 --- a/linux/include/linux/kvm.h +++ b/linux/include/linux/kvm.h @@ -200,7 +200,6 @@ struct kvm_pit_config { #define KVM_EXIT_DCR 15 #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 -#define KVM_EXIT_OSI 18 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -300,10 +299,6 @@ struct kvm_run { __u32 ndata; __u64 data[16]; } internal; - /* KVM_EXIT_OSI */ - struct { - __u64 gprs[32]; - } osi; /* Fix the size of the union. */ char padding[256]; }; @@ -445,23 +440,6 @@ struct kvm_ioeventfd { __u8 pad[36]; }; -/* for KVM_ENABLE_CAP */ -struct kvm_enable_cap { - /* in */ - __u32 cap; - __u32 flags; - __u64 args[4]; - __u8 pad[64]; -}; - -/* for KVM_PPC_GET_PVINFO */ -struct kvm_ppc_pvinfo { - /* out */ - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - #define KVMIO 0xAE /* @@ -563,24 +541,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_HYPERV_VAPIC 45 #define KVM_CAP_HYPERV_SPIN 46 #define KVM_CAP_PCI_SEGMENT 47 -#define KVM_CAP_PPC_PAIRED_SINGLES 48 -#define KVM_CAP_INTR_SHADOW 49 -#ifdef __KVM_HAVE_DEBUGREGS -#define KVM_CAP_DEBUGREGS 50 -#endif #define KVM_CAP_X86_ROBUST_SINGLESTEP 51 -#define KVM_CAP_PPC_OSI 52 -#define KVM_CAP_PPC_UNSET_IRQ 53 -#define KVM_CAP_ENABLE_CAP 54 -#ifdef __KVM_HAVE_XSAVE -#define KVM_CAP_XSAVE 55 -#endif -#ifdef __KVM_HAVE_XCRS -#define KVM_CAP_XCRS 56 -#endif -#define KVM_CAP_PPC_GET_PVINFO 57 -#define KVM_CAP_PPC_IRQ_LEVEL 58 -#define KVM_CAP_ASYNC_PF 59 #ifdef KVM_CAP_IRQ_ROUTING @@ -670,7 +631,6 @@ struct kvm_clock_data { */ #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) -/* KVM_SET_MEMORY_ALIAS is obsolete: */ #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) @@ -715,8 +675,6 @@ struct kvm_clock_data { /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) -/* Available with KVM_CAP_PPC_GET_PVINFO */ -#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) /* * ioctls for vcpu fds @@ -770,16 +728,6 @@ struct kvm_clock_data { /* Available with KVM_CAP_VCPU_EVENTS */ #define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) #define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) -/* Available with KVM_CAP_DEBUGREGS */ -#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) -#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) -#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) -/* Available with KVM_CAP_XSAVE */ -#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) -#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) -/* Available with KVM_CAP_XCRS */ -#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) -#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) diff --git a/linux/include/linux/kvm_host.h b/linux/include/linux/kvm_host.h index 84546bd..14f3920 100644 --- a/linux/include/linux/kvm_host.h +++ b/linux/include/linux/kvm_host.h @@ -56,8 +56,6 @@ #include <linux/mm.h> #include <linux/preempt.h> #include <linux/msi.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> #include <asm/signal.h> #include <linux/kvm.h> @@ -78,11 +76,9 @@ #define KVM_REQ_PENDING_TIMER 5 #define KVM_REQ_UNHALT 6 #define KVM_REQ_MMU_SYNC 7 -#define KVM_REQ_CLOCK_UPDATE 8 +#define KVM_REQ_KVMCLOCK_UPDATE 8 #define KVM_REQ_KICK 9 #define KVM_REQ_DEACTIVATE_FPU 10 -#define KVM_REQ_EVENT 11 -#define KVM_REQ_APF_HALT 12 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 @@ -117,27 +113,6 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); -#ifdef CONFIG_KVM_ASYNC_PF -struct kvm_async_pf { - struct work_struct work; - struct list_head link; - struct list_head queue; - struct kvm_vcpu *vcpu; - struct mm_struct *mm; - gva_t gva; - unsigned long addr; - struct kvm_arch_async_pf arch; - struct page *page; - bool done; -}; - -void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); -void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - struct kvm_arch_async_pf *arch); -int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); -#endif - struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -146,14 +121,13 @@ struct kvm_vcpu { int vcpu_id; struct mutex mutex; int cpu; - atomic_t guest_mode; struct kvm_run *run; unsigned long requests; unsigned long guest_debug; int srcu_idx; int fpu_active; - int guest_fpu_loaded, guest_xcr0_loaded; + int guest_fpu_loaded; wait_queue_head_t wq; int sigset_active; sigset_t sigset; @@ -168,40 +142,21 @@ struct kvm_vcpu { gpa_t mmio_phys_addr; #endif -#ifdef CONFIG_KVM_ASYNC_PF - struct { - u32 queued; - struct list_head queue; - struct list_head done; - spinlock_t lock; - } async_pf; -#endif - struct kvm_vcpu_arch arch; }; -/* - * Some of the bitops functions do not support too long bitmaps. - * This number must be determined not to exceed such limits. - */ -#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) - -struct kvm_lpage_info { - unsigned long rmap_pde; - int write_count; -}; - struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; unsigned long flags; unsigned long *rmap; unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_head; - struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + struct { + unsigned long rmap_pde; + int write_count; + } *lpage_info[KVM_NR_PAGE_SIZES - 1]; unsigned long userspace_addr; int user_alloc; - int id; }; static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) @@ -245,7 +200,6 @@ struct kvm_irq_routing_table {}; struct kvm_memslots { int nmemslots; - u64 generation; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS]; }; @@ -283,11 +237,7 @@ struct kvm { struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP - /* - * Update side is protected by irq_lock and, - * if configured, irqfds.lock. - */ - struct kvm_irq_routing_table __rcu *irq_routing; + struct kvm_irq_routing_table *irq_routing; struct hlist_head mask_notifier_list; struct hlist_head irq_ack_notifier_list; #endif @@ -297,7 +247,6 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif - long tlbs_dirty; }; /* The guest did something we don't support. */ @@ -328,31 +277,23 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, +int kvm_init(void *opaque, unsigned int vcpu_size, struct module *module); void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); -static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) -{ - return rcu_dereference_check(kvm->memslots, - srcu_read_lock_held(&kvm->srcu) - || lockdep_is_held(&kvm->slots_lock)); -} - #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); extern struct page *bad_page; extern pfn_t bad_pfn; int is_error_page(struct page *page); int is_error_pfn(pfn_t pfn); -int is_hwpoison_pfn(pfn_t pfn); -int is_fault_pfn(pfn_t pfn); int kvm_is_error_hva(unsigned long addr); int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, @@ -371,9 +312,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int user_alloc); void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); - -int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, - int nr_pages); +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); @@ -382,13 +322,7 @@ void kvm_release_page_dirty(struct page *page); void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); -pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); -pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, - bool write_fault, bool *writable); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable); pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); int memslot_id(struct kvm *kvm, gfn_t gfn); @@ -407,25 +341,18 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); -int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); -int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); -void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); - void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); @@ -491,19 +418,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); -#ifndef __KVM_HAVE_ARCH_VM_ALLOC -static inline struct kvm *kvm_arch_alloc_vm(void) -{ - return kzalloc(sizeof(struct kvm), GFP_KERNEL); -} - -static inline void kvm_arch_free_vm(struct kvm *kvm) -{ - kfree(kvm); -} -#endif - -int kvm_arch_init_vm(struct kvm *kvm); +struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); @@ -519,8 +434,16 @@ struct kvm_irq_ack_notifier { void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; +#define KVM_ASSIGNED_MSIX_PENDING 0x1 +struct kvm_guest_msix_entry { + u32 vector; + u16 entry; + u16 flags; +}; + struct kvm_assigned_dev_kernel { struct kvm_irq_ack_notifier ack_notifier; + struct work_struct interrupt_work; struct list_head list; int assigned_dev_id; int host_segnr; @@ -531,14 +454,13 @@ struct kvm_assigned_dev_kernel { bool host_irq_disabled; struct msix_entry *host_msix_entries; int guest_irq; - struct msix_entry *guest_msix_entries; + struct kvm_guest_msix_entry *guest_msix_entries; unsigned long irq_requested_type; int irq_source_id; int flags; struct pci_dev *dev; struct kvm *kvm; - spinlock_t intx_lock; - char irq_name[32]; + spinlock_t assigned_dev_lock; }; struct kvm_irq_mask_notifier { @@ -551,8 +473,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn); void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn); -void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, - bool mask); +void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); #ifdef __KVM_HAVE_IOAPIC void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, @@ -560,8 +481,6 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, unsigned long *deliver_bitmask); #endif int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, - int irq_source_id, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); @@ -583,7 +502,8 @@ int kvm_deassign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev); #else /* CONFIG_IOMMU_API */ static inline int kvm_iommu_map_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) + gfn_t base_gfn, + unsigned long npages) { return 0; } @@ -623,22 +543,11 @@ static inline void kvm_guest_exit(void) current->flags &= ~PF_VCPU; } -static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, - gfn_t gfn) -{ - return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; -} - static inline gpa_t gfn_to_gpa(gfn_t gfn) { return (gpa_t)gfn << PAGE_SHIFT; } -static inline gfn_t gpa_to_gfn(gpa_t gpa) -{ - return (gfn_t)(gpa >> PAGE_SHIFT); -} - static inline hpa_t pfn_to_hpa(pfn_t pfn) { return (hpa_t)pfn << PAGE_SHIFT; @@ -681,6 +590,10 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se } #endif +#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION +#define unalias_gfn_instantiation unalias_gfn +#endif + #ifdef CONFIG_HAVE_KVM_IRQCHIP #define KVM_MAX_IRQ_ROUTES 1024 @@ -703,28 +616,17 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} void kvm_eventfd_init(struct kvm *kvm); int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); void kvm_irqfd_release(struct kvm *kvm); -void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); #else static inline void kvm_eventfd_init(struct kvm *kvm) {} - static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) { return -EINVAL; } static inline void kvm_irqfd_release(struct kvm *kvm) {} - -#ifdef CONFIG_HAVE_KVM_IRQCHIP -static inline void kvm_irq_routing_update(struct kvm *kvm, - struct kvm_irq_routing_table *irq_rt) -{ - rcu_assign_pointer(kvm->irq_routing, irq_rt); -} -#endif - static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { return -ENOSYS; @@ -754,25 +656,5 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, #endif -static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) -{ - set_bit(req, &vcpu->requests); -} - -static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu) -{ - return test_and_set_bit(req, &vcpu->requests); -} - -static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) -{ - if (test_bit(req, &vcpu->requests)) { - clear_bit(req, &vcpu->requests); - return true; - } else { - return false; - } -} - #endif diff --git a/linux/include/linux/kvm_para.h b/linux/include/linux/kvm_para.h index e13e53d..921be06 100644 --- a/linux/include/linux/kvm_para.h +++ b/linux/include/linux/kvm_para.h @@ -57,8 +57,6 @@ #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 -#define KVM_HC_FEATURES 3 -#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 /* * hypercalls use architecture specific @@ -66,6 +64,11 @@ #include <asm/kvm_para.h> #ifdef __KERNEL__ +#ifdef CONFIG_KVM_GUEST +void __init kvm_guest_init(void); +#else +#define kvm_guest_init() do { } while (0) +#endif static inline int kvm_para_has_feature(unsigned int feature) { diff --git a/linux/include/linux/kvm_types.h b/linux/include/linux/kvm_types.h index e35297a..c65f89e 100644 --- a/linux/include/linux/kvm_types.h +++ b/linux/include/linux/kvm_types.h @@ -72,11 +72,11 @@ typedef unsigned long gva_t; typedef u64 gpa_t; -typedef u64 gfn_t; +typedef unsigned long gfn_t; typedef unsigned long hva_t; typedef u64 hpa_t; -typedef u64 hfn_t; +typedef unsigned long hfn_t; typedef hfn_t pfn_t; @@ -107,11 +107,4 @@ struct kvm_lapic_irq { u32 dest_id; }; -struct gfn_to_hva_cache { - u64 generation; - gpa_t gpa; - unsigned long hva; - struct kvm_memory_slot *memslot; -}; - #endif /* __KVM_TYPES_H__ */ diff --git a/linux/include/trace/events/kvm.h b/linux/include/trace/events/kvm.h index 1c1c1f7..35d92f2 100644 --- a/linux/include/trace/events/kvm.h +++ b/linux/include/trace/events/kvm.h @@ -45,36 +45,7 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm - -#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x } - -#define kvm_trace_exit_reason \ - ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \ - ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ - ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ - ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ - ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI) - -TRACE_EVENT(kvm_userspace_exit, - TP_PROTO(__u32 reason, int errno), - TP_ARGS(reason, errno), - - TP_STRUCT__entry( - __field( __u32, reason ) - __field( int, errno ) - ), - - TP_fast_assign( - __entry->reason = reason; - __entry->errno = errno; - ), - - TP_printk("reason %s (%d)", - __entry->errno < 0 ? - (__entry->errno == -EINTR ? "restart" : "error") : - __print_symbolic(__entry->reason, kvm_trace_exit_reason), - __entry->errno < 0 ? -__entry->errno : __entry->reason) -); +#define TRACE_INCLUDE_FILE kvm #if defined(__KVM_HAVE_IOAPIC) TRACE_EVENT(kvm_set_irq, @@ -255,97 +226,6 @@ TRACE_EVENT(kvm_age_page, __entry->referenced ? "YOUNG" : "OLD") ); -#ifdef CONFIG_KVM_ASYNC_PF -DECLARE_EVENT_CLASS(kvm_async_get_page_class, - - TP_PROTO(u64 gva, u64 gfn), - - TP_ARGS(gva, gfn), - - TP_STRUCT__entry( - __field(__u64, gva) - __field(u64, gfn) - ), - - TP_fast_assign( - __entry->gva = gva; - __entry->gfn = gfn; - ), - - TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) -); - -DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, - - TP_PROTO(u64 gva, u64 gfn), - - TP_ARGS(gva, gfn) -); - -DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault, - - TP_PROTO(u64 gva, u64 gfn), - - TP_ARGS(gva, gfn) -); - -DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, - - TP_PROTO(u64 token, u64 gva), - - TP_ARGS(token, gva), - - TP_STRUCT__entry( - __field(__u64, token) - __field(__u64, gva) - ), - - TP_fast_assign( - __entry->token = token; - __entry->gva = gva; - ), - - TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) - -); - -DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, - - TP_PROTO(u64 token, u64 gva), - - TP_ARGS(token, gva) -); - -DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, - - TP_PROTO(u64 token, u64 gva), - - TP_ARGS(token, gva) -); - -TRACE_EVENT( - kvm_async_pf_completed, - TP_PROTO(unsigned long address, struct page *page, u64 gva), - TP_ARGS(address, page, gva), - - TP_STRUCT__entry( - __field(unsigned long, address) - __field(pfn_t, pfn) - __field(u64, gva) - ), - - TP_fast_assign( - __entry->address = address; - __entry->pfn = page ? page_to_pfn(page) : 0; - __entry->gva = gva; - ), - - TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva, - __entry->address, __entry->pfn) -); - -#endif - #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ diff --git a/linux/usr/include/asm-x86/hyperv.h b/linux/usr/include/asm-x86/hyperv.h index 5df477a..e153a2b 100644 --- a/linux/usr/include/asm-x86/hyperv.h +++ b/linux/usr/include/asm-x86/hyperv.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_HYPERV_H -#define _ASM_X86_HYPERV_H +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H #include <linux/types.h> @@ -14,10 +14,6 @@ #define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 -#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 -#define HYPERV_CPUID_MIN 0x40000005 -#define HYPERV_CPUID_MAX 0x4000ffff - /* * Feature identification. EAX indicates which features are available * to the partition based upon the current partition privileges. @@ -133,9 +129,6 @@ /* MSR used to provide vcpu index */ #define HV_X64_MSR_VP_INDEX 0x40000002 -/* MSR used to read the per-partition time reference counter */ -#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 - /* Define the virtual APIC registers */ #define HV_X64_MSR_EOI 0x40000070 #define HV_X64_MSR_ICR 0x40000071 diff --git a/linux/usr/include/asm-x86/kvm.h b/linux/usr/include/asm-x86/kvm.h index 4d8dcbd..f46b79f 100644 --- a/linux/usr/include/asm-x86/kvm.h +++ b/linux/usr/include/asm-x86/kvm.h @@ -21,9 +21,6 @@ #define __KVM_HAVE_PIT_STATE2 #define __KVM_HAVE_XEN_HVM #define __KVM_HAVE_VCPU_EVENTS -#define __KVM_HAVE_DEBUGREGS -#define __KVM_HAVE_XSAVE -#define __KVM_HAVE_XCRS /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -260,11 +257,6 @@ struct kvm_reinject_control { /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ #define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 -#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 - -/* Interrupt shadow states */ -#define KVM_X86_SHADOW_INT_MOV_SS 0x01 -#define KVM_X86_SHADOW_INT_STI 0x02 /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { @@ -279,7 +271,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 soft; - __u8 shadow; + __u8 pad; } interrupt; struct { __u8 injected; @@ -292,33 +284,4 @@ struct kvm_vcpu_events { __u32 reserved[10]; }; -/* for KVM_GET/SET_DEBUGREGS */ -struct kvm_debugregs { - __u64 db[4]; - __u64 dr6; - __u64 dr7; - __u64 flags; - __u64 reserved[9]; -}; - -/* for KVM_CAP_XSAVE */ -struct kvm_xsave { - __u32 region[1024]; -}; - -#define KVM_MAX_XCRS 16 - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - #endif /* _ASM_X86_KVM_H */ diff --git a/linux/usr/include/asm-x86/kvm_para.h b/linux/usr/include/asm-x86/kvm_para.h index 834d71e..d91d4fa 100644 --- a/linux/usr/include/asm-x86/kvm_para.h +++ b/linux/usr/include/asm-x86/kvm_para.h @@ -16,30 +16,12 @@ #define KVM_FEATURE_CLOCKSOURCE 0 #define KVM_FEATURE_NOP_IO_DELAY 1 #define KVM_FEATURE_MMU_OP 2 -/* This indicates that the new set of kvmclock msrs - * are available. The use of 0x11 and 0x12 is deprecated - */ -#define KVM_FEATURE_CLOCKSOURCE2 3 -#define KVM_FEATURE_ASYNC_PF 4 - -/* The last 8 bits are used to indicate how to interpret the flags field - * in pvclock structure. If no bits are set, all flags are ignored. - */ -#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 #define MSR_KVM_WALL_CLOCK 0x11 #define MSR_KVM_SYSTEM_TIME 0x12 -/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ -#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 -#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 -#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 - #define KVM_MAX_MMU_OP_BATCH 32 -#define KVM_ASYNC_PF_ENABLED (1 << 0) -#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) - /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 #define KVM_MMU_OP_FLUSH_TLB 2 @@ -66,14 +48,5 @@ struct kvm_mmu_op_release_pt { __u64 pt_phys; }; -#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 -#define KVM_PV_REASON_PAGE_READY 2 - -struct kvm_vcpu_pv_apf_data { - __u32 reason; - __u8 pad[60]; - __u32 enabled; -}; - #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/linux/usr/include/linux/kvm.h b/linux/usr/include/linux/kvm.h index 27efcfb..fcebd4c 100644 --- a/linux/usr/include/linux/kvm.h +++ b/linux/usr/include/linux/kvm.h @@ -160,7 +160,6 @@ struct kvm_pit_config { #define KVM_EXIT_DCR 15 #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 -#define KVM_EXIT_OSI 18 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -260,10 +259,6 @@ struct kvm_run { __u32 ndata; __u64 data[16]; } internal; - /* KVM_EXIT_OSI */ - struct { - __u64 gprs[32]; - } osi; /* Fix the size of the union. */ char padding[256]; }; @@ -405,23 +400,6 @@ struct kvm_ioeventfd { __u8 pad[36]; }; -/* for KVM_ENABLE_CAP */ -struct kvm_enable_cap { - /* in */ - __u32 cap; - __u32 flags; - __u64 args[4]; - __u8 pad[64]; -}; - -/* for KVM_PPC_GET_PVINFO */ -struct kvm_ppc_pvinfo { - /* out */ - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - #define KVMIO 0xAE /* @@ -523,24 +501,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_HYPERV_VAPIC 45 #define KVM_CAP_HYPERV_SPIN 46 #define KVM_CAP_PCI_SEGMENT 47 -#define KVM_CAP_PPC_PAIRED_SINGLES 48 -#define KVM_CAP_INTR_SHADOW 49 -#ifdef __KVM_HAVE_DEBUGREGS -#define KVM_CAP_DEBUGREGS 50 -#endif #define KVM_CAP_X86_ROBUST_SINGLESTEP 51 -#define KVM_CAP_PPC_OSI 52 -#define KVM_CAP_PPC_UNSET_IRQ 53 -#define KVM_CAP_ENABLE_CAP 54 -#ifdef __KVM_HAVE_XSAVE -#define KVM_CAP_XSAVE 55 -#endif -#ifdef __KVM_HAVE_XCRS -#define KVM_CAP_XCRS 56 -#endif -#define KVM_CAP_PPC_GET_PVINFO 57 -#define KVM_CAP_PPC_IRQ_LEVEL 58 -#define KVM_CAP_ASYNC_PF 59 #ifdef KVM_CAP_IRQ_ROUTING @@ -630,7 +591,6 @@ struct kvm_clock_data { */ #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) -/* KVM_SET_MEMORY_ALIAS is obsolete: */ #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) @@ -675,8 +635,6 @@ struct kvm_clock_data { /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) -/* Available with KVM_CAP_PPC_GET_PVINFO */ -#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) /* * ioctls for vcpu fds @@ -730,16 +688,6 @@ struct kvm_clock_data { /* Available with KVM_CAP_VCPU_EVENTS */ #define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) #define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) -/* Available with KVM_CAP_DEBUGREGS */ -#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) -#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) -#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) -/* Available with KVM_CAP_XSAVE */ -#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) -#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) -/* Available with KVM_CAP_XCRS */ -#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) -#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) diff --git a/linux/usr/include/linux/kvm_para.h b/linux/usr/include/linux/kvm_para.h index b315e27..eca8259 100644 --- a/linux/usr/include/linux/kvm_para.h +++ b/linux/usr/include/linux/kvm_para.h @@ -17,8 +17,6 @@ #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 -#define KVM_HC_FEATURES 3 -#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 /* * hypercalls use architecture specific diff --git a/linux/x86/assigned-dev.c b/linux/x86/assigned-dev.c index 86dac05..b383411 100644 --- a/linux/x86/assigned-dev.c +++ b/linux/x86/assigned-dev.c @@ -41,7 +41,7 @@ /* * Kernel-based Virtual Machine - device assignment support * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (C) 2006-9 Red Hat, Inc * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -95,31 +95,60 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel return index; } -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) +static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) { - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - u32 vector; - int index; + struct kvm_assigned_dev_kernel *assigned_dev; + struct kvm *kvm; + int i; - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { - spin_lock(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock(&assigned_dev->intx_lock); - } + assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, + interrupt_work); + kvm = assigned_dev->kvm; + spin_lock_irq(&assigned_dev->assigned_dev_lock); if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - index = find_index_from_host_irq(assigned_dev, irq); - if (index >= 0) { - vector = assigned_dev-> - guest_msix_entries[index].vector; + struct kvm_guest_msix_entry *guest_entries = + assigned_dev->guest_msix_entries; + for (i = 0; i < assigned_dev->entries_nr; i++) { + if (!(guest_entries[i].flags & + KVM_ASSIGNED_MSIX_PENDING)) + continue; + guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1); + assigned_dev->irq_source_id, + guest_entries[i].vector, 1); } } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, assigned_dev->guest_irq, 1); + spin_unlock_irq(&assigned_dev->assigned_dev_lock); +} + +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +{ + unsigned long flags; + struct kvm_assigned_dev_kernel *assigned_dev = + (struct kvm_assigned_dev_kernel *) dev_id; + + spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int index = find_index_from_host_irq(assigned_dev, irq); + if (index < 0) + goto out; + assigned_dev->guest_msix_entries[index].flags |= + KVM_ASSIGNED_MSIX_PENDING; + } + + schedule_work(&assigned_dev->interrupt_work); + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + } + +out: + spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); return IRQ_HANDLED; } @@ -127,6 +156,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_assigned_dev_kernel *dev; + unsigned long flags; if (kian->gsi == -1) return; @@ -139,12 +169,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) /* The guest irq may be shared so this ack may be * from another device. */ - spin_lock(&dev->intx_lock); + spin_lock_irqsave(&dev->assigned_dev_lock, flags); if (dev->host_irq_disabled) { enable_irq(dev->host_irq); dev->host_irq_disabled = false; } - spin_unlock(&dev->intx_lock); + spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); } static void deassign_guest_irq(struct kvm *kvm, @@ -153,9 +183,6 @@ static void deassign_guest_irq(struct kvm *kvm, kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); assigned_dev->ack_notifier.gsi = -1; - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0); - if (assigned_dev->irq_source_id != -1) kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); assigned_dev->irq_source_id = -1; @@ -167,19 +194,28 @@ static void deassign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { /* - * We disable irq here to prevent further events. + * In kvm_free_device_irq, cancel_work_sync return true if: + * 1. work is scheduled, and then cancelled. + * 2. work callback is executed. + * + * The first one ensured that the irq is disabled and no more events + * would happen. But for the second one, the irq may be enabled (e.g. + * for MSI). So we disable irq here to prevent further events. * * Notice this maybe result in nested disable if the interrupt type is * INTx, but it's OK for we are going to free it. * * If this function is a part of VM destroy, please ensure that till * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. + * interrupt_work done. */ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { int i; for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); + disable_irq_nosync(assigned_dev-> + host_msix_entries[i].vector); + + cancel_work_sync(&assigned_dev->interrupt_work); for (i = 0; i < assigned_dev->entries_nr; i++) free_irq(assigned_dev->host_msix_entries[i].vector, @@ -191,7 +227,8 @@ static void deassign_host_irq(struct kvm *kvm, pci_disable_msix(assigned_dev->dev); } else { /* Deal with MSI and INTx */ - disable_irq(assigned_dev->host_irq); + disable_irq_nosync(assigned_dev->host_irq); + cancel_work_sync(&assigned_dev->interrupt_work); free_irq(assigned_dev->host_irq, (void *)assigned_dev); @@ -237,8 +274,7 @@ static void kvm_free_assigned_device(struct kvm *kvm, { kvm_free_assigned_irq(kvm, assigned_dev); - __pci_reset_function(assigned_dev->dev); - pci_restore_state(assigned_dev->dev); + pci_reset_function(assigned_dev->dev); pci_release_regions(assigned_dev->dev); pci_disable_device(assigned_dev->dev); @@ -271,8 +307,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, * on the same interrupt line is not a happy situation: there * are going to be long delays in accepting, acking, etc. */ - if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - IRQF_ONESHOT, dev->irq_name, (void *)dev)) + if (request_irq(dev->host_irq, kvm_assigned_dev_intr, + 0, "kvm_assigned_intx_device", (void *)dev)) return -EIO; return 0; } @@ -290,8 +326,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, } dev->host_irq = dev->dev->irq; - if (kvm_request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - 0, dev->irq_name, (void *)dev)) { + if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_msi_device", (void *)dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -316,19 +352,16 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, return r; for (i = 0; i < dev->entries_nr; i++) { - r = kvm_request_threaded_irq(dev->host_msix_entries[i].vector, - NULL, kvm_assigned_dev_thread, - 0, dev->irq_name, (void *)dev); + r = request_irq(dev->host_msix_entries[i].vector, + kvm_assigned_dev_intr, 0, + "kvm_assigned_msix_device", + (void *)dev); + /* FIXME: free requested_irq's on failure */ if (r) - goto err; + return r; } return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, (void *)dev); - pci_disable_msix(dev->dev); - return r; } #endif @@ -375,9 +408,6 @@ static int assign_host_irq(struct kvm *kvm, if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) return r; - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - switch (host_irq_type) { case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); @@ -454,6 +484,9 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *match; unsigned long host_irq_type, guest_irq_type; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (!irqchip_in_kernel(kvm)) return r; @@ -555,7 +588,6 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, } pci_reset_function(dev); - pci_save_state(dev); match->assigned_dev_id = assigned_dev->assigned_dev_id; match->host_segnr = assigned_dev->segnr; @@ -563,10 +595,12 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->host_devfn = assigned_dev->devfn; match->flags = assigned_dev->flags; match->dev = dev; - spin_lock_init(&match->intx_lock); + spin_lock_init(&match->assigned_dev_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); list_add(&match->list, &kvm->arch.assigned_dev_head); @@ -586,7 +620,6 @@ out: mutex_unlock(&kvm->lock); return r; out_list_del: - pci_restore_state(dev); list_del(&match->list); pci_release_regions(dev); out_disable: @@ -659,9 +692,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, r = -ENOMEM; goto msix_nr_out; } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); + adev->guest_msix_entries = kzalloc( + sizeof(struct kvm_guest_msix_entry) * + entry_nr->entry_nr, GFP_KERNEL); if (!adev->guest_msix_entries) { kfree(adev->host_msix_entries); r = -ENOMEM; @@ -713,8 +746,8 @@ msix_entry_out: long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, unsigned long arg) { - void *argp = (void *)arg; - int r; + void __user *argp = (void __user *)arg; + int r = -ENOTTY; switch (ioctl) { case KVM_ASSIGN_PCI_DEVICE: { @@ -732,6 +765,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, r = -EOPNOTSUPP; break; } +#ifdef KVM_CAP_ASSIGN_DEV_IRQ case KVM_ASSIGN_DEV_IRQ: { struct kvm_assigned_irq assigned_irq; @@ -754,6 +788,8 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } +#endif +#ifdef KVM_CAP_DEVICE_DEASSIGNMENT case KVM_DEASSIGN_PCI_DEVICE: { struct kvm_assigned_pci_dev assigned_dev; @@ -765,10 +801,11 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } +#endif #ifdef KVM_CAP_IRQ_ROUTING case KVM_SET_GSI_ROUTING: { struct kvm_irq_routing routing; - struct kvm_irq_routing *urouting; + struct kvm_irq_routing __user *urouting; struct kvm_irq_routing_entry *entries; r = -EFAULT; @@ -817,9 +854,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, break; } #endif - default: - r = -ENOTTY; - break; } out: return r; diff --git a/linux/x86/coalesced_mmio.c b/linux/x86/coalesced_mmio.c index 850773e..d84dadf 100644 --- a/linux/x86/coalesced_mmio.c +++ b/linux/x86/coalesced_mmio.c @@ -42,7 +42,6 @@ * KVM coalesced MMIO * * Copyright (c) 2008 Bull S.A.S. - * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Author: Laurent Vivier <Laurent.Vivier@bull.net> * @@ -161,10 +160,8 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) return ret; out_free_dev: - kvm->coalesced_mmio_dev = NULL; kfree(dev); out_free_page: - kvm->coalesced_mmio_ring = NULL; __free_page(page); out_err: return ret; @@ -182,7 +179,7 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; if (dev == NULL) - return -ENXIO; + return -EINVAL; mutex_lock(&kvm->slots_lock); if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { @@ -205,7 +202,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *z; if (dev == NULL) - return -ENXIO; + return -EINVAL; mutex_lock(&kvm->slots_lock); diff --git a/linux/x86/emulate.c b/linux/x86/emulate.c index cffdf3c..48e50c7 100644 --- a/linux/x86/emulate.c +++ b/linux/x86/emulate.c @@ -49,7 +49,6 @@ * privileged instructions: * * Copyright (C) 2006 Qumranet - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> @@ -60,13 +59,20 @@ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ +#ifndef __KERNEL__ +#include <stdio.h> +#include <stdint.h> +#include <public/xen.h> +#define DPRINTF(_f, _a ...) printf(_f , ## _a) +#else #include <linux/kvm_host.h> #include "kvm_cache_regs.h" +#define DPRINTF(x...) do {} while (0) +#endif #include <linux/module.h> #include <asm/kvm_emulate.h> #include "x86.h" -#include "tss.h" /* * Opcode effective-address decode tables. @@ -83,13 +89,11 @@ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ -#define DstDI (5<<1) /* Destination is in ES:(E)DI */ -#define DstMem64 (6<<1) /* 64bit memory operand */ -#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ +#define DstAcc (4<<1) /* Destination Accumulator */ #define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ +#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ #define SrcReg (1<<4) /* Register operand. */ #define SrcMem (2<<4) /* Memory operand. */ #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ @@ -99,11 +103,6 @@ #define SrcOne (7<<4) /* Implied '1' */ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ #define SrcImmU (9<<4) /* Immediate operand, unsigned */ -#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ -#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ -#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ -#define SrcAcc (0xd<<4) /* Source Accumulator */ -#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -115,10 +114,8 @@ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ -#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ -#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ -#define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) @@ -127,30 +124,285 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) -#define Src2Imm (4<<29) +#define Src2Imm16 (4<<29) #define Src2Mask (7<<29) -#define X2(x...) x, x -#define X3(x...) X2(x), x -#define X4(x...) X2(x), X2(x) -#define X5(x...) X4(x), x -#define X6(x...) X4(x), X2(x) -#define X7(x...) X4(x), X3(x) -#define X8(x...) X4(x), X4(x) -#define X16(x...) X8(x), X8(x) - -struct opcode { - u32 flags; - union { - int (*execute)(struct x86_emulate_ctxt *ctxt); - struct opcode *group; - struct group_dual *gdual; - } u; +enum { + Group1_80, Group1_81, Group1_82, Group1_83, + Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; -struct group_dual { - struct opcode mod012[8]; - struct opcode mod3[8]; +static u32 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + 0, 0, + /* 0x40 - 0x47 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x48 - 0x4F */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x50 - 0x57 */ + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + /* 0x58 - 0x5F */ + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + /* 0x60 - 0x67 */ + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + /* 0x70 - 0x77 */ + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + /* 0x78 - 0x7F */ + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + /* 0x80 - 0x87 */ + Group | Group1_80, Group | Group1_81, + Group | Group1_82, Group | Group1_83, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, + DstReg | SrcMem | ModRM | Mov, Group | Group1A, + /* 0x90 - 0x97 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x98 - 0x9F */ + 0, 0, SrcImm | Src2Imm16 | No64, 0, + ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xB0 - 0xB7 */ + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + /* 0xB8 - 0xBF */ + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps | Stack, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, ImplicitOps | Stack, + ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + 0, 0, 0, 0, + ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte, SrcImmUByte, + /* 0xE8 - 0xEF */ + SrcImm | Stack, SrcImm | ImplicitOps, + SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, + /* 0xF8 - 0xFF */ + ImplicitOps, 0, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, +}; + +static u32 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + ImplicitOps | Stack, ImplicitOps | Stack, + 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, 0, 0, + /* 0xA8 - 0xAF */ + ImplicitOps | Stack, ImplicitOps | Stack, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, + ModRM, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static u32 group_table[] = { + [Group1_80*8] = + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, + [Group1_81*8] = + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, + [Group1_82*8] = + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, + [Group1_83*8] = + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, + [Group1A*8] = + DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, + [Group3_Byte*8] = + ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group3*8] = + DstMem | SrcImm | ModRM, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group4*8] = + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, 0, 0, + [Group5*8] = + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, + [Group7*8] = + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + [Group9*8] = + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, +}; + +static u32 group2_table[] = { + [Group7*8] = + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ @@ -172,9 +424,6 @@ struct group_dual { #define EFLG_PF (1<<2) #define EFLG_CF (1<<0) -#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a -#define EFLG_RESERVED_ONE_MASK 2 - /* * Instruction emulation: * Most instructions are emulated directly via a fragment of inline assembly @@ -227,13 +476,13 @@ struct group_dual { #define ON64(x) #endif -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ do { \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ + : "=m" (_eflags), "=m" ((_dst).val), \ "=&r" (_tmp) \ : _y ((_src).val), "i" (EFLAGS_MASK)); \ } while (0) @@ -246,13 +495,13 @@ struct group_dual { \ switch ((_dst).bytes) { \ case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ break; \ case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ break; \ case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ break; \ } \ } while (0) @@ -262,7 +511,7 @@ struct group_dual { unsigned long _tmp; \ switch ((_dst).bytes) { \ case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ break; \ default: \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ @@ -349,91 +598,16 @@ struct group_dual { } \ } while (0) -#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "1") \ - _op _suffix " %5; " \ - _POST_EFLAGS("0", "4", "1") \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ - } while (0) - -#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "1") \ - "1: \n\t" \ - _op _suffix " %6; " \ - "2: \n\t" \ - _POST_EFLAGS("0", "5", "1") \ - ".pushsection .fixup,\"ax\" \n\t" \ - "3: movb $1, %4 \n\t" \ - "jmp 2b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(1b, 3b) \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ - } while (0) - -/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ - do { \ - switch((_src).bytes) { \ - case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ - case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ - case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ - } \ - } while (0) - -#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ - do { \ - switch((_src).bytes) { \ - case 1: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "l", _ex); \ - break; \ - case 8: ON64( \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "q", _ex)); \ - break; \ - } \ - } while (0) - /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != X86EMUL_CONTINUE) \ + if (rc != 0) \ goto done; \ (_eip) += (_size); \ (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ -({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - (_eip) += (_size); \ -}) - static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -450,9 +624,9 @@ address_mask(struct decode_cache *c, unsigned long reg) } static inline unsigned long -register_address(struct decode_cache *c, unsigned long reg) +register_address(struct decode_cache *c, unsigned long base, unsigned long reg) { - return address_mask(c, reg); + return base + address_mask(c, reg); } static inline void @@ -475,102 +649,69 @@ static void set_seg_override(struct decode_cache *c, int seg) c->seg_override = seg; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return ops->get_cached_segment_base(seg, ctxt->vcpu); + return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); } -static unsigned seg_override(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - struct decode_cache *c) +static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct decode_cache *c) { if (!c->has_seg_override) return 0; - return c->seg_override; -} - -static ulong linear(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr) -{ - struct decode_cache *c = &ctxt->decode; - ulong la; - - la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; - if (c->ad_bytes != 8) - la &= (u32)-1; - return la; -} - -static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, - u32 error, bool valid) -{ - ctxt->exception.vector = vec; - ctxt->exception.error_code = error; - ctxt->exception.error_code_valid = valid; - return X86EMUL_PROPAGATE_FAULT; -} - -static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) -{ - return emulate_exception(ctxt, GP_VECTOR, err, true); -} - -static int emulate_ud(struct x86_emulate_ctxt *ctxt) -{ - return emulate_exception(ctxt, UD_VECTOR, 0, false); + return seg_base(ctxt, c->seg_override); } -static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) +static unsigned long es_base(struct x86_emulate_ctxt *ctxt) { - return emulate_exception(ctxt, TS_VECTOR, err, true); + return seg_base(ctxt, VCPU_SREG_ES); } -static int emulate_de(struct x86_emulate_ctxt *ctxt) +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) { - return emulate_exception(ctxt, DE_VECTOR, 0, false); + return seg_base(ctxt, VCPU_SREG_SS); } static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, - unsigned long eip, u8 *dest) + unsigned long linear, u8 *dest) { struct fetch_cache *fc = &ctxt->decode.fetch; int rc; - int size, cur_size; + int size; - if (eip == fc->end) { - cur_size = fc->end - fc->start; - size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); - rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, - size, ctxt->vcpu, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) + if (linear < fc->start || linear >= fc->end) { + size = min(15UL, PAGE_SIZE - offset_in_page(linear)); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); + if (rc) return rc; - fc->end += size; + fc->start = linear; + fc->end = linear + size; } - *dest = fc->data[eip - fc->start]; - return X86EMUL_CONTINUE; + *dest = fc->data[linear - fc->start]; + return 0; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, void *dest, unsigned size) { - int rc; + int rc = 0; /* x86 instructions are limited to 15 bytes. */ - if (eip + size - ctxt->eip > 15) + if (eip + size - ctxt->decode.eip_orig > 15) return X86EMUL_UNHANDLEABLE; + eip += ctxt->cs_base; while (size--) { rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc != X86EMUL_CONTINUE) + if (rc) return rc; } - return X86EMUL_CONTINUE; + return 0; } /* @@ -591,7 +732,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, - struct segmented_address addr, + void *ptr, u16 *size, unsigned long *address, int op_bytes) { int rc; @@ -599,13 +740,12 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, - ctxt->vcpu, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, + ctxt->vcpu, NULL); + if (rc) return rc; - addr.ea += 2; - rc = ops->read_std(linear(ctxt, addr), address, op_bytes, - ctxt->vcpu, &ctxt->exception); + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, + ctxt->vcpu, NULL); return rc; } @@ -644,24 +784,6 @@ static int test_cc(unsigned int condition, unsigned int flags) return (!!rc ^ (condition & 1)); } -static void fetch_register_operand(struct operand *op) -{ - switch (op->bytes) { - case 1: - op->val = *(u8 *)op->addr.reg; - break; - case 2: - op->val = *(u16 *)op->addr.reg; - break; - case 4: - op->val = *(u32 *)op->addr.reg; - break; - case 8: - op->val = *(u64 *)op->addr.reg; - break; - } -} - static void decode_register_operand(struct operand *op, struct decode_cache *c, int inhibit_bytereg) @@ -673,25 +795,34 @@ static void decode_register_operand(struct operand *op, reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { - op->addr.reg = decode_register(reg, c->regs, highbyte_regs); + op->ptr = decode_register(reg, c->regs, highbyte_regs); + op->val = *(u8 *)op->ptr; op->bytes = 1; } else { - op->addr.reg = decode_register(reg, c->regs, 0); + op->ptr = decode_register(reg, c->regs, 0); op->bytes = c->op_bytes; + switch (op->bytes) { + case 2: + op->val = *(u16 *)op->ptr; + break; + case 4: + op->val = *(u32 *)op->ptr; + break; + case 8: + op->val = *(u64 *) op->ptr; + break; + } } - fetch_register_operand(op); op->orig_val = op->val; } static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - struct operand *op) + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; - int rc = X86EMUL_CONTINUE; - ulong modrm_ea = 0; + int rc = 0; if (c->rex_prefix) { c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ @@ -703,19 +834,16 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_mod |= (c->modrm & 0xc0) >> 6; c->modrm_reg |= (c->modrm & 0x38) >> 3; c->modrm_rm |= (c->modrm & 0x07); - c->modrm_seg = VCPU_SREG_DS; + c->modrm_ea = 0; + c->use_modrm_ea = 1; if (c->modrm_mod == 3) { - op->type = OP_REG; - op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - op->addr.reg = decode_register(c->modrm_rm, + c->modrm_ptr = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); - fetch_register_operand(op); + c->modrm_val = *(unsigned long *)c->modrm_ptr; return rc; } - op->type = OP_MEM; - if (c->ad_bytes == 2) { unsigned bx = c->regs[VCPU_REGS_RBX]; unsigned bp = c->regs[VCPU_REGS_RBP]; @@ -726,46 +854,47 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (c->modrm_mod) { case 0: if (c->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, c->eip); + c->modrm_ea += insn_fetch(u16, 2, c->eip); break; case 1: - modrm_ea += insn_fetch(s8, 1, c->eip); + c->modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - modrm_ea += insn_fetch(u16, 2, c->eip); + c->modrm_ea += insn_fetch(u16, 2, c->eip); break; } switch (c->modrm_rm) { case 0: - modrm_ea += bx + si; + c->modrm_ea += bx + si; break; case 1: - modrm_ea += bx + di; + c->modrm_ea += bx + di; break; case 2: - modrm_ea += bp + si; + c->modrm_ea += bp + si; break; case 3: - modrm_ea += bp + di; + c->modrm_ea += bp + di; break; case 4: - modrm_ea += si; + c->modrm_ea += si; break; case 5: - modrm_ea += di; + c->modrm_ea += di; break; case 6: if (c->modrm_mod != 0) - modrm_ea += bp; + c->modrm_ea += bp; break; case 7: - modrm_ea += bx; + c->modrm_ea += bx; break; } if (c->modrm_rm == 2 || c->modrm_rm == 3 || (c->modrm_rm == 6 && c->modrm_mod != 0)) - c->modrm_seg = VCPU_SREG_SS; - modrm_ea = (u16)modrm_ea; + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_SS); + c->modrm_ea = (u16)c->modrm_ea; } else { /* 32/64-bit ModR/M decode. */ if ((c->modrm_rm & 7) == 4) { @@ -775,377 +904,358 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, scale = sib >> 6; if ((base_reg & 7) == 5 && c->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, c->eip); + c->modrm_ea += insn_fetch(s32, 4, c->eip); else - modrm_ea += c->regs[base_reg]; + c->modrm_ea += c->regs[base_reg]; if (index_reg != 4) - modrm_ea += c->regs[index_reg] << scale; + c->modrm_ea += c->regs[index_reg] << scale; } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) c->rip_relative = 1; } else - modrm_ea += c->regs[c->modrm_rm]; + c->modrm_ea += c->regs[c->modrm_rm]; switch (c->modrm_mod) { case 0: if (c->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, c->eip); + c->modrm_ea += insn_fetch(s32, 4, c->eip); break; case 1: - modrm_ea += insn_fetch(s8, 1, c->eip); + c->modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - modrm_ea += insn_fetch(s32, 4, c->eip); + c->modrm_ea += insn_fetch(s32, 4, c->eip); break; } } - op->addr.mem.ea = modrm_ea; done: return rc; } static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - struct operand *op) + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; + int rc = 0; - op->type = OP_MEM; switch (c->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, c->eip); + c->modrm_ea = insn_fetch(u16, 2, c->eip); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, c->eip); + c->modrm_ea = insn_fetch(u32, 4, c->eip); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, c->eip); + c->modrm_ea = insn_fetch(u64, 8, c->eip); break; } done: return rc; } -static void fetch_bit_operand(struct decode_cache *c) +int +x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { - long sv = 0, mask; + struct decode_cache *c = &ctxt->decode; + int rc = 0; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes, group; - if (c->dst.type == OP_MEM && c->src.type == OP_REG) { - mask = ~(c->dst.bytes * 8 - 1); + /* Shadow copy of register state. Committed on successful emulation. */ - if (c->src.bytes == 2) - sv = (s16)c->src.val & (s16)mask; - else if (c->src.bytes == 4) - sv = (s32)c->src.val & (s32)mask; + memset(c, 0, sizeof(struct decode_cache)); + c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); + ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - c->dst.addr.mem.ea += (sv >> 3); + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; } - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; -} - -static int read_emulated(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long addr, void *dest, unsigned size) -{ - int rc; - struct read_cache *mc = &ctxt->decode.mem_read; - - while (size) { - int n = min(size, 8u); - size -= n; - if (mc->pos < mc->end) - goto read_cached; + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; - rc = ops->read_emulated(addr, mc->data + mc->end, n, - &ctxt->exception, ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; - mc->end += n; + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ + case 0x2e: /* CS override */ + case 0x36: /* SS override */ + case 0x3e: /* DS override */ + set_seg_override(c, (c->b >> 3) & 3); + break; + case 0x64: /* FS override */ + case 0x65: /* GS override */ + set_seg_override(c, c->b & 7); + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } - read_cached: - memcpy(dest, mc->data + mc->pos, n); - mc->pos += n; - dest += n; - addr += n; - } - return X86EMUL_CONTINUE; -} + /* Any legacy prefix after a REX prefix nullifies its effect. */ -static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned int size, unsigned short port, - void *dest) -{ - struct read_cache *rc = &ctxt->decode.io_read; - - if (rc->pos == rc->end) { /* refill pio read ahead */ - struct decode_cache *c = &ctxt->decode; - unsigned int in_page, n; - unsigned int count = c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; - in_page = (ctxt->eflags & EFLG_DF) ? - offset_in_page(c->regs[VCPU_REGS_RDI]) : - PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); - n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, - count); - if (n == 0) - n = 1; - rc->pos = rc->end = 0; - if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) - return 0; - rc->end = n * size; + c->rex_prefix = 0; } - memcpy(dest, rc->data + rc->pos, size); - rc->pos += size; - return 1; -} - -static u32 desc_limit_scaled(struct kvm_desc_struct *desc) -{ - u32 limit = kvm_get_desc_limit(desc); - - return desc->g ? (limit << 12) | 0xfff : limit; -} - -static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, struct kvm_desc_ptr *dt) -{ - if (selector & 1 << 2) { - struct kvm_desc_struct desc; - memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) - return; - - dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ - dt->address = kvm_get_desc_base(&desc); - } else - ops->get_gdt(dt, ctxt->vcpu); -} - -/* allowed just for 8 bytes segments */ -static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, struct kvm_desc_struct *desc) -{ - struct kvm_desc_ptr dt; - u16 index = selector >> 3; - int ret; - ulong addr; - - get_descriptor_table_ptr(ctxt, ops, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); - addr = dt.address + index * 8; - ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); - - return ret; -} - -/* allowed just for 8 bytes segments */ -static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, struct kvm_desc_struct *desc) -{ - struct kvm_desc_ptr dt; - u16 index = selector >> 3; - ulong addr; - int ret; - - get_descriptor_table_ptr(ctxt, ops, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); - - addr = dt.address + index * 8; - ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); +done_prefixes: - return ret; -} + /* REX prefix. */ + if (c->rex_prefix) + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ -static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, int seg) -{ - struct kvm_desc_struct seg_desc; - u8 dpl, rpl, cpl; - unsigned err_vec = GP_VECTOR; - u32 err_code = 0; - bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ - int ret; - - memset(&seg_desc, 0, sizeof seg_desc); - - if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) - || ctxt->mode == X86EMUL_MODE_REAL) { - /* set real mode segment descriptor */ - kvm_set_desc_base(&seg_desc, selector << 4); - kvm_set_desc_limit(&seg_desc, 0xffff); - seg_desc.type = 3; - seg_desc.p = 1; - seg_desc.s = 1; - goto load; + /* Opcode byte(s). */ + c->d = opcode_table[c->b]; + if (c->d == 0) { + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + c->d = twobyte_table[c->b]; + } } - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) - && null_selector) - goto exception; + if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); + return -1; + } - /* TR should be in GDT only */ - if (seg == VCPU_SREG_TR && (selector & (1 << 2))) - goto exception; + if (c->d & Group) { + group = c->d & GroupMask; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; - if (null_selector) /* for NULL selector skip all following checks */ - goto load; + group = (group << 3) + ((c->modrm >> 3) & 7); + if ((c->d & GroupDual) && (c->modrm >> 6) == 3) + c->d = group2_table[group]; + else + c->d = group_table[group]; + } - ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } - err_code = selector & 0xfffc; - err_vec = GP_VECTOR; + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; - /* can't load system descriptor into segment selecor */ - if (seg <= VCPU_SREG_GS && !seg_desc.s) - goto exception; + /* ModRM and SIB bytes. */ + if (c->d & ModRM) + rc = decode_modrm(ctxt, ops); + else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops); + if (rc) + goto done; - if (!seg_desc.p) { - err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; - goto exception; - } + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); - rpl = selector & 3; - dpl = seg_desc.dpl; - cpl = ops->cpl(ctxt->vcpu); + if (!(!c->twobyte && c->b == 0x8d)) + c->modrm_ea += seg_override_base(ctxt, c); - switch (seg) { - case VCPU_SREG_SS: + if (c->ad_bytes != 8) + c->modrm_ea = (u32)c->modrm_ea; + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + c->src.bytes = 2; + goto srcmem_common; + case SrcMem32: + c->src.bytes = 4; + goto srcmem_common; + case SrcMem: + c->src.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) + break; + srcmem_common: /* - * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. */ - if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) - goto exception; - break; - case VCPU_SREG_CS: - if (!(seg_desc.type & 8)) - goto exception; - - if (seg_desc.type & 4) { - /* conforming */ - if (dpl > cpl) - goto exception; - } else { - /* nonconforming */ - if (rpl > cpl || dpl != cpl) - goto exception; + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->src.type = OP_REG; + c->src.val = c->modrm_val; + c->src.ptr = c->modrm_ptr; + break; } - /* CS(RPL) <- CPL */ - selector = (selector & 0xfffc) | cpl; + c->src.type = OP_MEM; break; - case VCPU_SREG_TR: - if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) - goto exception; + case SrcImm: + case SrcImmU: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + if ((c->d & SrcMask) == SrcImmU) { + switch (c->src.bytes) { + case 1: + c->src.val &= 0xff; + break; + case 2: + c->src.val &= 0xffff; + break; + case 4: + c->src.val &= 0xffffffff; + break; + } + } break; - case VCPU_SREG_LDTR: - if (seg_desc.s || seg_desc.type != 2) - goto exception; + case SrcImmByte: + case SrcImmUByte: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = 1; + if ((c->d & SrcMask) == SrcImmByte) + c->src.val = insn_fetch(s8, 1, c->eip); + else + c->src.val = insn_fetch(u8, 1, c->eip); break; - default: /* DS, ES, FS, or GS */ - /* - * segment is not a data or readable code segment or - * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) - */ - if ((seg_desc.type & 0xa) == 0x8 || - (((seg_desc.type & 0xc) != 0xc) && - (rpl > dpl && cpl > dpl))) - goto exception; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; break; } - if (seg_desc.s) { - /* mark segment as accessed */ - seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - } -load: - ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); - return X86EMUL_CONTINUE; -exception: - emulate_exception(ctxt, err_vec, err_code, true); - return X86EMUL_PROPAGATE_FAULT; -} - -static void write_register_operand(struct operand *op) -{ - /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ - switch (op->bytes) { - case 1: - *(u8 *)op->addr.reg = (u8)op->val; + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: break; - case 2: - *(u16 *)op->addr.reg = (u16)op->val; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; break; - case 4: - *op->addr.reg = (u32)op->val; - break; /* 64b: zero-extend */ - case 8: - *op->addr.reg = op->val; + case Src2ImmByte: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 1; + c->src2.val = insn_fetch(u8, 1, c->eip); + break; + case Src2Imm16: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 2; + c->src2.val = insn_fetch(u16, 2, c->eip); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; break; } -} - -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - switch (c->dst.type) { - case OP_REG: - write_register_operand(&c->dst); - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); - else - rc = ops->write_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + return 0; + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; - case OP_NONE: - /* no writeback */ + case DstMem: + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.type = OP_REG; + c->dst.val = c->dst.orig_val = c->modrm_val; + c->dst.ptr = c->modrm_ptr; + break; + } + c->dst.type = OP_MEM; break; - default: + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->op_bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; break; } - return X86EMUL_CONTINUE; + + if (c->rip_relative) + c->modrm_ea += c->eip; + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static inline void emulate_push(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -1153,8 +1263,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt, c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); - c->dst.addr.mem.seg = VCPU_SREG_SS; + c->dst.ptr = (void *) register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1163,11 +1273,10 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; int rc; - struct segmented_address addr; - addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); - addr.seg = VCPU_SREG_SS; - rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); + rc = ops->read_emulated(register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + dest, len, ctxt->vcpu); if (rc != X86EMUL_CONTINUE) return rc; @@ -1182,7 +1291,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, int rc; unsigned long val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ops->cpl(ctxt->vcpu); + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); rc = emulate_pop(ctxt, ops, &val, len); if (rc != X86EMUL_CONTINUE) @@ -1201,8 +1310,10 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, change_mask |= EFLG_IF; break; case X86EMUL_MODE_VM86: - if (iopl < 3) - return emulate_gp(ctxt, 0); + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } change_mask |= EFLG_IF; break; default: /* real mode */ @@ -1216,14 +1327,15 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; + struct kvm_segment segment; - c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); + kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); - emulate_push(ctxt, ops); + c->src.val = segment.selector; + emulate_push(ctxt); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1234,45 +1346,33 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int rc; rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); - if (rc != X86EMUL_CONTINUE) + if (rc != 0) return rc; - rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); return rc; } -static int emulate_pusha(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static void emulate_pusha(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; - int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt, ops); - - rc = writeback(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - return rc; - + emulate_push(ctxt); ++reg; } - - /* Disable writeback. */ - c->dst.type = OP_NONE; - - return rc; } static int emulate_popa(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; + int rc = 0; int reg = VCPU_REGS_RDI; while (reg >= VCPU_REGS_RAX) { @@ -1283,160 +1383,23 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, } rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); - if (rc != X86EMUL_CONTINUE) + if (rc != 0) break; --reg; } return rc; } -int emulate_int_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int irq) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - struct kvm_desc_ptr dt; - gva_t cs_addr; - gva_t eip_addr; - u16 cs, eip; - - /* TODO: Add limit checks */ - c->src.val = ctxt->eflags; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - - c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->src.val = c->eip; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->dst.type = OP_NONE; - - ops->get_idt(&dt, ctxt->vcpu); - - eip_addr = dt.address + (irq << 2); - cs_addr = dt.address + (irq << 2) + 2; - - rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->eip = eip; - - return rc; -} - -static int emulate_int(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int irq) -{ - switch(ctxt->mode) { - case X86EMUL_MODE_REAL: - return emulate_int_real(ctxt, ops, irq); - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - case X86EMUL_MODE_PROT32: - case X86EMUL_MODE_PROT64: - default: - /* Protected mode interrupts unimplemented yet */ - return X86EMUL_UNHANDLEABLE; - } -} - -static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - unsigned long temp_eip = 0; - unsigned long temp_eflags = 0; - unsigned long cs = 0; - unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | - EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | - EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ - unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; - - /* TODO: Add stack limit check */ - - rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - if (temp_eip & ~0xffff) - return emulate_gp(ctxt, 0); - - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); - - if (rc != X86EMUL_CONTINUE) - return rc; - - c->eip = temp_eip; - - - if (c->op_bytes == 4) - ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); - else if (c->op_bytes == 2) { - ctxt->eflags &= ~0xffff; - ctxt->eflags |= temp_eflags; - } - - ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ - ctxt->eflags |= EFLG_RESERVED_ONE_MASK; - - return rc; -} - -static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops* ops) -{ - switch(ctxt->mode) { - case X86EMUL_MODE_REAL: - return emulate_iret_real(ctxt, ops); - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - case X86EMUL_MODE_PROT32: - case X86EMUL_MODE_PROT64: - default: - /* iret from protected mode unimplemented yet */ - return X86EMUL_UNHANDLEABLE; - } -} - static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; + int rc; - return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + if (rc != 0) + return rc; + return 0; } static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) @@ -1472,9 +1435,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - unsigned long *rax = &c->regs[VCPU_REGS_RAX]; - unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; - u8 de = 0; + int rc = 0; switch (c->modrm_reg) { case 0 ... 1: /* test */ @@ -1486,26 +1447,12 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, case 3: /* neg */ emulate_1op("neg", c->dst, ctxt->eflags); break; - case 4: /* mul */ - emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); - break; - case 5: /* imul */ - emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); - break; - case 6: /* div */ - emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, - ctxt->eflags, de); - break; - case 7: /* idiv */ - emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, - ctxt->eflags, de); - break; default: - return X86EMUL_UNHANDLEABLE; + DPRINTF("Cannot emulate %02x\n", c->b); + rc = X86EMUL_UNHANDLEABLE; + break; } - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; + return rc; } static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, @@ -1525,37 +1472,48 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt, ops); + emulate_push(ctxt); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; case 6: /* push */ - emulate_push(ctxt, ops); + emulate_push(ctxt); break; } - return X86EMUL_CONTINUE; + return 0; } static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + unsigned long memop) { struct decode_cache *c = &ctxt->decode; - u64 old = c->dst.orig_val64; + u64 old, new; + int rc; + + rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { + c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; + } else { - c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; + new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; + rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; ctxt->eflags |= EFLG_ZF; } - return X86EMUL_CONTINUE; + return 0; } static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, @@ -1566,108 +1524,153 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, unsigned long cs; rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); - if (rc != X86EMUL_CONTINUE) + if (rc) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - if (rc != X86EMUL_CONTINUE) + if (rc) return rc; - rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } -static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - struct decode_cache *c = &ctxt->decode; - unsigned short sel; int rc; + struct decode_cache *c = &ctxt->decode; - memcpy(&sel, c->src.valptr + c->op_bytes, 2); - - rc = load_segment_descriptor(ctxt, ops, sel, seg); - if (rc != X86EMUL_CONTINUE) - return rc; + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return 0; +} - c->dst.val = c->src.val; - return rc; +static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + ctxt->interruptibility = mask; } static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct kvm_desc_struct *cs, - struct kvm_desc_struct *ss) + struct kvm_segment *cs, struct kvm_segment *ss) { - memset(cs, 0, sizeof(struct kvm_desc_struct)); - ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); - memset(ss, 0, sizeof(struct kvm_desc_struct)); + memset(cs, 0, sizeof(struct kvm_segment)); + kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); + memset(ss, 0, sizeof(struct kvm_segment)); cs->l = 0; /* will be adjusted later */ - kvm_set_desc_base(cs, 0); /* flat segment */ + cs->base = 0; /* flat segment */ cs->g = 1; /* 4kb granularity */ - kvm_set_desc_limit(cs, 0xfffff); /* 4GB limit */ + cs->limit = 0xffffffff; /* 4GB limit */ cs->type = 0x0b; /* Read, Execute, Accessed */ cs->s = 1; cs->dpl = 0; /* will be adjusted later */ - cs->p = 1; - cs->d = 1; + cs->present = 1; + cs->db = 1; - kvm_set_desc_base(ss, 0); /* flat segment */ - kvm_set_desc_limit(ss, 0xfffff); /* 4GB limit */ + ss->unusable = 0; + ss->base = 0; /* flat segment */ + ss->limit = 0xffffffff; /* 4GB limit */ ss->g = 1; /* 4kb granularity */ ss->s = 1; ss->type = 0x03; /* Read/Write, Accessed */ - ss->d = 1; /* 32bit stack segment */ + ss->db = 1; /* 32bit stack segment */ ss->dpl = 0; - ss->p = 1; + ss->present = 1; } static int -emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +emulate_syscall(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - struct kvm_desc_struct cs, ss; + struct kvm_segment cs, ss; u64 msr_data; - u16 cs_sel, ss_sel; /* syscall is not available in real mode */ - if (ctxt->mode == X86EMUL_MODE_REAL || - ctxt->mode == X86EMUL_MODE_VM86) - return emulate_ud(ctxt); + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) + return X86EMUL_UNHANDLEABLE; - setup_syscalls_segments(ctxt, ops, &cs, &ss); + setup_syscalls_segments(ctxt, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; - cs_sel = (u16)(msr_data & 0xfffc); - ss_sel = (u16)(msr_data + 8); + cs.selector = (u16)(msr_data & 0xfffc); + ss.selector = (u16)(msr_data + 8); if (is_long_mode(ctxt->vcpu)) { - cs.d = 0; + cs.db = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); c->regs[VCPU_REGS_RCX] = c->eip; if (is_long_mode(ctxt->vcpu)) { #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - ops->get_msr(ctxt->vcpu, - ctxt->mode == X86EMUL_MODE_PROT64 ? - MSR_LSTAR : MSR_CSTAR, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1677,77 +1680,81 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +emulate_sysenter(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - struct kvm_desc_struct cs, ss; + struct kvm_segment cs, ss; u64 msr_data; - u16 cs_sel, ss_sel; /* inject #GP if in real mode */ - if (ctxt->mode == X86EMUL_MODE_REAL) - return emulate_gp(ctxt, 0); + if (ctxt->mode == X86EMUL_MODE_REAL) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_UNHANDLEABLE; + } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return emulate_ud(ctxt); + return X86EMUL_UNHANDLEABLE; - setup_syscalls_segments(ctxt, ops, &cs, &ss); + setup_syscalls_segments(ctxt, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: - if ((msr_data & 0xfffc) == 0x0) - return emulate_gp(ctxt, 0); + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } break; case X86EMUL_MODE_PROT64: - if (msr_data == 0x0) - return emulate_gp(ctxt, 0); + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } break; } ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - cs_sel = (u16)msr_data; - cs_sel &= ~SELECTOR_RPL_MASK; - ss_sel = cs_sel + 8; - ss_sel &= ~SELECTOR_RPL_MASK; + cs.selector = (u16)msr_data; + cs.selector &= ~SELECTOR_RPL_MASK; + ss.selector = cs.selector + 8; + ss.selector &= ~SELECTOR_RPL_MASK; if (ctxt->mode == X86EMUL_MODE_PROT64 || is_long_mode(ctxt->vcpu)) { - cs.d = 0; + cs.db = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; } static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +emulate_sysexit(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - struct kvm_desc_struct cs, ss; + struct kvm_segment cs, ss; u64 msr_data; int usermode; - u16 cs_sel, ss_sel; /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || - ctxt->mode == X86EMUL_MODE_VM86) - return emulate_gp(ctxt, 0); + ctxt->mode == X86EMUL_MODE_VM86) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_UNHANDLEABLE; + } - setup_syscalls_segments(ctxt, ops, &cs, &ss); + setup_syscalls_segments(ctxt, &cs, &ss); if ((c->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -1756,39 +1763,40 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.dpl = 3; ss.dpl = 3; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: - cs_sel = (u16)(msr_data + 16); - if ((msr_data & 0xfffc) == 0x0) - return emulate_gp(ctxt, 0); - ss_sel = (u16)(msr_data + 24); + cs.selector = (u16)(msr_data + 16); + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + ss.selector = (u16)(msr_data + 24); break; case X86EMUL_MODE_PROT64: - cs_sel = (u16)(msr_data + 32); - if (msr_data == 0x0) - return emulate_gp(ctxt, 0); - ss_sel = cs_sel + 8; - cs.d = 0; + cs.selector = (u16)(msr_data + 32); + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + ss.selector = cs.selector + 8; + cs.db = 0; cs.l = 1; break; } - cs_sel |= SELECTOR_RPL_MASK; - ss_sel |= SELECTOR_RPL_MASK; + cs.selector |= SELECTOR_RPL_MASK; + ss.selector |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - c->eip = c->regs[VCPU_REGS_RDX]; - c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; return X86EMUL_CONTINUE; } -static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) { int iopl; if (ctxt->mode == X86EMUL_MODE_REAL) @@ -1796,32 +1804,32 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return ops->cpl(ctxt->vcpu) > iopl; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; } static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { - struct kvm_desc_struct tr_seg; + struct kvm_segment tr_seg; int r; u16 io_bitmap_ptr; u8 perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; - ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); - if (!tr_seg.p) + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) return false; - if (desc_limit_scaled(&tr_seg) < 103) + if (tr_seg.limit < 103) return false; - r = ops->read_std(kvm_get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, - ctxt->vcpu, NULL); + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); if (r != X86EMUL_CONTINUE) return false; - if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) + if (io_bitmap_ptr + port/8 > tr_seg.limit) return false; - r = ops->read_std(kvm_get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, - &perm, 1, ctxt->vcpu, NULL); + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -1833,1231 +1841,118 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { - if (ctxt->perm_ok) - return true; - - if (emulator_bad_iopl(ctxt, ops)) + if (emulator_bad_iopl(ctxt)) if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) return false; - - ctxt->perm_ok = true; - return true; } -static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - struct tss_segment_16 *tss) -{ - struct decode_cache *c = &ctxt->decode; - - tss->ip = c->eip; - tss->flag = ctxt->eflags; - tss->ax = c->regs[VCPU_REGS_RAX]; - tss->cx = c->regs[VCPU_REGS_RCX]; - tss->dx = c->regs[VCPU_REGS_RDX]; - tss->bx = c->regs[VCPU_REGS_RBX]; - tss->sp = c->regs[VCPU_REGS_RSP]; - tss->bp = c->regs[VCPU_REGS_RBP]; - tss->si = c->regs[VCPU_REGS_RSI]; - tss->di = c->regs[VCPU_REGS_RDI]; - - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); -} - -static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - struct tss_segment_16 *tss) -{ - struct decode_cache *c = &ctxt->decode; - int ret; - - c->eip = tss->ip; - ctxt->eflags = tss->flag | 2; - c->regs[VCPU_REGS_RAX] = tss->ax; - c->regs[VCPU_REGS_RCX] = tss->cx; - c->regs[VCPU_REGS_RDX] = tss->dx; - c->regs[VCPU_REGS_RBX] = tss->bx; - c->regs[VCPU_REGS_RSP] = tss->sp; - c->regs[VCPU_REGS_RBP] = tss->bp; - c->regs[VCPU_REGS_RSI] = tss->si; - c->regs[VCPU_REGS_RDI] = tss->di; - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); - if (ret != X86EMUL_CONTINUE) - return ret; - - return X86EMUL_CONTINUE; -} - -static int task_switch_16(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 tss_selector, u16 old_tss_sel, - ulong old_tss_base, struct kvm_desc_struct *new_desc) -{ - struct tss_segment_16 tss_seg; - int ret; - u32 new_tss_base = kvm_get_desc_base(new_desc); - - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - save_state_to_tss16(ctxt, ops, &tss_seg); - - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - if (old_tss_sel != 0xffff) { - tss_seg.prev_task_link = old_tss_sel; - - ret = ops->write_std(new_tss_base, - &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - } - - return load_state_from_tss16(ctxt, ops, &tss_seg); -} - -static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - struct tss_segment_32 *tss) -{ - struct decode_cache *c = &ctxt->decode; - - tss->cr3 = ops->get_cr(3, ctxt->vcpu); - tss->eip = c->eip; - tss->eflags = ctxt->eflags; - tss->eax = c->regs[VCPU_REGS_RAX]; - tss->ecx = c->regs[VCPU_REGS_RCX]; - tss->edx = c->regs[VCPU_REGS_RDX]; - tss->ebx = c->regs[VCPU_REGS_RBX]; - tss->esp = c->regs[VCPU_REGS_RSP]; - tss->ebp = c->regs[VCPU_REGS_RBP]; - tss->esi = c->regs[VCPU_REGS_RSI]; - tss->edi = c->regs[VCPU_REGS_RDI]; - - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); - tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); - tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); -} - -static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - struct tss_segment_32 *tss) -{ - struct decode_cache *c = &ctxt->decode; - int ret; - - if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) - return emulate_gp(ctxt, 0); - c->eip = tss->eip; - ctxt->eflags = tss->eflags | 2; - c->regs[VCPU_REGS_RAX] = tss->eax; - c->regs[VCPU_REGS_RCX] = tss->ecx; - c->regs[VCPU_REGS_RDX] = tss->edx; - c->regs[VCPU_REGS_RBX] = tss->ebx; - c->regs[VCPU_REGS_RSP] = tss->esp; - c->regs[VCPU_REGS_RBP] = tss->ebp; - c->regs[VCPU_REGS_RSI] = tss->esi; - c->regs[VCPU_REGS_RDI] = tss->edi; - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); - ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); - ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); - if (ret != X86EMUL_CONTINUE) - return ret; - - return X86EMUL_CONTINUE; -} - -static int task_switch_32(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 tss_selector, u16 old_tss_sel, - ulong old_tss_base, struct kvm_desc_struct *new_desc) -{ - struct tss_segment_32 tss_seg; - int ret; - u32 new_tss_base = kvm_get_desc_base(new_desc); - - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - save_state_to_tss32(ctxt, ops, &tss_seg); - - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - if (old_tss_sel != 0xffff) { - tss_seg.prev_task_link = old_tss_sel; - - ret = ops->write_std(new_tss_base, - &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - } - - return load_state_from_tss32(ctxt, ops, &tss_seg); -} - -static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 tss_selector, int reason, - bool has_error_code, u32 error_code) -{ - struct kvm_desc_struct curr_tss_desc, next_tss_desc; - int ret; - u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); - ulong old_tss_base = - ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); - u32 desc_limit; - - /* FIXME: old_tss_base == ~0 ? */ - - ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - - /* FIXME: check that next_tss_desc is tss */ - - if (reason != TASK_SWITCH_IRET) { - if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) - return emulate_gp(ctxt, 0); - } - - desc_limit = desc_limit_scaled(&next_tss_desc); - if (!next_tss_desc.p || - ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || - desc_limit < 0x2b)) { - emulate_ts(ctxt, tss_selector & 0xfffc); - return X86EMUL_PROPAGATE_FAULT; - } - - if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { - curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ - write_segment_descriptor(ctxt, ops, old_tss_sel, - &curr_tss_desc); - } - - if (reason == TASK_SWITCH_IRET) - ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; - - /* set back link to prev task only if NT bit is set in eflags - note that old_tss_sel is not used afetr this point */ - if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) - old_tss_sel = 0xffff; - - if (next_tss_desc.type & 8) - ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, - old_tss_base, &next_tss_desc); - else - ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, - old_tss_base, &next_tss_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - - if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) - ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT; - - if (reason != TASK_SWITCH_IRET) { - next_tss_desc.type |= (1 << 1); /* set busy flag */ - write_segment_descriptor(ctxt, ops, tss_selector, - &next_tss_desc); - } - - ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); - ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); - - if (has_error_code) { - struct decode_cache *c = &ctxt->decode; - - c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; - c->lock_prefix = 0; - c->src.val = (unsigned long) error_code; - emulate_push(ctxt, ops); - } - - return ret; -} - -int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, - bool has_error_code, u32 error_code) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct decode_cache *c = &ctxt->decode; - int rc; - - c->eip = ctxt->eip; - c->dst.type = OP_NONE; - - rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, - has_error_code, error_code); - - if (rc == X86EMUL_CONTINUE) { - rc = writeback(ctxt, ops); - if (rc == X86EMUL_CONTINUE) - ctxt->eip = c->eip; - } - - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; -} - -static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, - int reg, struct operand *op) -{ - struct decode_cache *c = &ctxt->decode; - int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; - - register_address_increment(c, &c->regs[reg], df * op->bytes); - op->addr.mem.ea = register_address(c, c->regs[reg]); - op->addr.mem.seg = seg; -} - -static int em_push(struct x86_emulate_ctxt *ctxt) -{ - emulate_push(ctxt, ctxt->ops); - return X86EMUL_CONTINUE; -} - -static int em_das(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - u8 al, old_al; - bool af, cf, old_cf; - - cf = ctxt->eflags & X86_EFLAGS_CF; - al = c->dst.val; - - old_al = al; - old_cf = cf; - cf = false; - af = ctxt->eflags & X86_EFLAGS_AF; - if ((al & 0x0f) > 9 || af) { - al -= 6; - cf = old_cf | (al >= 250); - af = true; - } else { - af = false; - } - if (old_al > 0x99 || old_cf) { - al -= 0x60; - cf = true; - } - - c->dst.val = al; - /* Set PF, ZF, SF */ - c->src.type = OP_IMM; - c->src.val = 0; - c->src.bytes = 1; - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); - if (cf) - ctxt->eflags |= X86_EFLAGS_CF; - if (af) - ctxt->eflags |= X86_EFLAGS_AF; - return X86EMUL_CONTINUE; -} - -static int em_call_far(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - u16 sel, old_cs; - ulong old_eip; - int rc; - - old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - old_eip = c->eip; - - memcpy(&sel, c->src.valptr + c->op_bytes, 2); - if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) - return X86EMUL_CONTINUE; - - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); - - c->src.val = old_cs; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->src.val = old_eip; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->dst.type = OP_NONE; - - return X86EMUL_CONTINUE; -} - -static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - c->dst.type = OP_REG; - c->dst.addr.reg = &c->eip; - c->dst.bytes = c->op_bytes; - rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); - return X86EMUL_CONTINUE; -} - -static int em_imul(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); - return X86EMUL_CONTINUE; -} - -static int em_imul_3op(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - c->dst.val = c->src2.val; - return em_imul(ctxt); -} - -static int em_cwd(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_REG; - c->dst.bytes = c->src.bytes; - c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; - c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); - - return X86EMUL_CONTINUE; -} - -static int em_rdtsc(struct x86_emulate_ctxt *ctxt) -{ - unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); - struct decode_cache *c = &ctxt->decode; - u64 tsc = 0; - - if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) - return emulate_gp(ctxt, 0); - ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); - c->regs[VCPU_REGS_RAX] = (u32)tsc; - c->regs[VCPU_REGS_RDX] = tsc >> 32; - return X86EMUL_CONTINUE; -} - -static int em_mov(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - c->dst.val = c->src.val; - return X86EMUL_CONTINUE; -} - -#define D(_y) { .flags = (_y) } -#define N D(0) -#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } -#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } - -#define D2bv(_f) D((_f) | ByteOp), D(_f) -#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) - -#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ - D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ - D2bv(((_f) & ~Lock) | DstAcc | SrcImm) - - -static struct opcode group1[] = { - X7(D(Lock)), N -}; - -static struct opcode group1A[] = { - D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, -}; - -static struct opcode group3[] = { - D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - X4(D(SrcMem | ModRM)), -}; - -static struct opcode group4[] = { - D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), - N, N, N, N, N, N, -}; - -static struct opcode group5[] = { - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - D(SrcMem | ModRM | Stack), - I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), - D(SrcMem | ModRM | Stack), N, -}; - -static struct group_dual group7 = { { - N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), - D(SrcMem | ModRM | ByteOp | Priv | NoAccess), -}, { - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, -} }; - -static struct opcode group8[] = { - N, N, N, N, - D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), - D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), -}; - -static struct group_dual group9 = { { - N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, -}, { - N, N, N, N, N, N, N, N, -} }; - -static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), -}; - -static struct opcode opcode_table[256] = { - /* 0x00 - 0x07 */ - D6ALU(Lock), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - /* 0x08 - 0x0F */ - D6ALU(Lock), - D(ImplicitOps | Stack | No64), N, - /* 0x10 - 0x17 */ - D6ALU(Lock), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - /* 0x18 - 0x1F */ - D6ALU(Lock), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - /* 0x20 - 0x27 */ - D6ALU(Lock), N, N, - /* 0x28 - 0x2F */ - D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), - /* 0x30 - 0x37 */ - D6ALU(Lock), N, N, - /* 0x38 - 0x3F */ - D6ALU(0), N, N, - /* 0x40 - 0x4F */ - X16(D(DstReg)), - /* 0x50 - 0x57 */ - X8(I(SrcReg | Stack, em_push)), - /* 0x58 - 0x5F */ - X8(D(DstReg | Stack)), - /* 0x60 - 0x67 */ - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , - N, N, N, N, - /* 0x68 - 0x6F */ - I(SrcImm | Mov | Stack, em_push), - I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), - I(SrcImmByte | Mov | Stack, em_push), - I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D2bv(DstDI | Mov | String), /* insb, insw/insd */ - D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ - /* 0x70 - 0x7F */ - X16(D(SrcImmByte)), - /* 0x80 - 0x87 */ - G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), - G(DstMem | SrcImm | ModRM | Group, group1), - G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), - G(DstMem | SrcImmByte | ModRM | Group, group1), - D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), - /* 0x88 - 0x8F */ - I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), - I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), - D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), - D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), - /* 0x90 - 0x97 */ - X8(D(SrcAcc | DstReg)), - /* 0x98 - 0x9F */ - D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - I(SrcImmFAddr | No64, em_call_far), N, - D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, - /* 0xA0 - 0xA7 */ - I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), - I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), - I2bv(SrcSI | DstDI | Mov | String, em_mov), - D2bv(SrcSI | DstDI | String), - /* 0xA8 - 0xAF */ - D2bv(DstAcc | SrcImm), - I2bv(SrcAcc | DstDI | Mov | String, em_mov), - I2bv(SrcSI | DstAcc | Mov | String, em_mov), - D2bv(SrcAcc | DstDI | String), - /* 0xB0 - 0xB7 */ - X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), - /* 0xB8 - 0xBF */ - X8(I(DstReg | SrcImm | Mov, em_mov)), - /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcImmByte | ModRM), - I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), - D(ImplicitOps | Stack), - D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), - G(ByteOp, group11), G(0, group11), - /* 0xC8 - 0xCF */ - N, N, N, D(ImplicitOps | Stack), - D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), - /* 0xD0 - 0xD7 */ - D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), - N, N, N, N, - /* 0xD8 - 0xDF */ - N, N, N, N, N, N, N, N, - /* 0xE0 - 0xE7 */ - X4(D(SrcImmByte)), - D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), - /* 0xE8 - 0xEF */ - D(SrcImm | Stack), D(SrcImm | ImplicitOps), - D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), - D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), - /* 0xF0 - 0xF7 */ - N, N, N, N, - D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), - /* 0xF8 - 0xFF */ - D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), - D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), -}; - -static struct opcode twobyte_table[256] = { - /* 0x00 - 0x0F */ - N, GD(0, &group7), N, N, - N, D(ImplicitOps), D(ImplicitOps | Priv), N, - D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, - N, D(ImplicitOps | ModRM), N, N, - /* 0x10 - 0x1F */ - N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, - /* 0x20 - 0x2F */ - D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), - D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), - N, N, N, N, - N, N, N, N, N, N, N, N, - /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), - D(ImplicitOps | Priv), N, - D(ImplicitOps), D(ImplicitOps | Priv), N, N, - N, N, N, N, N, N, N, N, - /* 0x40 - 0x4F */ - X16(D(DstReg | SrcMem | ModRM | Mov)), - /* 0x50 - 0x5F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0x60 - 0x6F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0x70 - 0x7F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0x80 - 0x8F */ - X16(D(SrcImm)), - /* 0x90 - 0x9F */ - X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), - /* 0xA0 - 0xA7 */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), N, N, - /* 0xA8 - 0xAF */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), - D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), - /* 0xB0 - 0xB7 */ - D2bv(DstMem | SrcReg | ModRM | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xB8 - 0xBF */ - N, N, - G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xC0 - 0xCF */ - D2bv(DstMem | SrcReg | ModRM | Lock), - N, D(DstMem | SrcReg | ModRM | Mov), - N, N, N, GD(0, &group9), - N, N, N, N, N, N, N, N, - /* 0xD0 - 0xDF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0xE0 - 0xEF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0xF0 - 0xFF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N -}; - -#undef D -#undef N -#undef G -#undef GD -#undef I - -#undef D2bv -#undef I2bv -#undef D6ALU - -static unsigned imm_size(struct decode_cache *c) -{ - unsigned size; - - size = (c->d & ByteOp) ? 1 : c->op_bytes; - if (size == 8) - size = 4; - return size; -} - -static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, - unsigned size, bool sign_extension) -{ - struct decode_cache *c = &ctxt->decode; - struct x86_emulate_ops *ops = ctxt->ops; - int rc = X86EMUL_CONTINUE; - - op->type = OP_IMM; - op->bytes = size; - op->addr.mem.ea = c->eip; - /* NB. Immediates are sign-extended as necessary. */ - switch (op->bytes) { - case 1: - op->val = insn_fetch(s8, 1, c->eip); - break; - case 2: - op->val = insn_fetch(s16, 2, c->eip); - break; - case 4: - op->val = insn_fetch(s32, 4, c->eip); - break; - } - if (!sign_extension) { - switch (op->bytes) { - case 1: - op->val &= 0xff; - break; - case 2: - op->val &= 0xffff; - break; - case 4: - op->val &= 0xffffffff; - break; - } - } -done: - return rc; -} - int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) +x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { - struct x86_emulate_ops *ops = ctxt->ops; + unsigned long memop = 0; + u64 msr_data; + unsigned long saved_eip = 0; struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, dual, goffset; - struct opcode opcode, *g_mod012, *g_mod3; - struct operand memop = { .type = OP_NONE }; - - c->eip = ctxt->eip; - c->fetch.start = c->eip; - c->fetch.end = c->fetch.start + insn_len; - if (insn_len > 0) - memcpy(c->fetch.data, insn, insn_len); - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; - } - - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x26: /* ES override */ - case 0x2e: /* CS override */ - case 0x36: /* SS override */ - case 0x3e: /* DS override */ - set_seg_override(c, (c->b >> 3) & 3); - break; - case 0x64: /* FS override */ - case 0x65: /* GS override */ - set_seg_override(c, c->b & 7); - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - c->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - opcode = opcode_table[c->b]; - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - opcode = twobyte_table[c->b]; - } - c->d = opcode.flags; - - if (c->d & Group) { - dual = c->d & GroupDual; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - if (c->d & GroupDual) { - g_mod012 = opcode.u.gdual->mod012; - g_mod3 = opcode.u.gdual->mod3; - } else - g_mod012 = g_mod3 = opcode.u.group; - - c->d &= ~(Group | GroupDual); - - goffset = (c->modrm >> 3) & 7; - - if ((c->modrm >> 6) == 3) - opcode = g_mod3[goffset]; - else - opcode = g_mod012[goffset]; - c->d |= opcode.flags; - } - - c->execute = opcode.u.execute; - - /* Unrecognised? */ - if (c->d == 0 || (c->d & Undefined)) - return -1; - - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; - - if (c->d & Op3264) { - if (mode == X86EMUL_MODE_PROT64) - c->op_bytes = 8; - else - c->op_bytes = 4; - } - - /* ModRM and SIB bytes. */ - if (c->d & ModRM) { - rc = decode_modrm(ctxt, ops, &memop); - if (!c->has_seg_override) - set_seg_override(c, c->modrm_seg); - } else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops, &memop); - if (rc != X86EMUL_CONTINUE) - goto done; - - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_DS); - - memop.addr.mem.seg = seg_override(ctxt, ops, c); - - if (memop.type == OP_MEM && c->ad_bytes != 8) - memop.addr.mem.ea = (u32)memop.addr.mem.ea; - - if (memop.type == OP_MEM && c->rip_relative) - memop.addr.mem.ea += c->eip; - - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(&c->src, c, 0); - break; - case SrcMem16: - memop.bytes = 2; - goto srcmem_common; - case SrcMem32: - memop.bytes = 4; - goto srcmem_common; - case SrcMem: - memop.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - srcmem_common: - c->src = memop; - break; - case SrcImmU16: - rc = decode_imm(ctxt, &c->src, 2, false); - break; - case SrcImm: - rc = decode_imm(ctxt, &c->src, imm_size(c), true); - break; - case SrcImmU: - rc = decode_imm(ctxt, &c->src, imm_size(c), false); - break; - case SrcImmByte: - rc = decode_imm(ctxt, &c->src, 1, true); - break; - case SrcImmUByte: - rc = decode_imm(ctxt, &c->src, 1, false); - break; - case SrcAcc: - c->src.type = OP_REG; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; - fetch_register_operand(&c->src); - break; - case SrcOne: - c->src.bytes = 1; - c->src.val = 1; - break; - case SrcSI: - c->src.type = OP_MEM; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.addr.mem.ea = - register_address(c, c->regs[VCPU_REGS_RSI]); - c->src.addr.mem.seg = seg_override(ctxt, ops, c), - c->src.val = 0; - break; - case SrcImmFAddr: - c->src.type = OP_IMM; - c->src.addr.mem.ea = c->eip; - c->src.bytes = c->op_bytes + 2; - insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); - break; - case SrcMemFAddr: - memop.bytes = c->op_bytes + 2; - goto srcmem_common; - break; - } + unsigned int port; + int io_dir_in; + int rc = 0; - if (rc != X86EMUL_CONTINUE) - goto done; + ctxt->interruptibility = 0; - /* - * Decode and fetch the second source operand: register, memory - * or immediate. + /* Shadow copy of register state. Committed on successful emulation. + * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't + * modify them. */ - switch (c->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - c->src2.bytes = 1; - c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; - break; - case Src2ImmByte: - rc = decode_imm(ctxt, &c->src2, 1, true); - break; - case Src2One: - c->src2.bytes = 1; - c->src2.val = 1; - break; - case Src2Imm: - rc = decode_imm(ctxt, &c->src2, imm_size(c), true); - break; - } - if (rc != X86EMUL_CONTINUE) - goto done; - - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstImmUByte: - c->dst.type = OP_IMM; - c->dst.addr.mem.ea = c->eip; - c->dst.bytes = 1; - c->dst.val = insn_fetch(u8, 1, c->eip); - break; - case DstMem: - case DstMem64: - c->dst = memop; - if ((c->d & DstMask) == DstMem64) - c->dst.bytes = 8; - else - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->d & BitOp) - fetch_bit_operand(c); - c->dst.orig_val = c->dst.val; - break; - case DstAcc: - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; - fetch_register_operand(&c->dst); - c->dst.orig_val = c->dst.val; - break; - case DstDI: - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.addr.mem.ea = - register_address(c, c->regs[VCPU_REGS_RDI]); - c->dst.addr.mem.seg = VCPU_SREG_ES; - c->dst.val = 0; - break; - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - default: - c->dst.type = OP_NONE; /* Disable writeback. */ - return 0; - } - -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; -} - -static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if (((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) - && (((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) - || ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) - return true; - - return false; -} - -int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt) -{ - struct x86_emulate_ops *ops = ctxt->ops; - u64 msr_data; - struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - int saved_dst_type = c->dst.type; - int irq; /* Used for int 3, int, and into */ - - ctxt->decode.mem_read.pos = 0; - - if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - rc = emulate_ud(ctxt); - goto done; - } + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + saved_eip = c->eip; /* LOCK prefix is allowed only with some instructions */ - if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { - rc = emulate_ud(ctxt); + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } /* Privileged instruction can be executed only in CPL=0 */ - if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { - rc = emulate_gp(ctxt, 0); + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); goto done; } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) + memop = c->modrm_ea; + if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ - if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { - ctxt->eip = c->eip; + if (c->regs[VCPU_REGS_RCX] == 0) { + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if ((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) { + if ((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + if ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + } + c->regs[VCPU_REGS_RCX]--; + c->eip = kvm_rip_read(ctxt->vcpu); } - if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), - c->src.valptr, c->src.bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - c->src.orig_val64 = c->src.val64; - } - - if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), - &c->src2.val, c->src2.bytes); + if (c->src.type == OP_MEM) { + c->src.ptr = (unsigned long *)memop; + c->src.val = 0; + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); if (rc != X86EMUL_CONTINUE) goto done; + c->src.orig_val = c->src.val; } if ((c->d & DstMask) == ImplicitOps) goto special_insn; - if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { - /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), - &c->dst.val, c->dst.bytes); - if (rc != X86EMUL_CONTINUE) - goto done; + if (c->dst.type == OP_MEM) { + c->dst.ptr = (unsigned long *)memop; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } + if (!(c->d & Mov)) { + /* optimisation - avoid slow emulated read */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } } c->dst.orig_val = c->dst.val; special_insn: - if (c->execute) { - rc = c->execute(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; - goto writeback; - } - if (c->twobyte) goto twobyte_insn; @@ -3067,37 +1962,43 @@ special_insn: emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); + emulate_push_sreg(ctxt, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); + if (rc != 0) + goto done; break; case 0x08 ... 0x0d: or: /* or */ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); + emulate_push_sreg(ctxt, VCPU_SREG_CS); break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); + emulate_push_sreg(ctxt, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); + if (rc != 0) + goto done; break; case 0x18 ... 0x1d: sbb: /* sbb */ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); + emulate_push_sreg(ctxt, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); + if (rc != 0) + goto done; break; case 0x20 ... 0x25: and: /* and */ @@ -3121,30 +2022,75 @@ special_insn: case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; + case 0x50 ... 0x57: /* push reg */ + emulate_push(ctxt); + break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != 0) + goto done; break; case 0x60: /* pusha */ - rc = emulate_pusha(ctxt, ops); + emulate_pusha(ctxt); break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); + if (rc != 0) + goto done; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; c->dst.val = (s32) c->src.val; break; + case 0x68: /* push imm */ + case 0x6a: /* push imm8 */ + emulate_push(ctxt); + break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - c->src.val = c->regs[VCPU_REGS_RDX]; - goto do_io_in; + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, + 1, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c, es_base(ctxt), + c->regs[VCPU_REGS_RDI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - c->dst.val = c->regs[VCPU_REGS_RDX]; - goto do_io_out; - break; + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, + 0, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); @@ -3170,30 +2116,49 @@ special_insn: } break; case 0x84 ... 0x85: - test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 0x86 ... 0x87: /* xchg */ xchg: /* Write back the register source. */ - c->src.val = c->dst.val; - write_register_operand(&c->src); + switch (c->dst.bytes) { + case 1: + *(u8 *) c->src.ptr = (u8) c->dst.val; + break; + case 2: + *(u16 *) c->src.ptr = (u16) c->dst.val; + break; + case 4: + *c->src.ptr = (u32) c->dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *c->src.ptr = c->dst.val; + break; + } /* * Write back the memory destination with implicit LOCK * prefix. */ - c->dst.val = c->src.orig_val; + c->dst.val = c->src.val; c->lock_prefix = 1; break; - case 0x8c: /* mov r/m, sreg */ - if (c->modrm_reg > VCPU_SREG_GS) { - rc = emulate_ud(ctxt); - goto done; + case 0x88 ... 0x8b: /* mov */ + goto mov; + case 0x8c: { /* mov r/m, sreg */ + struct kvm_segment segreg; + + if (c->modrm_reg <= 5) + kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); + else { + printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; } - c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); + c->dst.val = segreg.selector; break; + } case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->src.addr.mem.ea; + c->dst.val = c->modrm_ea; break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; @@ -3202,127 +2167,193 @@ special_insn: if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS) { - rc = emulate_ud(ctxt); + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } if (c->modrm_reg == VCPU_SREG_SS) - ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); + if (rc != 0) + goto done; break; - case 0x90 ... 0x97: /* nop / xchg reg, rax */ - if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) + case 0x90: /* nop / xchg r8,rax */ + if (!(c->rex_prefix & 1)) { /* nop */ + c->dst.type = OP_NONE; break; - goto xchg; - case 0x98: /* cbw/cwde/cdqe */ - switch (c->op_bytes) { - case 2: c->dst.val = (s8)c->dst.val; break; - case 4: c->dst.val = (s16)c->dst.val; break; - case 8: c->dst.val = (s32)c->dst.val; break; } - break; + case 0x91 ... 0x97: /* xchg reg,rax */ + c->src.type = c->dst.type = OP_REG; + c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; + c->src.val = *(c->src.ptr); + goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt, ops); + emulate_push(ctxt); break; case 0x9d: /* popf */ c->dst.type = OP_REG; - c->dst.addr.reg = &ctxt->eflags; + c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; + case 0xa0 ... 0xa1: /* mov */ + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.val = c->src.val; + break; + case 0xa2 ... 0xa3: /* mov */ + c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; + break; + case 0xa4 ... 0xa5: /* movs */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); break; case 0xa6 ... 0xa7: /* cmps */ + c->src.type = OP_NONE; /* Disable writeback. */ + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *)register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]); + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + c->dst.type = OP_NONE; /* Disable writeback. */ - goto cmp; - case 0xa8 ... 0xa9: /* test ax, imm */ - goto test; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->src.bytes + : c->src.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + + break; + case 0xaa ... 0xab: /* stos */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + c->dst.val = c->regs[VCPU_REGS_RAX]; + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; case 0xae ... 0xaf: /* scas */ - goto cmp; + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + case 0xb0 ... 0xbf: /* mov r, imm */ + goto mov; case 0xc0 ... 0xc1: emulate_grp2(ctxt); break; case 0xc3: /* ret */ c->dst.type = OP_REG; - c->dst.addr.reg = &c->eip; + c->dst.ptr = &c->eip; c->dst.bytes = c->op_bytes; goto pop_instruction; - case 0xc4: /* les */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); - break; - case 0xc5: /* lds */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + mov: + c->dst.val = c->src.val; break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); - break; - case 0xcc: /* int3 */ - irq = 3; - goto do_interrupt; - case 0xcd: /* int n */ - irq = c->src.val; - do_interrupt: - rc = emulate_int(ctxt, ops, irq); - break; - case 0xce: /* into */ - if (ctxt->eflags & EFLG_OF) { - irq = 4; - goto do_interrupt; - } - break; - case 0xcf: /* iret */ - rc = emulate_iret(ctxt, ops); + if (rc) + goto done; break; case 0xd0 ... 0xd1: /* Grp2 */ + c->src.val = 1; emulate_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; - case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ - register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && - (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) - jmp_rel(c, c->src.val); - break; - case 0xe3: /* jcxz/jecxz/jrcxz */ - if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) - jmp_rel(c, c->src.val); - break; case 0xe4: /* inb */ case 0xe5: /* in */ - goto do_io_in; + port = c->src.val; + io_dir_in = 1; + goto do_io; case 0xe6: /* outb */ case 0xe7: /* out */ - goto do_io_out; + port = c->src.val; + io_dir_in = 0; + goto do_io; case 0xe8: /* call (near) */ { long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt, ops); + emulate_push(ctxt); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: { /* jmp far */ - unsigned short sel; - jump_far: - memcpy(&sel, c->src.valptr + c->op_bytes, 2); - - if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) + case 0xea: /* jmp far */ + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) goto done; - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); + c->eip = c->src.val; break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -3330,30 +2361,25 @@ special_insn: break; case 0xec: /* in al,dx */ case 0xed: /* in (e/r)ax,dx */ - c->src.val = c->regs[VCPU_REGS_RDX]; - do_io_in: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - rc = emulate_gp(ctxt, 0); + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 1; + goto do_io; + case 0xee: /* out al,dx */ + case 0xef: /* out (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 0; + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); goto done; } - if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, - &c->dst.val)) - goto done; /* IO is needed */ - break; - case 0xee: /* out dx,al */ - case 0xef: /* out dx,(e/r)ax */ - c->dst.val = c->regs[VCPU_REGS_RDX]; - do_io_out: - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->dst.val, - c->src.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, + (c->d & ByteOp) ? 1 : c->op_bytes, + port) != 0) { + c->eip = saved_eip; + goto cannot_emulate; } - ops->pio_out_emulated(c->src.bytes, c->dst.val, - &c->src.val, 1, ctxt->vcpu); - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; @@ -3361,101 +2387,64 @@ special_insn: case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf6 ... 0xf7: /* Grp3 */ rc = emulate_grp3(ctxt, ops); + if (rc != 0) + goto done; break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; - break; - case 0xf9: /* stc */ - ctxt->eflags |= EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - if (emulator_bad_iopl(ctxt, ops)) { - rc = emulate_gp(ctxt, 0); - goto done; - } else + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - if (emulator_bad_iopl(ctxt, ops)) { - rc = emulate_gp(ctxt, 0); - goto done; - } else { - ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfd: /* std */ ctxt->eflags |= EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ break; - case 0xfe: /* Grp4 */ - grp45: + case 0xfe ... 0xff: /* Grp4/Grp5 */ rc = emulate_grp45(ctxt, ops); + if (rc != 0) + goto done; break; - case 0xff: /* Grp5 */ - if (c->modrm_reg == 5) - goto jump_far; - goto grp45; - default: - goto cannot_emulate; } - if (rc != X86EMUL_CONTINUE) - goto done; - writeback: rc = writeback(ctxt, ops); - if (rc != X86EMUL_CONTINUE) + if (rc != 0) goto done; - /* - * restore dst type in case the decoding will be reused - * (happens for string instruction ) - */ - c->dst.type = saved_dst_type; - - if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt, ops, c), - VCPU_REGS_RSI, &c->src); - - if ((c->d & DstMask) == DstDI) - string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, - &c->dst); - - if (c->rep_prefix && (c->d & String)) { - struct read_cache *r = &ctxt->decode.io_read; - register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - - if (!string_insn_completed(ctxt)) { - /* - * Re-enter guest when pio read ahead buffer is empty - * or, if it is not used, after each 1024 iteration. - */ - if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && - (r->end == 0 || r->end != r->pos)) { - /* - * Reset read cache. Usually happens before - * decode, but since instruction is restarted - * we have to do it here. - */ - ctxt->decode.mem_read.end = 0; - return EMULATION_RESTART; - } - goto done; /* skip rip writeback */ - } - } - - ctxt->eip = c->eip; + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(ctxt->vcpu, c->eip); done: - if (rc == X86EMUL_PROPAGATE_FAULT) - ctxt->have_exception = true; - return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; + if (rc == X86EMUL_UNHANDLEABLE) { + c->eip = saved_eip; + return -1; + } + return 0; twobyte_insn: switch (c->b) { @@ -3469,18 +2458,18 @@ twobyte_insn: goto cannot_emulate; rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) + if (rc) goto done; /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->eip; + c->eip = kvm_rip_read(ctxt->vcpu); /* Disable writeback. */ c->dst.type = OP_NONE; break; case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.addr.mem, + rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc != X86EMUL_CONTINUE) + if (rc) goto done; realmode_lgdt(ctxt->vcpu, size, address); /* Disable writeback. */ @@ -3491,15 +2480,17 @@ twobyte_insn: switch (c->modrm_rm) { case 1: rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; break; default: goto cannot_emulate; } } else { - rc = read_descriptor(ctxt, ops, c->src.addr.mem, + rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc != X86EMUL_CONTINUE) + if (rc) goto done; realmode_lidt(ctxt->vcpu, size, address); } @@ -3508,20 +2499,15 @@ twobyte_insn: break; case 4: /* smsw */ c->dst.bytes = 2; - c->dst.val = ops->get_cr(0, ctxt->vcpu); + c->dst.val = realmode_get_cr(ctxt->vcpu, 0); break; case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | - (c->src.val & 0x0f), ctxt->vcpu); + realmode_lmsw(ctxt->vcpu, (u16)c->src.val, + &ctxt->eflags); c->dst.type = OP_NONE; break; - case 5: /* not defined */ - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, - linear(ctxt, c->src.addr.mem)); + emulate_invlpg(ctxt->vcpu, memop); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -3530,93 +2516,91 @@ twobyte_insn: } break; case 0x05: /* syscall */ - rc = emulate_syscall(ctxt, ops); + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + else + goto writeback; break; case 0x06: emulate_clts(ctxt->vcpu); - break; - case 0x09: /* wbinvd */ - kvm_emulate_wbinvd(ctxt->vcpu); + c->dst.type = OP_NONE; break; case 0x08: /* invd */ + case 0x09: /* wbinvd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ + c->dst.type = OP_NONE; break; case 0x20: /* mov cr, reg */ - switch (c->modrm_reg) { - case 1: - case 5 ... 7: - case 9 ... 15: - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); + if (c->modrm_mod != 3) + goto cannot_emulate; + c->regs[c->modrm_rm] = + realmode_get_cr(ctxt->vcpu, c->modrm_reg); + c->dst.type = OP_NONE; /* no writeback */ break; case 0x21: /* mov from dr to reg */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } + if (c->modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, + c->modrm_reg, c->modrm_val, &ctxt->eflags); c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - - if (ops->set_dr(c->modrm_reg, c->src.val & - ((ctxt->mode == X86EMUL_MODE_PROT64) ? - ~0ULL : ~0U), ctxt->vcpu) < 0) { - /* #UD condition is already handled by the code above */ - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, c->modrm_reg, + c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; + rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = kvm_rip_read(ctxt->vcpu); } rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; break; case 0x32: /* rdmsr */ - if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; + rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = kvm_rip_read(ctxt->vcpu); } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RDX] = msr_data >> 32; } rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - rc = emulate_sysenter(ctxt, ops); + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + else + goto writeback; break; case 0x35: /* sysexit */ - rc = emulate_sysexit(ctxt, ops); + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + else + goto writeback; break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; @@ -3626,15 +2610,15 @@ twobyte_insn: case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); - break; - case 0x90 ... 0x9f: /* setcc r/m8 */ - c->dst.val = test_cc(c->b, ctxt->eflags); + c->dst.type = OP_NONE; break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); + emulate_push_sreg(ctxt, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); + if (rc != 0) + goto done; break; case 0xa3: bt: /* bt */ @@ -3648,13 +2632,17 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); + emulate_push_sreg(ctxt, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); + if (rc != 0) + goto done; break; case 0xab: bts: /* bts */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; case 0xac: /* shrd imm8, r, r/m */ @@ -3677,22 +2665,15 @@ twobyte_insn: } else { /* Failure: write the value we saw to EAX. */ c->dst.type = OP_REG; - c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; } break; - case 0xb2: /* lss */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); - break; case 0xb3: btr: /* btr */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); break; - case 0xb4: /* lfs */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); - break; - case 0xb5: /* lgs */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); - break; case 0xb6 ... 0xb7: /* movzx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (u8) c->src.val @@ -3712,60 +2693,31 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); break; - case 0xbc: { /* bsf */ - u8 zf; - __asm__ ("bsf %2, %0; setz %1" - : "=r"(c->dst.val), "=q"(zf) - : "r"(c->src.val)); - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - c->dst.type = OP_NONE; /* Disable writeback. */ - } - break; - } - case 0xbd: { /* bsr */ - u8 zf; - __asm__ ("bsr %2, %0; setz %1" - : "=r"(c->dst.val), "=q"(zf) - : "r"(c->src.val)); - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - c->dst.type = OP_NONE; /* Disable writeback. */ - } - break; - } case 0xbe ... 0xbf: /* movsx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : (s16) c->src.val; break; - case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - /* Write back the register source. */ - c->src.val = c->dst.orig_val; - write_register_operand(&c->src); - break; case 0xc3: /* movnti */ c->dst.bytes = c->op_bytes; c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : (u64) c->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops); + rc = emulate_grp9(ctxt, ops, memop); + if (rc != 0) + goto done; + c->dst.type = OP_NONE; break; - default: - goto cannot_emulate; } - - if (rc != X86EMUL_CONTINUE) - goto done; - goto writeback; cannot_emulate: + DPRINTF("Cannot emulate %02x\n", c->b); + c->eip = saved_eip; return -1; } diff --git a/linux/x86/eventfd.c b/linux/x86/eventfd.c index babaeed..c3cb2fc 100644 --- a/linux/x86/eventfd.c +++ b/linux/x86/eventfd.c @@ -42,7 +42,6 @@ * kvm eventfd support - use eventfd objects to signal various KVM events * * Copyright 2009 Novell. All Rights Reserved. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Author: * Gregory Haskins <ghaskins@novell.com> @@ -85,19 +84,14 @@ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) struct _irqfd { - /* Used for MSI fast-path */ - struct kvm *kvm; - wait_queue_t wait; - /* Update side is protected by irqfds.lock */ - struct kvm_kernel_irq_routing_entry __rcu *irq_entry; - /* Used for level IRQ fast-path */ - int gsi; - struct work_struct inject; - /* Used for setup/shutdown */ - struct eventfd_ctx *eventfd; - struct list_head list; - poll_table pt; - struct work_struct shutdown; + struct kvm *kvm; + struct eventfd_ctx *eventfd; + int gsi; + struct list_head list; + poll_table pt; + wait_queue_t wait; + struct work_struct inject; + struct work_struct shutdown; }; static struct workqueue_struct *irqfd_cleanup_wq; @@ -171,22 +165,14 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); unsigned long flags = (unsigned long)key; - struct kvm_kernel_irq_routing_entry *irq; - struct kvm *kvm = irqfd->kvm; - if (flags & POLLIN) { - rcu_read_lock(); - irq = rcu_dereference(irqfd->irq_entry); + if (flags & POLLIN) /* An event has been signaled, inject an interrupt */ - if (irq) - kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); - else - schedule_work(&irqfd->inject); - rcu_read_unlock(); - } + schedule_work(&irqfd->inject); if (flags & POLLHUP) { /* The eventfd is closing, detach from KVM */ + struct kvm *kvm = irqfd->kvm; unsigned long flags; spin_lock_irqsave(&kvm->irqfds.lock, flags); @@ -217,31 +203,9 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, add_wait_queue(wqh, &irqfd->wait); } -/* Must be called under irqfds.lock */ -static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, - struct kvm_irq_routing_table *irq_rt) -{ - struct kvm_kernel_irq_routing_entry *e; - struct hlist_node *n; - - if (irqfd->gsi >= irq_rt->nr_rt_entries) { - rcu_assign_pointer(irqfd->irq_entry, NULL); - return; - } - - hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { - /* Only fast-path MSI. */ - if (e->type == KVM_IRQ_ROUTING_MSI) - rcu_assign_pointer(irqfd->irq_entry, e); - else - rcu_assign_pointer(irqfd->irq_entry, NULL); - } -} - static int kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) { - struct kvm_irq_routing_table *irq_rt; struct _irqfd *irqfd, *tmp; struct file *file = NULL; struct eventfd_ctx *eventfd = NULL; @@ -255,8 +219,8 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) irqfd->kvm = kvm; irqfd->gsi = gsi; INIT_LIST_HEAD(&irqfd->list); - INIT_WORK(&irqfd->inject, irqfd_inject); - INIT_WORK(&irqfd->shutdown, irqfd_shutdown); + kvm_INIT_WORK(&irqfd->inject, irqfd_inject); + kvm_INIT_WORK(&irqfd->shutdown, irqfd_shutdown); file = eventfd_fget(fd); if (IS_ERR(file)) { @@ -291,13 +255,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) goto fail; } - irq_rt = rcu_dereference_protected(kvm->irq_routing, - lockdep_is_held(&kvm->irqfds.lock)); - irqfd_update(kvm, irqfd, irq_rt); - events = file->f_op->poll(file, &irqfd->pt); list_add_tail(&irqfd->list, &kvm->irqfds.items); + spin_unlock_irq(&kvm->irqfds.lock); /* * Check if there was an event already pending on the eventfd @@ -306,8 +267,6 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) if (events & POLLIN) schedule_work(&irqfd->inject); - spin_unlock_irq(&kvm->irqfds.lock); - /* * do not drop the file until the irqfd is fully initialized, otherwise * we might race against the POLLHUP @@ -351,17 +310,8 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { - /* - * This rcu_assign_pointer is needed for when - * another thread calls kvm_irqfd_update before - * we flush workqueue below. - * It is paired with synchronize_rcu done by caller - * of that function. - */ - rcu_assign_pointer(irqfd->irq_entry, NULL); + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) irqfd_deactivate(irqfd); - } } spin_unlock_irq(&kvm->irqfds.lock); @@ -411,25 +361,6 @@ kvm_irqfd_release(struct kvm *kvm) } /* - * Change irq_routing and irqfd. - * Caller must invoke synchronize_rcu afterwards. - */ -void kvm_irq_routing_update(struct kvm *kvm, - struct kvm_irq_routing_table *irq_rt) -{ - struct _irqfd *irqfd; - - spin_lock_irq(&kvm->irqfds.lock); - - rcu_assign_pointer(kvm->irq_routing, irq_rt); - - list_for_each_entry(irqfd, &kvm->irqfds.items, list) - irqfd_update(kvm, irqfd, irq_rt); - - spin_unlock_irq(&kvm->irqfds.lock); -} - -/* * create a host-wide workqueue for issuing deferred shutdown requests * aggregated from all vm* instances. We need our own isolated single-thread * queue to prevent deadlock against flushing the normal work-queue. @@ -697,9 +628,4 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) #else void kvm_eventfd_init(struct kvm *kvm) { } void kvm_irqfd_release(struct kvm *kvm) { } -void kvm_irq_routing_update(struct kvm *kvm, - struct kvm_irq_routing_table *irq_rt) -{ - rcu_assign_pointer(kvm->irq_routing, irq_rt); -} #endif diff --git a/linux/x86/i8254.c b/linux/x86/i8254.c index 64aa827..13dc226 100644 --- a/linux/x86/i8254.c +++ b/linux/x86/i8254.c @@ -45,7 +45,6 @@ * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -74,7 +73,6 @@ #include <linux/kvm_host.h> #include <linux/slab.h> -#include <linux/workqueue.h> #include "irq.h" #include "i8254.h" @@ -272,26 +270,24 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } +int pit_has_pending_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + + if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) + return atomic_read(&pit->pit_state.pit_timer.pending); + return 0; +} + static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - int value; - - spin_lock(&ps->inject_lock); - value = atomic_dec_return(&ps->pit_timer.pending); - if (value < 0) - /* spurious acks can be generated if, for example, the - * PIC is being reset. Handle it gracefully here - */ + raw_spin_lock(&ps->inject_lock); + if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); - else if (value > 0) - /* in this case, we had multiple outstanding pit interrupts - * that we needed to inject. Reinject - */ - queue_work(ps->pit->wq, &ps->pit->expired); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -303,14 +299,14 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) return; timer = &pit->pit_state.pit_timer.timer; - if (hrtimer_cancel(timer)) + if (hrtimer_cancel_p(timer)) kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -static void destroy_pit_timer(struct kvm_pit *pit) +static void destroy_pit_timer(struct kvm_timer *pt) { - hrtimer_cancel(&pit->pit_state.pit_timer.timer); - cancel_work_sync(&pit->expired); + pr_debug("pit: " "execute del timer!\n"); + hrtimer_cancel_p(&pt->timer); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -324,60 +320,6 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; -static void pit_do_work(struct work_struct *work) -{ - struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); - struct kvm *kvm = pit->kvm; - struct kvm_vcpu *vcpu; - int i; - struct kvm_kpit_state *ps = &pit->pit_state; - int inject = 0; - - /* Try to inject pending interrupts when - * last one has been acked. - */ - spin_lock(&ps->inject_lock); - if (ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - spin_unlock(&ps->inject_lock); - if (inject) { - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (kvm->arch.vapics_in_nmi_mode > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); - } -} - -static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) -{ - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - struct kvm_pit *pt = ktimer->kvm->arch.vpit; - - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - queue_work(pt->wq, &pt->expired); - } - - if (ktimer->t_ops->is_periodic(ktimer)) { - kvm_hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); - return HRTIMER_RESTART; - } else - return HRTIMER_NORESTART; -} - static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { struct kvm_timer *pt = &ps->pit_timer; @@ -388,19 +330,20 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) pr_debug("pit: " "create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ - hrtimer_cancel(&pt->timer); - cancel_work_sync(&ps->pit->expired); + hrtimer_cancel_p(&pt->timer); pt->period = interval; ps->is_periodic = is_period; - pt->timer.function = pit_timer_fn; + pt->timer.function = kvm_timer_fn; + hrtimer_data_pointer(&pt->timer); pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; + pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; - hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), + hrtimer_start_p(&pt->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); } @@ -444,7 +387,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) } break; default: - destroy_pit_timer(kvm->arch.vpit); + destroy_pit_timer(&ps->pit_timer); } } @@ -723,23 +666,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); - - pit->wq = create_singlethread_workqueue("kvm-pit-wq"); - if (!pit->wq) { - mutex_unlock(&pit->pit_state.lock); - kvm_free_irq_source_id(kvm, pit->irq_source_id); - kfree(pit); - return NULL; - } - INIT_WORK(&pit->expired, pit_do_work); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; pit_state = &pit->pit_state; pit_state->pit = pit; - hrtimer_init(&pit_state->pit_timer.timer, + hrtimer_init_p(&pit_state->pit_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; @@ -774,7 +708,7 @@ fail: kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); kvm_free_irq_source_id(kvm, pit->irq_source_id); - destroy_workqueue(pit->wq); + kfree(pit); return NULL; } @@ -784,20 +718,61 @@ void kvm_free_pit(struct kvm *kvm) struct hrtimer *timer; if (kvm->arch.vpit) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &kvm->arch.vpit->speaker_dev); kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, &kvm->arch.vpit->pit_state.irq_ack_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; - hrtimer_cancel(timer); - cancel_work_sync(&kvm->arch.vpit->expired); + hrtimer_cancel_p(timer); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } + +static void __inject_pit_timer_intr(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); +} + +void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + struct kvm *kvm = vcpu->kvm; + struct kvm_kpit_state *ps; + + if (pit) { + int inject = 0; + ps = &pit->pit_state; + + /* Try to inject pending interrupts when + * last one has been acked. + */ + raw_spin_lock(&ps->inject_lock); + if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; + } + raw_spin_unlock(&ps->inject_lock); + if (inject) + __inject_pit_timer_intr(kvm); + } +} diff --git a/linux/x86/i8254.h b/linux/x86/i8254.h index 46d08ca..900d6b0 100644 --- a/linux/x86/i8254.h +++ b/linux/x86/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; @@ -40,8 +40,6 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct workqueue_struct *wq; - struct work_struct expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff --git a/linux/x86/i8259.c b/linux/x86/i8259.c index d316fc6..76c25b5 100644 --- a/linux/x86/i8259.c +++ b/linux/x86/i8259.c @@ -43,7 +43,6 @@ * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -74,44 +73,6 @@ #include <linux/kvm_host.h> #include "trace.h" -static void pic_irq_request(struct kvm *kvm, int level); - -static void pic_lock(struct kvm_pic *s) - __acquires(&s->lock) -{ - spin_lock(&s->lock); -} - -static void pic_unlock(struct kvm_pic *s) - __releases(&s->lock) -{ - bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu, *found = NULL; - int i; - - s->wakeup_needed = false; - - spin_unlock(&s->lock); - - if (wakeup) { - kvm_for_each_vcpu(i, vcpu, s->kvm) { - if (kvm_apic_accept_pic_intr(vcpu)) { - found = vcpu; - break; - } - } - - if (!found) - found = s->kvm->bsp_vcpu; - - if (!found) - return; - - kvm_make_request(KVM_REQ_EVENT, found); - kvm_vcpu_kick(found); - } -} - static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -124,19 +85,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - pic_unlock(s->pics_state); + raw_spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - pic_lock(s->pics_state); + raw_spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + raw_spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - pic_unlock(s); + raw_spin_unlock(&s->lock); } /* @@ -229,14 +190,17 @@ static void pic_update_irq(struct kvm_pic *s) pic_set_irq1(&s->pics[0], 2, 0); } irq = pic_get_irq(&s->pics[0]); - pic_irq_request(s->kvm, irq >= 0); + if (irq >= 0) + s->irq_request(s->irq_request_opaque, 1); + else + s->irq_request(s->irq_request_opaque, 0); } void kvm_pic_update_irq(struct kvm_pic *s) { - pic_lock(s); + raw_spin_lock(&s->lock); pic_update_irq(s); - pic_unlock(s); + raw_spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -244,14 +208,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - pic_lock(s); + raw_spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - pic_unlock(s); + raw_spin_unlock(&s->lock); return ret; } @@ -281,7 +245,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + raw_spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -306,7 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - pic_unlock(s); + raw_spin_unlock(&s->lock); return intno; } @@ -314,7 +278,8 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { int irq; - struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; + struct kvm *kvm = s->pics_state->irq_request_opaque; + struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; u8 irr = s->irr, isr = s->imr; s->last_irr = 0; @@ -349,17 +314,14 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { - s->init4 = val & 1; - s->last_irr = 0; - s->imr = 0; - s->priority_add = 0; - s->special_mask = 0; - s->read_reg_select = 0; - if (!s->init4) { - s->special_fully_nested_mode = 0; - s->auto_eoi = 0; - } + kvm_pic_reset(s); /* init */ + /* + * deassert a pending interrupt + */ + s->pics_state->irq_request(s->pics_state-> + irq_request_opaque, 0); s->init_state = 1; + s->init4 = val & 1; if (val & 0x02) printk(KERN_ERR "single mode not supported"); if (val & 0x08) @@ -411,20 +373,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) } } else switch (s->init_state) { - case 0: { /* normal mode */ - u8 imr_diff = s->imr ^ val, - off = (s == &s->pics_state->pics[0]) ? 0 : 8; + case 0: /* normal mode */ s->imr = val; - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) - if (imr_diff & (1 << irq)) - kvm_fire_mask_notifiers( - s->pics_state->kvm, - SELECT_PIC(irq + off), - irq + off, - !!(s->imr & (1 << irq))); pic_update_irq(s->pics_state); break; - } case 1: s->irq_base = val & 0xf8; s->init_state = 2; @@ -532,7 +484,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - pic_lock(s); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -545,7 +497,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - pic_unlock(s); + raw_spin_unlock(&s->lock); return 0; } @@ -562,7 +514,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - pic_lock(s); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -576,15 +528,16 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - pic_unlock(s); + raw_spin_unlock(&s->lock); return 0; } /* * callback when PIC0 irq status changed */ -static void pic_irq_request(struct kvm *kvm, int level) +static void pic_irq_request(void *opaque, int level) { + struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); int irq = pic_get_irq(&s->pics[0]); @@ -592,7 +545,7 @@ static void pic_irq_request(struct kvm *kvm, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - s->wakeup_needed = true; + kvm_vcpu_kick(vcpu); } } @@ -609,14 +562,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; + s->irq_request = pic_irq_request; + s->irq_request_opaque = kvm; s->pics[0].pics_state = s; s->pics[1].pics_state = s; - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/linux/x86/ioapic.c b/linux/x86/ioapic.c index f19b670..c87f9c0 100644 --- a/linux/x86/ioapic.c +++ b/linux/x86/ioapic.c @@ -40,7 +40,6 @@ #endif /* * Copyright (C) 2001 MandrakeSoft S.A. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * MandrakeSoft S.A. * 43, rue d'Aboukir @@ -192,7 +191,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) - kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); + kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index); @@ -233,13 +232,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) { - u32 old_irr; + u32 old_irr = ioapic->irr; u32 mask = 1 << irq; union kvm_ioapic_redirect_entry entry; int ret = 1; spin_lock(&ioapic->lock); - old_irr = ioapic->irr; if (irq >= 0 && irq < IOAPIC_NUM_PINS) { entry = ioapic->redirtbl[irq]; level ^= entry.fields.polarity; diff --git a/linux/x86/iommu.c b/linux/x86/iommu.c index 381142f..20bf46f 100644 --- a/linux/x86/iommu.c +++ b/linux/x86/iommu.c @@ -56,8 +56,6 @@ * * Copyright (C) 2006-2008 Intel Corporation * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * * Author: Allen M. Kay <allen.m.kay@intel.com> * Author: Weidong Han <weidong.han@intel.com> * Author: Ben-Ami Yassour <benami@il.ibm.com> @@ -74,30 +72,12 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); -static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, unsigned long size) -{ - gfn_t end_gfn; - pfn_t pfn; - - pfn = gfn_to_pfn_memslot(kvm, slot, gfn); - end_gfn = gfn + (size >> PAGE_SHIFT); - gfn += 1; - - if (is_error_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(kvm, slot, gfn++); - - return pfn; -} - int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { - gfn_t gfn, end_gfn; + gfn_t gfn = slot->base_gfn; + unsigned long npages = slot->npages; pfn_t pfn; - int r = 0; + int i, r = 0; struct iommu_domain *domain = kvm->arch.iommu_domain; int flags; @@ -105,79 +85,46 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) if (!domain) return 0; - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - flags = IOMMU_READ | IOMMU_WRITE; if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) flags |= IOMMU_CACHE; - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(kvm, slot, gfn, page_size); - if (is_error_pfn(pfn)) { - gfn += 1; + for (i = 0; i < npages; i++) { + /* check if already mapped */ + if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) continue; - } - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - get_order(page_size), flags); + pfn = gfn_to_pfn_memslot(kvm, slot, gfn); + r = iommu_map_range(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, flags); if (r) { printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); + "iommu failed to map pfn=%lx\n", pfn); goto unmap_pages; } - - gfn += page_size >> PAGE_SHIFT; - - + gfn++; } - return 0; unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn); + kvm_iommu_put_pages(kvm, slot->base_gfn, i); return r; } static int kvm_iommu_map_memslots(struct kvm *kvm) { - int i, idx, r = 0; + int i, r = 0; struct kvm_memslots *slots; - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < slots->nmemslots; i++) { r = kvm_iommu_map_pages(kvm, &slots->memslots[i]); if (r) break; } - srcu_read_unlock(&kvm->srcu, idx); return r; } @@ -282,62 +229,40 @@ out_unmap: return r; } -static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages) { - struct iommu_domain *domain; - gfn_t end_gfn, gfn; + gfn_t gfn = base_gfn; pfn_t pfn; + struct iommu_domain *domain = kvm->arch.iommu_domain; + unsigned long i; u64 phys; - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - /* check if iommu exists and in use */ if (!domain) return; - while (gfn < end_gfn) { - unsigned long unmap_pages; - int order; - - /* Get physical address */ + for (i = 0; i < npages; i++) { phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - order = iommu_unmap(domain, gfn_to_gpa(gfn), 0); - unmap_pages = 1ULL << order; - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; + pfn = phys >> PAGE_SHIFT; + kvm_release_pfn_clean(pfn); + gfn++; } + + iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages); } static int kvm_iommu_unmap_memslots(struct kvm *kvm) { - int i, idx; + int i; struct kvm_memslots *slots; - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < slots->nmemslots; i++) { kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, slots->memslots[i].npages); } - srcu_read_unlock(&kvm->srcu, idx); return 0; } diff --git a/linux/x86/irq.c b/linux/x86/irq.c index d1372c0..255aaf7 100644 --- a/linux/x86/irq.c +++ b/linux/x86/irq.c @@ -41,7 +41,6 @@ /* * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. - * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -73,7 +72,12 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return apic_has_pending_timer(vcpu); + int ret; + + ret = pit_has_pending_timer(vcpu); + ret |= apic_has_pending_timer(vcpu); + + return ret; } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); @@ -125,6 +129,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); + kvm_inject_pit_timer_irqs(vcpu); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff --git a/linux/x86/irq.h b/linux/x86/irq.h index 90f1923..a4cb03d 100644 --- a/linux/x86/irq.h +++ b/linux/x86/irq.h @@ -38,11 +38,14 @@ struct kvm; struct kvm_vcpu; +typedef void irq_request_func(void *opaque, int level); + struct kvm_kpic_state { u8 last_irr; /* edge detection */ u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ + u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -55,16 +58,16 @@ struct kvm_kpic_state { u8 init4; /* true if 4 byte init */ u8 elcr; /* PIIX edge/trigger selection */ u8 elcr_mask; - u8 isr_ack; /* interrupt ack detection */ struct kvm_pic *pics_state; }; struct kvm_pic { - spinlock_t lock; - bool wakeup_needed; + raw_spinlock_t lock; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ + irq_request_func *irq_request; + void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; void (*ack_notifier)(void *opaque, int irq); diff --git a/linux/x86/irq_comm.c b/linux/x86/irq_comm.c index 646d331..bbf8e05 100644 --- a/linux/x86/irq_comm.c +++ b/linux/x86/irq_comm.c @@ -57,7 +57,6 @@ * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * - * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ #include <linux/kvm_host.h> @@ -140,7 +139,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (r < 0) r = 0; r += kvm_apic_set_irq(vcpu, irq); - } else if (kvm_lapic_enabled(vcpu)) { + } else { if (!lowest) lowest = vcpu; else if (kvm_apic_compare_prio(vcpu, lowest) < 0) @@ -154,8 +153,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level) +static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) { struct kvm_lapic_irq irq; @@ -319,19 +318,15 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, synchronize_rcu(); } -void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, - bool mask) +void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) { struct kvm_irq_mask_notifier *kimn; struct hlist_node *n; - int gsi; rcu_read_lock(); - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; - if (gsi != -1) - hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) - if (kimn->irq == gsi) - kimn->func(kimn, mask); + hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) + if (kimn->irq == irq) + kimn->func(kimn, mask); rcu_read_unlock(); } @@ -449,9 +444,8 @@ int kvm_set_irq_routing(struct kvm *kvm, mutex_lock(&kvm->irq_lock); old = kvm->irq_routing; - kvm_irq_routing_update(kvm, new); + rcu_assign_pointer(kvm->irq_routing, new); mutex_unlock(&kvm->irq_lock); - synchronize_rcu(); new = old; diff --git a/linux/x86/kvm_cache_regs.h b/linux/x86/kvm_cache_regs.h index 3377d53..cff851c 100644 --- a/linux/x86/kvm_cache_regs.h +++ b/linux/x86/kvm_cache_regs.h @@ -36,20 +36,11 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { - might_sleep(); /* on svm */ - if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); - return vcpu->arch.walk_mmu->pdptrs[index]; -} - -static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) -{ - load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); - - return mmu->pdptrs[index]; + return vcpu->arch.pdptrs[index]; } static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) @@ -73,37 +64,9 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) return vcpu->arch.cr4 & mask; } -static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) -{ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - kvm_x86_ops->decache_cr3(vcpu); - return vcpu->arch.cr3; -} - static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, ~0UL); } -static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) -{ - return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) - | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); -} - -static inline void enter_guest_mode(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hflags |= HF_GUEST_MASK; -} - -static inline void leave_guest_mode(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hflags &= ~HF_GUEST_MASK; -} - -static inline bool is_guest_mode(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hflags & HF_GUEST_MASK; -} - #endif diff --git a/linux/x86/kvm_main.c b/linux/x86/kvm_main.c index 355948e..19f1924 100644 --- a/linux/x86/kvm_main.c +++ b/linux/x86/kvm_main.c @@ -45,7 +45,6 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -95,12 +94,11 @@ #include <asm-generic/bitops/le.h> #include "coalesced_mmio.h" -#include "async_pf.h" #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> -MODULE_INFO(version, "kvm-kmod-2.6.38-rc7"); +MODULE_INFO(version, "kvm-kmod-2.6.34"); MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -131,40 +129,15 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); +static bool kvm_rebooting; static bool largepages_enabled = true; -static struct page *hwpoison_page; -static pfn_t hwpoison_pfn; - -static struct page *fault_page; -static pfn_t fault_pfn; - inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - int reserved; - struct page *tail = pfn_to_page(pfn); - struct page *head = compound_trans_head(tail); - reserved = PageReserved(head); - if (head != tail) { - /* - * "head" is not a dangling pointer - * (compound_trans_head takes care of that) - * but the hugepage may have been splitted - * from under us (and we may not hold a - * reference count on the head page so it can - * be reused before we run PageReferenced), so - * we've to check PageTail before returning - * what we just read. - */ - smp_rmb(); - if (PageTail(tail)) - return reserved; - } - return PageReserved(tail); + struct page *page = compound_head(pfn_to_page(pfn)); + return PageReserved(page); } return true; @@ -210,7 +183,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) raw_spin_lock(&kvm->requests_lock); me = smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (kvm_make_check_request(req, vcpu)) + if (test_and_set_bit(req, &vcpu->requests)) continue; cpu = vcpu->cpu; if (cpus != NULL && cpu != -1 && cpu != me) @@ -229,12 +202,8 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { - int dirty_count = kvm->tlbs_dirty; - - smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } void kvm_reload_remote_mmus(struct kvm *kvm) @@ -252,7 +221,6 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; init_waitqueue_head(&vcpu->wq); - kvm_async_pf_vcpu_init(vcpu); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { @@ -311,12 +279,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, * pte after kvm_unmap_hva returned, without noticing the page * is going to be freed. */ - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; + need_tlb_flush = kvm_unmap_hva(kvm, address); spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -335,12 +303,12 @@ void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; kvm_set_spte_hva(kvm, address, pte); spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); } static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, @@ -351,7 +319,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no @@ -361,9 +329,8 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mmu_notifier_count++; for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); - need_tlb_flush |= kvm->tlbs_dirty; spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -403,11 +370,11 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); young = kvm_age_hva(kvm, address); spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); if (young) kvm_flush_remote_tlbs(kvm); @@ -415,33 +382,15 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) -static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young, idx; - - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - young = kvm_test_age_hva(kvm, address); - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - return young; -} -#endif - static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); kvm_arch_flush_shadow(kvm); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { @@ -449,9 +398,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) - .test_young = kvm_mmu_notifier_test_young, -#endif #ifdef MMU_NOTIFIER_HAS_CHANGE_PTE .change_pte = kvm_mmu_notifier_change_pte, #endif @@ -475,15 +421,11 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) static struct kvm *kvm_create_vm(void) { - int r, i; - struct kvm *kvm = kvm_arch_alloc_vm(); - - if (!kvm) - return ERR_PTR(-ENOMEM); + int r = 0, i; + struct kvm *kvm = kvm_arch_create_vm(); - r = kvm_arch_init_vm(kvm); - if (r) - goto out_err_nodisable; + if (IS_ERR(kvm)) + goto out; r = hardware_enable_all(); if (r) @@ -497,19 +439,23 @@ static struct kvm *kvm_create_vm(void) r = -ENOMEM; kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) - goto out_err_nosrcu; - if (init_srcu_struct(&kvm->srcu)) - goto out_err_nosrcu; + goto out_err; + if (kvm_init_srcu_struct(&kvm->srcu)) + goto out_err; for (i = 0; i < KVM_NR_BUSES; i++) { kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); - if (!kvm->buses[i]) + if (!kvm->buses[i]) { + kvm_cleanup_srcu_struct(&kvm->srcu); goto out_err; + } } r = kvm_init_mmu_notifier(kvm); - if (r) + if (r) { + kvm_cleanup_srcu_struct(&kvm->srcu); goto out_err; + } kvm->mm = current->mm; mmget(&kvm->mm->mm_count); @@ -523,35 +469,22 @@ static struct kvm *kvm_create_vm(void) spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); - +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + kvm_coalesced_mmio_init(kvm); +#endif +out: return kvm; out_err: - cleanup_srcu_struct(&kvm->srcu); -out_err_nosrcu: hardware_disable_all(); out_err_nodisable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm->buses[i]); kfree(kvm->memslots); - kvm_arch_free_vm(kvm); + kfree(kvm); return ERR_PTR(r); } -static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) -{ - if (!memslot->dirty_bitmap) - return; - - if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) - vfree(memslot->dirty_bitmap_head); - else - kfree(memslot->dirty_bitmap_head); - - memslot->dirty_bitmap = NULL; - memslot->dirty_bitmap_head = NULL; -} - /* * Free any memory in @free but not in @dont. */ @@ -564,7 +497,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - kvm_destroy_dirty_bitmap(free); + vfree(free->dirty_bitmap); for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { @@ -575,6 +508,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, } free->npages = 0; + free->dirty_bitmap = NULL; free->rmap = NULL; } @@ -608,9 +542,6 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_flush_shadow(kvm); #endif kvm_arch_destroy_vm(kvm); - kvm_free_physmem(kvm); - cleanup_srcu_struct(&kvm->srcu); - kvm_arch_free_vm(kvm); hardware_disable_all(); mmdrop(mm); } @@ -640,27 +571,6 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) } /* - * Allocation size is twice as large as the actual dirty bitmap size. - * This makes it possible to do double buffering: see x86's - * kvm_vm_ioctl_get_dirty_log(). - */ -static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) -{ - unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - - if (dirty_bytes > PAGE_SIZE) - memslot->dirty_bitmap = vzalloc(dirty_bytes); - else - memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); - - if (!memslot->dirty_bitmap) - return -ENOMEM; - - memslot->dirty_bitmap_head = memslot->dirty_bitmap; - return 0; -} - -/* * Allocate some memory and give it an address in the guest physical address * space. * @@ -697,16 +607,11 @@ int __kvm_set_memory_region(struct kvm *kvm, base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; - r = -EINVAL; - if (npages > KVM_MEM_MAX_NR_PAGES) - goto out; - if (!npages) mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; new = old = *memslot; - new.id = mem->slot; new.base_gfn = base_gfn; new.npages = npages; new.flags = mem->flags; @@ -737,11 +642,13 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Allocate if a slot is being created */ #ifndef CONFIG_S390 if (npages && !new.rmap) { - new.rmap = vzalloc(npages * sizeof(*new.rmap)); + new.rmap = vmalloc(npages * sizeof(struct page *)); if (!new.rmap) goto out_free; + memset(new.rmap, 0, npages * sizeof(*new.rmap)); + new.user_alloc = user_alloc; new.userspace_addr = mem->userspace_addr; } @@ -760,18 +667,21 @@ int __kvm_set_memory_region(struct kvm *kvm, if (new.lpage_info[i]) continue; - lpages = 1 + ((base_gfn + npages - 1) - >> KVM_HPAGE_GFN_SHIFT(level)); - lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); + lpages = 1 + (base_gfn + npages - 1) / + KVM_PAGES_PER_HPAGE(level); + lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); - new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); + new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); if (!new.lpage_info[i]) goto out_free; - if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + memset(new.lpage_info[i], 0, + lpages * sizeof(*new.lpage_info[i])); + + if (base_gfn % KVM_PAGES_PER_HPAGE(level)) new.lpage_info[i][0].write_count = 1; - if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* @@ -789,8 +699,12 @@ skip_lpage: /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - if (kvm_create_dirty_bitmap(&new) < 0) + unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); + + new.dirty_bitmap = vmalloc(dirty_bytes); + if (!new.dirty_bitmap) goto out_free; + memset(new.dirty_bitmap, 0, dirty_bytes); /* destroy any largepage mappings for dirty tracking */ if (old.npages) flush_shadow = 1; @@ -809,7 +723,6 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; - slots->generation++; slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; old_memslots = kvm->memslots; @@ -830,12 +743,14 @@ skip_lpage: if (r) goto out_free; +#ifdef CONFIG_DMAR /* map the pages in iommu page table */ if (npages) { r = kvm_iommu_map_pages(kvm, &new); if (r) goto out_free; } +#endif r = -ENOMEM; slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); @@ -844,7 +759,6 @@ skip_lpage: memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); if (mem->slot >= slots->nmemslots) slots->nmemslots = mem->slot + 1; - slots->generation++; /* actual memory is freed via old in kvm_free_physmem_slot below */ if (!npages) { @@ -942,28 +856,16 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages); int is_error_page(struct page *page) { - return page == bad_page || page == hwpoison_page || page == fault_page; + return page == bad_page; } EXPORT_SYMBOL_GPL(is_error_page); int is_error_pfn(pfn_t pfn) { - return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; + return pfn == bad_pfn; } EXPORT_SYMBOL_GPL(is_error_pfn); -int is_hwpoison_pfn(pfn_t pfn) -{ - return pfn == hwpoison_pfn; -} -EXPORT_SYMBOL_GPL(is_hwpoison_pfn); - -int is_fault_pfn(pfn_t pfn) -{ - return pfn == fault_pfn; -} -EXPORT_SYMBOL_GPL(is_fault_pfn); - static inline unsigned long bad_hva(void) { return PAGE_OFFSET; @@ -975,10 +877,10 @@ int kvm_is_error_hva(unsigned long addr) } EXPORT_SYMBOL_GPL(kvm_is_error_hva); -static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, - gfn_t gfn) +struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); for (i = 0; i < slots->nmemslots; ++i) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -989,18 +891,20 @@ static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, } return NULL; } +EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_memslot(kvm_memslots(kvm), gfn); + gfn = unalias_gfn(kvm, gfn); + return gfn_to_memslot_unaliased(kvm, gfn); } -EXPORT_SYMBOL_GPL(gfn_to_memslot); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); + gfn = unalias_gfn_instantiation(kvm, gfn); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -1042,9 +946,10 @@ out: int memslot_id(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); struct kvm_memory_slot *memslot = NULL; + gfn = unalias_gfn(kvm, gfn); for (i = 0; i < slots->nmemslots; ++i) { memslot = &slots->memslots[i]; @@ -1056,179 +961,76 @@ int memslot_id(struct kvm *kvm, gfn_t gfn) return memslot - slots->memslots; } -static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, - gfn_t *nr_pages) +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; + + gfn = unalias_gfn_instantiation(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); - - if (nr_pages) - *nr_pages = slot->npages - (gfn - slot->base_gfn); - - return gfn_to_hva_memslot(slot, gfn); -} - -unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } EXPORT_SYMBOL_GPL(gfn_to_hva); -static pfn_t get_fault_pfn(void) -{ - get_page(fault_page); - return fault_pfn; -} - -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, - bool *async, bool write_fault, bool *writable) +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) { struct page *page[1]; - int npages = 0; + int npages; pfn_t pfn; - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - - BUG_ON(!write_fault && !writable); - - if (writable) - *writable = true; + might_sleep(); - if (atomic || async) - npages = kvm___get_user_pages_fast(addr, 1, 1, page); - - if (unlikely(npages != 1) && !atomic) { - might_sleep(); - - if (writable) - *writable = write_fault; - - npages = get_user_pages_fast(addr, 1, write_fault, page); - - /* map read fault as writable if possible */ - if (unlikely(!write_fault) && npages == 1) { - struct page *wpage[1]; - - npages = kvm___get_user_pages_fast(addr, 1, 1, wpage); - if (npages == 1) { - *writable = true; - put_page(page[0]); - page[0] = wpage[0]; - } - npages = 1; - } - } + npages = get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1)) { struct vm_area_struct *vma; - if (atomic) - return get_fault_pfn(); - down_read(¤t->mm->mmap_sem); - if (is_hwpoison_address(addr)) { + vma = find_vma(current->mm, addr); + + if (vma == NULL || addr < vma->vm_start || + !(vma->vm_flags & VM_PFNMAP)) { up_read(¤t->mm->mmap_sem); - get_page(hwpoison_page); - return page_to_pfn(hwpoison_page); + get_page(bad_page); + return page_to_pfn(bad_page); } - vma = find_vma_intersection(current->mm, addr, addr+1); - - if (vma == NULL) - pfn = get_fault_pfn(); - else if ((vma->vm_flags & VM_PFNMAP)) { - pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - BUG_ON(!kvm_is_mmio_pfn(pfn)); - } else { - if (async && (vma->vm_flags & VM_WRITE)) - *async = true; - pfn = get_fault_pfn(); - } + pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; up_read(¤t->mm->mmap_sem); + BUG_ON(!kvm_is_mmio_pfn(pfn)); } else pfn = page_to_pfn(page[0]); return pfn; } -pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) -{ - return hva_to_pfn(kvm, addr, true, NULL, true, NULL); -} -EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); - -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, - bool write_fault, bool *writable) +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { unsigned long addr; - if (async) - *async = false; - addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) { get_page(bad_page); return page_to_pfn(bad_page); } -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38) - async = NULL; -#endif - return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); -} - -pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) -{ - return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); - -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, - bool write_fault, bool *writable) -{ - return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_async); - -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) -{ - return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); + return hva_to_pfn(kvm, addr); } EXPORT_SYMBOL_GPL(gfn_to_pfn); -pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) +static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(kvm, addr, false, NULL, true, NULL); + return hva_to_pfn(kvm, addr); } -int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, - int nr_pages) -{ - unsigned long addr; - gfn_t entry; - - addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); - if (kvm_is_error_hva(addr)) - return -1; - - if (entry < nr_pages) - return 0; - - return kvm___get_user_pages_fast(addr, nr_pages, 1, pages); -} -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); - struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { pfn_t pfn; @@ -1402,51 +1204,9 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, return 0; } -int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa) -{ - struct kvm_memslots *slots = kvm_memslots(kvm); - int offset = offset_in_page(gpa); - gfn_t gfn = gpa >> PAGE_SHIFT; - - ghc->gpa = gpa; - ghc->generation = slots->generation; - ghc->memslot = __gfn_to_memslot(slots, gfn); - ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); - if (!kvm_is_error_hva(ghc->hva)) - ghc->hva += offset; - else - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); - -int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) -{ - struct kvm_memslots *slots = kvm_memslots(kvm); - int r; - - if (slots->generation != ghc->generation) - kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); - - if (kvm_is_error_hva(ghc->hva)) - return -EFAULT; - - r = copy_to_user((void *)ghc->hva, data, len); - if (r) - return -EFAULT; - mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); - - return 0; -} -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); - int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, - offset, len); + return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); } EXPORT_SYMBOL_GPL(kvm_clear_guest_page); @@ -1469,24 +1229,24 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t gfn) +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *memslot; + + gfn = unalias_gfn(kvm, gfn); + memslot = gfn_to_memslot_unaliased(kvm, gfn); if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; + unsigned long *p = memslot->dirty_bitmap + + rel_gfn / BITS_PER_LONG; + int offset = rel_gfn % BITS_PER_LONG; - generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); + /* avoid RMW */ + if (!generic_test_le_bit(offset, p)) + generic___set_le_bit(offset, p); } } -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *memslot; - - memslot = gfn_to_memslot(kvm, gfn); - mark_page_dirty_in_slot(kvm, memslot, gfn); -} - /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. */ @@ -1498,7 +1258,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); if (kvm_arch_vcpu_runnable(vcpu)) { - kvm_make_request(KVM_REQ_UNHALT, vcpu); + set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } if (kvm_cpu_has_pending_timer(vcpu)) @@ -1558,7 +1318,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static struct vm_operations_struct kvm_vcpu_vm_ops = { - .fault = kvm_vcpu_fault, + .VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(kvm_vcpu_fault), }; static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) @@ -1580,9 +1340,6 @@ static struct file_operations kvm_vcpu_fops = { .unlocked_ioctl = kvm_vcpu_ioctl, .compat_ioctl = kvm_vcpu_ioctl, .mmap = kvm_vcpu_mmap, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) - .llseek = noop_llseek, -#endif }; /* @@ -1590,7 +1347,7 @@ static struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - return kvm_anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); + return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); } /* @@ -1672,25 +1429,12 @@ static long kvm_vcpu_ioctl(struct file *filp, if (vcpu->kvm->mm != current->mm) return -EIO; - -#if defined(CONFIG_S390) || defined(CONFIG_PPC) - /* - * Special cases: vcpu ioctls that are asynchronous to vcpu execution, - * so vcpu_load() would break it. - */ - if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) - return kvm_arch_vcpu_ioctl(filp, ioctl, arg); -#endif - - - vcpu_load(vcpu); switch (ioctl) { case KVM_RUN: r = -EINVAL; if (arg) goto out; r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); - trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; case KVM_GET_REGS: { struct kvm_regs *kvm_regs; @@ -1827,7 +1571,7 @@ out_free2: goto out; p = &sigset; } - r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); break; } case KVM_GET_FPU: { @@ -1862,7 +1606,6 @@ out_free2: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } out: - vcpu_put(vcpu); kfree(fpu); kfree(kvm_sregs); return r; @@ -1913,6 +1656,7 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&zone, argp, sizeof zone)) goto out; + r = -ENXIO; r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); if (r) goto out; @@ -1924,6 +1668,7 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&zone, argp, sizeof zone)) goto out; + r = -ENXIO; r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); if (r) goto out; @@ -2041,7 +1786,7 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static struct vm_operations_struct kvm_vm_vm_ops = { - .fault = kvm_vm_fault, + .VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(kvm_vm_fault), }; static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) @@ -2057,31 +1802,21 @@ static struct file_operations kvm_vm_fops = { .compat_ioctl = kvm_vm_compat_ioctl, #endif .mmap = kvm_vm_mmap, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) - .llseek = noop_llseek, -#endif }; static int kvm_dev_ioctl_create_vm(void) { - int r; + int fd; struct kvm *kvm; kvm = kvm_create_vm(); if (IS_ERR(kvm)) return PTR_ERR(kvm); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - r = kvm_coalesced_mmio_init(kvm); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } -#endif - r = kvm_anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); - if (r < 0) + fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); + if (fd < 0) kvm_put_kvm(kvm); - return r; + return fd; } static long kvm_dev_ioctl_check_extension_generic(long arg) @@ -2153,9 +1888,6 @@ out: static struct file_operations kvm_chardev_ops = { .unlocked_ioctl = kvm_dev_ioctl, .compat_ioctl = kvm_dev_ioctl, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) - .llseek = noop_llseek, -#endif }; static struct miscdevice kvm_dev = { @@ -2164,7 +1896,7 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable_nolock(void *junk) +static void hardware_enable(void *junk) { int cpu = raw_smp_processor_id(); int r; @@ -2184,14 +1916,7 @@ static void hardware_enable_nolock(void *junk) } } -static void hardware_enable(void *junk) -{ - spin_lock(&kvm_lock); - hardware_enable_nolock(junk); - spin_unlock(&kvm_lock); -} - -static void hardware_disable_nolock(void *junk) +static void hardware_disable(void *junk) { int cpu = raw_smp_processor_id(); @@ -2201,20 +1926,13 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(NULL); } -static void hardware_disable(void *junk) -{ - spin_lock(&kvm_lock); - hardware_disable_nolock(junk); - spin_unlock(&kvm_lock); -} - static void hardware_disable_all_nolock(void) { BUG_ON(!kvm_usage_count); kvm_usage_count--; if (!kvm_usage_count) - kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_disable, NULL, 1); } static void hardware_disable_all(void) @@ -2233,7 +1951,7 @@ static int hardware_enable_all(void) kvm_usage_count++; if (kvm_usage_count == 1) { atomic_set(&hardware_enable_failed, 0); - kvm_on_each_cpu(hardware_enable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_enable, NULL, 1); if (atomic_read(&hardware_enable_failed)) { hardware_disable_all_nolock(); @@ -2261,30 +1979,31 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, cpu); hardware_disable(NULL); break; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28) - case CPU_STARTING: -#else + case CPU_UP_CANCELED: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_disable, NULL, 1); + break; case CPU_ONLINE: -#endif printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28) - hardware_enable(NULL); -#else smp_call_function_single(cpu, hardware_enable, NULL, 1); -#endif break; } return NOTIFY_OK; } -asmlinkage void kvm_spurious_fault(void) +asmlinkage void kvm_handle_fault_on_reboot(void) { + if (kvm_rebooting) + /* spin while reset goes on */ + while (true) + ; /* Fault while not rebooting. We want the trace. */ BUG(); } -EXPORT_SYMBOL_GPL(kvm_spurious_fault); +EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); static int kvm_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -2297,7 +2016,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, */ printk(KERN_INFO "kvm: exiting hardware virtualization\n"); kvm_rebooting = true; - kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_disable, NULL, 1); return NOTIFY_OK; } @@ -2323,9 +2042,7 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { int i; - struct kvm_io_bus *bus; - - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) return 0; @@ -2337,9 +2054,8 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { int i; - struct kvm_io_bus *bus; + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) return 0; @@ -2403,6 +2119,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, + .priority = 20, /* must be > scheduler priority */ }; static int __vm_stat_get(void *_offset, u64 *val) @@ -2469,16 +2186,14 @@ static void kvm_exit_debug(void) static int kvm_suspend(struct sys_device *dev, pm_message_t state) { if (kvm_usage_count) - hardware_disable_nolock(NULL); + hardware_disable(NULL); return 0; } static int kvm_resume(struct sys_device *dev) { - if (kvm_usage_count) { - WARN_ON(spin_is_locked(&kvm_lock)); - hardware_enable_nolock(NULL); - } + if (kvm_usage_count) + hardware_enable(NULL); return 0; } @@ -2518,17 +2233,22 @@ static void kvm_sched_out(struct preempt_notifier *pn, kvm_fire_urn(); } -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, +int kvm_init(void *opaque, unsigned int vcpu_size, struct module *module) { int r; int cpu; - r = kvm_init_srcu(); + r = kvm_init_anon_inodes(); if (r) return r; + r = kvm_init_srcu(); + if (r) + goto cleanup_anon_inodes; + preempt_notifier_sys_init(); + hrtimer_kallsyms_resolve(); r = kvm_arch_init(opaque); if (r) @@ -2543,24 +2263,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, bad_pfn = page_to_pfn(bad_page); - hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (hwpoison_page == NULL) { - r = -ENOMEM; - goto out_free_0; - } - - hwpoison_pfn = page_to_pfn(hwpoison_page); - - fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (fault_page == NULL) { - r = -ENOMEM; - goto out_free_0; - } - - fault_pfn = page_to_pfn(fault_page); - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; @@ -2592,19 +2294,14 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_free_4; /* A kmem cache lets us meet the alignment requirements of fx_save. */ - if (!vcpu_align) - vcpu_align = __alignof__(struct kvm_vcpu); - kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, + __alignof__(struct kvm_vcpu), 0, NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; goto out_free_5; } - r = kvm_async_pf_init(); - if (r) - goto out_free; - kvm_chardev_ops.owner = module; IF_ANON_INODES_DOES_REFCOUNTS( kvm_vm_fops.owner = module;) IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;) @@ -2612,7 +2309,7 @@ IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;) r = misc_register(&kvm_dev); if (r) { printk(KERN_ERR "kvm: misc device register failed\n"); - goto out_unreg; + goto out_free; } kvm_preempt_ops.sched_in = kvm_sched_in; @@ -2620,14 +2317,12 @@ IF_ANON_INODES_DOES_REFCOUNTS( kvm_vcpu_fops.owner = module;) kvm_init_debug(); - printk("loaded kvm module (kvm-kmod-2.6.38-rc7)\n"); + printk("loaded kvm module (kvm-kmod-2.6.34)\n"); kvm_clock_warn_suspend_bug(); return 0; -out_unreg: - kvm_async_pf_deinit(); out_free: kmem_cache_destroy(kvm_vcpu_cache); out_free_5: @@ -2643,37 +2338,35 @@ out_free_1: out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: - if (fault_page) - __free_page(fault_page); - if (hwpoison_page) - __free_page(hwpoison_page); __free_page(bad_page); out: kvm_arch_exit(); out_fail: preempt_notifier_sys_exit(); kvm_exit_srcu(); +cleanup_anon_inodes: + kvm_exit_anon_inodes(); return r; } EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { + tracepoint_synchronize_unregister(); kvm_exit_debug(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); - kvm_async_pf_deinit(); sysdev_unregister(&kvm_sysdev); sysdev_class_unregister(&kvm_sysdev_class); unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); - kvm_on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_on_each_cpu(hardware_disable, NULL, 1); kvm_arch_hardware_unsetup(); kvm_arch_exit(); free_cpumask_var(cpus_hardware_enabled); - __free_page(hwpoison_page); __free_page(bad_page); preempt_notifier_sys_exit(); kvm_exit_srcu(); + kvm_exit_anon_inodes(); } EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/linux/x86/kvm_timer.h b/linux/x86/kvm_timer.h index 64bc6ea..55c7524 100644 --- a/linux/x86/kvm_timer.h +++ b/linux/x86/kvm_timer.h @@ -10,7 +10,9 @@ struct kvm_timer { }; struct kvm_timer_ops { - bool (*is_periodic)(struct kvm_timer *); + bool (*is_periodic)(struct kvm_timer *); }; + enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); + diff --git a/linux/x86/lapic.c b/linux/x86/lapic.c index d3ddaa4..890f40f 100644 --- a/linux/x86/lapic.c +++ b/linux/x86/lapic.c @@ -45,7 +45,6 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel - * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Authors: * Dor Laor <dor.laor@qumranet.com> @@ -299,10 +298,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static void apic_update_ppr(struct kvm_lapic *apic) { - u32 tpr, isrv, ppr, old_ppr; + u32 tpr, isrv, ppr; int isr; - old_ppr = apic_get_reg(apic, APIC_PROCPRI); tpr = apic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -315,11 +313,7 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - if (old_ppr != ppr) { - apic_set_reg(apic, APIC_PROCPRI, ppr); - if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); - } + apic_set_reg(apic, APIC_PROCPRI, ppr); } static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) @@ -374,7 +368,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); - ASSERT(target); + ASSERT(!target); switch (short_hand) { case APIC_DEST_NOSHORT: if (dest_mode == 0) @@ -436,7 +430,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } - kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; @@ -462,7 +455,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, "INIT on a runnable vcpu %d\n", vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug("Ignoring de-assert INIT to vcpu %d\n", @@ -477,7 +469,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; @@ -523,7 +514,6 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_EDGE_TRIG; if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -583,7 +573,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; - kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); + set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } @@ -726,7 +716,7 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->lapic_timer.period = NSEC_PER_MSEC/2; } - hrtimer_start(&apic->lapic_timer.timer, + hrtimer_start_p(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), HRTIMER_MODE_ABS); @@ -841,7 +831,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_TMICT: - hrtimer_cancel(&apic->lapic_timer.timer); + hrtimer_cancel_p(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; @@ -913,7 +903,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) if (!vcpu->arch.apic) return; - hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); + hrtimer_cancel_p(&vcpu->arch.apic->lapic_timer.timer); if (vcpu->arch.apic->regs_page) __free_page(vcpu->arch.apic->regs_page); @@ -989,7 +979,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) ASSERT(apic != NULL); /* Stop the timer in case it's a reset to an active apic */ - hrtimer_cancel(&apic->lapic_timer.timer); + hrtimer_cancel_p(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); kvm_apic_set_version(apic->vcpu); @@ -1105,16 +1095,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + apic->regs_page = alloc_page(GFP_KERNEL); if (apic->regs_page == NULL) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } apic->regs = page_address(apic->regs_page); + memset(apic->regs, 0, PAGE_SIZE); apic->vcpu = vcpu; - hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, + hrtimer_init_p(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); apic->lapic_timer.timer.function = kvm_timer_fn; apic->lapic_timer.t_ops = &lapic_timer_ops; @@ -1155,11 +1146,13 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (!apic_hw_enabled(vcpu->arch.apic)) - r = 1; - if ((lvt0 & APIC_LVT_MASKED) == 0 && - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; + if (kvm_vcpu_is_bsp(vcpu)) { + if (!apic_hw_enabled(vcpu->arch.apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; + } return r; } @@ -1196,11 +1189,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); apic_update_ppr(apic); - hrtimer_cancel(&apic->lapic_timer.timer); + hrtimer_cancel_p(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) @@ -1212,7 +1204,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) return; timer = &apic->lapic_timer.timer; - if (hrtimer_cancel(timer)) + if (hrtimer_cancel_p(timer)) kvm_hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } diff --git a/linux/x86/mmu.c b/linux/x86/mmu.c index 9713914..a1d4d0c 100644 --- a/linux/x86/mmu.c +++ b/linux/x86/mmu.c @@ -47,7 +47,6 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -58,11 +57,9 @@ * */ -#include "irq.h" #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" -#include "x86.h" #include <linux/kvm_host.h> #include <asm/types.h> @@ -75,7 +72,6 @@ #include <linux/srcu.h> #include <linux/slab.h> -#include <linux/uaccess.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -91,25 +87,15 @@ */ bool tdp_enabled = false; -enum { - AUDIT_PRE_PAGE_FAULT, - AUDIT_POST_PAGE_FAULT, - AUDIT_PRE_PTE_WRITE, - AUDIT_POST_PTE_WRITE, - AUDIT_PRE_SYNC, - AUDIT_POST_SYNC -}; +#undef MMU_DEBUG -char *audit_point_name[] = { - "pre page fault", - "post page fault", - "pre pte write", - "post pte write", - "pre sync", - "post sync" -}; +#undef AUDIT -#undef MMU_DEBUG +#ifdef AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} +#endif #ifdef MMU_DEBUG @@ -123,7 +109,7 @@ char *audit_point_name[] = { #endif -#ifdef MMU_DEBUG +#if defined(MMU_DEBUG) || defined(AUDIT) static int dbg = 0; module_param(dbg, bool, 0644); #endif @@ -141,11 +127,11 @@ module_param(oos_shadow, bool, 0644); } #endif -#define PTE_PREFETCH_NUM 8 - #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + #define PT64_LEVEL_BITS 9 #define PT64_LEVEL_SHIFT(level) \ @@ -202,6 +188,7 @@ module_param(oos_shadow, bool, 0644); #include <trace/events/kvm.h> +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -227,15 +214,20 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) -typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); + +struct kvm_unsync_walk { + int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); +}; + +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_trap_nonpresent_pte; static u64 __read_mostly shadow_notrap_nonpresent_pte; +static u64 __read_mostly shadow_base_present_pte; static u64 __read_mostly shadow_nx_mask; static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; @@ -254,6 +246,12 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) } EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); +void kvm_mmu_set_base_ptes(u64 base_pte) +{ + shadow_base_present_pte = base_pte; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); + void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask) { @@ -265,7 +263,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static bool is_write_protection(struct kvm_vcpu *vcpu) +static int is_write_protection(struct kvm_vcpu *vcpu) { return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } @@ -329,70 +327,13 @@ static gfn_t pse36_gfn_delta(u32 gpte) static void __set_spte(u64 *sptep, u64 spte) { - kvm_set_64bit(sptep, spte); -} - -static u64 __xchg_spte(u64 *sptep, u64 new_spte) -{ #ifdef CONFIG_X86_64 - return xchg(sptep, new_spte); + set_64bit((unsigned long *)sptep, spte); #else - u64 old_spte; - - do { - old_spte = *sptep; - } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); - - return old_spte; + set_64bit((unsigned long long *)sptep, spte); #endif } -static bool spte_has_volatile_bits(u64 spte) -{ - if (!shadow_accessed_mask) - return false; - - if (!is_shadow_present_pte(spte)) - return false; - - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; - - return true; -} - -static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) -{ - return (old_spte & bit_mask) && !(new_spte & bit_mask); -} - -static void update_spte(u64 *sptep, u64 new_spte) -{ - u64 mask, old_spte = *sptep; - - WARN_ON(!is_rmap_spte(new_spte)); - - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; - - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) - __set_spte(sptep, new_spte); - else - old_spte = __xchg_spte(sptep, new_spte); - - if (!shadow_accessed_mask) - return; - - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); -} - static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -409,11 +350,10 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); + kfree(mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -427,6 +367,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; + set_page_private(page, 0); cache->objects[cache->nobjs++] = page_address(page); } return 0; @@ -447,7 +388,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 4 + PTE_PREFETCH_NUM); + rmap_desc_cache, 4); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -461,11 +402,10 @@ out: static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -486,7 +426,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) static void mmu_free_pte_chain(struct kvm_pte_chain *pc) { - kmem_cache_free(pte_chain_cache, pc); + kfree(pc); } static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) @@ -497,66 +437,53 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) { - kmem_cache_free(rmap_desc_cache, rd); -} - -static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) -{ - if (!sp->role.direct) - return sp->gfns[index]; - - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); -} - -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) -{ - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else - sp->gfns[index] = gfn; + kfree(rd); } /* - * Return the pointer to the large page information for a given gfn, - * handling slots that are not large page aligned. + * Return the pointer to the largepage write count for a given + * gfn, handling slots that are not large page aligned. */ -static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) +static int *slot_largepage_idx(gfn_t gfn, + struct kvm_memory_slot *slot, + int level) { unsigned long idx; - idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); - return &slot->lpage_info[level - 2][idx]; + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + return &slot->lpage_info[level - 2][idx].write_count; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; + int *write_count; int i; - slot = gfn_to_memslot(kvm, gfn); + gfn = unalias_gfn(kvm, gfn); + + slot = gfn_to_memslot_unaliased(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count += 1; + write_count = slot_largepage_idx(gfn, slot, i); + *write_count += 1; } } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; + int *write_count; int i; - slot = gfn_to_memslot(kvm, gfn); + gfn = unalias_gfn(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count -= 1; - WARN_ON(linfo->write_count < 0); + slot = gfn_to_memslot_unaliased(kvm, gfn); + write_count = slot_largepage_idx(gfn, slot, i); + *write_count -= 1; + WARN_ON(*write_count < 0); } } @@ -565,12 +492,13 @@ static int has_wrprotected_page(struct kvm *kvm, int level) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; + int *largepage_idx; - slot = gfn_to_memslot(kvm, gfn); + gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { - linfo = lpage_info_slot(gfn, slot, level); - return linfo->write_count; + largepage_idx = slot_largepage_idx(gfn, slot, level); + return *largepage_idx; } return 1; @@ -594,18 +522,14 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; + int host_level, level, max_level; + slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return true; - return false; -} - -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - int host_level, level, max_level; + return PT_PAGE_TABLE_LEVEL; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -624,20 +548,22 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) /* * Take gfn and return the reverse mapping to it. + * Note: gfn must be unaliased before this function get called */ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; + unsigned long idx; slot = gfn_to_memslot(kvm, gfn); if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - linfo = lpage_info_slot(gfn, slot, level); + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); - return &linfo->rmap_pde; + return &slot->lpage_info[level - 2][idx].rmap_pde; } /* @@ -662,8 +588,9 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) if (!is_rmap_spte(*spte)) return count; + gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + sp->gfns[spte - sp->spt] = gfn; rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); @@ -674,7 +601,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc->sptes[0] = (u64 *)*rmapp; desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; - ++count; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); @@ -687,7 +613,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc = desc->more; } for (i = 0; desc->sptes[i]; ++i) - ++count; + ; desc->sptes[i] = spte; } return count; @@ -721,25 +647,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; - gfn_t gfn; + pfn_t pfn; unsigned long *rmapp; int i; + if (!is_rmap_spte(*spte)) + return; sp = page_header(__pa(spte)); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); + pfn = spte_to_pfn(*spte); + if (*spte & shadow_accessed_mask) + kvm_set_pfn_accessed(pfn); + if (is_writable_pte(*spte)) + kvm_set_pfn_dirty(pfn); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { - printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); + printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); } else if (!(*rmapp & 1)) { - rmap_printk("rmap_remove: %p 1->0\n", spte); + rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); if ((u64 *)*rmapp != spte) { - printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); + printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", + spte, *spte); BUG(); } *rmapp = 0; } else { - rmap_printk("rmap_remove: %p many->many\n", spte); + rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { @@ -753,41 +686,15 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } - pr_err("rmap_remove: %p many->many\n", spte); + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } -static int set_spte_track_bits(u64 *sptep, u64 new_spte) -{ - pfn_t pfn; - u64 old_spte = *sptep; - - if (!spte_has_volatile_bits(old_spte)) - __set_spte(sptep, new_spte); - else - old_spte = __xchg_spte(sptep, new_spte); - - if (!is_rmap_spte(old_spte)) - return 0; - - pfn = spte_to_pfn(old_spte); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) - kvm_set_pfn_dirty(pfn); - return 1; -} - -static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) -{ - if (set_spte_track_bits(sptep, new_spte)) - rmap_remove(kvm, sptep); -} - static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) { struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; u64 *prev_spte; int i; @@ -799,6 +706,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) return NULL; } desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; prev_spte = NULL; while (desc) { for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { @@ -817,6 +725,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) u64 *spte; int i, write_protected = 0; + gfn = unalias_gfn(kvm, gfn); rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); @@ -825,11 +734,18 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { - update_spte(spte, *spte & ~PT_WRITABLE_MASK); + __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); } + if (write_protected) { + pfn_t pfn; + + spte = rmap_next(kvm, rmapp, NULL); + pfn = spte_to_pfn(*spte); + kvm_set_pfn_dirty(pfn); + } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; @@ -842,9 +758,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { - drop_spte(kvm, spte, - shadow_trap_nonpresent_pte); + rmap_remove(kvm, spte); --kvm->stat.lpages; + __set_spte(spte, shadow_trap_nonpresent_pte); spte = NULL; write_protected = 1; } @@ -864,7 +780,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((spte = rmap_next(kvm, rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; @@ -886,7 +803,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { - drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); spte = rmap_next(kvm, rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -894,8 +812,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; - set_spte_track_bits(spte, new_spte); + if (is_writable_pte(*spte)) + kvm_set_pfn_dirty(spte_to_pfn(*spte)); + __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); } } @@ -915,7 +834,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int retval = 0; struct kvm_memslots *slots; - slots = kvm_memslots(kvm); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < slots->nmemslots; i++) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -925,16 +844,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - gfn_t gfn = memslot->base_gfn + gfn_offset; ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - struct kvm_lpage_info *linfo; - - linfo = lpage_info_slot(gfn, memslot, - PT_DIRECTORY_LEVEL + j); - ret |= handler(kvm, &linfo->rmap_pde, data); + int idx = gfn_offset; + idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + ret |= handler(kvm, + &memslot->lpage_info[j][idx].rmap_pde, + data); } trace_kvm_age_page(hva, memslot, ret); retval |= ret; @@ -985,35 +903,6 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return young; } -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) -{ - u64 *spte; - int young = 0; - - /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. - */ - if (!shadow_accessed_mask) - goto out; - - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - young = _spte & PT_ACCESSED_MASK; - if (young) { - young = 1; - break; - } - spte = rmap_next(kvm, rmapp, spte); - } -out: - return young; -} - #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -1023,6 +912,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); + gfn = unalias_gfn(vcpu->kvm, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); @@ -1034,11 +924,6 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); -} - #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { @@ -1055,28 +940,14 @@ static int is_empty_shadow_page(u64 *spt) } #endif -/* - * This value is the sum of all of the kvm instances's - * kvm->arch.n_used_mmu_pages values. We need a global, - * aggregate version in order to make the slab shrinker - * faster - */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) -{ - kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); -} - static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); - hlist_del(&sp->hash_link); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); - if (!sp->role.direct) - __free_page(virt_to_page(sp->gfns)); - kmem_cache_free(mmu_page_header_cache, sp); - kvm_mod_used_mmu_pages(kvm, -1); + __free_page(virt_to_page(sp->gfns)); + kfree(sp); + ++kvm->arch.n_free_mmu_pages; } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -1085,21 +956,20 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte, int direct) + u64 *parent_pte) { struct kvm_mmu_page *sp; sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&sp->oos_link); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; - kvm_mod_used_mmu_pages(vcpu->kvm, +1); + --vcpu->kvm->arch.n_free_mmu_pages; return sp; } @@ -1178,7 +1048,9 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } -static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; struct hlist_node *node; @@ -1187,37 +1059,64 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(__pa(sp->parent_pte)); - fn(parent_sp, sp->parent_pte); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); return; } - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - u64 *spte = pte_chain->parent_ptes[i]; - - if (!spte) + if (!pte_chain->parent_ptes[i]) break; - parent_sp = page_header(__pa(spte)); - fn(parent_sp, spte); + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); } } -static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); -static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) +static void kvm_mmu_update_unsync_bitmap(u64 *spte) { - mmu_parent_walk(sp, mark_unsync); + unsigned int index; + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + index = spte - sp->spt; + if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) + sp->unsync_children++; + WARN_ON(!sp->unsync_children); } -static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) +static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) { - unsigned int index; + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + if (!sp->parent_pte) return; - if (sp->unsync_children++) + + if (!sp->multimapped) { + kvm_mmu_update_unsync_bitmap(sp->parent_pte); return; - kvm_mmu_mark_parents_unsync(sp); + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); + } +} + +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + kvm_mmu_update_parents_unsync(sp); + return 1; +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); } static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, @@ -1276,40 +1175,35 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { - struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (!is_shadow_present_pte(ent) || is_large_pte(ent)) - goto clear_child_bitmap; - - child = page_header(ent & PT64_BASE_ADDR_MASK); - - if (child->unsync_children) { - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - - ret = __mmu_unsync_walk(child, pvec); - if (!ret) - goto clear_child_bitmap; - else if (ret > 0) - nr_unsync_leaf += ret; - else - return ret; - } else if (child->unsync) { - nr_unsync_leaf++; - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - } else - goto clear_child_bitmap; - - continue; + if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + __clear_bit(i, sp->unsync_child_bitmap); + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } -clear_child_bitmap: - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); + if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } + } } + if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) + sp->unsync_children = 0; return nr_unsync_leaf; } @@ -1324,44 +1218,48 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, return __mmu_unsync_walk(sp, pvec); } +static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + + pgprintk("%s: looking for gfn %lx\n", __func__, gfn); + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.direct + && !sp->role.invalid) { + pgprintk("%s: found role %x\n", + __func__, sp->role.word); + return sp; + } + return NULL; +} + static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); - trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list); -static void kvm_mmu_commit_zap_page(struct kvm *kvm, - struct list_head *invalid_list); - -#define for_each_gfn_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn)) {} else +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn) || (sp)->role.direct || \ - (sp)->role.invalid) {} else - -/* @sp->gfn should be write-protected at the call site */ -static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list, bool clear_unsync) +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } - if (clear_unsync) - kvm_unlink_unsync_page(vcpu->kvm, sp); - + trace_kvm_mmu_sync_page(sp); + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + kvm_unlink_unsync_page(vcpu->kvm, sp); if (vcpu->arch.mmu.sync_page(vcpu, sp)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); + kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } @@ -1369,52 +1267,6 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 0; } -static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - LIST_HEAD(invalid_list); - int ret; - - ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (ret) - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - - return ret; -} - -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) -{ - return __kvm_sync_page(vcpu, sp, invalid_list, true); -} - -/* @gfn should be write-protected at the call site */ -static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - struct kvm_mmu_page *s; - struct hlist_node *node; - LIST_HEAD(invalid_list); - bool flush = false; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { - if (!s->unsync) - continue; - - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - kvm_unlink_unsync_page(vcpu->kvm, s); - if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); - continue; - } - flush = true; - } - - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (flush) - kvm_mmu_flush_tlb(vcpu); -} - struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; @@ -1481,7 +1333,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; - LIST_HEAD(invalid_list); kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { @@ -1494,10 +1345,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_flush_remote_tlbs(vcpu->kvm); for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp, &invalid_list); + kvm_sync_page(vcpu, sp); mmu_pages_clear_parents(&parents); } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); cond_resched_lock(&vcpu->kvm->mmu_lock); kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1512,57 +1362,50 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, u64 *parent_pte) { union kvm_mmu_page_role role; + unsigned index; unsigned quadrant; + struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; - bool need_sync = false; + struct hlist_node *node, *tmp; role = vcpu->arch.mmu.base_role; role.level = level; role.direct = direct; - if (role.direct) - role.cr4_pae = 0; role.access = access; - if (!vcpu->arch.mmu.direct_map - && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { - if (!need_sync && sp->unsync) - need_sync = true; - - if (sp->role.word != role.word) - continue; - - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); + if (sp->role.word != role.word) + continue; - trace_kvm_mmu_get_page(sp, false); - return sp; - } + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } + trace_kvm_mmu_get_page(sp, false); + return sp; + } ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); + sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) return sp; sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, - &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); + hlist_add_head(&sp->hash_link, bucket); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) - kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) @@ -1579,12 +1422,6 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, iterator->addr = addr; iterator->shadow_addr = vcpu->arch.mmu.root_hpa; iterator->level = vcpu->arch.mmu.shadow_root_level; - - if (iterator->level == PT64_ROOT_LEVEL && - vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && - !vcpu->arch.mmu.direct_map) - --iterator->level; - if (iterator->level == PT32E_ROOT_LEVEL) { iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; @@ -1615,47 +1452,6 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) --iterator->level; } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) -{ - u64 spte; - - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - __set_spte(sptep, spte); -} - -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - -static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned direct_access) -{ - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { - struct kvm_mmu_page *child; - - /* - * For the direct sp, if the guest pte's dirty bit - * changed form clean to dirty, it will corrupt the - * sp's access: allow writable in the read-only sp, - * so we should update the spte at this point to get - * a new sp with the correct access. - */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); - if (child->role.access == direct_access) - return; - - mmu_page_remove_parent_pte(child, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -1676,8 +1472,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } else { if (is_large_pte(ent)) --kvm->stat.lpages; - drop_spte(kvm, &pt[i], - shadow_trap_nonpresent_pte); + rmap_remove(kvm, &pt[i]); } } pt[i] = shadow_trap_nonpresent_pte; @@ -1719,8 +1514,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } static int mmu_zap_unsync_children(struct kvm *kvm, - struct kvm_mmu_page *parent, - struct list_head *invalid_list) + struct kvm_mmu_page *parent) { int i, zapped = 0; struct mmu_page_path parents; @@ -1734,7 +1528,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); zapped++; } @@ -1744,113 +1538,110 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { int ret; - trace_kvm_mmu_prepare_zap_page(sp); + trace_kvm_mmu_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp, invalid_list); + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { - /* Count self */ - ret++; - list_move(&sp->link, invalid_list); + hlist_del(&sp->hash_link); + kvm_mmu_free_page(kvm, sp); } else { + sp->role.invalid = 1; list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); } - - sp->role.invalid = 1; kvm_mmu_reset_last_pte_updated(kvm); return ret; } -static void kvm_mmu_commit_zap_page(struct kvm *kvm, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *sp; - - if (list_empty(invalid_list)) - return; - - kvm_flush_remote_tlbs(kvm); - - do { - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(kvm, sp); - } while (!list_empty(invalid_list)); - -} - /* * Changing the number of mmu pages allocated to the vm - * Note: if goal_nr_mmu_pages is too small, you will get dead lock + * Note: if kvm_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { - LIST_HEAD(invalid_list); + int used_pages; + + used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; + used_pages = max(0, used_pages); + /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && + if (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages && !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); - kvm_mmu_commit_zap_page(kvm, &invalid_list); + used_pages -= kvm_mmu_zap_page(kvm, page); + used_pages--; } - goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; + kvm_nr_mmu_pages = used_pages; + kvm->arch.n_free_mmu_pages = 0; } + else + kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages + - kvm->arch.n_alloc_mmu_pages; - kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; + kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { + unsigned index; + struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); + struct hlist_node *node, *n; int r; - pgprintk("%s: looking for gfn %llx\n", __func__, gfn); + pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; - - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: gfn %llx role %x\n", __func__, gfn, - sp->role.word); - r = 1; - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.direct) { + pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + sp->role.word); + r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; + } return r; } static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { + unsigned index; + struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); - - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: zap %llx %x\n", - __func__, gfn, sp->role.word); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + struct hlist_node *node, *nn; + + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { + if (sp->gfn == gfn && !sp->role.direct + && !sp->role.invalid) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + if (kvm_mmu_zap_page(kvm, sp)) + nn = bucket->first; + } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) @@ -1875,6 +1666,20 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) } } +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct page *page; + + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + + if (gpa == UNMAPPED_GVA) + return NULL; + + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + + return page; +} + /* * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c @@ -1980,51 +1785,47 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); -static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + trace_kvm_mmu_unsync_page(sp); + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.direct) + continue; + if (s->role.word != sp->role.word) + return 1; + } ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; - kvm_mmu_mark_parents_unsync(sp); - mmu_convert_notrap(sp); -} - -static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - struct kvm_mmu_page *s; - struct hlist_node *node; + kvm_mmu_mark_parents_unsync(vcpu, sp); - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { - if (s->unsync) - continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - __kvm_unsync_page(vcpu, s); - } + mmu_convert_notrap(sp); + return 0; } static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - struct kvm_mmu_page *s; - struct hlist_node *node; - bool need_unsync = false; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { - if (!can_unsync) - return 1; + struct kvm_mmu_page *shadow; - if (s->role.level != PT_PAGE_TABLE_LEVEL) + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) return 1; - - if (!need_unsync && !s->unsync) { - if (!oos_shadow) - return 1; - need_unsync = true; - } + if (shadow->unsync) + return 0; + if (can_unsync && oos_shadow) + return kvm_unsync_page(vcpu, shadow); + return 1; } - if (need_unsync) - kvm_unsync_pages(vcpu, gfn); return 0; } @@ -2032,9 +1833,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool can_unsync, bool host_writable) + bool can_unsync, bool reset_host_protection) { - u64 spte, entry = *sptep; + u64 spte; int ret = 0; /* @@ -2042,7 +1843,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * whether the guest actually used the pte (in order to detect * demand paging). */ - spte = PT_PRESENT_MASK; + spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) spte |= shadow_accessed_mask; if (!dirty) @@ -2059,30 +1860,23 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); - if (host_writable) + if (reset_host_protection) spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (!vcpu->arch.mmu.direct_map && write_fault - && !is_write_protection(vcpu) && !user_fault)) { + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); - goto done; + spte = shadow_trap_nonpresent_pte; + goto set_pte; } spte |= PT_WRITABLE_MASK; - if (!vcpu->arch.mmu.direct_map - && !(pte_access & ACC_WRITE_MASK)) - spte &= ~PT_USER_MASK; - /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection @@ -2093,7 +1887,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", + pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; @@ -2106,16 +1900,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - update_spte(sptep, spte); - /* - * If we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB. - */ - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) - kvm_flush_remote_tlbs(vcpu->kvm); -done: + __set_spte(sptep, spte); return ret; } @@ -2124,13 +1909,14 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, int user_fault, int write_fault, int dirty, int *ptwrite, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool host_writable) + bool reset_host_protection) { int was_rmapped = 0; + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %llx\n", + " user_fault %d gfn %lx\n", __func__, *sptep, pt_access, write_fault, user_fault, gfn); @@ -2146,27 +1932,24 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %llx new %llx\n", + pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); + rmap_remove(vcpu->kvm, sptep); } else was_rmapped = 1; } if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, dirty, level, gfn, pfn, speculative, true, - host_writable)) { + reset_host_protection)) { if (write_fault) *ptwrite = 1; - kvm_mmu_flush_tlb(vcpu); + kvm_x86_ops->tlb_flush(vcpu); } pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, *sptep, sptep); @@ -2176,10 +1959,15 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); + kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); + } else { + if (was_writable) + kvm_release_pfn_dirty(pfn); + else + kvm_release_pfn_clean(pfn); } - kvm_release_pfn_clean(pfn); if (speculative) { vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; @@ -2190,108 +1978,8 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static struct kvm_memory_slot * -pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) - slot = NULL; - - return slot; -} - -static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - unsigned long hva; - - slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); - if (!slot) { - get_page(bad_page); - return page_to_pfn(bad_page); - } - - hva = gfn_to_hva_memslot(slot, gfn); - - return hva_to_pfn_atomic(vcpu->kvm, hva); -} - -static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *start, u64 *end) -{ - struct page *pages[PTE_PREFETCH_NUM]; - unsigned access = sp->role.access; - int i, ret; - gfn_t gfn; - - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) - return -1; - - ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); - if (ret <= 0) - return -1; - - for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, ACC_ALL, - access, 0, 0, 1, NULL, - sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); - - return 0; -} - -static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *sptep) -{ - u64 *spte, *start = NULL; - int i; - - WARN_ON(!sp->role.direct); - - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); - spte = sp->spt + i; - - for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { - if (!start) - continue; - if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) - break; - start = NULL; - } else if (!start) - start = spte; - } -} - -static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) -{ - struct kvm_mmu_page *sp; - - /* - * Since it's no accessed bit on EPT, it's no way to - * distinguish between actually accessed translations - * and prefetched, so disable pte prefetch if EPT is - * enabled. - */ - if (!shadow_accessed_mask) - return; - - sp = page_header(__pa(sptep)); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return; - - __direct_pte_prefetch(vcpu, sp, sptep); -} - static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int map_writable, int level, gfn_t gfn, pfn_t pfn, - bool prefault) + int level, gfn_t gfn, pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2300,21 +1988,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - unsigned pte_access = ACC_ALL; - - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - level, gfn, pfn, prefault, map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); + level, gfn, pfn, false, true); ++vcpu->stat.pf_fixed; break; } if (*iterator.sptep == shadow_trap_nonpresent_pte) { - u64 base_addr = iterator.addr; - - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; + pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); @@ -2327,126 +2009,45 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, __set_spte(iterator.sptep, __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask - | shadow_accessed_mask); + | shadow_user_mask | shadow_x_mask); } } return pt_write; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) -{ - kvm_siginfo_t info; - - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_MCEERR_AR; - info.si_addr = (void *)address; - info.si_addr_lsb = PAGE_SHIFT; - - send_sig_info(SIGBUS, (siginfo_t *)&info, tsk); -} - -static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) -{ - kvm_release_pfn_clean(pfn); - if (is_hwpoison_pfn(pfn)) { - kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); - return 0; - } else if (is_fault_pfn(pfn)) - return -EFAULT; - - return 1; -} - -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, pfn_t *pfnp, int *levelp) -{ - pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; - int level = *levelp; - - /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. - */ - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && - PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { - unsigned long mask; - /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - if (!get_page_unless_zero(pfn_to_page(pfn))) - BUG(); - *pfnp = pfn; - } - } -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable); - -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; int level; - int force_pt_level; pfn_t pfn; unsigned long mmu_seq; - bool map_writable; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); - if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; + level = mapping_level(vcpu, gfn); + + /* + * This path builds a PAE pagetable - so we can map 2mb pages at + * maximum. Therefore check if the level is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; + pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ - if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return 1; + } spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, - prefault); + r = __direct_map(vcpu, v, write, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2463,22 +2064,17 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; spin_lock(&vcpu->kvm->mmu_lock); - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && - (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || - vcpu->arch.mmu.direct_map)) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); --sp->root_count; - if (!sp->root_count && sp->role.invalid) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - } + if (!sp->root_count && sp->role.invalid) + kvm_mmu_zap_page(vcpu->kvm, sp); vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; @@ -2491,12 +2087,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + kvm_mmu_zap_page(vcpu->kvm, sp); } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -2506,170 +2100,79 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); ret = 1; } return ret; } -static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - unsigned i; - - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, - 1, ACC_ALL, NULL); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = __pa(sp->spt); - } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - ASSERT(!VALID_PAGE(root)); - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), - i << 30, - PT32_ROOT_LEVEL, 1, ACC_ALL, - NULL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; - } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); - } else - BUG(); - - return 0; -} - -static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { - struct kvm_mmu_page *sp; - u64 pdptr, pm_mask; - gfn_t root_gfn; int i; + gfn_t root_gfn; + struct kvm_mmu_page *sp; + int direct = 0; + u64 pdptr; - root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; - - if (mmu_check_root(vcpu, root_gfn)) - return 1; + root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; - /* - * Do we shadow a long mode page table? If so we need to - * write-protect the guests page table root. - */ - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, - 0, ACC_ALL, NULL); + if (tdp_enabled) + direct = 1; + if (mmu_check_root(vcpu, root_gfn)) + return 1; + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, direct, + ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } - - /* - * We shadow a 32 bit page table. This may be a legacy 2-level - * or a PAE 3-level page table. In either case we need to be aware that - * the shadow page table may be a PAE or a long mode page table. - */ - pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) - pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - + direct = !is_paging(vcpu); + if (tdp_enabled) + direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); + pdptr = kvm_pdptr_read(vcpu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; - } - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); + } else if (vcpu->arch.mmu.root_level == 0) + root_gfn = 0; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, 0, + PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - - vcpu->arch.mmu.pae_root[i] = root | pm_mask; + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); - - /* - * If we shadow a 32 bit page table with a long mode page - * table we enter this path. - */ - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - if (vcpu->arch.mmu.lm_root == NULL) { - /* - * The additional page necessary for this is only - * allocated on demand. - */ - - u64 *lm_root; - - lm_root = (void*)get_zeroed_page(GFP_KERNEL); - if (lm_root == NULL) - return 1; - - lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; - - vcpu->arch.mmu.lm_root = lm_root; - } - - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); - } - return 0; } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mmu.direct_map) - return mmu_alloc_direct_roots(vcpu); - else - return mmu_alloc_shadow_roots(vcpu); -} - static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; - if (vcpu->arch.mmu.direct_map) - return; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); - trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); return; } for (i = 0; i < 4; ++i) { @@ -2681,7 +2184,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } - trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -2692,24 +2194,15 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, struct x86_exception *exception) + u32 access, u32 *error) { - if (exception) - exception->error_code = 0; + if (error) + *error = 0; return vaddr; } -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, - struct x86_exception *exception) -{ - if (exception) - exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); -} - static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code, bool prefault) + u32 error_code) { gfn_t gfn; int r; @@ -2725,68 +2218,17 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); -} - -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) -{ - struct kvm_arch_async_pf arch; - - arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; - arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu.direct_map; - arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); -} - -static bool can_do_async_pf(struct kvm_vcpu *vcpu) -{ - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || - kvm_event_needs_reinjection(vcpu))) - return false; - - return kvm_x86_ops->interrupt_allowed(vcpu); -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable) -{ - bool async; - - *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); - - if (!async) - return false; /* *pfn has correct page already */ - - put_page(pfn_to_page(*pfn)); - - if (!prefault && can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(gva, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(gva, gfn); - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) - return true; - } - - *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); - - return false; + error_code & PFERR_WRITE_MASK, gfn); } -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, - bool prefault) +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, + u32 error_code) { pfn_t pfn; int r; int level; - int force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; - int write = error_code & PFERR_WRITE_MASK; - bool map_writable; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -2795,30 +2237,23 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); - if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + level = mapping_level(vcpu, gfn); + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; - - /* mmio */ - if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); + pfn = gfn_to_pfn(vcpu->kvm, gfn); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return 1; + } spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, - level, gfn, pfn, prefault); + r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, + level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -2834,9 +2269,10 @@ static void nonpaging_free(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static int nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static int nonpaging_init_context(struct kvm_vcpu *vcpu) { + struct kvm_mmu *context = &vcpu->arch.mmu; + context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -2847,32 +2283,26 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; - context->direct_map = true; - context->nx = false; return 0; } void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_x86_ops->tlb_flush(vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); + pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); mmu_free_roots(vcpu); } -static unsigned long get_cr3(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr3(vcpu); -} - static void inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) + u64 addr, + u32 err_code) { - vcpu->arch.mmu.inject_page_fault(vcpu, fault); + kvm_inject_page_fault(vcpu, addr, err_code); } static void paging_free(struct kvm_vcpu *vcpu) @@ -2880,12 +2310,12 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) { int bit7; bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; + return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; } #define PTTYPE 64 @@ -2896,33 +2326,26 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) { + struct kvm_mmu *context = &vcpu->arch.mmu; int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; - if (!context->nx) + if (!is_nx(vcpu)) exb_bit_rsvd = rsvd_bits(63, 63); switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; context->rsvd_bits_mask[0][0] = 0; - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; - - if (!is_pse(vcpu)) { - context->rsvd_bits_mask[1][1] = 0; - break; - } - if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = @@ -2935,7 +2358,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | @@ -2953,18 +2376,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; } } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { - context->nx = is_nx(vcpu); - - reset_rsvds_bits_mask(vcpu, context, level); + struct kvm_mmu *context = &vcpu->arch.mmu; ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -2977,23 +2396,20 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; - context->direct_map = false; return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static int paging64_init_context(struct kvm_vcpu *vcpu) { - return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); + return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static int paging32_init_context(struct kvm_vcpu *vcpu) { - context->nx = false; - - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + struct kvm_mmu *context = &vcpu->arch.mmu; + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -3004,21 +2420,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; - context->direct_map = false; return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static int paging32E_init_context(struct kvm_vcpu *vcpu) { - return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.walk_mmu; + struct kvm_mmu *context = &vcpu->arch.mmu; - context->base_role.word = 0; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; context->free = nonpaging_free; @@ -3027,29 +2441,20 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; - context->direct_map = true; - context->set_cr3 = kvm_x86_ops->set_tdp_cr3; - context->get_cr3 = get_cr3; - context->inject_page_fault = kvm_inject_page_fault; - context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { - context->nx = false; context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { - context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { - context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { - context->nx = false; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -3057,83 +2462,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) return 0; } -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r; + ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu, context); + r = nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu, context); + r = paging64_init_context(vcpu); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu, context); + r = paging32E_init_context(vcpu); else - r = paging32_init_context(vcpu, context); + r = paging32_init_context(vcpu); - vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; return r; } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); - -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) -{ - int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - - vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.walk_mmu->get_cr3 = get_cr3; - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - - return r; -} - -static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; - - g_context->get_cr3 = get_cr3; - g_context->inject_page_fault = kvm_inject_page_fault; - - /* - * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The - * translation of l2_gpa to l1_gpa addresses is done using the - * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa - * functions between mmu and nested_mmu are swapped. - */ - if (!is_paging(vcpu)) { - g_context->nx = false; - g_context->root_level = 0; - g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; - } else if (is_long_mode(vcpu)) { - g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); - g_context->root_level = PT64_ROOT_LEVEL; - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else if (is_pae(vcpu)) { - g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); - g_context->root_level = PT32E_ROOT_LEVEL; - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else { - g_context->nx = false; - reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); - g_context->root_level = PT32_ROOT_LEVEL; - g_context->gva_to_gpa = paging32_gva_to_gpa_nested; - } - - return 0; -} static int init_kvm_mmu(struct kvm_vcpu *vcpu) { vcpu->arch.update_pte.pfn = bad_pfn; - if (mmu_is_nested(vcpu)) - return init_kvm_nested_mmu(vcpu); - else if (tdp_enabled) + if (tdp_enabled) return init_kvm_tdp_mmu(vcpu); else return init_kvm_softmmu(vcpu); @@ -3142,9 +2496,10 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - /* mmu.free() should set root_hpa = INVALID_PAGE */ + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { vcpu->arch.mmu.free(vcpu); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -3161,14 +2516,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - r = mmu_alloc_roots(vcpu); spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + r = mmu_alloc_roots(vcpu); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); out: return r; } @@ -3178,7 +2534,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); } -EXPORT_SYMBOL_GPL(kvm_mmu_unload); static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -3190,7 +2545,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) - drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); + rmap_remove(vcpu->kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); @@ -3212,7 +2567,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - if (!sp->role.cr4_pae) + if (sp->role.glevels == PT32_ROOT_LEVEL) paging32_update_pte(vcpu, sp, spte, new); else paging64_update_pte(vcpu, sp, spte, new); @@ -3231,15 +2586,11 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, - bool remote_flush, bool local_flush) +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) { - if (zap_page) - return; - - if (remote_flush) + if (need_remote_flush(old, new)) kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) + else kvm_mmu_flush_tlb(vcpu); } @@ -3251,11 +2602,36 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) } static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - u64 gpte) + const u8 *new, int bytes) { gfn_t gfn; + int r; + u64 gpte = 0; pfn_t pfn; + if (bytes != 4 && bytes != 8) + return; + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode. This is nearly always true + * (might be false while changing modes). Note it is verified later + * by update_pte(). + */ + if (is_pae(vcpu)) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + if ((bytes == 4) && (gpa % 4 == 0)) { + r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); + if (r) + return; + memcpy((void *)&gpte + (gpa % 8), new, 4); + } else if ((bytes == 8) && (gpa % 8 == 0)) { + memcpy((void *)&gpte, new, 8); + } + } else { + if ((bytes == 4) && (gpa % 4 == 0)) + memcpy((void *)&gpte, new, 4); + } if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -3289,10 +2665,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; - union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); + struct hlist_node *node, *n; + struct hlist_head *bucket; + unsigned index; u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -3304,53 +2680,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int flooded = 0; int npte; int r; - int invlpg_counter; - bool remote_flush, local_flush, zap_page; - - zap_page = remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - - invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); - - /* - * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). - */ - if ((is_pae(vcpu) && bytes == 4) || !new) { - /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - if (is_pae(vcpu)) { - gpa &= ~(gpa_t)7; - bytes = 8; - } - r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); - if (r) - gentry = 0; - new = (const u8 *)&gentry; - } - - switch (bytes) { - case 4: - gentry = *(const u32 *)new; - break; - case 8: - gentry = *(const u64 *)new; - break; - default: - gentry = 0; - break; - } - - mmu_guess_page_from_pte_write(vcpu, gpa, gentry); + mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); spin_lock(&vcpu->kvm->mmu_lock); - if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) - gentry = 0; kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); + kvm_mmu_audit(vcpu, "pre pte write"); if (guest_initiated) { if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { @@ -3363,10 +2700,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pte_updated = NULL; } } - - mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - pte_size = sp->role.cr4_pae ? 8 : 4; + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { + if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) + continue; + pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; if (misaligned || flooded) { @@ -3382,15 +2721,15 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } page_offset = offset; level = sp->role.level; npte = 1; - if (!sp->role.cr4_pae) { + if (sp->role.glevels == PT32_ROOT_LEVEL) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map @@ -3407,23 +2746,26 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (quadrant != sp->role.quadrant) continue; } - local_flush = true; spte = &sp->spt[page_offset / sizeof(*spte)]; + if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { + gentry = 0; + r = kvm_read_guest_atomic(vcpu->kvm, + gpa & ~(u64)(pte_size - 1), + &gentry, pte_size); + new = (const void *)&gentry; + if (r < 0) + new = NULL; + } while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - if (gentry && - !((sp->role.word ^ vcpu->arch.mmu.base_role.word) - & mask.word)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - if (!remote_flush && need_remote_flush(entry, *spte)) - remote_flush = true; + if (new) + mmu_pte_write_new_pte(vcpu, sp, spte, new); + mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } } - mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); + kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); @@ -3436,7 +2778,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - if (vcpu->arch.mmu.direct_map) + if (tdp_enabled) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); @@ -3450,27 +2792,23 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - LIST_HEAD(invalid_list); - - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && + while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, - void *insn, int insn_len) +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) { int r; enum emulation_result er; - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); if (r < 0) goto out; @@ -3483,15 +2821,18 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, if (r) goto out; - er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); + er = emulate_instruction(vcpu, cr2, error_code, 0); switch (er) { case EMULATE_DONE: return 1; case EMULATE_DO_MMIO: ++vcpu->stat.mmio_exits; - /* fall through */ + return 0; case EMULATE_FAIL: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; default: BUG(); @@ -3524,8 +2865,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu.pae_root); - if (vcpu->arch.mmu.lm_root != NULL) - free_page((unsigned long)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) @@ -3567,6 +2906,15 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) return init_kvm_mmu(vcpu); } +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); +} + void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; @@ -3578,14 +2926,11 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (!test_bit(slot, sp->slot_bitmap)) continue; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ - if (is_writable_pte(pt[i])) - update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); + if (pt[i] & PT_WRITABLE_MASK) + pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); } @@ -3593,67 +2938,58 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); spin_lock(&kvm->mmu_lock); -restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - - kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); + + kvm_flush_remote_tlbs(kvm); } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, - struct list_head *invalid_list) +static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); + kvm_mmu_zap_page(kvm, page); } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) -static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) -#else static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) -#endif { struct kvm *kvm; struct kvm *kvm_freed = NULL; - - if (nr_to_scan == 0) - goto out; + int cache_count = 0; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int idx, freed_pages; - LIST_HEAD(invalid_list); + int npages, idx; - idx = srcu_read_lock(&kvm->srcu); + idx = kvm_srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); + npages = kvm->arch.n_alloc_mmu_pages - + kvm->arch.n_free_mmu_pages; + cache_count += npages; + if (!kvm_freed && nr_to_scan > 0 && npages > 0) { + kvm_mmu_remove_one_alloc_mmu_page(kvm); + cache_count--; kvm_freed = kvm; } nr_to_scan--; - kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + kvm_srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); spin_unlock(&kvm_lock); -out: - return percpu_counter_read_positive(&kvm_total_used_mmu_pages); + return cache_count; } static struct shrinker mmu_shrinker = { @@ -3671,6 +3007,12 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +void kvm_mmu_module_exit(void) +{ + mmu_destroy_caches(); + unregister_shrinker(&mmu_shrinker); +} + int kvm_mmu_module_init(void) { pte_chain_cache = kmem_cache_create("kvm_pte_chain", @@ -3690,9 +3032,6 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) - goto nomem; - register_shrinker(&mmu_shrinker); return 0; @@ -3712,8 +3051,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_pages = 0; struct kvm_memslots *slots; - slots = kvm_memslots(kvm); - + slots = rcu_dereference(kvm->memslots); for (i = 0; i < slots->nmemslots; i++) nr_pages += slots->memslots[i].npages; @@ -3767,7 +3105,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); + kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } @@ -3863,25 +3201,272 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) } EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +#ifdef AUDIT + +static const char *audit_msg; + +static gva_t canonicalize(gva_t gva) { - ASSERT(vcpu); +#ifdef CONFIG_X86_64 + gva = (long long)(gva << 16) >> 16; +#endif + return gva; +} - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); + +typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *sptep); + +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, + inspect_spte_fn fn) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(kvm, child, fn); + } else + fn(kvm, sp, &sp->spt[i]); + } + } } -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void mmu_audit_disable(void) { } -#endif +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; -void kvm_mmu_module_exit(void) + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + } + } + return; +} + +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) { - mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - unregister_shrinker(&mmu_shrinker); - mmu_audit_disable(); + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 ent = pt[i]; + + if (ent == shadow_trap_nonpresent_pte) + continue; + + va = canonicalize(va); + if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) + audit_mappings_page(vcpu, ent, va, level - 1); + else { + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); + gfn_t gfn = gpa >> PAGE_SHIFT; + pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); + hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; + + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + continue; + } + + if (is_shadow_present_pte(ent) + && (ent & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "xx audit error: (%s) levels %d" + " gva %lx gpa %llx hpa %llx ent %llx %d\n", + audit_msg, vcpu->arch.mmu.root_level, + va, gpa, hpa, ent, + is_shadow_present_pte(ent)); + else if (ent == shadow_notrap_nonpresent_pte + && !is_error_hpa(hpa)) + printk(KERN_ERR "audit: (%s) notrap shadow," + " valid guest gva %lx\n", audit_msg, va); + kvm_release_pfn_clean(pfn); + + } + } +} + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + unsigned i; + + if (vcpu->arch.mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->arch.mmu.pae_root[i], + i << 30, + 2); } + +static int count_rmaps(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + int i, j, k, idx; + + idx = kvm_srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *m = &slots->memslots[i]; + struct kvm_rmap_desc *d; + + for (j = 0; j < m->npages; ++j) { + unsigned long *rmapp = &m->rmap[j]; + + if (!*rmapp) + continue; + if (!(*rmapp & 1)) { + ++nmaps; + continue; + } + d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (d) { + for (k = 0; k < RMAP_EXT; ++k) + if (d->sptes[k]) + ++nmaps; + else + break; + d = d->more; + } + } + } + kvm_srcu_read_unlock(&kvm->srcu, idx); + return nmaps; +} + +void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + if (*sptep & PT_WRITABLE_MASK) { + rev_sp = page_header(__pa(sptep)); + gfn = rev_sp->gfns[sptep - rev_sp->spt]; + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no memslot for gfn %ld\n", + audit_msg, gfn); + printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", + audit_msg, sptep - rev_sp->spt, + rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], + is_large_pte(*sptep)); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no rmap for writable spte %llx\n", + audit_msg, *sptep); + dump_stack(); + } + } + +} + +void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, inspect_spte_has_rmap); +} + +static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + int i; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + u64 *pt = sp->spt; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + if (!(ent & PT_WRITABLE_MASK)) + continue; + inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); + } + } + return; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + check_writable_mappings_rmap(vcpu); + count_rmaps(vcpu); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + struct kvm_memory_slot *slot; + unsigned long *rmapp; + u64 *spte; + gfn_t gfn; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + if (sp->role.direct) + continue; + if (sp->unsync) + continue; + + gfn = unalias_gfn(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); + rmapp = &slot->rmap[gfn - slot->base_gfn]; + + spte = rmap_next(vcpu->kvm, rmapp, NULL); + while (spte) { + if (*spte & PT_WRITABLE_MASK) + printk(KERN_ERR "%s: (%s) shadow page has " + "writable mappings: gfn %lx role %x\n", + __func__, audit_msg, sp->gfn, + sp->role.word); + spte = rmap_next(vcpu->kvm, rmapp, spte); + } + } +} + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +{ + int olddbg = dbg; + + dbg = 0; + audit_msg = msg; + audit_rmap(vcpu); + audit_write_protection(vcpu); + if (strcmp("pre pte write", audit_msg) != 0) + audit_mappings(vcpu); + audit_writable_sptes_have_rmaps(vcpu); + dbg = olddbg; +} + +#endif diff --git a/linux/x86/mmu.h b/linux/x86/mmu.h index 7086ca8..be66759 100644 --- a/linux/x86/mmu.h +++ b/linux/x86/mmu.h @@ -49,17 +49,10 @@ #define PFERR_FETCH_MASK (1U << 4) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); - -static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) -{ - return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; -} static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) + if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) __kvm_mmu_free_some_pages(vcpu); } diff --git a/linux/x86/mmu_audit.c b/linux/x86/mmu_audit.c deleted file mode 100644 index 3b2d201..0000000 --- a/linux/x86/mmu_audit.c +++ /dev/null @@ -1,344 +0,0 @@ -#ifndef KVM_UNIFDEF_H -#define KVM_UNIFDEF_H - -#ifdef __i386__ -#ifndef CONFIG_X86_32 -#define CONFIG_X86_32 1 -#endif -#endif - -#ifdef __x86_64__ -#ifndef CONFIG_X86_64 -#define CONFIG_X86_64 1 -#endif -#endif - -#if defined(__i386__) || defined (__x86_64__) -#ifndef CONFIG_X86 -#define CONFIG_X86 1 -#endif -#endif - -#ifdef __ia64__ -#ifndef CONFIG_IA64 -#define CONFIG_IA64 1 -#endif -#endif - -#ifdef __PPC__ -#ifndef CONFIG_PPC -#define CONFIG_PPC 1 -#endif -#endif - -#ifdef __s390__ -#ifndef CONFIG_S390 -#define CONFIG_S390 1 -#endif -#endif - -#endif -/* - * mmu_audit.c: - * - * Audit code for KVM MMU - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * Marcelo Tosatti <mtosatti@redhat.com> - * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/ratelimit.h> - -#define audit_printk(kvm, fmt, args...) \ - printk(KERN_ERR "audit: (%s) error: " \ - fmt, audit_point_name[kvm->arch.audit_point], ##args) - -typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); - -static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - inspect_spte_fn fn, int level) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 *ent = sp->spt; - - fn(vcpu, ent + i, level); - - if (is_shadow_present_pte(ent[i]) && - !is_last_spte(ent[i], level)) { - struct kvm_mmu_page *child; - - child = page_header(ent[i] & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(vcpu, child, fn, level - 1); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); - return; - } - - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root && VALID_PAGE(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, 2); - } - } - - return; -} - -typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); - -static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) - fn(kvm, sp); -} - -static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp; - gfn_t gfn; - pfn_t pfn; - hpa_t hpa; - - sp = page_header(__pa(sptep)); - - if (sp->unsync) { - if (level != PT_PAGE_TABLE_LEVEL) { - audit_printk(vcpu->kvm, "unsync sp: %p " - "level = %d\n", sp, level); - return; - } - - if (*sptep == shadow_notrap_nonpresent_pte) { - audit_printk(vcpu->kvm, "notrap spte in unsync " - "sp: %p\n", sp); - return; - } - } - - if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { - audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", - sp); - return; - } - - if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) - return; - - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); - - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - - hpa = pfn << PAGE_SHIFT; - if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu.root_level, pfn, - hpa, *sptep); -} - -static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - unsigned long *rmapp; - struct kvm_mmu_page *rev_sp; - gfn_t gfn; - - - rev_sp = page_header(__pa(sptep)); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) - return; - audit_printk(kvm, "no memslot for gfn %llx\n", gfn); - audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", - (long int)(sptep - rev_sp->spt), rev_sp->gfn); - dump_stack(); - return; - } - - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); - if (!*rmapp) { - if (!printk_ratelimit()) - return; - audit_printk(kvm, "no rmap for writable spte %llx\n", - *sptep); - dump_stack(); - } -} - -static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) - inspect_spte_has_rmap(vcpu->kvm, sptep); -} - -static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) - audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " - "root.\n", sp); -} - -static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - int i; - - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - return; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_rmap_spte(sp->spt[i])) - continue; - - inspect_spte_has_rmap(kvm, sp->spt + i); - } -} - -static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct kvm_memory_slot *slot; - unsigned long *rmapp; - u64 *spte; - - if (sp->role.direct || sp->unsync || sp->role.invalid) - return; - - slot = gfn_to_memslot(kvm, sp->gfn); - rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) - audit_printk(kvm, "shadow page has writable " - "mappings: gfn %llx role %x\n", - sp->gfn, sp->role.word); - spte = rmap_next(kvm, rmapp, spte); - } -} - -static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - check_mappings_rmap(kvm, sp); - audit_write_protection(kvm, sp); -} - -static void audit_all_active_sps(struct kvm *kvm) -{ - walk_all_active_sps(kvm, audit_sp); -} - -static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - audit_sptes_have_rmaps(vcpu, sptep, level); - audit_mappings(vcpu, sptep, level); - audit_spte_after_sync(vcpu, sptep, level); -} - -static void audit_vcpu_spte(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, audit_spte); -} - -static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - - if (!__ratelimit(&ratelimit_state)) - return; - - vcpu->kvm->arch.audit_point = point; - audit_all_active_sps(vcpu->kvm); - audit_vcpu_spte(vcpu); -} - -static bool mmu_audit; - -static void mmu_audit_enable(void) -{ - int ret; - - if (mmu_audit) - return; - - ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - WARN_ON(ret); - - mmu_audit = true; -} - -static void mmu_audit_disable(void) -{ - if (!mmu_audit) - return; - - unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - tracepoint_synchronize_unregister(); - mmu_audit = false; -} - -static int mmu_audit_set(const char *val, const struct kernel_param *kp) -{ - int ret; - unsigned long enable; - - ret = strict_strtoul(val, 10, &enable); - if (ret < 0) - return -EINVAL; - - switch (enable) { - case 0: - mmu_audit_disable(); - break; - case 1: - mmu_audit_enable(); - break; - default: - return -EINVAL; - } - - return 0; -} - -static struct kernel_param_ops audit_param_ops = { - .set = mmu_audit_set, - .get = param_get_bool, -}; - -module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/linux/x86/mmutrace.h b/linux/x86/mmutrace.h index b60b4fd..3e4a5c6 100644 --- a/linux/x86/mmutrace.h +++ b/linux/x86/mmutrace.h @@ -6,12 +6,14 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE mmutrace #define KVM_MMU_PAGE_FIELDS \ __field(__u64, gfn) \ __field(__u32, role) \ __field(__u32, root_count) \ - __field(bool, unsync) + __field(__u32, unsync) #define KVM_MMU_PAGE_ASSIGN(sp) \ __entry->gfn = sp->gfn; \ @@ -28,14 +30,14 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ + trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ " %snxe root %u %s%c", \ - __entry->gfn, role.level, \ - role.cr4_pae ? " pae" : "", \ + __entry->gfn, role.level, role.glevels, \ role.quadrant, \ role.direct ? " direct" : "", \ access_str[role.access], \ role.invalid ? " invalid" : "", \ + role.cr4_pge ? "" : "!", \ role.nxe ? "" : "!", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ @@ -92,15 +94,15 @@ TRACE_EVENT( TP_printk("pte %llx level %u", __entry->pte, __entry->level) ); -DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, - +/* We set a pte accessed bit */ +TRACE_EVENT( + kvm_mmu_set_accessed_bit, TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - TP_ARGS(table_gfn, index, size), TP_STRUCT__entry( __field(__u64, gpa) - ), + ), TP_fast_assign( __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) @@ -110,20 +112,22 @@ DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, TP_printk("gpa %llx", __entry->gpa) ); -/* We set a pte accessed bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, - +/* We set a pte dirty bit */ +TRACE_EVENT( + kvm_mmu_set_dirty_bit, TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), - TP_ARGS(table_gfn, index, size) -); - -/* We set a pte dirty bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, + TP_STRUCT__entry( + __field(__u64, gpa) + ), - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), - TP_ARGS(table_gfn, index, size) + TP_printk("gpa %llx", __entry->gpa) ); TRACE_EVENT( @@ -162,64 +166,55 @@ TRACE_EVENT( __entry->created ? "new" : "existing") ); -DECLARE_EVENT_CLASS(kvm_mmu_page_class, - +TRACE_EVENT( + kvm_mmu_sync_page, TP_PROTO(struct kvm_mmu_page *sp), TP_ARGS(sp), TP_STRUCT__entry( KVM_MMU_PAGE_FIELDS - ), + ), TP_fast_assign( KVM_MMU_PAGE_ASSIGN(sp) - ), + ), TP_printk("%s", KVM_MMU_PAGE_PRINTK()) ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, +TRACE_EVENT( + kvm_mmu_unsync_page, TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), - TP_ARGS(sp) -); + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, - TP_PROTO(struct kvm_mmu_page *sp), + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), - TP_ARGS(sp) + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) ); TRACE_EVENT( - kvm_mmu_audit, - TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), - TP_ARGS(vcpu, audit_point), + kvm_mmu_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(int, audit_point) - ), + KVM_MMU_PAGE_FIELDS + ), TP_fast_assign( - __entry->vcpu = vcpu; - __entry->audit_point = audit_point; - ), + KVM_MMU_PAGE_ASSIGN(sp) + ), - TP_printk("vcpu:%d %s", __entry->vcpu->cpu, - audit_point_name[__entry->audit_point]) + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) ); -#endif /* _TRACE_KVMMMU_H */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE mmutrace +#endif /* _TRACE_KVMMMU_H */ /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/linux/x86/paging_tmpl.h b/linux/x86/paging_tmpl.h index 6bccc24..81eab9a 100644 --- a/linux/x86/paging_tmpl.h +++ b/linux/x86/paging_tmpl.h @@ -7,7 +7,6 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -67,12 +66,11 @@ struct guest_walker { int level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; - pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; - struct x86_exception fault; + u32 error_code; }; static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) @@ -105,7 +103,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; #if PTTYPE == 64 - if (vcpu->arch.mmu.nx) + if (is_nx(vcpu)) access &= ~(gpte >> PT64_NX_SHIFT); #endif return access; @@ -114,42 +112,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) /* * Fetch a guest pte for a guest virtual address */ -static int FNAME(walk_addr_generic)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t addr, u32 access) +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + int write_fault, int user_fault, int fetch_fault) { pt_element_t pte; gfn_t table_gfn; - unsigned index, pt_access, uninitialized_var(pte_access); + unsigned index, pt_access, pte_access; gpa_t pte_gpa; - bool eperm, present, rsvd_fault; - int offset, write_fault, user_fault, fetch_fault; - - write_fault = access & PFERR_WRITE_MASK; - user_fault = access & PFERR_USER_MASK; - fetch_fault = access & PFERR_FETCH_MASK; + int rsvd_fault = 0; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); walk: - present = true; - eperm = rsvd_fault = false; - walker->level = mmu->root_level; - pte = mmu->get_cr3(vcpu); - + walker->level = vcpu->arch.mmu.root_level; + pte = vcpu->arch.cr3; #if PTTYPE == 64 - if (walker->level == PT32E_ROOT_LEVEL) { - pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); + if (!is_long_mode(vcpu)) { + pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) { - present = false; - goto error; - } + if (!is_present_gpte(pte)) + goto not_present; --walker->level; } #endif ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); pt_access = ACC_ALL; @@ -157,49 +145,42 @@ walk: index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); - offset = index * sizeof(pt_element_t); - pte_gpa = gfn_to_gpa(table_gfn) + offset; + pte_gpa = gfn_to_gpa(table_gfn); + pte_gpa += index * sizeof(pt_element_t); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, - offset, sizeof(pte), - PFERR_USER_MASK|PFERR_WRITE_MASK)) { - present = false; - break; - } + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) + goto not_present; trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) { - present = false; - break; - } + if (!is_present_gpte(pte)) + goto not_present; - if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { - rsvd_fault = true; - break; - } + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); + if (rsvd_fault) + goto access_error; if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) - eperm = true; + goto access_error; if (user_fault && !(pte & PT_USER_MASK)) - eperm = true; + goto access_error; #if PTTYPE == 64 - if (fetch_fault && (pte & PT64_NX_MASK)) - eperm = true; + if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) + goto access_error; #endif - if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { + if (!(pte & PT_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); + mark_page_dirty(vcpu->kvm, table_gfn); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_ACCESSED_MASK)) goto walk; - mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -209,32 +190,21 @@ walk: if ((walker->level == PT_PAGE_TABLE_LEVEL) || ((walker->level == PT_DIRECTORY_LEVEL) && - is_large_pte(pte) && + (pte & PT_PAGE_SIZE_MASK) && (PTTYPE == 64 || is_pse(vcpu))) || ((walker->level == PT_PDPE_LEVEL) && - is_large_pte(pte) && - mmu->root_level == PT64_ROOT_LEVEL)) { + (pte & PT_PAGE_SIZE_MASK) && + is_long_mode(vcpu))) { int lvl = walker->level; - gpa_t real_gpa; - gfn_t gfn; - u32 ac; - gfn = gpte_to_gfn_lvl(pte, lvl); - gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; + walker->gfn = gpte_to_gfn_lvl(pte, lvl); + walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) + >> PAGE_SHIFT; if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) - gfn += pse36_gfn_delta(pte); - - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), - ac); - if (real_gpa == UNMAPPED_GVA) - return 0; - - walker->gfn = real_gpa >> PAGE_SHIFT; + walker->gfn += pse36_gfn_delta(pte); break; } @@ -243,18 +213,15 @@ walk: --walker->level; } - if (!present || eperm || rsvd_fault) - goto error; - if (write_fault && !is_dirty_gpte(pte)) { bool ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); + mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); if (ret) goto walk; - mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; } @@ -262,71 +229,30 @@ walk: walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pte_access, pt_access); + __func__, (u64)pte, pt_access, pte_access); return 1; -error: - walker->fault.vector = PF_VECTOR; - walker->fault.error_code_valid = true; - walker->fault.error_code = 0; - if (present) - walker->fault.error_code |= PFERR_PRESENT_MASK; +not_present: + walker->error_code = 0; + goto err; - walker->fault.error_code |= write_fault | user_fault; +access_error: + walker->error_code = PFERR_PRESENT_MASK; - if (fetch_fault && mmu->nx) - walker->fault.error_code |= PFERR_FETCH_MASK; +err: + if (write_fault) + walker->error_code |= PFERR_WRITE_MASK; + if (user_fault) + walker->error_code |= PFERR_USER_MASK; + if (fetch_fault) + walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) - walker->fault.error_code |= PFERR_RSVD_MASK; - - walker->fault.address = addr; - walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; - - trace_kvm_mmu_walker_error(walker->fault.error_code); + walker->error_code |= PFERR_RSVD_MASK; + trace_kvm_mmu_walker_error(walker->error_code); return 0; } -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, - access); -} - -static int FNAME(walk_addr_nested)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, - addr, access); -} - -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) -{ - u64 nonpresent = shadow_trap_nonpresent_pte; - - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) { - if (!sp->unsync) - nonpresent = shadow_notrap_nonpresent_pte; - goto no_present; - } - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte, nonpresent); - return true; -} - -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, u64 *spte, const void *pte) { pt_element_t gpte; @@ -334,11 +260,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pfn_t pfn; gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { + if (!is_present_gpte(gpte)) + __set_spte(spte, shadow_notrap_nonpresent_pte); return; - + } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) return; pfn = vcpu->arch.update_pte.pfn; @@ -348,181 +276,91 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return; kvm_get_pfn(pfn); /* - * we call mmu_set_spte() with host_writable = true beacuse that + * we call mmu_set_spte() with reset_host_protection = true beacuse that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, + mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, + gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); } -static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, - struct guest_walker *gw, int level) -{ - pt_element_t curr_pte; - gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; - u64 mask; - int r, index; - - if (level == PT_PAGE_TABLE_LEVEL) { - mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; - base_gpa = pte_gpa & ~mask; - index = (pte_gpa - base_gpa) / sizeof(pt_element_t); - - r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, - gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); - curr_pte = gw->prefetch_ptes[index]; - } else - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, - &curr_pte, sizeof(curr_pte)); - - return r || curr_pte != gw->ptes[level - 1]; -} - -static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, - u64 *sptep) -{ - struct kvm_mmu_page *sp; - pt_element_t *gptep = gw->prefetch_ptes; - u64 *spte; - int i; - - sp = page_header(__pa(sptep)); - - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return; - - if (sp->role.direct) - return __direct_pte_prefetch(vcpu, sp, sptep); - - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); - spte = sp->spt + i; - - for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - bool dirty; - - if (spte == sptep) - continue; - - if (*spte != shadow_trap_nonpresent_pte) - continue; - - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - gfn = gpte_to_gfn(gpte); - dirty = is_dirty_gpte(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - (pte_access & ACC_WRITE_MASK) && dirty); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - break; - } - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); - } -} - /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int user_fault, int write_fault, int hlevel, - int *ptwrite, pfn_t pfn, bool map_writable, - bool prefault) + int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; - struct kvm_mmu_page *sp = NULL; - bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); - int top_level; - unsigned direct_access; - struct kvm_shadow_walk_iterator it; + struct kvm_mmu_page *shadow_page; + u64 spte, *sptep = NULL; + int direct; + gfn_t table_gfn; + int r; + int level; + pt_element_t curr_pte; + struct kvm_shadow_walk_iterator iterator; if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; - direct_access = gw->pt_access & gw->pte_access; - if (!dirty) - direct_access &= ~ACC_WRITE_MASK; - - top_level = vcpu->arch.mmu.root_level; - if (top_level == PT32E_ROOT_LEVEL) - top_level = PT32_ROOT_LEVEL; - /* - * Verify that the top-level gpte is still there. Since the page - * is a root page, it is either write protected (and cannot be - * changed from now on) or it is invalid (in which case, we don't - * really care if it changes underneath us after this point). - */ - if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; - - for (shadow_walk_init(&it, vcpu, addr); - shadow_walk_okay(&it) && it.level > gw->level; - shadow_walk_next(&it)) { - gfn_t table_gfn; - - drop_large_spte(vcpu, it.sptep); - - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access, it.sptep); + for_each_shadow_entry(vcpu, addr, iterator) { + level = iterator.level; + sptep = iterator.sptep; + if (iterator.level == hlevel) { + mmu_set_spte(vcpu, sptep, access, + gw->pte_access & access, + user_fault, write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + ptwrite, level, + gw->gfn, pfn, false, true); + break; } - /* - * Verify that the gpte in the page we've just write - * protected is still there. - */ - if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; - - if (sp) - link_shadow_page(it.sptep, sp); - } - - for (; - shadow_walk_okay(&it) && it.level > hlevel; - shadow_walk_next(&it)) { - gfn_t direct_gfn; - - validate_direct_spte(vcpu, it.sptep, direct_access); - - drop_large_spte(vcpu, it.sptep); - - if (is_shadow_present_pte(*it.sptep)) + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) continue; - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); - } + if (is_large_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } - mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, - user_fault, write_fault, dirty, ptwrite, it.level, - gw->gfn, pfn, prefault, map_writable); - FNAME(pte_prefetch)(vcpu, gw, it.sptep); + if (level <= gw->level) { + int delta = level - gw->level + 1; + direct = 1; + if (!is_dirty_gpte(gw->ptes[level - delta])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(gw->ptes[level - delta]); + /* advance table_gfn when emulating 1gb pages with 4k */ + if (delta == 0) + table_gfn += PT_INDEX(addr, level); + } else { + direct = 0; + table_gfn = gw->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + direct, access, sptep); + if (!direct) { + r = kvm_read_guest_atomic(vcpu->kvm, + gw->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_mmu_put_page(shadow_page, sptep); + kvm_release_pfn_clean(pfn); + sptep = NULL; + break; + } + } - return it.sptep; + spte = __pa(shadow_page->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; + } -out_gpte_changed: - if (sp) - kvm_mmu_put_page(sp, it.sptep); - kvm_release_pfn_clean(pfn); - return NULL; + return sptep; } /* @@ -539,22 +377,22 @@ out_gpte_changed: * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, + u32 error_code) { int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; + int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *sptep; int write_pt = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; unsigned long mmu_seq; - bool map_writable; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + kvm_mmu_audit(vcpu, "pre page fault"); r = mmu_topup_memory_caches(vcpu); if (r) @@ -563,52 +401,41 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, /* * Look up the guest pte for the faulting address. */ - r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); + r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, + fetch_fault); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) { - inject_page_fault(vcpu, &walker.fault); - /* reset fork detector */ - vcpu->arch.last_pt_write_count = 0; - } + inject_page_fault(vcpu, addr, walker.error_code); + vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ return 0; } - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); - else - force_pt_level = 1; - if (!force_pt_level) { + if (walker.level >= PT_DIRECTORY_LEVEL) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) - return 0; + pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); /* mmio */ - if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); + if (is_error_pfn(pfn)) { + pgprintk("gfn %lx is mmio\n", walker.gfn); + kvm_release_pfn_clean(pfn); + return 1; + } spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; - - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); - if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &write_pt, pfn, map_writable, prefault); - (void)sptep; + level, &write_pt, pfn); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, sptep, *sptep, write_pt); @@ -616,7 +443,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ ++vcpu->stat.pf_fixed; - trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); + kvm_mmu_audit(vcpu, "post page fault (fixed)"); spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; @@ -630,8 +457,6 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -642,83 +467,46 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) level = iterator.level; sptep = iterator.sptep; - sp = page_header(__pa(sptep)); - if (is_last_spte(*sptep, level)) { - int offset, shift; - - if (!sp->unsync) - break; - - shift = PAGE_SHIFT - - (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; - offset = sp->role.quadrant << shift; - - pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || + ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { if (is_shadow_present_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); if (is_large_pte(*sptep)) --vcpu->kvm->stat.lpages; - drop_spte(vcpu->kvm, sptep, - shadow_trap_nonpresent_pte); need_flush = 1; - } else - __set_spte(sptep, shadow_trap_nonpresent_pte); + } + __set_spte(sptep, shadow_trap_nonpresent_pte); break; } - if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) + if (!is_shadow_present_pte(*sptep)) break; } if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); - - atomic_inc(&vcpu->kvm->arch.invlpg_counter); - spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, - struct x86_exception *exception) + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; - - return gpa; -} - -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, - struct x86_exception *exception) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - - r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; + } else if (error) + *error = walker.error_code; return gpa; } @@ -757,68 +545,59 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. - * - * Note: - * We should flush all tlbs if spte is dropped even though guest is - * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page - * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't - * used by guest then tlbs are not flushed, so guest is allowed to access the - * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + * - Alias changes zap the entire shadow cache. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i, offset, nr_present; - bool host_writable; - gpa_t first_pte_gpa; + bool reset_host_protection; offset = nr_present = 0; - /* direct kvm_mmu_page can not be unsync. */ - BUG_ON(sp->role.direct); - if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; - first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; - gfn_t gfn; + gfn_t gfn = sp->gfns[i]; if (!is_shadow_present_pte(sp->spt[i])) continue; - pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); + pte_gpa = gfn_to_gpa(sp->gfn); + pte_gpa += (i+offset) * sizeof(pt_element_t); if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) return -EINVAL; - gfn = gpte_to_gfn(gpte); - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; - continue; - } + if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || + !(gpte & PT_ACCESSED_MASK)) { + u64 nonpresent; - if (gfn != sp->gfns[i]) { - drop_spte(vcpu->kvm, &sp->spt[i], - shadow_trap_nonpresent_pte); - vcpu->kvm->tlbs_dirty++; + rmap_remove(vcpu->kvm, &sp->spt[i]); + if (is_present_gpte(gpte)) + nonpresent = shadow_trap_nonpresent_pte; + else + nonpresent = shadow_notrap_nonpresent_pte; + __set_spte(&sp->spt[i], nonpresent); continue; } nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - + if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { + pte_access &= ~ACC_WRITE_MASK; + reset_host_protection = 0; + } else { + reset_host_protection = 1; + } set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false, - host_writable); + reset_host_protection); } return !nr_present; diff --git a/linux/x86/svm.c b/linux/x86/svm.c index bd6a099..174aa10 100644 --- a/linux/x86/svm.c +++ b/linux/x86/svm.c @@ -44,7 +44,6 @@ * AMD SVM support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -69,16 +68,14 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> -#include <asm/tlbflush.h> #include <asm/desc.h> -#include <asm/kvm_para.h> #include <asm/virtext.h> #include "trace.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) -MODULE_INFO(version, "kvm-kmod-2.6.38-rc7"); +MODULE_INFO(version, "kvm-kmod-2.6.34"); MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -88,15 +85,10 @@ MODULE_LICENSE("GPL"); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_NRIP (1 << 3) -#define SVM_FEATURE_TSC_RATE (1 << 4) -#define SVM_FEATURE_VMCB_CLEAN (1 << 5) -#define SVM_FEATURE_FLUSH_ASID (1 << 6) -#define SVM_FEATURE_DECODE_ASSIST (1 << 7) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ @@ -104,8 +96,6 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) -static bool erratum_383_found __read_mostly; - static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, @@ -121,7 +111,6 @@ struct kvm_vcpu; struct nested_state { struct vmcb *hsave; u64 hsave_msr; - u64 vm_cr_msr; u64 vmcb; /* These are the merged vectors */ @@ -129,32 +118,20 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; - u64 vmcb_iopm; /* A VMEXIT is required but not yet emulated */ bool exit_required; - /* - * If we vmexit during an instruction emulation we need this to restore - * the l1 guest rip after the emulation - */ - unsigned long vmexit_rip; - unsigned long vmexit_rsp; - unsigned long vmexit_rax; - /* cache for intercepts of the guest */ - u32 intercept_cr; - u32 intercept_dr; + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; u32 intercept_exceptions; u64 intercept; - /* Nested Paging related state */ - u64 nested_cr3; }; -#define MSRPM_OFFSETS 16 -static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -167,52 +144,20 @@ struct vcpu_svm { u64 next_rip; u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; - u16 gs; - u16 ldt; - u64 gs_base; - } host; + u64 host_gs_base; u32 *msrpm; struct nested_state nested; bool nmi_singlestep; - - unsigned int3_injected; - unsigned long int3_rip; - u32 apf_reason; -}; - -#define MSR_INVALID 0xffffffffU - -static struct svm_direct_access_msrs { - u32 index; /* Index of the MSR */ - bool always; /* True if intercept is always on */ -} direct_access_msrs[] = { - { .index = MSR_STAR, .always = true }, - { .index = MSR_IA32_SYSENTER_CS, .always = true }, -#ifdef CONFIG_X86_64 - { .index = MSR_GS_BASE, .always = true }, - { .index = MSR_FS_BASE, .always = true }, - { .index = MSR_KERNEL_GS_BASE, .always = true }, - { .index = MSR_LSTAR, .always = true }, - { .index = MSR_CSTAR, .always = true }, - { .index = MSR_SYSCALL_MASK, .always = true }, -#endif - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, - { .index = MSR_IA32_LASTINTTOIP, .always = false }, - { .index = MSR_INVALID, .always = false }, }; /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; #else -static bool npt_enabled; +static bool npt_enabled = false; #endif static int npt = 1; @@ -225,156 +170,18 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm); -static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -enum { - VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, - pause filter count */ - VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ - VMCB_ASID, /* ASID */ - VMCB_INTR, /* int_ctl, int_vector */ - VMCB_NPT, /* npt_en, nCR3, gPAT */ - VMCB_CR, /* CR0, CR3, CR4, EFER */ - VMCB_DR, /* DR6, DR7 */ - VMCB_DT, /* GDT, IDT */ - VMCB_SEG, /* CS, DS, SS, ES, CPL */ - VMCB_CR2, /* CR2 only */ - VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ - VMCB_DIRTY_MAX, -}; - -/* TPR and CR2 are always written before VMRUN */ -#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) - -static inline void mark_all_dirty(struct vmcb *vmcb) -{ - vmcb->control.clean = 0; -} - -static inline void mark_all_clean(struct vmcb *vmcb) -{ - vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) - & ~VMCB_ALWAYS_DIRTY_MASK; -} - -static inline void mark_dirty(struct vmcb *vmcb, int bit) -{ - vmcb->control.clean &= ~(1 << bit); -} - static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); } -static void recalc_intercepts(struct vcpu_svm *svm) -{ - struct vmcb_control_area *c, *h; - struct nested_state *g; - - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - if (!is_guest_mode(&svm->vcpu)) - return; - - c = &svm->vmcb->control; - h = &svm->nested.hsave->control; - g = &svm->nested; - - c->intercept_cr = h->intercept_cr | g->intercept_cr; - c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; - c->intercept = h->intercept | g->intercept; -} - -static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) -{ - if (is_guest_mode(&svm->vcpu)) - return svm->nested.hsave; - else - return svm->vmcb; -} - -static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - return vmcb->control.intercept_cr & (1U << bit); -} - -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_dr |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_dr &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_exceptions |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +static inline bool is_nested(struct vcpu_svm *svm) { - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_exceptions &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline void set_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept |= (1ULL << bit); - - recalc_intercepts(svm); -} - -static inline void clr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept &= ~(1ULL << bit); - - recalc_intercepts(svm); + return svm->nested.vmcb; } static inline void enable_gif(struct vcpu_svm *svm) @@ -397,8 +204,8 @@ static unsigned long iopm_base; struct kvm_ldttss_desc { u16 limit0; u16 base0; - unsigned base1:8, type:5, dpl:2, p:1; - unsigned limit1:4, zero0:3, g:1, base2:8; + unsigned base1 : 8, type : 5, dpl : 2, p : 1; + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; u32 base3; u32 zero1; } __attribute__((packed)); @@ -428,29 +235,13 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define MSRS_RANGE_SIZE 2048 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) -static u32 svm_msrpm_offset(u32 msr) -{ - u32 offset; - int i; - - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr < msrpm_ranges[i] || - msr >= msrpm_ranges[i] + MSRS_IN_RANGE) - continue; - - offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ - offset += (i * MSRS_RANGE_SIZE); /* add range offset */ - - /* Now we have the u8 offset - but need the u32 offset */ - return offset / 4; - } +#define MAX_INST_SIZE 15 - /* MSR not in any range */ - return MSR_INVALID; +static inline u32 svm_has(u32 feat) +{ + return svm_features & feat; } -#define MAX_INST_SIZE 15 - static inline void clgi(void) { asm volatile (__ex(SVM_CLGI)); @@ -463,26 +254,43 @@ static inline void stgi(void) static inline void invlpga(unsigned long addr, u32 asid) { - asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); + asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); } -static int get_npt_level(void) +static inline void force_new_asid(struct kvm_vcpu *vcpu) { -#ifdef CONFIG_X86_64 - return PT64_ROOT_LEVEL; -#else - return PT32E_ROOT_LEVEL; -#endif + to_svm(vcpu)->asid_generation--; +} + +static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); } static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { - vcpu->arch.efer = efer; if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + vcpu->arch.efer = efer; +} + +static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* If we are within a nested VM we'd better #VMEXIT and let the + guest handle the exception */ + if (nested_svm_check_exception(svm, nr, has_error_code, error_code)) + return; + + svm->vmcb->control.event_inj = nr + | SVM_EVTINJ_VALID + | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | SVM_EVTINJ_TYPE_EXEPT; + svm->vmcb->control.event_inj_err = error_code; } static int is_external_interrupt(u32 info) @@ -497,7 +305,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; return ret & mask; } @@ -516,11 +324,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->vmcb->control.next_rip != 0) - svm->next_rip = svm->vmcb->control.next_rip; - if (!svm->next_rip) { - if (emulate_instruction(vcpu, EMULTYPE_SKIP) != + if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -533,67 +338,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - /* - * If we are within a nested VM we'd better #VMEXIT and let the guest - * handle the exception - */ - if (!reinject && - nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - - if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { - unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); - - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - skip_emulated_instruction(&svm->vcpu); - rip = kvm_rip_read(&svm->vcpu); - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } - - svm->vmcb->control.event_inj = nr - | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) - | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; -} - -static void svm_init_erratum_383(void) -{ - u32 low, high; - int err; - u64 val; - - if (!kvm_cpu_has_amd_erratum(kvm_amd_erratum_383)) - return; - - /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) - return; - - val |= (1ULL << 47); - - low = lower_32_bits(val); - high = upper_32_bits(val); - - kvm_native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); - - erratum_383_found = true; -} - static int has_svm(void) { const char *msg; @@ -616,7 +360,7 @@ static int svm_hardware_enable(void *garbage) struct svm_cpu_data *sd; uint64_t efer; - struct kvm_desc_ptr gdt_descr; + struct descriptor_table gdt_descr; struct kvm_desc_struct *gdt; int me = raw_smp_processor_id(); @@ -641,16 +385,14 @@ static int svm_hardware_enable(void *garbage) sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - kvm_native_store_gdt(&gdt_descr); - gdt = (struct kvm_desc_struct *)gdt_descr.address; + kvm_get_gdt(&gdt_descr); + gdt = (struct kvm_desc_struct *)gdt_descr.base; sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); - svm_init_erratum_383(); - return 0; } @@ -690,98 +432,42 @@ err_1: } -static bool valid_msr_intercept(u32 index) -{ - int i; - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == index) - return true; - - return false; -} - static void set_msr_interception(u32 *msrpm, unsigned msr, int read, int write) { - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; - - /* - * If this warning triggers extend the direct_access_msrs list at the - * beginning of the file - */ - WARN_ON(!valid_msr_intercept(msr)); - - offset = svm_msrpm_offset(msr); - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); - - read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); - write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); - - msrpm[offset] = tmp; -} - -static void svm_vcpu_init_msrpm(u32 *msrpm) -{ int i; - memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - if (!direct_access_msrs[i].always) - continue; - - set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); - } -} - -static void add_msr_offset(u32 offset) -{ - int i; - - for (i = 0; i < MSRPM_OFFSETS; ++i) { - - /* Offset already in list? */ - if (msrpm_offsets[i] == offset) + for (i = 0; i < NUM_MSR_MAPS; i++) { + if (msr >= msrpm_ranges[i] && + msr < msrpm_ranges[i] + MSRS_IN_RANGE) { + u32 msr_offset = (i * MSRS_IN_RANGE + msr - + msrpm_ranges[i]) * 2; + + u32 *base = msrpm + (msr_offset / 32); + u32 msr_shift = msr_offset % 32; + u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); + *base = (*base & ~(0x3 << msr_shift)) | + (mask << msr_shift); return; - - /* Slot used by another offset? */ - if (msrpm_offsets[i] != MSR_INVALID) - continue; - - /* Add offset to list */ - msrpm_offsets[i] = offset; - - return; + } } - - /* - * If this BUG triggers the msrpm_offsets table has an overflow. Just - * increase MSRPM_OFFSETS in this case. - */ BUG(); } -static void init_msrpm_offsets(void) +static void svm_vcpu_init_msrpm(u32 *msrpm) { - int i; - - memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 offset; - - offset = svm_msrpm_offset(direct_access_msrs[i].index); - BUG_ON(offset == MSR_INVALID); + memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); - add_msr_offset(offset); - } +#ifdef CONFIG_X86_64 + set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_LSTAR, 1, 1); + set_msr_interception(msrpm, MSR_CSTAR, 1, 1); + set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); +#endif + set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); } static void svm_enable_lbrv(struct vcpu_svm *svm) @@ -822,8 +508,6 @@ static __init int svm_hardware_setup(void) memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - init_msrpm_offsets(); - if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -832,7 +516,7 @@ static __init int svm_hardware_setup(void) if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + kvm_enable_efer_bits(EFER_SVME); } for_each_possible_cpu(cpu) { @@ -843,7 +527,7 @@ static __init int svm_hardware_setup(void) svm_features = cpuid_edx(SVM_CPUID_FUNC); - if (!boot_cpu_has(X86_FEATURE_NPT)) + if (!svm_has(SVM_FEATURE_NPT)) npt_enabled = false; if (npt_enabled && !npt) { @@ -880,7 +564,7 @@ static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | - SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ + SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ seg->limit = 0xffff; seg->base = 0; } @@ -893,97 +577,72 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 g_tsc_offset = 0; - - if (is_guest_mode(vcpu)) { - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = offset; - } - - svm->vmcb->control.tsc_offset = offset + g_tsc_offset; - - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); -} - -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.tsc_offset += adjustment; - if (is_guest_mode(vcpu)) - svm->nested.hsave->control.tsc_offset += adjustment; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); -} - static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; svm->vcpu.fpu_active = 1; - svm->vcpu.arch.hflags = 0; - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR3_READ); - set_cr_intercept(svm, INTERCEPT_CR4_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); - set_cr_intercept(svm, INTERCEPT_CR3_WRITE); - set_cr_intercept(svm, INTERCEPT_CR4_WRITE); - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); - - set_exception_intercept(svm, PF_VECTOR); - set_exception_intercept(svm, UD_VECTOR); - set_exception_intercept(svm, MC_VECTOR); - - set_intercept(svm, INTERCEPT_INTR); - set_intercept(svm, INTERCEPT_NMI); - set_intercept(svm, INTERCEPT_SMI); - set_intercept(svm, INTERCEPT_SELECTIVE_CR0); - set_intercept(svm, INTERCEPT_CPUID); - set_intercept(svm, INTERCEPT_INVD); - set_intercept(svm, INTERCEPT_HLT); - set_intercept(svm, INTERCEPT_INVLPG); - set_intercept(svm, INTERCEPT_INVLPGA); - set_intercept(svm, INTERCEPT_IOIO_PROT); - set_intercept(svm, INTERCEPT_MSR_PROT); - set_intercept(svm, INTERCEPT_TASK_SWITCH); - set_intercept(svm, INTERCEPT_SHUTDOWN); - set_intercept(svm, INTERCEPT_VMRUN); - set_intercept(svm, INTERCEPT_VMMCALL); - set_intercept(svm, INTERCEPT_VMLOAD); - set_intercept(svm, INTERCEPT_VMSAVE); - set_intercept(svm, INTERCEPT_STGI); - set_intercept(svm, INTERCEPT_CLGI); - set_intercept(svm, INTERCEPT_SKINIT); - set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); - set_intercept(svm, INTERCEPT_XSETBV); + control->intercept_cr_read = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK; + + control->intercept_cr_write = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK | + INTERCEPT_CR8_MASK; + + control->intercept_dr_read = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; + + control->intercept_dr_write = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; + + control->intercept_exceptions = (1 << PF_VECTOR) | + (1 << UD_VECTOR) | + (1 << MC_VECTOR); + + + control->intercept = (1ULL << INTERCEPT_INTR) | + (1ULL << INTERCEPT_NMI) | + (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | + (1ULL << INTERCEPT_CPUID) | + (1ULL << INTERCEPT_INVD) | + (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPG) | + (1ULL << INTERCEPT_INVLPGA) | + (1ULL << INTERCEPT_IOIO_PROT) | + (1ULL << INTERCEPT_MSR_PROT) | + (1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_SHUTDOWN) | + (1ULL << INTERCEPT_VMRUN) | + (1ULL << INTERCEPT_VMMCALL) | + (1ULL << INTERCEPT_VMLOAD) | + (1ULL << INTERCEPT_VMSAVE) | + (1ULL << INTERCEPT_STGI) | + (1ULL << INTERCEPT_CLGI) | + (1ULL << INTERCEPT_SKINIT) | + (1ULL << INTERCEPT_WBINVD) | + (1ULL << INTERCEPT_MONITOR) | + (1ULL << INTERCEPT_MWAIT); control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); + control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -1011,19 +670,18 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_efer(&svm->vcpu, 0); + save->efer = EFER_SVME; save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - /* - * This is the guest-visible cr0 value. + /* This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - svm->vcpu.arch.cr0 = 0; - (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); + svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -1031,27 +689,25 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - clr_intercept(svm, INTERCEPT_TASK_SWITCH); - clr_intercept(svm, INTERCEPT_INVLPG); - clr_exception_intercept(svm, PF_VECTOR); - clr_cr_intercept(svm, INTERCEPT_CR3_READ); - clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); + control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_INVLPG)); + control->intercept_exceptions &= ~(1 << PF_VECTOR); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; } - svm->asid_generation = 0; + force_new_asid(&svm->vcpu); svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; - if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { control->pause_filter_count = 3000; - set_intercept(svm, INTERCEPT_PAUSE); + control->intercept |= (1ULL << INTERCEPT_PAUSE); } - mark_all_dirty(svm->vmcb); - enable_gif(svm); } @@ -1114,27 +770,20 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm_vcpu_init_msrpm(svm->msrpm); svm->nested.msrpm = page_address(nested_msrpm_pages); - svm_vcpu_init_msrpm(svm->nested.msrpm); svm->vmcb = page_address(page); clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - kvm_write_tsc(&svm->vcpu, 0); - - err = fx_init(&svm->vcpu); - if (err) - goto free_page4; + fx_init(&svm->vcpu); svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; -free_page4: - __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -1167,17 +816,23 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int i; if (unlikely(cpu != vcpu->cpu)) { + u64 delta; + + if (kvm_check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - kvm_native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } + vcpu->cpu = cpu; + kvm_migrate_timers(vcpu); svm->asid_generation = 0; - mark_all_dirty(svm->vmcb); } -#ifdef CONFIG_X86_64 - rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); -#endif - savesegment(fs, svm->host.fs); - savesegment(gs, svm->host.gs); - svm->host.ldt = kvm_read_ldt(); - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } @@ -1188,16 +843,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) int i; ++vcpu->stat.host_state_reload; - kvm_load_ldt(svm->host.ldt); -#ifdef CONFIG_X86_64 - loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); - load_gs_index(svm->host.gs); -#else - loadsegment(gs, svm->host.gs); -#endif for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + vcpu->arch.host_tsc = kvm_native_read_tsc(); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -1215,7 +864,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) switch (reg) { case VCPU_EXREG_PDPTR: BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + load_pdptrs(vcpu, vcpu->arch.cr3); break; default: BUG(); @@ -1224,12 +873,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) static void svm_set_vintr(struct vcpu_svm *svm) { - set_intercept(svm, INTERCEPT_VINTR); + svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; } static void svm_clear_vintr(struct vcpu_svm *svm) { - clr_intercept(svm, INTERCEPT_VINTR); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@ -1274,8 +923,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; - /* - * AMD's VMCB does not have an explicit unusable field, so emulate it + /* AMD's VMCB does not have an explicit unusable field, so emulate it * for cross vendor migration purposes by "not present" */ var->unusable = !var->present || (var->type == 0); @@ -1311,8 +959,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->type |= 0x1; break; case VCPU_SREG_SS: - /* - * On AMD CPUs sometimes the DB bit in the segment + /* On AMD CPUs sometimes the DB bit in the segment * descriptor is left as 1, although the whole segment has * been made unusable. Clear it here to pass an Intel VMX * entry check when cross vendor migrating. @@ -1330,48 +977,42 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } -static void svm_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->size = svm->vmcb->save.idtr.limit; - dt->address = svm->vmcb->save.idtr.base; + dt->limit = svm->vmcb->save.idtr.limit; + dt->base = svm->vmcb->save.idtr.base; } -static void svm_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.idtr.limit = dt->size; - svm->vmcb->save.idtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + svm->vmcb->save.idtr.limit = dt->limit; + svm->vmcb->save.idtr.base = dt->base ; } -static void svm_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->size = svm->vmcb->save.gdtr.limit; - dt->address = svm->vmcb->save.gdtr.base; + dt->limit = svm->vmcb->save.gdtr.limit; + dt->base = svm->vmcb->save.gdtr.base; } -static void svm_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.gdtr.limit = dt->size; - svm->vmcb->save.gdtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + svm->vmcb->save.gdtr.limit = dt->limit; + svm->vmcb->save.gdtr.base = dt->base ; } static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { } -static void svm_decache_cr3(struct kvm_vcpu *vcpu) -{ -} - static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } @@ -1387,14 +1028,13 @@ static void update_cr0_intercept(struct vcpu_svm *svm) *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | (gcr0 & SVM_CR0_SELECTIVE_MASK); - mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { - clr_cr_intercept(svm, INTERCEPT_CR0_READ); - clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; } else { - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; } } @@ -1402,31 +1042,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu)) { - /* - * We are here because we run in nested mode, the host kvm - * intercepts cr0 writes but the l1 hypervisor does not. - * But the L1 hypervisor may intercept selective cr0 writes. - * This needs to be checked here. - */ - unsigned long old, new; - - /* Remove bits that would trigger a real cr0 write intercept */ - old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; - new = cr0 & SVM_CR0_SELECTIVE_MASK; - - if (old == new) { - /* cr0 write with ts and mp unchanged */ - svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { - svm->nested.vmexit_rip = kvm_rip_read(vcpu); - svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - return; - } - } - } - #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { @@ -1454,7 +1069,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; - mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); } @@ -1464,14 +1078,13 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + force_new_asid(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) cr4 |= X86_CR4_PAE; cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1500,25 +1113,26 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, = (svm->vmcb->save.cs.attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; - mark_dirty(svm->vmcb, VMCB_SEG); } static void update_db_intercept(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - clr_exception_intercept(svm, DB_VECTOR); - clr_exception_intercept(svm, BP_VECTOR); + svm->vmcb->control.intercept_exceptions &= + ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); if (svm->nmi_singlestep) - set_exception_intercept(svm, DB_VECTOR); + svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { if (vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - set_exception_intercept(svm, DB_VECTOR); + svm->vmcb->control.intercept_exceptions |= + 1 << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - set_exception_intercept(svm, BP_VECTOR); + svm->vmcb->control.intercept_exceptions |= + 1 << BP_VECTOR; } else vcpu->guest_debug = 0; } @@ -1532,11 +1146,23 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) else svm->vmcb->save.dr7 = vcpu->arch.dr7; - mark_dirty(svm->vmcb, VMCB_DR); - update_db_intercept(vcpu); } +static void load_host_msrs(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); +#endif +} + +static void save_host_msrs(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); +#endif +} + static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { if (sd->next_asid > sd->max_asid) { @@ -1547,49 +1173,86 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid_generation = sd->asid_generation; svm->vmcb->control.asid = sd->next_asid++; - - mark_dirty(svm->vmcb, VMCB_ASID); } -static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.dr7 = value; - mark_dirty(svm->vmcb, VMCB_DR); + switch (dr) { + case 0 ... 3: + *dest = vcpu->arch.db[dr]; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ + case 6: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + *dest = vcpu->arch.dr6; + else + *dest = svm->vmcb->save.dr6; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ + case 7: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + *dest = vcpu->arch.dr7; + else + *dest = svm->vmcb->save.dr7; + break; + } + + return EMULATE_DONE; } -static int pf_interception(struct vcpu_svm *svm) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { - u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; - int r = 1; + struct vcpu_svm *svm = to_svm(vcpu); - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; - - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, - svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = value; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = value; break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ + case 6: + vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ + case 7: + vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + svm->vmcb->save.dr7 = vcpu->arch.dr7; + vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); + } break; } - return r; + + return EMULATE_DONE; +} + +static int pf_interception(struct vcpu_svm *svm) +{ + u64 fault_address; + u32 error_code; + + fault_address = svm->vmcb->control.exit_info_2; + error_code = svm->vmcb->control.exit_info_1; + + trace_kvm_page_fault(fault_address, error_code); + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } static int db_interception(struct vcpu_svm *svm) @@ -1612,7 +1275,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; @@ -1637,7 +1300,7 @@ static int ud_interception(struct vcpu_svm *svm) { int er; - er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); + er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1646,9 +1309,7 @@ static int ud_interception(struct vcpu_svm *svm) static void svm_fpu_activate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - - clr_exception_intercept(svm, NM_VECTOR); - + svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); svm->vcpu.fpu_active = 1; update_cr0_intercept(svm); } @@ -1659,59 +1320,8 @@ static int nm_interception(struct vcpu_svm *svm) return 1; } -static bool is_erratum_383(void) -{ - int err, i; - u64 value; - - if (!erratum_383_found) - return false; - - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) - return false; - - /* Bit 62 may or may not be set for this mce */ - value &= ~(1ULL << 62); - - if (value != 0xb600000000010015ULL) - return false; - - /* Clear MCi_STATUS registers */ - for (i = 0; i < 6; ++i) - kvm_native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; - - value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - kvm_native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); - } - - /* Flush tlb to evict multi-match entries */ - __flush_tlb_all(); - - return true; -} - -static void svm_handle_mce(struct vcpu_svm *svm) +static int mc_interception(struct vcpu_svm *svm) { - if (is_erratum_383()) { - /* - * Erratum 383 triggered. Guest state is corrupt so kill the - * guest. - */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); - - kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); - - return; - } - /* * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. @@ -1720,11 +1330,6 @@ static void svm_handle_mce(struct vcpu_svm *svm) "int $0x12\n"); /* not sure if we ever come back to this point */ - return; -} - -static int mc_interception(struct vcpu_svm *svm) -{ return 1; } @@ -1745,23 +1350,29 @@ static int shutdown_interception(struct vcpu_svm *svm) static int io_interception(struct vcpu_svm *svm) { - struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; unsigned port; ++svm->vcpu.stat.io_exits; + + svm->next_rip = svm->vmcb->control.exit_info_2; + string = (io_info & SVM_IOIO_STR_MASK) != 0; - in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string || in) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + if (string) { + if (emulate_instruction(&svm->vcpu, + 0, 0, 0) == EMULATE_DO_MMIO) + return 0; + return 1; + } + + in = (io_info & SVM_IOIO_TYPE_MASK) != 0; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - svm->next_rip = svm->vmcb->control.exit_info_2; - skip_emulated_instruction(&svm->vcpu); - return kvm_fast_pio_out(vcpu, size, port); + skip_emulated_instruction(&svm->vcpu); + return kvm_emulate_pio(&svm->vcpu, in, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -1795,56 +1406,6 @@ static int vmmcall_interception(struct vcpu_svm *svm) return 1; } -static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return svm->nested.nested_cr3; -} - -static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, - unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.nested_cr3 = root; - mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu); -} - -static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; - - nested_svm_vmexit(svm); -} - -static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) -{ - int r; - - r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); - - vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; - vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; - vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu.shadow_root_level = get_npt_level(); - vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; -} - -static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) -{ - vcpu->arch.walk_mmu = &vcpu->arch.mmu; -} - static int nested_svm_check_permissions(struct vcpu_svm *svm) { if (!(svm->vcpu.arch.efer & EFER_SVME) @@ -1864,9 +1425,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { - int vmexit; - - if (!is_guest_mode(&svm->vcpu)) + if (!is_nested(svm)) return 0; svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; @@ -1874,36 +1433,21 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_info_1 = error_code; svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - vmexit = nested_svm_intercept(svm); - if (vmexit == NESTED_EXIT_DONE) - svm->nested.exit_required = true; - - return vmexit; + return nested_svm_exit_handled(svm); } -/* This function returns true if it is save to enable the irq window */ -static inline bool nested_svm_intr(struct vcpu_svm *svm) +static inline int nested_svm_intr(struct vcpu_svm *svm) { - if (!is_guest_mode(&svm->vcpu)) - return true; + if (!is_nested(svm)) + return 0; if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return true; + return 0; if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return false; - - /* - * if vmexit was already requested (by intercepted exception - * for instance) do not overwrite it with "external interrupt" - * vmexit. - */ - if (svm->nested.exit_required) - return false; + return 0; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; + svm->vmcb->control.exit_code = SVM_EXIT_INTR; if (svm->nested.intercept & 1ULL) { /* @@ -1914,40 +1458,22 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) */ svm->nested.exit_required = true; trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - return false; + return 1; } - return true; -} - -/* This function returns true if it is save to enable the nmi window */ -static inline bool nested_svm_nmi(struct vcpu_svm *svm) -{ - if (!is_guest_mode(&svm->vcpu)) - return true; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) - return true; - - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->nested.exit_required = true; - - return false; + return 0; } -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx, struct page **mapped_page) { struct page *page; - might_sleep(); - page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto error; - *_page = page; - - return kmap(page); + *mapped_page = page; + return kmap_atomic(page, idx); error: kvm_release_page_clean(page); @@ -1956,55 +1482,62 @@ error: return NULL; } -static void nested_svm_unmap(struct page *page) +static void nested_svm_unmap(void *addr, enum km_type idx, struct page *mapped_page) { - kunmap(page); - kvm_release_page_dirty(page); -} - -static int nested_svm_intercept_ioio(struct vcpu_svm *svm) -{ - unsigned port; - u8 val, bit; - u64 gpa; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) - return NESTED_EXIT_HOST; + struct page *page; - port = svm->vmcb->control.exit_info_1 >> 16; - gpa = svm->nested.vmcb_iopm + (port / 8); - bit = port % 8; - val = 0; + if (!addr) + return; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) - val &= (1 << bit); + page = mapped_page; - return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; + kunmap_atomic(addr, idx); + kvm_release_page_dirty(page); } -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) +static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - u32 offset, msr, value; - int write, mask; + u32 param = svm->vmcb->control.exit_info_1 & 1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + bool ret = false; + u32 t0, t1; + u8 *msrpm; if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return NESTED_EXIT_HOST; + return false; - msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - offset = svm_msrpm_offset(msr); - write = svm->vmcb->control.exit_info_1 & 1; - mask = 1 << ((2 * (msr & 0xf)) + write); + { struct page *mapped_page; + msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0, &mapped_page); - if (offset == MSR_INVALID) - return NESTED_EXIT_DONE; + if (!msrpm) + goto out; - /* Offset is in 32 bit units but need in 8 bit units */ - offset *= 4; + switch (msr) { + case 0 ... 0x1fff: + t0 = (msr * 2) % 8; + t1 = msr / 8; + break; + case 0xc0000000 ... 0xc0001fff: + t0 = (8192 + msr - 0xc0000000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + case 0xc0010000 ... 0xc0011fff: + t0 = (16384 + msr - 0xc0010000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + default: + ret = true; + goto out; + } + + ret = msrpm[t1] & ((1 << param) << t0); - if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) - return NESTED_EXIT_DONE; +out: + nested_svm_unmap(msrpm, KM_USER0, mapped_page); } - return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; + return ret; } static int nested_svm_exit_special(struct vcpu_svm *svm) @@ -2014,21 +1547,17 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: - case SVM_EXIT_EXCP_BASE + MC_VECTOR: return NESTED_EXIT_HOST; - case SVM_EXIT_NPF: /* For now we are always handling NPFs when using them */ + case SVM_EXIT_NPF: if (npt_enabled) return NESTED_EXIT_HOST; break; + /* When we're shadowing, trap PFs */ case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) + if (!npt_enabled) return NESTED_EXIT_HOST; break; - case SVM_EXIT_EXCP_BASE + NM_VECTOR: - nm_interception(svm); - break; default: break; } @@ -2039,7 +1568,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) /* * If this function returns true, this #vmexit was already handled */ -static int nested_svm_intercept(struct vcpu_svm *svm) +static int nested_svm_exit_handled(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; @@ -2048,18 +1577,27 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_MSR: vmexit = nested_svm_exit_handled_msr(svm); break; - case SVM_EXIT_IOIO: - vmexit = nested_svm_intercept_ioio(svm); + case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { + u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); + if (svm->nested.intercept_cr_read & cr_bits) + vmexit = NESTED_EXIT_DONE; + break; + } + case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { + u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); + if (svm->nested.intercept_cr_write & cr_bits) + vmexit = NESTED_EXIT_DONE; break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.intercept_cr & bit) + } + case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { + u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); + if (svm->nested.intercept_dr_read & dr_bits) vmexit = NESTED_EXIT_DONE; break; } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.intercept_dr & bit) + case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { + u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); + if (svm->nested.intercept_dr_write & dr_bits) vmexit = NESTED_EXIT_DONE; break; } @@ -2067,14 +1605,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm) u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); if (svm->nested.intercept_exceptions & excp_bits) vmexit = NESTED_EXIT_DONE; - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_ERR: { - vmexit = NESTED_EXIT_DONE; break; } default: { @@ -2084,17 +1614,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } } - return vmexit; -} - -static int nested_svm_exit_handled(struct vcpu_svm *svm) -{ - int vmexit; - - vmexit = nested_svm_intercept(svm); - - if (vmexit == NESTED_EXIT_DONE) + if (vmexit == NESTED_EXIT_DONE) { nested_svm_vmexit(svm); + } return vmexit; } @@ -2104,8 +1626,10 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr struct vmcb_control_area *dst = &dst_vmcb->control; struct vmcb_control_area *from = &from_vmcb->control; - dst->intercept_cr = from->intercept_cr; - dst->intercept_dr = from->intercept_dr; + dst->intercept_cr_read = from->intercept_cr_read; + dst->intercept_cr_write = from->intercept_cr_write; + dst->intercept_dr_read = from->intercept_dr_read; + dst->intercept_dr_write = from->intercept_dr_write; dst->intercept_exceptions = from->intercept_exceptions; dst->intercept = from->intercept; dst->iopm_base_pa = from->iopm_base_pa; @@ -2134,7 +1658,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, vmcb->control.exit_info_1, @@ -2142,14 +1665,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_int_info, vmcb->control.exit_int_info_err); - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); + { struct page *mapped_page; + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0, &mapped_page); if (!nested_vmcb) return 1; - /* Exit Guest-Mode */ - leave_guest_mode(&svm->vcpu); - svm->nested.vmcb = 0; - /* Give the current vmcb to the guest */ disable_gif(svm); @@ -2159,11 +1679,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.ds = vmcb->save.ds; nested_vmcb->save.gdtr = vmcb->save.gdtr; nested_vmcb->save.idtr = vmcb->save.idtr; - nested_vmcb->save.efer = svm->vcpu.arch.efer; - nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); - nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); + if (npt_enabled) + nested_vmcb->save.cr3 = vmcb->save.cr3; nested_vmcb->save.cr2 = vmcb->save.cr2; - nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; nested_vmcb->save.rflags = vmcb->save.rflags; nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; @@ -2181,7 +1699,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -2212,8 +1729,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); - svm->nested.nested_cr3 = 0; - /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; svm->vmcb->save.cs = hsave->save.cs; @@ -2229,7 +1744,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cr3 = hsave->save.cr3; svm->vcpu.arch.cr3 = hsave->save.cr3; } else { - (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + kvm_set_cr3(&svm->vcpu, hsave->save.cr3); } kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); @@ -2238,11 +1753,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - mark_all_dirty(svm->vmcb); + /* Exit nested SVM mode */ + svm->nested.vmcb = 0; - nested_svm_unmap(page); + nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); } - nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -2251,47 +1766,20 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is omptimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits - */ + u32 *nested_msrpm; int i; - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return true; - - for (i = 0; i < MSRPM_OFFSETS; i++) { - u32 value, p; - u64 offset; - - if (msrpm_offsets[i] == 0xffffffff) - break; - - p = msrpm_offsets[i]; - offset = svm->nested.vmcb_msrpm + (p * 4); - - if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) - return false; + { struct page *mapped_page; + nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0, &mapped_page); + if (!nested_msrpm) + return false; - svm->nested.msrpm[p] = svm->msrpm[p] | value; - } + for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) + svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); - return true; -} - -static bool nested_vmcb_checks(struct vmcb *vmcb) -{ - if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) - return false; - - if (vmcb->control.asid == 0) - return false; - - if (vmcb->control.nested_ctl && !npt_enabled) - return false; + nested_svm_unmap(nested_msrpm, KM_USER0, mapped_page); } return true; } @@ -2301,45 +1789,27 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; - u64 vmcb_gpa; - vmcb_gpa = svm->vmcb->save.rax; - - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + { struct page *mapped_page; + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0, &mapped_page); if (!nested_vmcb) return false; - if (!nested_vmcb_checks(nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; - - nested_svm_unmap(page); + /* nested_vmcb is our indicator if nested SVM is activated */ + svm->nested.vmcb = svm->vmcb->save.rax; - return false; - } - - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, + trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb, nested_vmcb->save.rip, nested_vmcb->control.int_ctl, nested_vmcb->control.event_inj, nested_vmcb->control.nested_ctl); - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); - /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs - */ + /* Save the old vmcb, so we don't need to pick what we save, but + can restore everything when a VMEXIT occurs */ hsave->save.es = vmcb->save.es; hsave->save.cs = vmcb->save.cs; hsave->save.ss = vmcb->save.ss; @@ -2350,13 +1820,13 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; - hsave->save.rip = kvm_rip_read(&svm->vcpu); + hsave->save.rip = svm->next_rip; hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; if (npt_enabled) hsave->save.cr3 = vmcb->save.cr3; else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); + hsave->save.cr3 = svm->vcpu.arch.cr3; copy_vmcb_control_area(hsave, vmcb); @@ -2365,12 +1835,6 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; - if (nested_vmcb->control.nested_ctl) { - kvm_mmu_unload(&svm->vcpu); - svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; - nested_svm_init_mmu_context(&svm->vcpu); - } - /* Load the nested guest state */ svm->vmcb->save.es = nested_vmcb->save.es; svm->vmcb->save.cs = nested_vmcb->save.cs; @@ -2385,17 +1849,14 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) if (npt_enabled) { svm->vmcb->save.cr3 = nested_vmcb->save.cr3; svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); - + } else { + kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + kvm_mmu_reset_context(&svm->vcpu); + } svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); - /* In case we don't even reach vcpu_run, the fields are not updated */ svm->vmcb->save.rax = nested_vmcb->save.rax; svm->vmcb->save.rsp = nested_vmcb->save.rsp; @@ -2404,55 +1865,48 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; - svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; + /* We don't want a nested guest to be more powerful than the guest, + so all intercepts are ORed */ + svm->vmcb->control.intercept_cr_read |= + nested_vmcb->control.intercept_cr_read; + svm->vmcb->control.intercept_cr_write |= + nested_vmcb->control.intercept_cr_write; + svm->vmcb->control.intercept_dr_read |= + nested_vmcb->control.intercept_dr_read; + svm->vmcb->control.intercept_dr_write |= + nested_vmcb->control.intercept_dr_write; + svm->vmcb->control.intercept_exceptions |= + nested_vmcb->control.intercept_exceptions; + + svm->vmcb->control.intercept |= nested_vmcb->control.intercept; + + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; /* cache intercepts */ - svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; - svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; + svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; + svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; + svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; + svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; svm->nested.intercept = nested_vmcb->control.intercept; - svm_flush_tlb(&svm->vcpu); + force_new_asid(&svm->vcpu); svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { - /* We only want the cr8 intercept bits of the guest */ - clr_cr_intercept(svm, INTERCEPT_CR8_READ); - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); - } - - /* We don't want to see VMMCALLs from a nested guest */ - clr_intercept(svm, INTERCEPT_VMMCALL); - - svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - nested_svm_unmap(page); - - /* Enter Guest-Mode */ - enter_guest_mode(&svm->vcpu); - - /* - * Merge guest and host intercepts - must be called with vcpu in - * guest-mode to take affect here - */ - recalc_intercepts(svm); - - svm->nested.vmcb = vmcb_gpa; + nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); } enable_gif(svm); - mark_all_dirty(svm->vmcb); - return true; } @@ -2475,7 +1929,6 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; if (nested_svm_check_permissions(svm)) return 1; @@ -2483,12 +1936,13 @@ static int vmload_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + { struct page *mapped_page; + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0, &mapped_page); if (!nested_vmcb) return 1; nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(page); + nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); } return 1; } @@ -2496,7 +1950,6 @@ static int vmload_interception(struct vcpu_svm *svm) static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; if (nested_svm_check_permissions(svm)) return 1; @@ -2504,12 +1957,13 @@ static int vmsave_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + { struct page *mapped_page; + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0, &mapped_page); if (!nested_vmcb) return 1; nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(page); + nested_svm_unmap(nested_vmcb, KM_USER0, mapped_page); } return 1; } @@ -2519,8 +1973,8 @@ static int vmrun_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - /* Save rip after vmrun instruction */ - kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); if (!nested_svm_vmrun(svm)) return 1; @@ -2549,7 +2003,6 @@ static int stgi_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); @@ -2570,8 +2023,6 @@ static int clgi_interception(struct vcpu_svm *svm) svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - mark_dirty(svm->vmcb, VMCB_INTR); - return 1; } @@ -2598,19 +2049,6 @@ static int skinit_interception(struct vcpu_svm *svm) return 1; } -static int xsetbv_interception(struct vcpu_svm *svm) -{ - u64 new_bv = kvm_read_edx_eax(&svm->vcpu); - u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); - - if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - } - - return 1; -} - static int invalid_op_interception(struct vcpu_svm *svm) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); @@ -2628,8 +2066,6 @@ static int task_switch_interception(struct vcpu_svm *svm) svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; uint32_t idt_v = svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; - bool has_error_code = false; - u32 error_code = 0; tss_selector = (u16)svm->vmcb->control.exit_info_1; @@ -2650,12 +2086,6 @@ static int task_switch_interception(struct vcpu_svm *svm) svm->vcpu.arch.nmi_injected = false; break; case SVM_EXITINTINFO_TYPE_EXEPT: - if (svm->vmcb->control.exit_info_2 & - (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { - has_error_code = true; - error_code = - (u32)svm->vmcb->control.exit_info_2; - } kvm_clear_exception_queue(&svm->vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: @@ -2672,14 +2102,7 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - if (kvm_task_switch(&svm->vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - svm->vcpu.run->internal.ndata = 0; - return 0; - } - return 1; + return kvm_task_switch(&svm->vcpu, tss_selector, reason); } static int cpuid_interception(struct vcpu_svm *svm) @@ -2692,151 +2115,38 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - clr_intercept(svm, INTERCEPT_IRET); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; return 1; } static int invlpg_interception(struct vcpu_svm *svm) { - if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; - - kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - skip_emulated_instruction(&svm->vcpu); + if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } static int emulate_on_interception(struct vcpu_svm *svm) { - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; -} - -#define CR_VALID (1ULL << 63) - -static int cr_interception(struct vcpu_svm *svm) -{ - int reg, cr; - unsigned long val; - int err; - - if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); - - if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) - return emulate_on_interception(svm); - - reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; - cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; - - err = 0; - if (cr >= 16) { /* mov to cr */ - cr -= 16; - val = kvm_register_read(&svm->vcpu, reg); - switch (cr) { - case 0: - err = kvm_set_cr0(&svm->vcpu, val); - break; - case 3: - err = kvm_set_cr3(&svm->vcpu, val); - break; - case 4: - err = kvm_set_cr4(&svm->vcpu, val); - break; - case 8: - err = kvm_set_cr8(&svm->vcpu, val); - break; - default: - WARN(1, "unhandled write to CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; - } - } else { /* mov from cr */ - switch (cr) { - case 0: - val = kvm_read_cr0(&svm->vcpu); - break; - case 2: - val = svm->vcpu.arch.cr2; - break; - case 3: - val = kvm_read_cr3(&svm->vcpu); - break; - case 4: - val = kvm_read_cr4(&svm->vcpu); - break; - case 8: - val = kvm_get_cr8(&svm->vcpu); - break; - default: - WARN(1, "unhandled read from CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; - } - kvm_register_write(&svm->vcpu, reg, val); - } - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; -} - -static int cr0_write_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - int r; - - r = cr_interception(svm); - - if (svm->nested.vmexit_rip) { - kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); - kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); - kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); - svm->nested.vmexit_rip = 0; - } - - return r; -} - -static int dr_interception(struct vcpu_svm *svm) -{ - int reg, dr; - unsigned long val; - int err; - - if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); - - reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; - dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; - - if (dr >= 16) { /* mov to DRn */ - val = kvm_register_read(&svm->vcpu, reg); - kvm_set_dr(&svm->vcpu, dr - 16, val); - } else { - err = kvm_get_dr(&svm->vcpu, dr, &val); - if (!err) - kvm_register_write(&svm->vcpu, reg, val); - } - - skip_emulated_instruction(&svm->vcpu); - + if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } static int cr8_write_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; - int r; u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ - r = cr_interception(svm); + emulate_instruction(&svm->vcpu, 0, 0, 0); if (irqchip_in_kernel(svm->vcpu.kvm)) { - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); - return r; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + return 1; } if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) - return r; + return 1; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; } @@ -2847,12 +2157,17 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) switch (ecx) { case MSR_IA32_TSC: { - struct vmcb *vmcb = get_host_vmcb(svm); + u64 tsc_offset; - *data = vmcb->control.tsc_offset + kvm_native_read_tsc(); + if (is_nested(svm)) + tsc_offset = svm->nested.hsave->control.tsc_offset; + else + tsc_offset = svm->vmcb->control.tsc_offset; + + *data = tsc_offset + kvm_native_read_tsc(); break; } - case MSR_STAR: + case MSR_K6_STAR: *data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 @@ -2878,11 +2193,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_SYSENTER_ESP: *data = svm->sysenter_esp; break; - /* - * Nobody will change the following 5 values in the VMCB so we can - * safely return them on rdmsr. They will always be 0 until LBRV is - * implemented. - */ + /* Nobody will change the following 5 values in the VMCB so + we can safely return them on rdmsr. They will always be 0 + until LBRV is implemented. */ case MSR_IA32_DEBUGCTLMSR: *data = svm->vmcb->save.dbgctl; break; @@ -2902,7 +2215,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->nested.hsave_msr; break; case MSR_VM_CR: - *data = svm->nested.vm_cr_msr; + *data = 0; break; case MSR_IA32_UCODE_REV: *data = 0x01000065; @@ -2932,40 +2245,26 @@ static int rdmsr_interception(struct vcpu_svm *svm) return 1; } -static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) { struct vcpu_svm *svm = to_svm(vcpu); - int svm_dis, chg_mask; - - if (data & ~SVM_VM_CR_VALID_MASK) - return 1; - - chg_mask = SVM_VM_CR_VALID_MASK; - if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) - chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); - - svm->nested.vm_cr_msr &= ~chg_mask; - svm->nested.vm_cr_msr |= (data & chg_mask); - - svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; - - /* check for svm_disable while efer.svme is set */ - if (svm_dis && (vcpu->arch.efer & EFER_SVME)) - return 1; + switch (ecx) { + case MSR_IA32_TSC: { + u64 tsc_offset = data - kvm_native_read_tsc(); + u64 g_tsc_offset = 0; - return 0; -} + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = tsc_offset; + } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) -{ - struct vcpu_svm *svm = to_svm(vcpu); + svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - switch (ecx) { - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); break; - case MSR_STAR: + } + case MSR_K6_STAR: svm->vmcb->save.star = data; break; #ifdef CONFIG_X86_64 @@ -2994,7 +2293,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->vmcb->save.sysenter_esp = data; break; case MSR_IA32_DEBUGCTLMSR: - if (!boot_cpu_has(X86_FEATURE_LBRV)) { + if (!svm_has(SVM_FEATURE_LBRV)) { pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", __func__, data); break; @@ -3003,7 +2302,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) return 1; svm->vmcb->save.dbgctl = data; - mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) svm_enable_lbrv(svm); else @@ -3013,7 +2311,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->nested.hsave_msr = data; break; case MSR_VM_CR: - return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; @@ -3053,10 +2350,8 @@ static int interrupt_window_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - mark_dirty(svm->vmcb, VMCB_INTR); /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -3079,42 +2374,43 @@ static int pause_interception(struct vcpu_svm *svm) } static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { - [SVM_EXIT_READ_CR0] = cr_interception, - [SVM_EXIT_READ_CR3] = cr_interception, - [SVM_EXIT_READ_CR4] = cr_interception, - [SVM_EXIT_READ_CR8] = cr_interception, + [SVM_EXIT_READ_CR0] = emulate_on_interception, + [SVM_EXIT_READ_CR3] = emulate_on_interception, + [SVM_EXIT_READ_CR4] = emulate_on_interception, + [SVM_EXIT_READ_CR8] = emulate_on_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = cr0_write_interception, - [SVM_EXIT_WRITE_CR3] = cr_interception, - [SVM_EXIT_WRITE_CR4] = cr_interception, - [SVM_EXIT_WRITE_CR8] = cr8_write_interception, - [SVM_EXIT_READ_DR0] = dr_interception, - [SVM_EXIT_READ_DR1] = dr_interception, - [SVM_EXIT_READ_DR2] = dr_interception, - [SVM_EXIT_READ_DR3] = dr_interception, - [SVM_EXIT_READ_DR4] = dr_interception, - [SVM_EXIT_READ_DR5] = dr_interception, - [SVM_EXIT_READ_DR6] = dr_interception, - [SVM_EXIT_READ_DR7] = dr_interception, - [SVM_EXIT_WRITE_DR0] = dr_interception, - [SVM_EXIT_WRITE_DR1] = dr_interception, - [SVM_EXIT_WRITE_DR2] = dr_interception, - [SVM_EXIT_WRITE_DR3] = dr_interception, - [SVM_EXIT_WRITE_DR4] = dr_interception, - [SVM_EXIT_WRITE_DR5] = dr_interception, - [SVM_EXIT_WRITE_DR6] = dr_interception, - [SVM_EXIT_WRITE_DR7] = dr_interception, + [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR3] = emulate_on_interception, + [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_WRITE_CR8] = cr8_write_interception, + [SVM_EXIT_READ_DR0] = emulate_on_interception, + [SVM_EXIT_READ_DR1] = emulate_on_interception, + [SVM_EXIT_READ_DR2] = emulate_on_interception, + [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, + [SVM_EXIT_WRITE_DR0] = emulate_on_interception, + [SVM_EXIT_WRITE_DR1] = emulate_on_interception, + [SVM_EXIT_WRITE_DR2] = emulate_on_interception, + [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, + [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, + [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, - [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, - [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, - [SVM_EXIT_INTR] = intr_interception, + [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, + [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, + /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, @@ -3122,7 +2418,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, - [SVM_EXIT_IOIO] = io_interception, + [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, @@ -3136,123 +2432,16 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, [SVM_EXIT_MWAIT] = invalid_op_interception, - [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, }; -void dump_vmcb(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - struct vmcb_save_area *save = &svm->vmcb->save; - - pr_err("VMCB Control Area:\n"); - pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); - pr_err("cr_write: %04x\n", control->intercept_cr >> 16); - pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); - pr_err("dr_write: %04x\n", control->intercept_dr >> 16); - pr_err("exceptions: %08x\n", control->intercept_exceptions); - pr_err("intercepts: %016llx\n", control->intercept); - pr_err("pause filter count: %d\n", control->pause_filter_count); - pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); - pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); - pr_err("tsc_offset: %016llx\n", control->tsc_offset); - pr_err("asid: %d\n", control->asid); - pr_err("tlb_ctl: %d\n", control->tlb_ctl); - pr_err("int_ctl: %08x\n", control->int_ctl); - pr_err("int_vector: %08x\n", control->int_vector); - pr_err("int_state: %08x\n", control->int_state); - pr_err("exit_code: %08x\n", control->exit_code); - pr_err("exit_info1: %016llx\n", control->exit_info_1); - pr_err("exit_info2: %016llx\n", control->exit_info_2); - pr_err("exit_int_info: %08x\n", control->exit_int_info); - pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); - pr_err("nested_ctl: %lld\n", control->nested_ctl); - pr_err("nested_cr3: %016llx\n", control->nested_cr3); - pr_err("event_inj: %08x\n", control->event_inj); - pr_err("event_inj_err: %08x\n", control->event_inj_err); - pr_err("lbr_ctl: %lld\n", control->lbr_ctl); - pr_err("next_rip: %016llx\n", control->next_rip); - pr_err("VMCB State Save Area:\n"); - pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", - save->es.selector, save->es.attrib, - save->es.limit, save->es.base); - pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", - save->cs.selector, save->cs.attrib, - save->cs.limit, save->cs.base); - pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", - save->ss.selector, save->ss.attrib, - save->ss.limit, save->ss.base); - pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", - save->ds.selector, save->ds.attrib, - save->ds.limit, save->ds.base); - pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); - pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); - pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->gdtr.selector, save->gdtr.attrib, - save->gdtr.limit, save->gdtr.base); - pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); - pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->idtr.selector, save->idtr.attrib, - save->idtr.limit, save->idtr.base); - pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); - pr_err("cpl: %d efer: %016llx\n", - save->cpl, save->efer); - pr_err("cr0: %016llx cr2: %016llx\n", - save->cr0, save->cr2); - pr_err("cr3: %016llx cr4: %016llx\n", - save->cr3, save->cr4); - pr_err("dr6: %016llx dr7: %016llx\n", - save->dr6, save->dr7); - pr_err("rip: %016llx rflags: %016llx\n", - save->rip, save->rflags); - pr_err("rsp: %016llx rax: %016llx\n", - save->rsp, save->rax); - pr_err("star: %016llx lstar: %016llx\n", - save->star, save->lstar); - pr_err("cstar: %016llx sfmask: %016llx\n", - save->cstar, save->sfmask); - pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", - save->kernel_gs_base, save->sysenter_cs); - pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", - save->sysenter_esp, save->sysenter_eip); - pr_err("gpat: %016llx dbgctl: %016llx\n", - save->g_pat, save->dbgctl); - pr_err("br_from: %016llx br_to: %016llx\n", - save->br_from, save->br_to); - pr_err("excp_from: %016llx excp_to: %016llx\n", - save->last_excp_from, save->last_excp_to); - -} - -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) -{ - struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; - - *info1 = control->exit_info_1; - *info2 = control->exit_info_2; -} - static int handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) - vcpu->arch.cr0 = svm->vmcb->save.cr0; - if (npt_enabled) - vcpu->arch.cr3 = svm->vmcb->save.cr3; + trace_kvm_exit(exit_code, svm->vmcb->save.rip); if (unlikely(svm->nested.exit_required)) { nested_svm_vmexit(svm); @@ -3261,7 +2450,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) return 1; } - if (is_guest_mode(vcpu)) { + if (is_nested(svm)) { int vmexit; trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, @@ -3281,19 +2470,21 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) + vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) + vcpu->arch.cr3 = svm->vmcb->save.cr3; + if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; - pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); - dump_vmcb(vcpu); return 0; } if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && - exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, @@ -3324,6 +2515,7 @@ static void pre_svm_run(struct vcpu_svm *svm) struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* FIXME: handle wraparound of asid_generation */ if (svm->asid_generation != sd->asid_generation) new_asid(svm, sd); @@ -3335,7 +2527,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3343,12 +2535,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; + trace_kvm_inj_virq(irq); + + ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl |= V_IRQ_MASK | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); - mark_dirty(svm->vmcb, VMCB_INTR); } static void svm_set_irq(struct kvm_vcpu *vcpu) @@ -3357,9 +2551,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) BUG_ON(!(gif_set(svm))); - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); - ++vcpu->stat.irq_injections; - svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } @@ -3368,26 +2559,19 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) - return; - if (irr == -1) return; if (tpr >= irr) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; } static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int ret; - ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - !(svm->vcpu.arch.hflags & HF_NMI_MASK); - ret = ret && gif_set(svm) && nested_svm_nmi(svm); - - return ret; + return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -3403,10 +2587,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - clr_intercept(svm, INTERCEPT_IRET); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); } } @@ -3422,7 +2606,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); - if (is_guest_mode(vcpu)) + if (is_nested(svm)) return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); return ret; @@ -3432,13 +2616,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes - * 1, because that's a separate STGI/VMRUN intercept. The next time we - * get that intercept, this function will be called again though and - * we'll get the vintr intercept. - */ - if (gif_set(svm) && nested_svm_intr(svm)) { + nested_svm_intr(svm); + + /* In case GIF=0 we can't rely on the CPU to tell us when + * GIF becomes 1, because that's a separate STGI/VMRUN intercept. + * The next time we get that intercept, this function will be + * called again though and we'll get the vintr intercept. */ + if (gif_set(svm)) { svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } @@ -3452,10 +2636,9 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ - /* - * Something prevents NMI from been injected. Single step over possible - * problem (IRET or exception injection or interrupt shadow) - */ + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or interrupt + shadow) */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_intercept(vcpu); @@ -3468,12 +2651,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) static void svm_flush_tlb(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) - svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; - else - svm->asid_generation--; + force_new_asid(vcpu); } static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) @@ -3484,10 +2662,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) - return; - - if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); } @@ -3498,9 +2673,6 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) - return; - cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; @@ -3511,14 +2683,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; - svm->int3_injected = 0; - - if (svm->vcpu.arch.hflags & HF_IRET_MASK) { + if (svm->vcpu.arch.hflags & HF_IRET_MASK) svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - } svm->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&svm->vcpu); @@ -3527,8 +2694,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; @@ -3537,25 +2702,18 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->vcpu.arch.nmi_injected = true; break; case SVM_EXITINTINFO_TYPE_EXEPT: - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. - */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) - kvm_rip_write(&svm->vcpu, - kvm_rip_read(&svm->vcpu) - - int3_injected); + /* In case of software exception do not reinject an exception + vector, but re-execute and instruction instead */ + if (is_nested(svm)) + break; + if (kvm_exception_is_soft(vector)) break; - } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(&svm->vcpu, vector, err); + kvm_queue_exception_e(&svm->vcpu, vector, err); } else - kvm_requeue_exception(&svm->vcpu, vector); + kvm_queue_exception(&svm->vcpu, vector); break; case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(&svm->vcpu, vector, false); @@ -3565,17 +2723,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) } } -static void svm_cancel_injection(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - - control->exit_int_info = control->event_inj; - control->exit_int_info_err = control->event_inj_err; - control->event_inj = 0; - svm_complete_interrupts(svm); -} - #ifdef CONFIG_X86_64 #define R "r" #else @@ -3585,10 +2732,9 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + u16 fs_selector; + u16 gs_selector; + u16 ldt_selector; /* * A vmexit emulation is required before the vcpu can be executed @@ -3597,11 +2743,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); + save_host_msrs(vcpu); + fs_selector = kvm_read_fs(); + gs_selector = kvm_read_gs(); + ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; + /* required for live migration with NPT */ + if (npt_enabled) + svm->vmcb->save.cr3 = vcpu->arch.cr3; clgi(); @@ -3678,11 +2835,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#endif + vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + + kvm_load_fs(fs_selector); + kvm_load_gs(gs_selector); + kvm_load_ldt(ldt_selector); + load_host_msrs(vcpu); reload_tss(vcpu); @@ -3690,35 +2851,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); - vcpu->arch.cr2 = svm->vmcb->save.cr2; - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - sync_cr8_to_lapic(vcpu); svm->next_rip = 0; - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - - /* if exit due to PF check for async PF */ - if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); - if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); } - - /* - * We need to handle MC intercepts here before the vcpu has a chance to - * change the physical cpu - */ - if (unlikely(svm->vmcb->control.exit_code == - SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(svm); - - mark_all_clean(svm->vmcb); } #undef R @@ -3727,23 +2867,14 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.cr3 = root; - mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); -} - -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.nested_cr3 = root; - mark_dirty(svm->vmcb, VMCB_NPT); - - /* Also sync guest cr3 here in case we live migrate */ - svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); - mark_dirty(svm->vmcb, VMCB_CR); + if (npt_enabled) { + svm->vmcb->control.nested_cr3 = root; + force_new_asid(vcpu); + return; + } - svm_flush_tlb(vcpu); + svm->vmcb->save.cr3 = root; + force_new_asid(vcpu); } static int is_disabled(void) @@ -3778,61 +2909,43 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static int get_npt_level(void) { - return 0; +#ifdef CONFIG_X86_64 + return PT64_ROOT_LEVEL; +#else + return PT32E_ROOT_LEVEL; +#endif } -static void svm_cpuid_update(struct kvm_vcpu *vcpu) +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { + return 0; } -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static void svm_cpuid_update(struct kvm_vcpu *vcpu) { - switch (func) { - case 0x80000001: - if (nested) - entry->ecx |= (1 << 2); /* Set SVM bit */ - break; - case 0x8000000A: - entry->eax = 1; /* SVM revision 1 */ - entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper - ASID emulation to nested SVM */ - entry->ecx = 0; /* Reserved */ - entry->edx = 0; /* Per default do not support any - additional features */ - - /* Support next_rip if host supports it */ - if (boot_cpu_has(X86_FEATURE_NRIPS)) - entry->edx |= SVM_FEATURE_NRIP; - - /* Support NPT for the guest if enabled */ - if (npt_enabled) - entry->edx |= SVM_FEATURE_NPT; - - break; - } } static const struct trace_print_flags svm_exit_reasons_str[] = { - { SVM_EXIT_READ_CR0, "read_cr0" }, - { SVM_EXIT_READ_CR3, "read_cr3" }, - { SVM_EXIT_READ_CR4, "read_cr4" }, - { SVM_EXIT_READ_CR8, "read_cr8" }, - { SVM_EXIT_WRITE_CR0, "write_cr0" }, - { SVM_EXIT_WRITE_CR3, "write_cr3" }, - { SVM_EXIT_WRITE_CR4, "write_cr4" }, - { SVM_EXIT_WRITE_CR8, "write_cr8" }, - { SVM_EXIT_READ_DR0, "read_dr0" }, - { SVM_EXIT_READ_DR1, "read_dr1" }, - { SVM_EXIT_READ_DR2, "read_dr2" }, - { SVM_EXIT_READ_DR3, "read_dr3" }, - { SVM_EXIT_WRITE_DR0, "write_dr0" }, - { SVM_EXIT_WRITE_DR1, "write_dr1" }, - { SVM_EXIT_WRITE_DR2, "write_dr2" }, - { SVM_EXIT_WRITE_DR3, "write_dr3" }, - { SVM_EXIT_WRITE_DR5, "write_dr5" }, - { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, @@ -3863,7 +2976,6 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_WBINVD, "wbinvd" }, { SVM_EXIT_MONITOR, "monitor" }, { SVM_EXIT_MWAIT, "mwait" }, - { SVM_EXIT_XSETBV, "xsetbv" }, { SVM_EXIT_NPF, "npf" }, { -1, NULL } }; @@ -3878,17 +2990,12 @@ static bool svm_rdtscp_supported(void) return false; } -static bool svm_has_wbinvd_exit(void) -{ - return true; -} - static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - set_exception_intercept(svm, NM_VECTOR); update_cr0_intercept(svm); + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; } static struct kvm_x86_ops svm_x86_ops = { @@ -3918,7 +3025,6 @@ static struct kvm_x86_ops svm_x86_ops = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr3 = svm_decache_cr3, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -3928,7 +3034,8 @@ static struct kvm_x86_ops svm_x86_ops = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .set_dr7 = svm_set_dr7, + .get_dr = svm_get_dr, + .set_dr = svm_set_dr, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -3946,7 +3053,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, - .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, .get_nmi_mask = svm_get_nmi_mask, @@ -3959,29 +3065,18 @@ static struct kvm_x86_ops svm_x86_ops = { .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, - .get_exit_info = svm_get_exit_info, .exit_reasons_str = svm_exit_reasons_str, - .get_lpage_level = svm_get_lpage_level, .cpuid_update = svm_cpuid_update, .rdtscp_supported = svm_rdtscp_supported, - - .set_supported_cpuid = svm_set_supported_cpuid, - - .has_wbinvd_exit = svm_has_wbinvd_exit, - - .write_tsc_offset = svm_write_tsc_offset, - .adjust_tsc_offset = svm_adjust_tsc_offset, - - .set_tdp_cr3 = set_tdp_cr3, }; static int __init svm_init(void) { return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + THIS_MODULE); } static void __exit svm_exit(void) diff --git a/linux/x86/timer.c b/linux/x86/timer.c index 1fa2aa0..1a67e02 100644 --- a/linux/x86/timer.c +++ b/linux/x86/timer.c @@ -38,20 +38,6 @@ #endif #endif -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * timer support - * - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/hrtimer.h> @@ -66,13 +52,12 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) /* * There is a race window between reading and incrementing, but we do * not care about potentially loosing timer events in the !reinject - * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked - * in vcpu_enter_guest. + * case anyway. */ if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); } if (waitqueue_active(q)) diff --git a/linux/x86/trace.h b/linux/x86/trace.h index 1357d7c..6ad30a2 100644 --- a/linux/x86/trace.h +++ b/linux/x86/trace.h @@ -5,6 +5,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_FILE trace /* * Tracepoint for guest mode entry. @@ -178,36 +180,27 @@ TRACE_EVENT(kvm_apic, #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) -#define KVM_ISA_VMX 1 -#define KVM_ISA_SVM 2 - /* * Tracepoint for kvm guest exit: */ TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), + TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), + TP_ARGS(exit_reason, guest_rip), TP_STRUCT__entry( __field( unsigned int, exit_reason ) __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) ), TP_fast_assign( __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, - &__entry->info2); + __entry->guest_rip = guest_rip; ), - TP_printk("reason %s rip 0x%lx info %llx %llx", + TP_printk("reason %s rip 0x%lx", ftrace_print_symbols_seq(p, __entry->exit_reason, kvm_x86_ops->exit_reasons_str), - __entry->guest_rip, __entry->info1, __entry->info2) + __entry->guest_rip) ); /* @@ -228,38 +221,6 @@ TRACE_EVENT(kvm_inj_virq, TP_printk("irq %u", __entry->irq) ); -#define EXS(x) { x##_VECTOR, "#" #x } - -#define kvm_trace_sym_exc \ - EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(MC) - -/* - * Tracepoint for kvm interrupt injection: - */ -TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), - - TP_STRUCT__entry( - __field( u8, exception ) - __field( u8, has_error ) - __field( u32, error_code ) - ), - - TP_fast_assign( - __entry->exception = exception; - __entry->has_error = has_error; - __entry->error_code = error_code; - ), - - TP_printk("%s (0x%x)", - __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) -); - /* * Tracepoint for page fault. */ @@ -452,34 +413,12 @@ TRACE_EVENT(kvm_nested_vmrun, ), TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " - "event_inj: 0x%08x npt: %s", + "event_inj: 0x%08x npt: %s\n", __entry->rip, __entry->vmcb, __entry->nested_rip, __entry->int_ctl, __entry->event_inj, __entry->npt ? "on" : "off") ); -TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), - - TP_STRUCT__entry( - __field( __u16, cr_read ) - __field( __u16, cr_write ) - __field( __u32, exceptions ) - __field( __u64, intercept ) - ), - - TP_fast_assign( - __entry->cr_read = cr_read; - __entry->cr_write = cr_write; - __entry->exceptions = exceptions; - __entry->intercept = intercept; - ), - - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) -); /* * Tracepoint for #VMEXIT while nested */ @@ -508,7 +447,7 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_int_info_err = exit_int_info_err; ), TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", __entry->rip, ftrace_print_symbols_seq(p, __entry->exit_code, kvm_x86_ops->exit_reasons_str), @@ -543,7 +482,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, ), TP_printk("reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", ftrace_print_symbols_seq(p, __entry->exit_code, kvm_x86_ops->exit_reasons_str), __entry->exit_info1, __entry->exit_info2, @@ -565,7 +504,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit, __entry->rip = rip ), - TP_printk("rip: 0x%016llx", __entry->rip) + TP_printk("rip: 0x%016llx\n", __entry->rip) ); /* @@ -587,7 +526,7 @@ TRACE_EVENT(kvm_invlpga, __entry->address = address; ), - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", + TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", __entry->rip, __entry->asid, __entry->address) ); @@ -608,102 +547,11 @@ TRACE_EVENT(kvm_skinit, __entry->slb = slb; ), - TP_printk("rip: 0x%016llx slb: 0x%08x", + TP_printk("rip: 0x%016llx slb: 0x%08x\n", __entry->rip, __entry->slb) ); -#define __print_insn(insn, ilen) ({ \ - int i; \ - const char *ret = p->buffer + p->len; \ - \ - for (i = 0; i < ilen; ++i) \ - trace_seq_printf(p, " %02x", insn[i]); \ - trace_seq_printf(p, "%c", 0); \ - ret; \ - }) - -#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) -#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) -#define KVM_EMUL_INSN_F_CS_D (1 << 2) -#define KVM_EMUL_INSN_F_CS_L (1 << 3) - -#define kvm_trace_symbol_emul_flags \ - { 0, "real" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ - { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L, "prot64" } - -#define kei_decode_mode(mode) ({ \ - u8 flags = 0xff; \ - switch (mode) { \ - case X86EMUL_MODE_REAL: \ - flags = 0; \ - break; \ - case X86EMUL_MODE_VM86: \ - flags = KVM_EMUL_INSN_F_EFL_VM; \ - break; \ - case X86EMUL_MODE_PROT16: \ - flags = KVM_EMUL_INSN_F_CR0_PE; \ - break; \ - case X86EMUL_MODE_PROT32: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D; \ - break; \ - case X86EMUL_MODE_PROT64: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L; \ - break; \ - } \ - flags; \ - }) - -TRACE_EVENT(kvm_emulate_insn, - TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), - TP_ARGS(vcpu, failed), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) - ), - - TP_fast_assign( - __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; - __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt.decode.eip - - vcpu->arch.emulate_ctxt.decode.fetch.start; - memcpy(__entry->insn, - vcpu->arch.emulate_ctxt.decode.fetch.data, - 15); - __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); - __entry->failed = failed; - ), - - TP_printk("%x:%llx:%s (%s)%s", - __entry->csbase, __entry->rip, - __print_insn(__entry->insn, __entry->len), - __print_symbolic(__entry->flags, - kvm_trace_symbol_emul_flags), - __entry->failed ? " failed" : "" - ) - ); - -#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) -#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) - #endif /* _TRACE_KVM_H */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/x86/kvm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/linux/x86/vmx.c b/linux/x86/vmx.c index ed4cf97..e29837e 100644 --- a/linux/x86/vmx.c +++ b/linux/x86/vmx.c @@ -45,7 +45,6 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -68,7 +67,6 @@ #include <linux/moduleparam.h> #include <linux/ftrace_event.h> #include <linux/slab.h> -#include <linux/tboot.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -77,14 +75,12 @@ #include <asm/vmx.h> #include <asm/virtext.h> #include <asm/mce.h> -#include <asm/i387.h> -#include <asm/xcr.h> #include "trace.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) -MODULE_INFO(version, "kvm-kmod-2.6.38-rc7"); +MODULE_INFO(version, "kvm-kmod-2.6.34"); MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -107,12 +103,6 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static int __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); - -static int __read_mostly yield_on_hlt = 1; -module_param(yield_on_hlt, bool, S_IRUGO); - #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) #define KVM_GUEST_CR0_MASK \ @@ -149,8 +139,6 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); -#define NR_AUTOLOAD_MSRS 1 - struct vmcs { u32 revision_id; u32 abort; @@ -169,7 +157,6 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; - u32 exit_intr_info; u32 idt_vectoring_info; struct shared_msr_entry *guest_msrs; int nmsrs; @@ -179,11 +166,6 @@ struct vcpu_vmx { u64 msr_guest_kernel_gs_base; #endif struct vmcs *vmcs; - struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; - } msr_autoload; struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; @@ -199,6 +181,11 @@ struct vcpu_vmx { u32 limit; u32 ar; } tr, es, ds, fs, gs; + struct { + bool pending; + u8 vector; + unsigned rip; + } irq; } rmode; int vpid; bool emulation_required; @@ -219,22 +206,16 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); -static void kvm_cpu_vmxon(u64 addr); -static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); -static DEFINE_PER_CPU(struct kvm_desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; -static bool cpu_has_load_ia32_efer; - static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -283,67 +264,67 @@ static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); /* - * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it + * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static inline bool is_page_fault(u32 intr_info) +static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } -static inline bool is_no_device(u32 intr_info) +static inline int is_no_device(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); } -static inline bool is_invalid_opcode(u32 intr_info) +static inline int is_invalid_opcode(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); } -static inline bool is_external_interrupt(u32 intr_info) +static inline int is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static inline bool is_machine_check(u32 intr_info) +static inline int is_machine_check(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); } -static inline bool cpu_has_vmx_msr_bitmap(void) +static inline int cpu_has_vmx_msr_bitmap(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; } -static inline bool cpu_has_vmx_tpr_shadow(void) +static inline int cpu_has_vmx_tpr_shadow(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline bool vm_need_tpr_shadow(struct kvm *kvm) +static inline int vm_need_tpr_shadow(struct kvm *kvm) { return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); } -static inline bool cpu_has_secondary_exec_ctrls(void) +static inline int cpu_has_secondary_exec_ctrls(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -363,105 +344,84 @@ static inline bool cpu_has_vmx_flexpriority(void) static inline bool cpu_has_vmx_ept_execute_only(void) { - return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; + return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); } static inline bool cpu_has_vmx_eptp_uncacheable(void) { - return vmx_capability.ept & VMX_EPTP_UC_BIT; + return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); } static inline bool cpu_has_vmx_eptp_writeback(void) { - return vmx_capability.ept & VMX_EPTP_WB_BIT; + return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); } static inline bool cpu_has_vmx_ept_2m_page(void) { - return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; + return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } static inline bool cpu_has_vmx_ept_1g_page(void) { - return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_4levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; -} - -static inline bool cpu_has_vmx_invept_individual_addr(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); } -static inline bool cpu_has_vmx_invept_context(void) +static inline int cpu_has_vmx_invept_individual_addr(void) { - return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; + return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); } -static inline bool cpu_has_vmx_invept_global(void) +static inline int cpu_has_vmx_invept_context(void) { - return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; + return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); } -static inline bool cpu_has_vmx_invvpid_single(void) +static inline int cpu_has_vmx_invept_global(void) { - return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; + return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); } -static inline bool cpu_has_vmx_invvpid_global(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_ept(void) +static inline int cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_EPT; } -static inline bool cpu_has_vmx_unrestricted_guest(void) +static inline int cpu_has_vmx_unrestricted_guest(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_UNRESTRICTED_GUEST; } -static inline bool cpu_has_vmx_ple(void) +static inline int cpu_has_vmx_ple(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && irqchip_in_kernel(kvm); } -static inline bool cpu_has_vmx_vpid(void) +static inline int cpu_has_vmx_vpid(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID; } -static inline bool cpu_has_vmx_rdtscp(void) +static inline int cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDTSCP; } -static inline bool cpu_has_virtual_nmis(void) +static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } -static inline bool cpu_has_vmx_wbinvd_exit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_WBINVD_EXITING; -} - static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -519,26 +479,13 @@ static void vmcs_clear(struct vmcs *vmcs) u8 error; asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); if (error) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); } -static void vmcs_load(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - u8 error; - - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", - vmcs, phys_addr); -} - static void __vcpu_clear(void *arg) { struct vcpu_vmx *vmx = arg; @@ -548,6 +495,7 @@ static void __vcpu_clear(void *arg) vmcs_clear(vmx->vmcs); if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; + rdtscll(vmx->vcpu.arch.host_tsc); list_del(&vmx->local_vcpus_link); vmx->vcpu.cpu = -1; vmx->launched = 0; @@ -560,27 +508,12 @@ static void vcpu_clear(struct vcpu_vmx *vmx) smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); } -static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) { if (vmx->vpid == 0) return; - if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); -} - -static inline void vpid_sync_vcpu_global(void) -{ - if (cpu_has_vmx_invvpid_global()) - __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); -} - -static inline void vpid_sync_context(struct vcpu_vmx *vmx) -{ - if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vmx); - else - vpid_sync_vcpu_global(); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } static inline void ept_sync_global(void) @@ -612,10 +545,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) static unsigned long vmcs_readl(unsigned long field) { - unsigned long value = 0; + unsigned long value; asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) - : "+a"(value) : "d"(field) : "cc"); + : "=a"(value) : "d"(field) : "cc"); return value; } @@ -703,69 +636,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } -static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) -{ - unsigned i; - struct msr_autoload *m = &vmx->msr_autoload; - - if (msr == MSR_EFER && cpu_has_load_ia32_efer) { - vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); - vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); - return; - } - - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; - - if (i == m->nr) - return; - --m->nr; - m->guest[i] = m->guest[m->nr]; - m->host[i] = m->host[m->nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); -} - -static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val) -{ - unsigned i; - struct msr_autoload *m = &vmx->msr_autoload; - - if (msr == MSR_EFER && cpu_has_load_ia32_efer) { - vmcs_write64(GUEST_IA32_EFER, guest_val); - vmcs_write64(HOST_IA32_EFER, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); - vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); - return; - } - - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; - - if (i == m->nr) { - ++m->nr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); - } - - m->guest[i].index = msr; - m->guest[i].value = guest_val; - m->host[i].index = msr; - m->host[i].value = host_val; -} - static void reload_tss(void) { /* * VT restores TR but not its size. Useless. */ - struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct descriptor_table gdt; struct kvm_desc_struct *descs; - descs = (void *)gdt->address; + kvm_get_gdt(&gdt); + descs = (void *)gdt.base; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); } @@ -792,56 +672,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer |= host_efer & ignore_bits; vmx->guest_msrs[efer_offset].data = guest_efer; vmx->guest_msrs[efer_offset].mask = ~ignore_bits; - - clear_atomic_switch_msr(vmx, MSR_EFER); - /* On ept, can't emulate nx, and must switch nx atomically */ - if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { - guest_efer = vmx->vcpu.arch.efer; - if (!(guest_efer & EFER_LMA)) - guest_efer &= ~EFER_LME; - add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); - return false; - } - return true; } -static unsigned long segment_base(u16 selector) -{ - struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt); - struct kvm_desc_struct *d; - unsigned long table_base; - unsigned long v; - - if (!(selector & ~3)) - return 0; - - table_base = gdt->address; - - if (selector & 4) { /* from ldt */ - u16 ldt_selector = kvm_read_ldt(); - - if (!(ldt_selector & ~3)) - return 0; - - table_base = segment_base(ldt_selector); - } - d = (struct kvm_desc_struct *)(table_base + (selector & ~7)); - v = kvm_get_desc_base(d); -#ifdef CONFIG_X86_64 - if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32; -#endif - return v; -} - -static inline unsigned long kvm_read_tr_base(void) -{ - u16 tr; - asm("str %0" : "=g"(tr)); - return segment_base(tr); -} - static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -857,7 +690,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - savesegment(fs, vmx->host_state.fs_sel); + vmx->host_state.fs_sel = kvm_read_fs(); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -865,7 +698,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - savesegment(gs, vmx->host_state.gs_sel); + vmx->host_state.gs_sel = kvm_read_gs(); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -882,9 +715,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); - if (is_long_mode(&vmx->vcpu)) + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + } #endif for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, @@ -894,32 +728,35 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { + unsigned long flags; + if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; -#ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#endif + if (vmx->host_state.fs_reload_needed) + kvm_load_fs(vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_save(flags); + kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - load_gs_index(vmx->host_state.gs_sel); -#else - loadsegment(gs, vmx->host_state.gs_sel); + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); #endif + local_irq_restore(flags); } - if (vmx->host_state.fs_reload_needed) - loadsegment(fs, vmx->host_state.fs_sel); reload_tss(); #ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + } #endif - if (current_thread_info()->status & TS_USEDFPU) - clts(); - kvm_load_gdt(&__get_cpu_var(host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -936,47 +773,62 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + u64 phys_addr = __pa(vmx->vmcs); + u64 tsc_this, delta, new_offset; - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - else if (vcpu->cpu != cpu) + if (vcpu->cpu != cpu) { vcpu_clear(vmx); + kvm_migrate_timers(vcpu); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); + } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { + u8 error; + per_cpu(current_vmcs, cpu) = vmx->vmcs; - vmcs_load(vmx->vmcs); + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmx->vmcs, phys_addr); } if (vcpu->cpu != cpu) { - struct kvm_desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct descriptor_table dt; unsigned long sysenter_esp; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - local_irq_disable(); - list_add(&vmx->local_vcpus_link, - &per_cpu(vcpus_on_cpu, cpu)); - local_irq_enable(); - + vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ + kvm_get_gdt(&dt); + vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* + * Make sure the time stamp counter is monotonous. + */ + rdtscll(tsc_this); + if (tsc_this < vcpu->arch.host_tsc) { + delta = vcpu->arch.host_tsc - tsc_this; + new_offset = vmcs_read64(TSC_OFFSET) + delta; + vmcs_write64(TSC_OFFSET, new_offset); + } } } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) { - __vcpu_clear(to_vmx(vcpu)); - kvm_cpu_vmxoff(); - } } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -1035,9 +887,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) int ret = 0; if (interruptibility & GUEST_INTR_STATE_STI) - ret |= KVM_X86_SHADOW_INT_STI; + ret |= X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= KVM_X86_SHADOW_INT_MOV_SS; + ret |= X86_SHADOW_INT_MOV_SS; return ret & mask; } @@ -1049,9 +901,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - if (mask & KVM_X86_SHADOW_INT_MOV_SS) + if (mask & X86_SHADOW_INT_MOV_SS) interruptibility |= GUEST_INTR_STATE_MOV_SS; - else if (mask & KVM_X86_SHADOW_INT_STI) + if (mask & X86_SHADOW_INT_STI) interruptibility |= GUEST_INTR_STATE_STI; if ((interruptibility != interruptibility_old)) @@ -1070,20 +922,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } -static void vmx_clear_hlt(struct kvm_vcpu *vcpu) -{ - /* Ensure that we clear the HLT state in the VMCS. We don't need to - * explicitly skip the instruction because if the HLT state is set, then - * the instruction is already executing and RIP has already been - * advanced. */ - if (!yield_on_hlt && - vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); -} - static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) + bool has_error_code, u32 error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; @@ -1094,8 +934,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = nr; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (kvm_exception_is_soft(nr)) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; + intr_info |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } @@ -1107,7 +955,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmx_clear_hlt(vcpu); } static bool vmx_rdtscp_supported(void) @@ -1154,10 +1001,10 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0 && vmx->rdtscp_enabled) move_msr_up(vmx, index, save_nmsrs++); /* - * MSR_STAR is only needed on long mode guests, and only + * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ - index = __find_msr_index(vmx, MSR_STAR); + index = __find_msr_index(vmx, MSR_K6_STAR); if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } @@ -1192,17 +1039,12 @@ static u64 guest_read_tsc(void) } /* - * writes 'offset' into guest's timestamp counter offset register + * writes 'guest_tsc' into guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc */ -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) -{ - vmcs_write64(TSC_OFFSET, offset); -} - -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) { - u64 offset = vmcs_read64(TSC_OFFSET); - vmcs_write64(TSC_OFFSET, offset + adjustment); + vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); } /* @@ -1275,6 +1117,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; + u64 host_tsc; int ret = 0; switch (msr_index) { @@ -1304,7 +1147,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + rdtscll(host_tsc); + guest_write_tsc(data, host_tsc); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -1373,58 +1217,37 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - if (msr & FEATURE_CONTROL_LOCKED) { - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && kvm_tboot_enabled()) - return 1; - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && !kvm_tboot_enabled()) { - printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - " activate TXT before enabling KVM\n"); - return 1; - } - } - - return 0; + return (msr & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + == FEATURE_CONTROL_LOCKED; /* locked but not enabled */ } -static void kvm_cpu_vmxon(u64 addr) -{ - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&addr), "m"(addr) - : "memory", "cc"); -} - static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old, test_bits; + u64 old; if (read_cr4() & X86_CR4_VMXE) return -EBUSY; INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - - test_bits = FEATURE_CONTROL_LOCKED; - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (kvm_tboot_enabled()) - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; - - if ((old & test_bits) != test_bits) { + if ((old & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); - } + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | + FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&phys_addr), "m"(phys_addr) + : "memory", "cc"); - if (vmm_exclusive) { - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); - } - - kvm_store_gdt(&__get_cpu_var(host_gdt)); + ept_sync_global(); return 0; } @@ -1446,15 +1269,13 @@ static void vmclear_local_vcpus(void) static void kvm_cpu_vmxoff(void) { asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); } static void hardware_disable(void *garbage) { - if (vmm_exclusive) { - vmclear_local_vcpus(); - kvm_cpu_vmxoff(); - } - write_cr4(read_cr4() & ~X86_CR4_VMXE); + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -1476,14 +1297,6 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init bool allow_1_setting(u32 msr, u32 ctl) -{ - u32 vmx_msr_low, vmx_msr_high; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - return vmx_msr_high & ctl; -} - static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) { u32 vmx_msr_low, vmx_msr_high; @@ -1500,7 +1313,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_pin_based_exec_control) < 0) return -EIO; - min = + min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | @@ -1513,10 +1326,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; - - if (yield_on_hlt) - min |= CPU_BASED_HLT_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -1598,12 +1407,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; - cpu_has_load_ia32_efer = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_EFER) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_EFER); - return 0; } @@ -1671,8 +1474,7 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels()) { + if (!cpu_has_vmx_ept()) { enable_ept = 0; enable_unrestricted_guest = 0; } @@ -1760,8 +1562,8 @@ static gva_t rmode_tss_base(struct kvm *kvm) struct kvm_memslots *slots; gfn_t base_gfn; - slots = kvm_memslots(kvm); - base_gfn = slots->memslots[0].base_gfn + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } @@ -1777,13 +1579,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) save->limit = vmcs_read32(sf->limit); save->ar = vmcs_read32(sf->ar_bytes); vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xffff0); + vmcs_write32(sf->base, save->base & 0xfffff); vmcs_write32(sf->limit, 0xffff); vmcs_write32(sf->ar_bytes, 0xf3); - if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not paragraph" - " aligned when entering protected mode (seg=%d)", - seg); } static void enter_rmode(struct kvm_vcpu *vcpu) @@ -1881,27 +1679,26 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { + vcpu->arch.efer &= ~EFER_LMA; + vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) & ~VM_ENTRY_IA32E_MODE); - vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } #endif static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { - vpid_sync_context(to_vmx(vcpu)); - if (enable_ept) { - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; + vpid_sync_vcpu_all(to_vmx(vcpu)); + if (enable_ept) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); - } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -1912,13 +1709,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; } -static void vmx_decache_cr3(struct kvm_vcpu *vcpu) -{ - if (enable_ept && is_paging(vcpu)) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -} - static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; @@ -1934,20 +1724,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -1962,7 +1752,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, unsigned long cr0, struct kvm_vcpu *vcpu) { - vmx_decache_cr3(vcpu); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -2043,7 +1832,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : + guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : vcpu->kvm->arch.ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -2186,28 +1975,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *l = (ar >> 13) & 1; } -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - dt->size = vmcs_read32(GUEST_IDTR_LIMIT); - dt->address = vmcs_readl(GUEST_IDTR_BASE); + dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); + dt->base = vmcs_readl(GUEST_IDTR_BASE); } -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - vmcs_write32(GUEST_IDTR_LIMIT, dt->size); - vmcs_writel(GUEST_IDTR_BASE, dt->address); + vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_IDTR_BASE, dt->base); } -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - dt->size = vmcs_read32(GUEST_GDTR_LIMIT); - dt->address = vmcs_readl(GUEST_GDTR_BASE); + dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); + dt->base = vmcs_readl(GUEST_GDTR_BASE); } -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct kvm_desc_ptr *dt) +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - vmcs_write32(GUEST_GDTR_LIMIT, dt->size); - vmcs_writel(GUEST_GDTR_BASE, dt->address); + vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_GDTR_BASE, dt->base); } static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) @@ -2548,16 +2337,6 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void free_vpid(struct vcpu_vmx *vmx) -{ - if (!enable_vpid) - return; - spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); -} - static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) { int f = sizeof(unsigned long); @@ -2594,9 +2373,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat; + u64 host_pat, tsc_this, tsc_base; unsigned long a; - struct kvm_desc_ptr dt; + struct descriptor_table dt; int i; unsigned long kvm_vmx_return; u32 exec_control; @@ -2655,15 +2434,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); @@ -2677,16 +2456,14 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - kvm_native_store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ + kvm_get_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); @@ -2735,34 +2512,33 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); - kvm_write_tsc(&vmx->vcpu, 0); + tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; + rdtscll(tsc_this); + if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) + tsc_base = tsc_this; + + guest_write_tsc(0, tsc_base); return 0; } static int init_rmode(struct kvm *kvm) { - int idx, ret = 0; - - idx = srcu_read_lock(&kvm->srcu); if (!init_rmode_tss(kvm)) - goto exit; + return 0; if (!init_rmode_identity_map(kvm)) - goto exit; - - ret = 1; -exit: - srcu_read_unlock(&kvm->srcu, idx); - return ret; + return 0; + return 1; } static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2779,9 +2555,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - ret = fx_init(&vmx->vcpu); - if (ret != 0) - goto out; + fx_init(&vmx->vcpu); seg_setup(VCPU_SREG_CS); /* @@ -2831,7 +2605,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); + vmcs_write32(GUEST_ACTIVITY_STATE, 0); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); @@ -2864,7 +2638,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); - vpid_sync_context(vmx); + vpid_sync_vcpu_all(vmx); ret = 0; @@ -2872,6 +2646,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } @@ -2893,10 +2668,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; } - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); - return; - } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); @@ -2912,8 +2683,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = irq; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (vcpu->arch.interrupt.soft) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } intr = irq | INTR_INFO_VALID_MASK; @@ -2924,7 +2703,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); - vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -2946,13 +2724,18 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = NMI_VECTOR; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + NMI_VECTOR | INTR_TYPE_SOFT_INTR | + INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vmx_clear_hlt(vcpu); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -2961,15 +2744,16 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) return 0; return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); } static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + else + return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + GUEST_INTR_STATE_NMI); } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3023,7 +2807,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) return 1; /* * Forward all other exceptions that are valid in real mode. @@ -3120,7 +2904,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); + er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -3139,7 +2923,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); + return kvm_mmu_page_fault(vcpu, cr2, error_code); } if (vmx->rmode.vm86_active && @@ -3204,20 +2988,22 @@ static int handle_io(struct kvm_vcpu *vcpu) int size, in, string; unsigned port; + ++vcpu->stat.io_exits; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; - in = (exit_qualification & 8) != 0; - ++vcpu->stat.io_exits; - - if (string || in) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + if (string) { + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO) + return 0; + return 1; + } - port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; - skip_emulated_instruction(vcpu); + in = (exit_qualification & 8) != 0; + port = exit_qualification >> 16; - return kvm_fast_pio_out(vcpu, size, port); + skip_emulated_instruction(vcpu); + return kvm_emulate_pio(vcpu, in, size, port); } static void @@ -3236,7 +3022,6 @@ static int handle_cr(struct kvm_vcpu *vcpu) unsigned long exit_qualification, val; int cr; int reg; - int err; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -3247,22 +3032,22 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(cr, val); switch (cr) { case 0: - err = kvm_set_cr0(vcpu, val); - kvm_complete_insn_gp(vcpu, err); + kvm_set_cr0(vcpu, val); + skip_emulated_instruction(vcpu); return 1; case 3: - err = kvm_set_cr3(vcpu, val); - kvm_complete_insn_gp(vcpu, err); + kvm_set_cr3(vcpu, val); + skip_emulated_instruction(vcpu); return 1; case 4: - err = kvm_set_cr4(vcpu, val); - kvm_complete_insn_gp(vcpu, err); + kvm_set_cr4(vcpu, val); + skip_emulated_instruction(vcpu); return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = kvm_register_read(vcpu, reg); - err = kvm_set_cr8(vcpu, cr8); - kvm_complete_insn_gp(vcpu, err); + kvm_set_cr8(vcpu, cr8); + skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; if (cr8_prev <= cr8) @@ -3281,9 +3066,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 1: /*mov from cr*/ switch (cr) { case 3: - val = kvm_read_cr3(vcpu); - kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); + kvm_register_write(vcpu, reg, vcpu->arch.cr3); + trace_kvm_cr_read(cr, vcpu->arch.cr3); skip_emulated_instruction(vcpu); return 1; case 8: @@ -3310,9 +3094,19 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return -1; + } + return 0; +} + static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; + unsigned long val; int dr, reg; /* Do not handle if the CPL > 0, will trigger GP on re-entry */ @@ -3347,20 +3141,67 @@ static int handle_dr(struct kvm_vcpu *vcpu) dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; - if (!kvm_get_dr(vcpu, dr, &val)) - kvm_register_write(vcpu, reg, val); - } else - kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); + switch (dr) { + case 0 ... 3: + val = vcpu->arch.db[dr]; + break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + case 6: + val = vcpu->arch.dr6; + break; + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ + val = vcpu->arch.dr7; + break; + } + kvm_register_write(vcpu, reg, val); + } else { + val = vcpu->arch.regs[reg]; + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + case 6: + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + break; + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + vcpu->arch.switch_db_regs = + (val & DR7_BP_EN_MASK); + } + break; + } + } skip_emulated_instruction(vcpu); return 1; } -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) -{ - vmcs_writel(GUEST_DR7, val); -} - static int handle_cpuid(struct kvm_vcpu *vcpu) { kvm_emulate_cpuid(vcpu); @@ -3406,7 +3247,6 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3419,8 +3259,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - kvm_make_request(KVM_REQ_EVENT, vcpu); - ++vcpu->stat.irq_window_exits; /* @@ -3455,11 +3293,6 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu) return 1; } -static int handle_invd(struct kvm_vcpu *vcpu) -{ - return emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3472,31 +3305,34 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - kvm_emulate_wbinvd(vcpu); + /* TODO: Add support for VT-d/pass-through device */ return 1; } -static int handle_xsetbv(struct kvm_vcpu *vcpu) +static int handle_apic_access(struct kvm_vcpu *vcpu) { - u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + unsigned long exit_qualification; + enum emulation_result er; + unsigned long offset; - if (kvm_set_xcr(vcpu, index, new_bv) == 0) - skip_emulated_instruction(vcpu); - return 1; -} + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + offset = exit_qualification & 0xffful; -static int handle_apic_access(struct kvm_vcpu *vcpu) -{ - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + er = emulate_instruction(vcpu, 0, 0, 0); + + if (er != EMULATE_DONE) { + printk(KERN_ERR + "Fail to handle apic access vmexit! Offset is 0x%lx\n", + offset); + return -ENOEXEC; + } + return 1; } static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; - bool has_error_code = false; - u32 error_code = 0; u16 tss_selector; int reason, type, idt_v; @@ -3519,13 +3355,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) kvm_clear_interrupt_queue(vcpu); break; case INTR_TYPE_HARD_EXCEPTION: - if (vmx->idt_vectoring_info & - VECTORING_INFO_DELIVER_CODE_MASK) { - has_error_code = true; - error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - } - /* fall through */ case INTR_TYPE_SOFT_EXCEPTION: kvm_clear_exception_queue(vcpu); break; @@ -3540,13 +3369,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + if (!kvm_task_switch(vcpu, tss_selector, reason)) return 0; - } /* clear all local breakpoint enable flags */ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); @@ -3587,7 +3411,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); + return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } static u64 ept_rsvd_mask(u64 spte, int level) @@ -3682,7 +3506,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); ++vcpu->stat.nmi_window_exits; - kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3692,26 +3515,22 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; int ret = 1; - u32 cpu_exec_ctrl; - bool intr_window_requested; - - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; while (!guest_state_valid(vcpu)) { - if (intr_window_requested - && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) - return handle_interrupt_window(&vmx->vcpu); - - err = emulate_instruction(vcpu, 0); + err = emulate_instruction(vcpu, 0, 0, 0); if (err == EMULATE_DO_MMIO) { ret = 0; goto out; } - if (err != EMULATE_DONE) - return 0; + if (err != EMULATE_DONE) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + ret = 0; + goto out; + } if (signal_pending(current)) goto out; @@ -3760,7 +3579,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, - [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmx_insn, @@ -3775,7 +3593,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, - [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, @@ -3788,12 +3605,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) -{ - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); -} - /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -3804,18 +3615,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); + trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); /* If guest state is invalid, start emulating */ if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); - if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; - vcpu->run->fail_entry.hardware_entry_failure_reason - = exit_reason; - return 0; - } + /* Access CR3 don't cause VMExit in paging mode, so we need + * to sync with guest real CR3. */ + if (enable_ept && is_paging(vcpu)) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); if (unlikely(vmx->fail)) { vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -3870,9 +3679,18 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } -static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; + u32 idt_vectoring_info = vmx->idt_vectoring_info; + bool unblock_nmi; + u8 vector; + int type; + bool idtv_info_valid; + + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); /* Handle machine checks before interrupts are enabled */ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) @@ -3882,21 +3700,10 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { - kvm_before_handle_nmi(&vmx->vcpu); + (exit_intr_info & INTR_INFO_VALID_MASK)) asm("int $2"); - kvm_after_handle_nmi(&vmx->vcpu); - } -} - -static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) -{ - u32 exit_intr_info = vmx->exit_intr_info; - bool unblock_nmi; - u8 vector; - bool idtv_info_valid; - idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; @@ -3918,18 +3725,6 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); -} - -static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, - u32 idt_vectoring_info, - int instr_len_field, - int error_code_field) -{ - u8 vector; - int type; - bool idtv_info_valid; - - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vmx->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&vmx->vcpu); @@ -3938,8 +3733,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, if (!idtv_info_valid) return; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); - vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; @@ -3956,18 +3749,18 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); + u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); kvm_queue_exception_e(&vmx->vcpu, vector, err); } else kvm_queue_exception(&vmx->vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); /* fall through */ case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(&vmx->vcpu, vector, @@ -3978,21 +3771,27 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, } } -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) -{ - __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, - VM_EXIT_INSTRUCTION_LEN, - IDT_VECTORING_ERROR_CODE); -} - -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +/* + * Failure to inject an interrupt should give us the information + * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs + * when fetching the interrupt redirection bitmap in the real-mode + * tss, this doesn't happen. So we do it ourselves. + */ +static void fixup_rmode_irq(struct vcpu_vmx *vmx) { - __vmx_complete_interrupts(to_vmx(vcpu), - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - VM_ENTRY_INSTRUCTION_LEN, - VM_ENTRY_EXCEPTION_ERROR_CODE); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + vmx->rmode.irq.pending = 0; + if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) + return; + kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); + if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { + vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; + vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; + return; + } + vmx->idt_vectoring_info = + VECTORING_INFO_VALID_MASK + | INTR_TYPE_EXT_INTR + | vmx->rmode.irq.vector; } #ifdef CONFIG_X86_64 @@ -4029,6 +3828,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); + asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -4119,27 +3923,23 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" - , R"ax", R"bx", R"di", R"si" + , R"bx", R"di", R"si" #ifdef CONFIG_X86_64 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #endif ); vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_CR3)); + | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + if (vmx->rmode.irq.pending) + fixup_rmode_irq(vmx); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - vmx_complete_atomic_exit(vmx); - vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } @@ -4161,26 +3961,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - free_vpid(vmx); + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); } -static inline void vmcs_init(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); - - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - - vmcs_clear(vmcs); - - if (!vmm_exclusive) - kvm_cpu_vmxoff(); -} - static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -4206,11 +3996,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->vmcs) goto free_msrs; - vmcs_init(vmx->vmcs); + vmcs_clear(vmx->vmcs); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); @@ -4237,7 +4026,6 @@ free_msrs: uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: - free_vpid(vmx); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } @@ -4341,6 +4129,11 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -4363,10 +4156,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -{ -} - static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4394,7 +4183,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr3 = vmx_decache_cr3, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -4404,7 +4192,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .set_dr7 = vmx_set_dr7, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -4422,7 +4209,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, - .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, @@ -4435,23 +4221,12 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, - .get_exit_info = vmx_get_exit_info, .exit_reasons_str = vmx_exit_reasons_str, - .get_lpage_level = vmx_get_lpage_level, .cpuid_update = vmx_cpuid_update, .rdtscp_supported = vmx_rdtscp_supported, - - .set_supported_cpuid = vmx_set_supported_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .write_tsc_offset = vmx_write_tsc_offset, - .adjust_tsc_offset = vmx_adjust_tsc_offset, - - .set_tdp_cr3 = vmx_set_cr3, }; static int __init vmx_init(void) @@ -4499,8 +4274,7 @@ static int __init vmx_init(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) goto out3; @@ -4513,6 +4287,8 @@ static int __init vmx_init(void) if (enable_ept) { bypass_guest_pf = 0; + kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | + VMX_EPT_WRITABLE_MASK); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); diff --git a/linux/x86/x86.c b/linux/x86/x86.c index a6233d2..1071014 100644 --- a/linux/x86/x86.c +++ b/linux/x86/x86.c @@ -46,7 +46,6 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -81,23 +80,17 @@ #include <linux/user-return-notifier.h> #include <linux/srcu.h> #include <linux/slab.h> -#include <linux/perf_event.h> -#include <linux/uaccess.h> -#include <linux/hash.h> #include <trace/events/kvm.h> - +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "trace.h" #include <asm/debugreg.h> +#include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/mce.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/pvclock.h> -#include <asm/div64.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -108,13 +101,12 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -172,7 +164,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, + { "kvm_request_irq", VCPU_STAT(request_irq_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -194,15 +186,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -u64 __read_mostly host_xcr0; - -static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) -{ - int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) - vcpu->arch.apf.gfns[i] = ~0; -} - static void kvm_on_user_return(struct kvm_user_return_notifier *urn) { unsigned slot; @@ -280,6 +263,34 @@ static void drop_user_return_notifiers(void *ignore) kvm_on_user_return(&smsr->urn); } +unsigned long segment_base(u16 selector) +{ + struct descriptor_table gdt; + struct kvm_desc_struct *d; + unsigned long table_base; + unsigned long v; + + if (selector == 0) + return 0; + + kvm_get_gdt(&gdt); + table_base = gdt.base; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector = kvm_read_ldt(); + + table_base = segment_base(ldt_selector); + } + d = (struct kvm_desc_struct *)(table_base + (selector & ~7)); + v = kvm_get_desc_base(d); +#ifdef CONFIG_X86_64 + if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long)((struct kvm_ldttss_desc64 *)d)->base3) << 32; +#endif + return v; +} +EXPORT_SYMBOL_GPL(segment_base); + u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) @@ -321,21 +332,17 @@ static int exception_class(int vector) } static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool reinject) + unsigned nr, bool has_error, u32 error_code) { u32 prev_nr; int class1, class2; - kvm_make_request(KVM_REQ_EVENT, vcpu); - if (!vcpu->arch.exception.pending) { queue: vcpu->arch.exception.pending = true; vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; vcpu->arch.exception.error_code = error_code; - vcpu->arch.exception.reinject = reinject; return; } @@ -343,7 +350,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); return; } class1 = exception_class(prev_nr); @@ -364,59 +371,30 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); - -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) -{ - if (err) - kvm_inject_gp(vcpu, 0); - else - kvm_x86_ops->skip_emulated_instruction(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); - -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + u32 error_code) { ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); -} - -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) -{ - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu.inject_page_fault(vcpu, fault); + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false); + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. @@ -431,49 +409,18 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) EXPORT_SYMBOL_GPL(kvm_require_cpl); /* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - -int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, - void *data, int offset, int len, u32 access) -{ - return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, - data, offset, len, access); -} - -/* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; - u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { ret = 0; goto out; @@ -487,7 +434,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } ret = 1; - memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); + memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, @@ -500,10 +447,8 @@ EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { - u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; bool changed = true; - int offset; - gfn_t gfn; int r; if (is_long_mode(vcpu) || !is_pae(vcpu)) @@ -513,181 +458,132 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); - r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), - PFERR_USER_MASK | PFERR_WRITE_MASK); + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; - changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; + changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; out: return changed; } -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | - X86_CR0_CD | X86_CR0_NW; - cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) - return 1; + if (cr0 & 0xffffffff00000000UL) { + kvm_inject_gp(vcpu, 0); + return; + } #endif cr0 &= ~CR0_RESERVED_BITS; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) - return 1; + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { + kvm_inject_gp(vcpu, 0); + return; + } - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) - return 1; + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { + kvm_inject_gp(vcpu, 0); + return; + } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; - if (!is_pae(vcpu)) - return 1; + if (!is_pae(vcpu)) { + kvm_inject_gp(vcpu, 0); + return; + } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) - return 1; + if (cs_l) { + kvm_inject_gp(vcpu, 0); + return; + + } } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) - return 1; + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + } kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; - if ((cr0 ^ old_cr0) & X86_CR0_PG) - kvm_clear_async_pf_completion_queue(vcpu); - - if ((cr0 ^ old_cr0) & update_bits) - kvm_mmu_reset_context(vcpu); - return 0; + kvm_mmu_reset_context(vcpu); + return; } EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - u64 xcr0; - - /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ - if (index != XCR_XFEATURE_ENABLED_MASK) - return 1; - xcr0 = xcr; - if (kvm_x86_ops->get_cpl(vcpu) != 0) - return 1; - if (!(xcr0 & XSTATE_FP)) - return 1; - if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) - return 1; - if (xcr0 & ~host_xcr0) - return 1; - vcpu->arch.xcr0 = xcr0; - vcpu->guest_xcr0_loaded = 0; - return 0; -} + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) -{ - if (__kvm_set_xcr(vcpu, index, xcr)) { + if (cr4 & CR4_RESERVED_BITS) { kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_xcr); - -static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_XSAVE)); -} - -static void update_cpuid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) return; - - /* Update OSXSAVE bit */ - if (kvm_cpu_has_xsave && best->function == 0x1) { - best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= bit(X86_FEATURE_OSXSAVE); } -} - -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - - if (cr4 & CR4_RESERVED_BITS) - return 1; - - if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) - return 1; if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) - return 1; + if (!(cr4 & X86_CR4_PAE)) { + kvm_inject_gp(vcpu, 0); + return; + } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) - return 1; - - if (cr4 & X86_CR4_VMXE) - return 1; + && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } + if (cr4 & X86_CR4_VMXE) { + kvm_inject_gp(vcpu, 0); + return; + } kvm_x86_ops->set_cr4(vcpu, cr4); - - if ((cr4 ^ old_cr4) & pdptr_bits) - kvm_mmu_reset_context(vcpu); - - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) - update_cpuid(vcpu); - - return 0; + vcpu->arch.cr4 = cr4; + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); - return 0; + return; } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; + if (cr3 & CR3_L_MODE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } } else { if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) - return 1; - if (is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) - return 1; + if (cr3 & CR3_PAE_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + kvm_inject_gp(vcpu, 0); + return; + } } /* * We don't check reserved bits in nonpae mode, because @@ -705,23 +601,24 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - return 1; - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - vcpu->arch.mmu.new_cr3(vcpu); - return 0; + kvm_inject_gp(vcpu, 0); + else { + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + } } EXPORT_SYMBOL_GPL(kvm_set_cr3); -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if (cr8 & CR8_RESERVED_BITS) - return 1; + if (cr8 & CR8_RESERVED_BITS) { + kvm_inject_gp(vcpu, 0); + return; + } if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; - return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -734,90 +631,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static inline u32 bit(int bitno) { - switch (dr) { - case 0 ... 3: - vcpu->arch.db[dr] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = val; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ - /* fall through */ - case 6: - if (val & 0xffffffff00000000ULL) - return -1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ - /* fall through */ - default: /* 7 */ - if (val & 0xffffffff00000000ULL) - return -1; /* #GP */ - vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); - vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); - } - break; - } - - return 0; + return 1 << (bitno & 31); } -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - int res; - - res = __kvm_set_dr(vcpu, dr, val); - if (res > 0) - kvm_queue_exception(vcpu, UD_VECTOR); - else if (res < 0) - kvm_inject_gp(vcpu, 0); - - return res; -} -EXPORT_SYMBOL_GPL(kvm_set_dr); - -static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - switch (dr) { - case 0 ... 3: - *val = vcpu->arch.db[dr]; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; - /* fall through */ - case 6: - *val = vcpu->arch.dr6; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; - /* fall through */ - default: /* 7 */ - *val = vcpu->arch.dr7; - break; - } - - return 0; -} - -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - if (_kvm_get_dr(vcpu, dr, val)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_get_dr); - /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -827,67 +645,67 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 8 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, + MSR_K6_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, }; -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { - u64 old_efer = vcpu->arch.efer; - - if (efer & efer_reserved_bits) - return 1; + if (efer & efer_reserved_bits) { + kvm_inject_gp(vcpu, 0); + return; + } if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { + kvm_inject_gp(vcpu, 0); + return; + } if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) - return 1; + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + kvm_inject_gp(vcpu, 0); + return; + } } if (efer & EFER_SVME) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) - return 1; + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + kvm_inject_gp(vcpu, 0); + return; + } } + kvm_x86_ops->set_efer(vcpu, efer); + efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops->set_efer(vcpu, efer); + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - - /* Update reserved bits */ - if ((efer ^ old_efer) & EFER_NX) - kvm_mmu_reset_context(vcpu); - - return 0; + kvm_mmu_reset_context(vcpu); } void kvm_enable_efer_bits(u64 mask) @@ -917,28 +735,20 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { - int version; - int r; + static int version; struct pvclock_wall_clock wc; struct timespec boot; if (!wall_clock) return; - r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); - if (r) - return; - - if (version & 1) - ++version; /* first time write, random junk */ - - ++version; + version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); /* * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the + * system time (updated by kvm_write_guest_time below) to the * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ @@ -966,230 +776,64 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) return quotient; } -static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, - s8 *pshift, u32 *pmultiplier) +static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) { - uint64_t scaled64; + uint64_t nsecs = 1000000000LL; int32_t shift = 0; uint64_t tps64; uint32_t tps32; - tps64 = base_khz * 1000LL; - scaled64 = scaled_khz * 1000LL; - while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { - if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) - scaled64 >>= 1; - else - tps32 <<= 1; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; shift++; } - *pshift = shift; - *pmultiplier = div_frac(scaled64, tps32); + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); - pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", - __func__, base_khz, scaled_khz, shift, *pmultiplier); -} - -static inline u64 get_kernel_ns(void) -{ - struct timespec ts; - - WARN_ON(preemptible()); - ktime_get_ts(&ts); - kvm_monotonic_to_bootbased(&ts); - return timespec_to_ns(&ts); + pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", + __func__, tsc_khz, hv_clock->tsc_shift, + hv_clock->tsc_to_system_mul); } static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -unsigned long max_tsc_khz; - -static inline int kvm_tsc_changes_freq(void) -{ - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; -} - -static inline u64 nsec_to_cycles(u64 nsec) -{ - u64 ret; - - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * kvm___this_cpu_read(cpu_tsc_khz); - do_div(ret, USEC_PER_SEC); - return ret; -} -static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) -{ - /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &kvm->arch.virtual_tsc_shift, - &kvm->arch.virtual_tsc_mult); - kvm->arch.virtual_tsc_khz = this_tsc_khz; -} - -static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) -{ - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->kvm->arch.virtual_tsc_mult, - vcpu->kvm->arch.virtual_tsc_shift); - tsc += vcpu->arch.last_tsc_write; - return tsc; -} - -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - u64 offset, ns, elapsed; - unsigned long flags; - s64 sdiff; - - spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = data - kvm_native_read_tsc(); - ns = get_kernel_ns(); - elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; - - /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accomodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { - if (!kvm_check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); - } else { - u64 delta = nsec_to_cycles(elapsed); - offset += delta; - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); - } - ns = kvm->arch.last_tsc_nsec; - } - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; - kvm_x86_ops->write_tsc_offset(vcpu, offset); - spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - /* Reset of TSC must disable overshoot protection below */ - vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; -} -EXPORT_SYMBOL_GPL(kvm_write_tsc); - -static int kvm_guest_time_update(struct kvm_vcpu *v) +static void kvm_write_guest_time(struct kvm_vcpu *v) { + struct timespec ts; unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; - s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); - kernel_ns = get_kernel_ns(); - this_tsc_khz = kvm___this_cpu_read(cpu_tsc_khz); + if ((!vcpu->time_page)) + return; - if (unlikely(this_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - - /* - * We may have to catch up the TSC to match elapsed wall clock - * time for two reasons, even if kvmclock is used. - * 1) CPU could have been running below the maximum TSC rate - * 2) Broken TSC compensation resets the base at each VCPU - * entry to avoid unknown leaps of TSC even when running - * again on the same CPU. This may cause apparent elapsed - * time to disappear, and the guest to stand still or run - * very slowly. - */ - if (vcpu->tsc_catchup) { - u64 tsc = compute_guest_tsc(v, kernel_ns); - if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); - tsc_timestamp = tsc; - } + this_tsc_khz = get_cpu_var(cpu_tsc_khz); + if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; } + put_cpu_var(cpu_tsc_khz); + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + ktime_get_ts(&ts); + kvm_monotonic_to_bootbased(&ts); local_irq_restore(flags); - if (!vcpu->time_page) - return 0; - - /* - * Time as measured by the TSC may go backwards when resetting the base - * tsc_timestamp. The reason for this is that the TSC resolution is - * higher than the resolution of the other clock scales. Thus, many - * possible measurments of the TSC correspond to one measurement of any - * other clock, and so a spread of values is possible. This is not a - * problem for the computation of the nanosecond clock; with TSC rates - * around 1GHZ, there can only be a few cycles which correspond to one - * nanosecond value, and any path through this code will inevitably - * take longer than that. However, with the kernel_ns value itself, - * the precision may be much lower, down to HZ granularity. If the - * first sampling of TSC against kernel_ns ends in the low part of the - * range, and the second in the high end of the range, we can get: - * - * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new - * - * As the sampling errors potentially range in the thousands of cycles, - * it is possible such a time value has already been observed by the - * guest. To protect against this, we must compute the system time as - * observed by the guest and ensure the new system time is greater. - */ - max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { - max_kernel_ns = vcpu->last_guest_tsc - - vcpu->hv_clock.tsc_timestamp; - max_kernel_ns = pvclock_scale_delta(max_kernel_ns, - vcpu->hv_clock.tsc_to_system_mul, - vcpu->hv_clock.tsc_shift); - max_kernel_ns += vcpu->last_kernel_ns; - } - - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = this_tsc_khz; - } - - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - /* With all the info we got, fill in the values */ - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; - vcpu->last_guest_tsc = tsc_timestamp; - vcpu->hv_clock.flags = 0; + + vcpu->hv_clock.system_time = ts.tv_nsec + + (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; /* * The interface expects us to write an even number signaling that the @@ -1206,7 +850,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); - return 0; +} + +static int kvm_request_guest_time_update(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + + if (!vcpu->time_page) + return 0; + set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + return 1; } static bool msr_mtrr_valid(unsigned msr) @@ -1469,38 +1122,14 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } -static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) -{ - gpa_t gpa = data & ~0x3f; - - /* Bits 2:5 are resrved, Should be zero */ - if (data & 0x3c) - return 1; - - vcpu->arch.apf.msr_val = data; - - if (!(data & KVM_ASYNC_PF_ENABLED)) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - return 0; - } - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) - return 1; - - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); - kvm_async_pf_wakeup_all(vcpu); - return 0; -} - int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case MSR_EFER: - return set_efer(vcpu, data); + set_efer(vcpu, data); + break; case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ - data &= ~(u64)0x100; /* ignore ignne emulation enable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -1543,12 +1172,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; - case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data); break; - case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { if (vcpu->arch.time_page) { kvm_release_page_dirty(vcpu->arch.time_page); @@ -1556,7 +1183,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } vcpu->arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -1572,12 +1198,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_release_page_clean(vcpu->arch.time_page); vcpu->arch.time_page = NULL; } + + kvm_request_guest_time_update(vcpu); break; } - case MSR_KVM_ASYNC_PF_EN: - if (kvm_pv_enable_async_pf(vcpu, data)) - return 1; - break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1612,16 +1236,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; - case MSR_K7_CLK_CTL: - /* - * Ignore all writes to this no longer documented MSR. - * Writes are only relevant for old K7 processors, - * all pre-dating SVM, but a recommended workaround from - * AMD for these chips. It is possible to speicify the - * affected processor models on the command line, hence - * the need to ignore the workaround. - */ - break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1814,20 +1428,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case 0xcd: /* fsb frequency */ data = 3; break; - /* - * MSR_EBC_FREQUENCY_ID - * Conservative value valid for even the basic CPU models. - * Models 0,1: 000 in bits 23:21 indicating a bus speed of - * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, - * and 266MHz for model 3, or 4. Set Core Clock - * Frequency to System Bus Frequency Ratio to 1 (bits - * 31:24) even though these are only valid for CPU - * models > 2, however guests may end up dividing or - * multiplying by zero otherwise. - */ - case MSR_EBC_FREQUENCY_ID: - data = 1 << 24; - break; case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; @@ -1847,16 +1447,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: - case MSR_KVM_WALL_CLOCK_NEW: data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: - case MSR_KVM_SYSTEM_TIME_NEW: data = vcpu->arch.time; break; - case MSR_KVM_ASYNC_PF_EN: - data = vcpu->arch.apf.msr_val; - break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -1864,18 +1459,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); - case MSR_K7_CLK_CTL: - /* - * Provide expected ramp-up count for K7. All other - * are set to zero, indicating minimum divisors for - * every field. - * - * This prevents guest kernels on AMD host with CPU - * type 6, model 8 and higher from exploding due to - * the rdmsr failing. - */ - data = 0x20000000; - break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1913,11 +1496,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, { int i, idx; - idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu_load(vcpu); + + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); + + vcpu_put(vcpu); return i; } @@ -1947,7 +1534,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs, r = -ENOMEM; size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = kmalloc(size, GFP_KERNEL); + entries = vmalloc(size); if (!entries) goto out; @@ -1966,7 +1553,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *user_msrs, r = n; out_free: - kfree(entries); + vfree(entries); out: return r; } @@ -1988,7 +1575,6 @@ int kvm_dev_ioctl_check_extension(long ext) #ifdef CONFIG_MMU_NOTIFIER case KVM_CAP_SYNC_MMU: #endif - case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: @@ -2006,12 +1592,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_PCI_SEGMENT: - case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: - case KVM_CAP_XSAVE: -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38) - case KVM_CAP_ASYNC_PF: -#endif r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2035,9 +1616,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; - case KVM_CAP_XCRS: - r = kvm_cpu_has_xsave; - break; default: r = 0; break; @@ -2114,51 +1692,22 @@ out: return r; } -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - -static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - return vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - /* Address WBINVD may be executed by guest */ - if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); - } - kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - kvm_native_read_tsc() - vcpu->arch.last_host_tsc; - if (tsc_delta < 0) - mark_tsc_unstable("KVM discovered backwards TSC"); - if (kvm_check_tsc_unstable()) { - kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); - vcpu->arch.tsc_catchup = 1; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } - if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); - vcpu->cpu = cpu; + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { + unsigned long khz = cpufreq_quick_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; } + kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = kvm_native_read_tsc(); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -2207,6 +1756,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; + vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -2224,7 +1774,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); + vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -2245,10 +1795,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; + vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); + vcpu_put(vcpu); return 0; out: @@ -2275,11 +1826,6 @@ out: return r; } -static void cpuid_mask(u32 *word, int wordnum) -{ - *word &= boot_cpu_data.x86_capability[wordnum]; -} - static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index) { @@ -2328,20 +1874,19 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | + F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C); + 0 /* Reserved, XSAVE, OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | + F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + 0 /* SKINIT */ | 0 /* WDT */; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -2350,13 +1895,11 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)0xb); break; case 1: entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -2410,51 +1953,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } - case 0xd: { - int i; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (i = 1; *nent < maxnent; ++i) { - if (entry[i - 1].eax == 0 && i != 2) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case KVM_CPUID_SIGNATURE: { - char signature[12] = "KVMKVMKVM\0\0"; - u32 *sigptr = (u32 *)signature; - entry->eax = 0; - entry->ebx = sigptr[0]; - entry->ecx = sigptr[1]; - entry->edx = sigptr[2]; - break; - } - case KVM_CPUID_FEATURES: - entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; case 0x80000000: entry->eax = min(entry->eax, 0x8000001a); break; case 0x80000001: entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); break; } - - kvm_x86_ops->set_supported_cpuid(function, entry); - put_cpu(); } @@ -2490,23 +1996,6 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); - - - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, - cpuid->nent); - r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -2527,7 +2016,9 @@ out: static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + vcpu_load(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); + vcpu_put(vcpu); return 0; } @@ -2535,9 +2026,11 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); update_cr8_intercept(vcpu); + vcpu_put(vcpu); return 0; } @@ -2549,16 +2042,20 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; + vcpu_load(vcpu); kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu_put(vcpu); return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { + vcpu_load(vcpu); kvm_inject_nmi(vcpu); + vcpu_put(vcpu); return 0; } @@ -2624,7 +2121,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); return 0; } if (banks[1] & MCI_STATUS_VAL) @@ -2649,43 +2146,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - events->exception.injected = - vcpu->arch.exception.pending && - !kvm_exception_is_soft(vcpu->arch.exception.nr); + vcpu_load(vcpu); + + events->exception.injected = vcpu->arch.exception.pending; events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.pad = 0; events->exception.error_code = vcpu->arch.exception.error_code; - events->interrupt.injected = - vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; + events->interrupt.injected = vcpu->arch.interrupt.pending; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = 0; - events->interrupt.shadow = - kvm_x86_ops->get_interrupt_shadow(vcpu, - KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); + events->interrupt.soft = vcpu->arch.interrupt.soft; events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); - events->nmi.pad = 0; events->sipi_vector = vcpu->arch.sipi_vector; events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW); - memset(&events->reserved, 0, sizeof(events->reserved)); + | KVM_VCPUEVENT_VALID_SIPI_VECTOR); + + vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW)) + | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) return -EINVAL; + vcpu_load(vcpu); + vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -2696,9 +2188,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.soft = events->interrupt.soft; if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) kvm_pic_clear_isr_ack(vcpu->kvm); - if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, - events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) @@ -2708,134 +2197,34 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - dbgregs->dr6 = vcpu->arch.dr6; - dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); -} - -static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - if (dbgregs->flags) - return -EINVAL; - - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = dbgregs->dr6; - vcpu->arch.dr7 = dbgregs->dr7; + vcpu_put(vcpu); return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (kvm_cpu_has_xsave) - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->xsave, - kvm_xstate_size); - else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->fxsave, - sizeof(struct kvm_i387_fxsave_struct)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XSTATE_FPSSE; - } -} - -static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - u64 xstate_bv = - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - - if (kvm_cpu_has_xsave) - memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, kvm_xstate_size); - else { - if (xstate_bv & ~XSTATE_FPSSE) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->fxsave, - guest_xsave->region, sizeof(struct kvm_i387_fxsave_struct)); - } - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) -{ - if (!kvm_cpu_has_xsave) { - guest_xcrs->nr_xcrs = 0; - return; - } - - guest_xcrs->nr_xcrs = 1; - guest_xcrs->flags = 0; - guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; - guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; -} - -static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) -{ - int i, r = 0; - - if (!kvm_cpu_has_xsave) - return -EINVAL; - - if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) - return -EINVAL; - - for (i = 0; i < guest_xcrs->nr_xcrs; i++) - /* Only support XCR0 currently */ - if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { - r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, - guest_xcrs->xcrs[0].value); - break; - } - if (r) - r = -EINVAL; - return r; -} - long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void *argp = (void *)arg; int r; - union { - struct kvm_lapic_state *lapic; - struct kvm_xsave *xsave; - struct kvm_xcrs *xcrs; - void *buffer; - } u; + struct kvm_lapic_state *lapic = NULL; - u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; if (!vcpu->arch.apic) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!u.lapic) + if (!lapic) goto out; - r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) + if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; @@ -2844,14 +2233,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!u.lapic) + if (!lapic) goto out; r = -EFAULT; - if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) + if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); + r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); if (r) goto out; r = 0; @@ -2991,90 +2380,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); break; } - case KVM_GET_DEBUGREGS: { - struct kvm_debugregs dbgregs; - - kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); - - r = -EFAULT; - if (copy_to_user(argp, &dbgregs, - sizeof(struct kvm_debugregs))) - break; - r = 0; - break; - } - case KVM_SET_DEBUGREGS: { - struct kvm_debugregs dbgregs; - - r = -EFAULT; - if (copy_from_user(&dbgregs, argp, - sizeof(struct kvm_debugregs))) - break; - - r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); - break; - } - case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); - r = -ENOMEM; - if (!u.xsave) - break; - - kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); - - r = -EFAULT; - if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) - break; - r = 0; - break; - } - case KVM_SET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); - r = -ENOMEM; - if (!u.xsave) - break; - - r = -EFAULT; - if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) - break; - - r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); - break; - } - case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); - r = -ENOMEM; - if (!u.xcrs) - break; - - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); - - r = -EFAULT; - if (copy_to_user(argp, u.xcrs, - sizeof(struct kvm_xcrs))) - break; - r = 0; - break; - } - case KVM_SET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); - r = -ENOMEM; - if (!u.xcrs) - break; - - r = -EFAULT; - if (copy_from_user(u.xcrs, argp, - sizeof(struct kvm_xcrs))) - break; - - r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); - break; - } default: r = -EINVAL; } out: - kfree(u.buffer); + kfree(lapic); return r; } @@ -3114,7 +2424,116 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { - return kvm->arch.n_max_mmu_pages; + return kvm->arch.n_alloc_mmu_pages; +} + +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +/* + * Set a new alias region. Aliases map a portion of physical memory into + * another portion. This is useful for memory windows, for example the PC + * VGA region. + */ +static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, + struct kvm_memory_alias *alias) +{ + int r, n; + struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; + + r = -EINVAL; + /* General sanity checks */ + if (alias->memory_size & (PAGE_SIZE - 1)) + goto out; + if (alias->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (alias->slot >= KVM_ALIAS_SLOTS) + goto out; + if (alias->guest_phys_addr + alias->memory_size + < alias->guest_phys_addr) + goto out; + if (alias->target_phys_addr + alias->memory_size + < alias->target_phys_addr) + goto out; + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); + + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + kvm_synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + + p = &aliases->aliases[alias->slot]; + p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; + p->npages = alias->memory_size >> PAGE_SHIFT; + p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); + + for (n = KVM_ALIAS_SLOTS; n > 0; --n) + if (aliases->aliases[n - 1].npages) + break; + aliases->naliases = n; + + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + kvm_synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; + +out_unlock: + mutex_unlock(&kvm->slots_lock); +out: + return r; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) @@ -3150,18 +2569,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -3204,7 +2623,6 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); return r; } @@ -3246,6 +2664,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; unsigned long n; unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; mutex_lock(&kvm->slots_lock); @@ -3260,47 +2679,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) + goto out; + memset(dirty_bitmap, 0, n); + for (i = 0; !is_dirty && i < n/sizeof(long); i++) is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap; - dirty_bitmap = memslot->dirty_bitmap_head; - if (memslot->dirty_bitmap == dirty_bitmap) - dirty_bitmap += n / sizeof(long); - memset(dirty_bitmap, 0, n); + spin_lock(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); - r = -ENOMEM; slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) - goto out; + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; - slots->generation++; old_slots = kvm->memslots; rcu_assign_pointer(kvm->memslots, slots); kvm_synchronize_srcu_expedited(&kvm->srcu); dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); - - spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - spin_unlock(&kvm->mmu_lock); - - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - goto out; - } else { - r = -EFAULT; - if (clear_user(log->dirty_bitmap, n)) - goto out; } r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: mutex_unlock(&kvm->slots_lock); return r; @@ -3320,6 +2734,7 @@ long kvm_arch_vm_ioctl(struct file *filp, union { struct kvm_pit_state ps; struct kvm_pit_state2 ps2; + struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -3340,6 +2755,22 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } + case KVM_SET_MEMORY_REGION: { + struct kvm_memory_region kvm_mem; + struct kvm_userspace_memory_region kvm_userspace_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) + goto out; + kvm_userspace_mem.slot = kvm_mem.slot; + kvm_userspace_mem.flags = kvm_mem.flags; + kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; + kvm_userspace_mem.memory_size = kvm_mem.memory_size; + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + break; + } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); if (r) @@ -3348,6 +2779,14 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; + case KVM_SET_MEMORY_ALIAS: + r = -EFAULT; + if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) + goto out; + r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); + if (r) + goto out; + break; case KVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; @@ -3360,10 +2799,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { - mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); - mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; } @@ -3374,12 +2811,10 @@ long kvm_arch_vm_ioctl(struct file *filp, smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { - mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); - mutex_unlock(&kvm->slots_lock); } create_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -3412,13 +2847,11 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; - r = -ENXIO; if (irqchip_in_kernel(kvm)) { __s32 status; status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; irq_event.status = status; if (copy_to_user(argp, &irq_event, sizeof irq_event)) @@ -3555,6 +2988,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_SET_CLOCK: { + struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; s64 delta; @@ -3568,23 +3002,21 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - local_irq_disable(); - now_ns = get_kernel_ns(); + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); delta = user_ns.clock - now_ns; - local_irq_enable(); kvm->arch.kvmclock_offset = delta; break; } case KVM_GET_CLOCK: { + struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = get_kernel_ns(); + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; - local_irq_enable(); user_ns.flags = 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; if (copy_to_user(argp, &user_ns, sizeof(user_ns))) @@ -3635,86 +3067,52 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->set_segment(vcpu, var, seg); -} - -void kvm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->get_segment(vcpu, var, seg); -} - -static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) -{ - return gpa; -} - -static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) -{ - gpa_t t_gpa; - struct x86_exception exception; - - BUG_ON(!mmu_is_nested(vcpu)); - - /* NPT walks are always user-walks */ - access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); - - return t_gpa; -} - -gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); } - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); } -gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); } /* uses this to access any guest's mapped memory without checking CPL */ -gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u32 access, - struct x86_exception *exception) + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - exception); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { - r = X86EMUL_IO_NEEDED; + r = X86EMUL_UNHANDLEABLE; goto out; } @@ -3728,52 +3126,46 @@ out: /* used for instruction fetching */ static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) + struct kvm_vcpu *vcpu, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, - exception); + access | PFERR_FETCH_MASK, error); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) + struct kvm_vcpu *vcpu, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, - exception); + error); } static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) + struct kvm_vcpu *vcpu, u32 *error) { - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); } -static int kvm_write_guest_virt_system(gva_t addr, void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu, - struct x86_exception *exception) +static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, - PFERR_WRITE_MASK, - exception); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { - r = X86EMUL_IO_NEEDED; + r = X86EMUL_UNHANDLEABLE; goto out; } @@ -3785,13 +3177,14 @@ out: return r; } + static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, - struct x86_exception *exception, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3801,17 +3194,19 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); - if (gpa == UNMAPPED_GVA) + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) - == X86EMUL_CONTINUE) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) + == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; mmio: @@ -3826,16 +3221,15 @@ mmio: trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; - return X86EMUL_IO_NEEDED; + return X86EMUL_UNHANDLEABLE; } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) + const void *val, int bytes) { int ret; @@ -3849,15 +3243,17 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *exception, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); - if (gpa == UNMAPPED_GVA) + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3875,185 +3271,72 @@ mmio: return X86EMUL_CONTINUE; vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, val, bytes); + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); return X86EMUL_CONTINUE; } int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, exception, - vcpu); + rc = emulator_write_emulated_onepage(addr, val, now, vcpu); if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, exception, - vcpu); + return emulator_write_emulated_onepage(addr, val, bytes, vcpu); } - -#define CMPXCHG_TYPE(t, ptr, old, new) \ - (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) - -#ifdef CONFIG_X86_64 -# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) -#else -# define CMPXCHG64(ptr, old, new) \ - (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) -#endif +EXPORT_SYMBOL_GPL(emulator_write_emulated); static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *exception, struct kvm_vcpu *vcpu) { - gpa_t gpa; - struct page *page; - char *kaddr; - bool exchanged; - - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes > 8 || (bytes & (bytes - 1))) - goto emul_write; - - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - - if (gpa == UNMAPPED_GVA || - (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; - - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) - goto emul_write; - - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); - goto emul_write; - } - - kaddr = kmap_atomic(page, KM_USER0); - kaddr += offset_in_page(gpa); - switch (bytes) { - case 1: - exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); - break; - case 2: - exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); - break; - case 4: - exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); - break; - case 8: - exchanged = CMPXCHG64(kaddr, old, new); - break; - default: - BUG(); - } - kunmap_atomic(kaddr, KM_USER0); - kvm_release_page_dirty(page); - - if (!exchanged) - return X86EMUL_CMPXCHG_FAILED; - - kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); - - return X86EMUL_CONTINUE; - -emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); +#ifndef CONFIG_X86_64 + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes == 8) { + gpa_t gpa; + struct page *page; + char *kaddr; + u64 val; - return emulator_write_emulated(addr, new, bytes, exception, vcpu); -} - -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - /* TODO: String I/O for in kernel device */ - int r; - - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - return r; -} - - -static int emulator_pio_in_emulated(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.pio.count) - goto data_avail; - - trace_kvm_pio(0, port, size, count); - - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 1; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; - - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - vcpu->arch.pio.count = 0; - return 1; - } - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_IN; - vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - return 0; -} + if (gpa == UNMAPPED_GVA || + (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; -static int emulator_pio_out_emulated(int size, unsigned short port, - const void *val, unsigned int count, - struct kvm_vcpu *vcpu) -{ - trace_kvm_pio(1, port, size, count); + if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + goto emul_write; - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 0; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; + val = *(u64 *)new; - memcpy(vcpu->arch.pio_data, val, size * count); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; - return 1; + kaddr = kmap_atomic(page, KM_USER0); + set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); } +emul_write: +#endif - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_OUT; - vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; - - return 0; + return emulator_write_emulated(addr, new, bytes, vcpu); } static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) @@ -4067,25 +3350,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) return X86EMUL_CONTINUE; } -int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - if (!need_emulate_wbinvd(vcpu)) - return X86EMUL_CONTINUE; - - if (kvm_x86_ops->has_wbinvd_exit()) { - int cpu = get_cpu(); - - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); - put_cpu(); - cpumask_clear(vcpu->arch.wbinvd_dirty_mask); - } else - wbinvd(); - return X86EMUL_CONTINUE; -} -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); - int emulate_clts(struct kvm_vcpu *vcpu) { kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); @@ -4093,194 +3357,42 @@ int emulate_clts(struct kvm_vcpu *vcpu) return X86EMUL_CONTINUE; } -int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) -{ - return _kvm_get_dr(vcpu, dr, dest); -} - -int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) -{ - - return __kvm_set_dr(vcpu, dr, value); -} - -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; -} - -static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) -{ - unsigned long value; - - switch (cr) { - case 0: - value = kvm_read_cr0(vcpu); - break; - case 2: - value = vcpu->arch.cr2; - break; - case 3: - value = kvm_read_cr3(vcpu); - break; - case 4: - value = kvm_read_cr4(vcpu); - break; - case 8: - value = kvm_get_cr8(vcpu); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - return 0; - } - - return value; -} - -static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) -{ - int res = 0; - - switch (cr) { - case 0: - res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); - break; - case 2: - vcpu->arch.cr2 = val; - break; - case 3: - res = kvm_set_cr3(vcpu, val); - break; - case 4: - res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); - break; - case 8: - res = kvm_set_cr8(vcpu, val); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - res = -1; - } - - return res; -} - -static int emulator_get_cpl(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->get_cpl(vcpu); -} - -static void emulator_get_gdt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->get_gdt(vcpu, dt); -} - -static void emulator_get_idt(struct kvm_desc_ptr *dt, struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->get_idt(vcpu, dt); -} - -static unsigned long emulator_get_cached_segment_base(int seg, - struct kvm_vcpu *vcpu) +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return get_segment_base(vcpu, seg); + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } -static bool emulator_get_cached_descriptor(struct kvm_desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - struct kvm_segment var; - - kvm_get_segment(vcpu, &var, seg); - - if (var.unusable) - return false; + unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - if (var.g) - var.limit >>= 12; - kvm_set_desc_limit(desc, var.limit); - kvm_set_desc_base(desc, (unsigned long)var.base); - desc->type = var.type; - desc->s = var.s; - desc->dpl = var.dpl; - desc->p = var.present; - desc->avl = var.avl; - desc->l = var.l; - desc->d = var.db; - desc->g = var.g; - - return true; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } -static void emulator_set_cached_descriptor(struct kvm_desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { - struct kvm_segment var; - - /* needed to preserve selector */ - kvm_get_segment(vcpu, &var, seg); - - var.base = kvm_get_desc_base(desc); - var.limit = kvm_get_desc_limit(desc); - if (desc->g) - var.limit = (var.limit << 12) | 0xfff; - var.type = desc->type; - var.present = desc->p; - var.dpl = desc->dpl; - var.db = desc->d; - var.s = desc->s; - var.l = desc->l; - var.g = desc->g; - var.avl = desc->avl; - var.present = desc->p; - var.unusable = !var.present; - var.padding = 0; - - kvm_set_segment(vcpu, &var, seg); - return; -} + u8 opcodes[4]; + unsigned long rip = kvm_rip_read(vcpu); + unsigned long rip_linear; -static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) -{ - struct kvm_segment kvm_seg; + if (!printk_ratelimit()) + return; - kvm_get_segment(vcpu, &kvm_seg, seg); - return kvm_seg.selector; -} + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); -static void emulator_set_segment_selector(u16 sel, int seg, - struct kvm_vcpu *vcpu) -{ - struct kvm_segment kvm_seg; + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - kvm_get_segment(vcpu, &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); + printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", + context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); } +EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, - .write_std = kvm_write_guest_virt_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, - .pio_in_emulated = emulator_pio_in_emulated, - .pio_out_emulated = emulator_pio_out_emulated, - .get_cached_descriptor = emulator_get_cached_descriptor, - .set_cached_descriptor = emulator_set_cached_descriptor, - .get_segment_selector = emulator_get_segment_selector, - .set_segment_selector = emulator_set_segment_selector, - .get_cached_segment_base = emulator_get_cached_segment_base, - .get_gdt = emulator_get_gdt, - .get_idt = emulator_get_idt, - .get_cr = emulator_get_cr, - .set_cr = emulator_set_cr, - .cpl = emulator_get_cpl, - .get_dr = emulator_get_dr, - .set_dr = emulator_set_dr, - .set_msr = kvm_set_msr, - .get_msr = kvm_get_msr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -4291,134 +3403,14 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = ~0; } -static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +int emulate_instruction(struct kvm_vcpu *vcpu, + unsigned long cr2, + u16 error_code, + int emulation_type) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); -} - -static void inject_emulated_exception(struct kvm_vcpu *vcpu) -{ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - if (ctxt->exception.vector == PF_VECTOR) - kvm_propagate_fault(vcpu, &ctxt->exception); - else if (ctxt->exception.error_code_valid) - kvm_queue_exception_e(vcpu, ctxt->exception.vector, - ctxt->exception.error_code); - else - kvm_queue_exception(vcpu, ctxt->exception.vector); -} - -static void init_emulate_ctxt(struct kvm_vcpu *vcpu) -{ - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - int cs_db, cs_l; - - cache_all_regs(vcpu); - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); -} - -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) -{ - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - int ret; - - init_emulate_ctxt(vcpu); - - vcpu->arch.emulate_ctxt.decode.op_bytes = 2; - vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); - - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - vcpu->arch.emulate_ctxt.eip = c->eip; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - - if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = false; - else - vcpu->arch.interrupt.pending = false; - - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); - -static int handle_emulation_failure(struct kvm_vcpu *vcpu) -{ - int r = EMULATE_DONE; - - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - if (!is_guest_mode(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - r = EMULATE_FAIL; - } - kvm_queue_exception(vcpu, UD_VECTOR); - - return r; -} - -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - - if (tdp_enabled) - return false; - - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-entetr the - * guest to let CPU execute the instruction. - */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) - return true; - - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); - - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ - - if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) - return true; - - return false; -} - -int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - int emulation_type, - void *insn, - int insn_len) -{ - int r; - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int r, shadow_mask; + struct decode_cache *c; + struct kvm_run *run = vcpu->run; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -4430,20 +3422,27 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, */ cache_all_regs(vcpu); + vcpu->mmio_is_write = 0; + vcpu->arch.pio.string = 0; + if (!(emulation_type & EMULTYPE_NO_DECODE)) { - init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.interruptibility = 0; - vcpu->arch.emulate_ctxt.have_exception = false; - vcpu->arch.emulate_ctxt.perm_ok = false; + int cs_db, cs_l; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); - if (r == X86EMUL_PROPAGATE_FAULT) - goto done; + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - trace_kvm_emulate_insn_start(vcpu); + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ + c = &vcpu->arch.emulate_ctxt.decode; if (emulation_type & EMULTYPE_TRAP_UD) { if (!c->twobyte) return EMULATE_FAIL; @@ -4471,11 +3470,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - if (reexecute_instruction(vcpu, cr2)) + ++vcpu->stat.insn_emulation_fail; + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; - return handle_emulation_failure(vcpu); + return EMULATE_FAIL; } } @@ -4484,74 +3482,245 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } - /* this is needed for vmware backdor interface to work since it - changes registers values during IO operation */ - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + + if (vcpu->arch.pio.string) + return EMULATE_DO_MMIO; -restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } - if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2)) + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; + if (!vcpu->mmio_needed) { + kvm_report_emulation_failure(vcpu, "mmio"); + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } + + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; + } + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(emulate_instruction); + +static int pio_copy_data(struct kvm_vcpu *vcpu) +{ + void *p = vcpu->arch.pio_data; + gva_t q = vcpu->arch.pio.guest_gva; + unsigned bytes; + int ret; + u32 error_code; - return handle_emulation_failure(vcpu); - } - -done: - if (vcpu->arch.emulate_ctxt.have_exception) { - inject_emulated_exception(vcpu); - r = EMULATE_DONE; - } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) - vcpu->arch.pio.count = 0; - r = EMULATE_DO_MMIO; - } else if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; - r = EMULATE_DO_MMIO; - } else if (r == EMULATION_RESTART) - goto restart; + bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; + if (vcpu->arch.pio.in) + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - r = EMULATE_DONE; + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + + return ret; +} + +int complete_pio(struct kvm_vcpu *vcpu) +{ + struct kvm_pio_request *io = &vcpu->arch.pio; + long delta; + int r; + unsigned long val; - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (!io->string) { + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } + } else { + if (io->in) { + r = pio_copy_data(vcpu); + if (r) + goto out; + } + delta = 1; + if (io->rep) { + delta *= io->cur_count; + /* + * The size of the register should really depend on + * current address size. + */ + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); + } + if (io->down) + delta = -delta; + delta *= io->size; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } + } +out: + io->count -= io->cur_count; + io->cur_count = 0; + + return 0; +} + +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +{ + /* TODO: String I/O for in kernel device */ + int r; + + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); -int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) +static int pio_string_write(struct kvm_vcpu *vcpu) { - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); - /* do not return to emulator after return from userspace */ - vcpu->arch.pio.count = 0; - return ret; + struct kvm_pio_request *io = &vcpu->arch.pio; + void *pd = vcpu->arch.pio_data; + int i, r = 0; + + for (i = 0; i < io->cur_count; i++) { + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + io->port, io->size, pd)) { + r = -EOPNOTSUPP; + break; + } + pd += io->size; + } + return r; } -EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void tsc_bad(void *info) +int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { - kvm___this_cpu_write(cpu_tsc_khz, 0); + unsigned long val; + + trace_kvm_pio(!in, port, size, 1); + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 0; + vcpu->arch.pio.down = 0; + vcpu->arch.pio.rep = 0; + + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + complete_pio(vcpu); + return 1; + } + return 0; } +EXPORT_SYMBOL_GPL(kvm_emulate_pio); -static void tsc_khz_changed(void *data) +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, + int size, unsigned long count, int down, + gva_t address, int rep, unsigned port) { - struct cpufreq_freqs *freq = data; - unsigned long khz = 0; + unsigned now, in_page; + int ret = 0; + + trace_kvm_pio(!in, port, size, count); - if (data) - khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - khz = cpufreq_quick_get(raw_smp_processor_id()); - if (!khz) - khz = tsc_khz; - kvm___this_cpu_write(cpu_tsc_khz, khz); + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 1; + vcpu->arch.pio.down = down; + vcpu->arch.pio.rep = rep; + + if (!count) { + kvm_x86_ops->skip_emulated_instruction(vcpu); + return 1; + } + + if (!down) + in_page = PAGE_SIZE - offset_in_page(address); + else + in_page = offset_in_page(address) + size; + now = min(count, (unsigned long)in_page / size); + if (!now) + now = 1; + if (down) { + /* + * String I/O in reverse. Yuck. Kill the guest, fix later. + */ + pr_unimpl(vcpu, "guest string pio down\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->run->io.count = now; + vcpu->arch.pio.cur_count = now; + + if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) + kvm_x86_ops->skip_emulated_instruction(vcpu); + + vcpu->arch.pio.guest_gva = address; + + if (!vcpu->arch.pio.in) { + /* string PIO write */ + ret = pio_copy_data(vcpu); + if (ret == X86EMUL_PROPAGATE_FAULT) + return 1; + if (ret == 0 && !pio_string_write(vcpu)) { + complete_pio(vcpu); + if (vcpu->arch.pio.count == 0) + ret = 1; + } + } + /* no string PIO read support yet */ + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); + +static void bounce_off(void *info) +{ + /* nothing */ } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -4562,60 +3731,21 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; - /* - * We allow guests to temporarily run on slowing clocks, - * provided we notify them after, or to run on accelerating - * clocks, provided we notify them before. Thus time never - * goes backwards. - * - * However, we have a problem. We can't atomically update - * the frequency of a given CPU from this function; it is - * merely a notifier, which can be called from any CPU. - * Changing the TSC frequency at arbitrary points in time - * requires a recomputation of local variables related to - * the TSC for each VCPU. We must flag these local variables - * to be updated and be sure the update takes place with the - * new frequency before any guests proceed. - * - * Unfortunately, the combination of hotplug CPU and frequency - * change creates an intractable locking scenario; the order - * of when these callouts happen is undefined with respect to - * CPU hotplug, and they can race with each other. As such, - * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is - * undefined; you can actually have a CPU frequency change take - * place in between the computation of X and the setting of the - * variable. To protect against this problem, all updates of - * the per_cpu tsc_khz variable are done in an interrupt - * protected IPI, and all callers wishing to update the value - * must wait for a synchronous IPI to complete (which is trivial - * if the caller is on the CPU already). This establishes the - * necessary total order on variable updates. - * - * Note that because a guest time update may take place - * anytime after the setting of the VCPU's request bit, the - * correct TSC value must be set before the request. However, - * to ensure the update actually makes it to any guest which - * starts running in hardware virtualization between the set - * and the acquisition of the spinlock, we must also ping the - * CPU after setting the request bit. - * - */ - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + if (!kvm_request_guest_time_update(vcpu)) + continue; if (vcpu->cpu != smp_processor_id()) - send_ipi = 1; + send_ipi++; } } spin_unlock(&kvm_lock); @@ -4633,106 +3763,34 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + smp_call_function_single(freq->cpu, bounce_off, NULL, 1); } return 0; } static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier -}; - -static int kvmclock_cpu_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, tsc_bad, NULL, 1); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block kvmclock_cpu_notifier_block = { - .notifier_call = kvmclock_cpu_notifier, - .priority = -INT_MAX + .notifier_call = kvmclock_cpufreq_notifier }; static void kvm_timer_init(void) { int cpu; - max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; - memset(&policy, 0, sizeof(policy)); - cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; - put_cpu(); -#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + for_each_online_cpu(cpu) { + unsigned long khz = cpufreq_get(cpu); + if (!khz) + khz = tsc_khz; + per_cpu(cpu_tsc_khz, cpu) = khz; + } + } else { + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - for_each_online_cpu(cpu) - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); -} - -static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); - -static int kvm_is_in_guest(void) -{ - return percpu_read(current_vcpu) != NULL; } -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (percpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (percpu_read(current_vcpu)) - ip = kvm_rip_read(percpu_read(current_vcpu)); - - return ip; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) -{ - percpu_write(current_vcpu, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); - -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) -{ - percpu_write(current_vcpu, NULL); -} -EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); - int kvm_arch_init(void *opaque) { int r; @@ -4751,10 +3809,6 @@ int kvm_arch_init(void *opaque) } if (ops->disabled_by_bios()) { printk(KERN_ERR "kvm: disabled by bios\n"); -#ifndef KVM_TBOOT_ENABLED_WORKS - - printk(KERN_ERR "kvm: if TXT is enabled in the BIOS, disable it\n"); -#endif r = -EOPNOTSUPP; goto out; } @@ -4765,20 +3819,14 @@ int kvm_arch_init(void *opaque) kvm_init_msr_list(); - kvm_xstate_size_init(); - kvm_x86_ops = ops; kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); kvm_timer_init(); - perf_register_guest_info_callbacks(&kvm_guest_cbs); - - if (kvm_cpu_has_xsave) - host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - return 0; out: @@ -4787,12 +3835,9 @@ out: void kvm_arch_exit(void) { - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -4942,23 +3987,88 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); + return emulator_write_emulated(rip, instruction, 3, vcpu); +} + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct kvm_desc_ptr dt = { limit, base }; + struct descriptor_table dt = { limit, base }; kvm_x86_ops->set_gdt(vcpu, &dt); } void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct kvm_desc_ptr dt = { limit, base }; + struct descriptor_table dt = { limit, base }; kvm_x86_ops->set_idt(vcpu, &dt); } +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) +{ + kvm_lmsw(vcpu, msw); + *rflags = kvm_get_rflags(vcpu); +} + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + unsigned long value; + + switch (cr) { + case 0: + value = kvm_read_cr0(vcpu); + break; + case 2: + value = vcpu->arch.cr2; + break; + case 3: + value = vcpu->arch.cr3; + break; + case 4: + value = kvm_read_cr4(vcpu); + break; + case 8: + value = kvm_get_cr8(vcpu); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + return 0; + } + + return value; +} + +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + *rflags = kvm_get_rflags(vcpu); + break; + case 2: + vcpu->arch.cr2 = val; + break; + case 3: + kvm_set_cr3(vcpu, val); + break; + case 4: + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + break; + case 8: + kvm_set_cr8(vcpu, val & 0xfUL); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + } +} + static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; @@ -5022,13 +4132,9 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); - if (!best || best->eax < 0x80000008) - goto not_found; best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); if (best) return best->eax & 0xff; -not_found: return 36; } @@ -5109,10 +4215,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; - idx = srcu_read_lock(&vcpu->kvm->srcu); + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -5142,13 +4248,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) { /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code, - vcpu->arch.exception.reinject); + vcpu->arch.exception.error_code); return; } @@ -5178,84 +4280,44 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) } } -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; - } -} - -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_xcr0_loaded) { - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; - } -} - static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - if (vcpu->requests) { - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) + if (vcpu->requests) + if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) kvm_mmu_unload(vcpu); - if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) + + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { - r = kvm_guest_time_update(vcpu); - if (unlikely(r)) - goto out; - } - if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + kvm_write_guest_time(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); - if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { + if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, + &vcpu->requests)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { vcpu->fpu_active = 0; kvm_x86_ops->fpu_deactivate(vcpu); } - if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { - /* Page is swapped out. Do synthetic halt */ - vcpu->arch.apf.halted = true; - r = 1; - goto out; - } - } - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } } preempt_disable(); @@ -5263,25 +4325,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); - kvm_load_guest_xcr0(vcpu); - - atomic_set(&vcpu->guest_mode, 1); - smp_wmb(); local_irq_disable(); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests - || need_resched() || signal_pending(current)) { - atomic_set(&vcpu->guest_mode, 0); - smp_wmb(); + clear_bit(KVM_REQ_KICK, &vcpu->requests); + smp_mb__after_clear_bit(); + + if (vcpu->requests || need_resched() || signal_pending(current)) { + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); r = 1; goto out; } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + + kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -5306,10 +4377,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); - - atomic_set(&vcpu->guest_mode, 0); - smp_wmb(); + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); ++vcpu->stat.exits; @@ -5326,7 +4394,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -5360,26 +4428,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; while (r > 0) { - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) + vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu); + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: - vcpu->arch.apf.halted = false; break; case KVM_MP_STATE_SIPI_RECEIVED: default: @@ -5401,22 +4467,20 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.request_irq_exits; } - - kvm_check_async_pf_completion(vcpu); - if (signal_pending(current)) { r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } if (need_resched()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vcpu->srcu_idx = kvm_srcu_read_lock(&kvm->srcu); } } - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -5428,8 +4492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - if (!tsk_used_math(current) && kvm_init_fpu(current)) - return -ENOMEM; + vcpu_load(vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -5442,23 +4505,29 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { - if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { - r = -EINVAL; + if (!irqchip_in_kernel(vcpu->kvm)) + kvm_set_cr8(vcpu, kvm_run->cr8); + + if (vcpu->arch.pio.cur_count) { + vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); + r = complete_pio(vcpu); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r) goto out; - } } + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; - if (vcpu->arch.pio.count || vcpu->mmio_needed) { - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) { + vcpu->srcu_idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, + EMULTYPE_NO_DECODE); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ r = 0; goto out; } @@ -5470,15 +4539,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = __vcpu_run(vcpu); out: - post_kvm_run_save(vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); + vcpu_put(vcpu); return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -5501,11 +4572,15 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); + vcpu_put(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -5530,11 +4605,17 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { struct kvm_segment cs; @@ -5548,7 +4629,9 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - struct kvm_desc_ptr dt; + struct descriptor_table dt; + + vcpu_load(vcpu); kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -5561,15 +4644,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); kvm_x86_ops->get_idt(vcpu, &dt); - sregs->idt.limit = dt.size; - sregs->idt.base = dt.address; + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; kvm_x86_ops->get_gdt(vcpu, &dt); - sregs->gdt.limit = dt.size; - sregs->gdt.base = dt.address; + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = kvm_read_cr3(vcpu); + sregs->cr3 = vcpu->arch.cr3; sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; @@ -5581,44 +4664,586 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); + vcpu_put(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); vcpu->arch.mp_state = mp_state->mp_state; - kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +static void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + +static void seg_desct_to_kvm_desct(struct kvm_desc_struct *seg_desc, u16 selector, + struct kvm_segment *kvm_desct) +{ + kvm_desct->base = kvm_get_desc_base(seg_desc); + kvm_desct->limit = kvm_get_desc_limit(seg_desc); + if (seg_desc->g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } + kvm_desct->selector = selector; + kvm_desct->type = seg_desc->type; + kvm_desct->present = seg_desc->p; + kvm_desct->dpl = seg_desc->dpl; + kvm_desct->db = seg_desc->d; + kvm_desct->s = seg_desc->s; + kvm_desct->l = seg_desc->l; + kvm_desct->g = seg_desc->g; + kvm_desct->avl = seg_desc->avl; + if (!selector) + kvm_desct->unusable = 1; + else + kvm_desct->unusable = 0; + kvm_desct->padding = 0; +} + +static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, + u16 selector, + struct descriptor_table *dtable) +{ + if (selector & 1 << 2) { + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + + if (kvm_seg.unusable) + dtable->limit = 0; + else + dtable->limit = kvm_seg.limit; + dtable->base = kvm_seg.base; + } + else + kvm_x86_ops->get_gdt(vcpu, dtable); +} + +/* allowed just for 8 bytes segments */ +static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + struct kvm_desc_struct *seg_desc) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct descriptor_table dtable; + u16 index = selector >> 3; int ret; + u32 err; + gva_t addr; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) { + kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); - init_emulate_ctxt(vcpu); + return ret; +} - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, - tss_selector, reason, has_error_code, - error_code); +/* allowed just for 8 bytes segments */ +static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + struct kvm_desc_struct *seg_desc) +{ + struct descriptor_table dtable; + u16 index = selector >> 3; + + get_segment_descriptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) + return 1; + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct kvm_desc_struct *seg_desc) +{ + u32 base_addr = kvm_get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); +} + +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, + struct kvm_desc_struct *seg_desc) +{ + u32 base_addr = kvm_get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); +} + +static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + return kvm_seg.selector; +} + +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return X86EMUL_CONTINUE; +} + +static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && + (seg != VCPU_SREG_TR) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); +} + +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment kvm_seg; + struct kvm_desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; + + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) + return kvm_load_realmode_segment(vcpu, selector, seg); + + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); if (ret) - return EMULATE_FAIL; + return ret; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: + kvm_set_segment(vcpu, &kvm_seg, seg); + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; +} + +static void save_state_to_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + tss->cr3 = vcpu->arch.cr3; + tss->eip = kvm_rip_read(vcpu); + tss->eflags = kvm_get_rflags(vcpu); + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); + tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + +static int load_state_from_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + kvm_set_cr3(vcpu, tss->cr3); + + kvm_rip_write(vcpu, tss->eip); + kvm_set_rflags(vcpu, tss->eflags | 2); + + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) + return 1; + return 0; +} + +static void save_state_to_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + tss->ip = kvm_rip_read(vcpu); + tss->flag = kvm_get_rflags(vcpu); + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); + + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); +} + +static int load_state_from_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + kvm_rip_write(vcpu, tss->ip); + kvm_set_rflags(vcpu, tss->flag | 2); + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) + return 1; + + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) + return 1; + return 0; +} + +static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, + u16 old_tss_sel, u32 old_tss_base, + struct kvm_desc_struct *nseg_desc) +{ + struct tss_segment_16 tss_segment_16; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) + goto out; + + save_state_to_tss16(vcpu, &tss_segment_16); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_16, sizeof tss_segment_16)) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr_write(vcpu, nseg_desc), + &tss_segment_16.prev_task_link, + sizeof tss_segment_16.prev_task_link)) + goto out; + } + + if (load_state_from_tss16(vcpu, &tss_segment_16)) + goto out; + + ret = 1; +out: + return ret; +} + +static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, + u16 old_tss_sel, u32 old_tss_base, + struct kvm_desc_struct *nseg_desc) +{ + struct tss_segment_32 tss_segment_32; + int ret = 0; + + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + + save_state_to_tss32(vcpu, &tss_segment_32); + + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), + &tss_segment_32, sizeof tss_segment_32)) + goto out; + + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr_write(vcpu, nseg_desc), + &tss_segment_32.prev_task_link, + sizeof tss_segment_32.prev_task_link)) + goto out; + } + + if (load_state_from_tss32(vcpu, &tss_segment_32)) + goto out; + + ret = 1; +out: + return ret; +} + +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) +{ + struct kvm_segment tr_seg; + struct kvm_desc_struct cseg_desc; + struct kvm_desc_struct nseg_desc; + int ret = 0; + u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + u32 desc_limit; + + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); + + /* FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ + if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) + goto out; + + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) + goto out; + + if (reason != TASK_SWITCH_IRET) { + int cpl; + + cpl = kvm_x86_ops->get_cpl(vcpu); + if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + } + + desc_limit = kvm_get_desc_limit(&nseg_desc); + if (!nseg_desc.p || + ((desc_limit < 0x67 && (nseg_desc.type & 8)) || + desc_limit < 0x2b)) { + kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); + return 1; + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + cseg_desc.type &= ~(1 << 1); //clear the B flag + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); + } + + if (reason == TASK_SWITCH_IRET) { + u32 eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); + } + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (nseg_desc.type & 8) + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + else + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { + u32 eflags = kvm_get_rflags(vcpu); + kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); + } + + if (reason != TASK_SWITCH_IRET) { + nseg_desc.type |= (1 << 1); + save_guest_segment_descriptor(vcpu, tss_selector, + &nseg_desc); + } + + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); + seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); + tr_seg.type = 11; + kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); +out: + return ret; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -5627,19 +5252,20 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { int mmu_reset_needed = 0; int pending_vec, max_bits; - struct kvm_desc_ptr dt; + struct descriptor_table dt; - dt.size = sregs->idt.limit; - dt.address = sregs->idt.base; + vcpu_load(vcpu); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); - dt.size = sregs->gdt.limit; - dt.address = sregs->gdt.base; + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; kvm_x86_ops->set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; + mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); kvm_set_cr8(vcpu, sregs->cr8); @@ -5653,10 +5279,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) - update_cpuid(vcpu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + load_pdptrs(vcpu, vcpu->arch.cr3); mmu_reset_needed = 1; } @@ -5691,7 +5315,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } @@ -5702,10 +5326,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; + vcpu_load(vcpu); + if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) - goto out; + goto unlock_out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else @@ -5733,9 +5359,11 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + vcpu->arch.singlestep_cs = + get_segment_selector(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); + } /* * Trigger an rflags update that will inject or remove the trace @@ -5747,12 +5375,34 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; -out: +unlock_out: + vcpu_put(vcpu); return r; } /* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +#ifdef CONFIG_X86_64 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ +#else + u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ +#endif +}; + +/* * Translate a guest virtual address to a guest physical address. */ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -5762,21 +5412,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; - idx = srcu_read_lock(&vcpu->kvm->srcu); + vcpu_load(vcpu); + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; + vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct kvm_i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -5787,13 +5440,16 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + vcpu_put(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct kvm_i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -5804,63 +5460,61 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + vcpu_put(vcpu); + return 0; } -int fx_init(struct kvm_vcpu *vcpu) +void fx_init(struct kvm_vcpu *vcpu) { - int err; - - err = kvm_fpu_alloc(&vcpu->arch.guest_fpu); - if (err) - return err; - - kvm_fpu_finit(&vcpu->arch.guest_fpu); + unsigned after_mxcsr_mask; /* - * Ensure guest xcr0 is valid for loading + * Touch the fpu the first time in non atomic context as if + * this is the first fpu instruction the exception handler + * will fire before the instruction returns and it'll have to + * allocate ram with GFP_KERNEL. */ - vcpu->arch.xcr0 = XSTATE_FP; + if (!used_math()) + kvm_fx_save(&vcpu->arch.host_fx_image); - vcpu->arch.cr0 |= X86_CR0_ET; + /* Initialize guest FPU by resetting ours and saving into guest's */ + preempt_disable(); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_finit(); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); + preempt_enable(); - return 0; + vcpu->arch.cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->arch.guest_fx_image.mxcsr = 0x1f80; + memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, + 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } EXPORT_SYMBOL_GPL(fx_init); -static void fx_free(struct kvm_vcpu *vcpu) -{ - kvm_fpu_free(&vcpu->arch.guest_fpu); -} - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) return; - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - unlazy_fpu(current); - kvm_fpu_restore_checking(&vcpu->arch.guest_fpu); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_restore(&vcpu->arch.guest_fx_image); trace_kvm_fpu(1); } void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 0; - kvm_fpu_save_init(&vcpu->arch.guest_fpu); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); trace_kvm_fpu(0); } @@ -5871,18 +5525,12 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - printk_once(KERN_WARNING - "kvm: SMP vm created on host with unstable TSC; " - "guest TSC will not be reliable\n"); return kvm_x86_ops->vcpu_create(kvm, id); } @@ -5890,6 +5538,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; + /* We do fxsave: this must be aligned. */ + BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); + vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); @@ -5907,13 +5558,10 @@ free_vcpu: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5927,27 +5575,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; - - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - vcpu->arch.apf.halted = false; - return kvm_x86_ops->vcpu_reset(vcpu); } int kvm_arch_hardware_enable(void *garbage) { - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + /* + * Since this may be called from a hotplug notifcation, + * we can't get the CPU frequency directly. + */ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + int cpu = raw_smp_processor_id(); + per_cpu(cpu_tsc_khz, cpu) = 0; + } kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + return kvm_x86_ops->hardware_enable(garbage); } @@ -5981,11 +5624,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.emulate_ctxt.ops = &emulate_ops; - vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.translate_gpa = translate_gpa; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -5998,9 +5637,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - if (!kvm->arch.virtual_tsc_khz) - kvm_arch_set_tsc_khz(kvm, max_tsc_khz); - r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -6019,14 +5655,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) - goto fail_free_mce_banks; - - kvm_async_pf_hash_reset(vcpu); - return 0; -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: @@ -6043,23 +5672,34 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); + idx = kvm_srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + kvm_srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } -int kvm_arch_init_vm(struct kvm *kvm) +struct kvm *kvm_arch_create_vm(void) { + struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + + if (!kvm) + return ERR_PTR(-ENOMEM); + + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - spin_lock_init(&kvm->arch.tsc_write_lock); + rdtscll(kvm->arch.vm_init_tsc); - return 0; + return kvm; } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -6077,10 +5717,8 @@ static void kvm_free_vcpus(struct kvm *kvm) /* * Unpin any mmu pages first. */ - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); + kvm_for_each_vcpu(i, vcpu, kvm) kvm_unload_vcpu_mmu(vcpu); - } kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_free(vcpu); @@ -6095,19 +5733,23 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); - kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); + kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); if (kvm->arch.apic_access_page) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + kvm_cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); + kfree(kvm); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -6117,11 +5759,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, int user_alloc) { int npages = memslot->npages; - int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; - - /* Prevent internal slot pages from being moved by fork()/COW. */ - if (memslot->id >= KVM_MEMORY_SLOTS) - map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -6134,7 +5771,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - map_flags, + MAP_PRIVATE | MAP_ANONYMOUS, 0); up_write(¤t->mm->mmap_sem); @@ -6188,9 +5825,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) - || !list_empty_careful(&vcpu->async_pf.done) + return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || vcpu->arch.nmi_pending || (kvm_arch_interrupt_allowed(vcpu) && @@ -6209,7 +5844,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (atomic_xchg(&vcpu->guest_mode, 0)) + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) kvm_smp_send_reschedule(cpu); put_cpu(); } @@ -6219,22 +5854,13 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) -{ - unsigned long current_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); - - return current_rip == linear_rip; -} -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); - unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; rflags = kvm_x86_ops->get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~X86_EFLAGS_TF; + rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); return rflags; } EXPORT_SYMBOL_GPL(kvm_get_rflags); @@ -6242,154 +5868,14 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && - kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) - rflags |= X86_EFLAGS_TF; + vcpu->arch.singlestep_cs == + get_segment_selector(vcpu, VCPU_SREG_CS) && + vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) + rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; kvm_x86_ops->set_rflags(vcpu, rflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - is_error_page(work->page)) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu.direct_map && - work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) - return; - - vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); -} - -static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) -{ - return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); -} - -static inline u32 kvm_async_pf_next_probe(u32 key) -{ - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); -} - -static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 key = kvm_async_pf_hash_fn(gfn); - - while (vcpu->arch.apf.gfns[key] != ~0) - key = kvm_async_pf_next_probe(key); - - vcpu->arch.apf.gfns[key] = gfn; -} - -static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - int i; - u32 key = kvm_async_pf_hash_fn(gfn); - - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && - (vcpu->arch.apf.gfns[key] != gfn && - vcpu->arch.apf.gfns[key] != ~0); i++) - key = kvm_async_pf_next_probe(key); - - return key; -} - -bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; -} - -static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 i, j, k; - - i = j = kvm_async_pf_gfn_slot(vcpu, gfn); - while (true) { - vcpu->arch.apf.gfns[i] = ~0; - do { - j = kvm_async_pf_next_probe(j); - if (vcpu->arch.apf.gfns[j] == ~0) - return; - k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); - /* - * k lies cyclically in ]i,j] - * | i.k.j | - * |....j i.k.| or |.k..j i...| - */ - } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); - vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; - i = j; - } -} - -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) -{ - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); -} - -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_not_present(work->arch.token, work->gva); - kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } -} - -void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (is_error_page(work->page)) - work->arch.token = ~0; /* broadcast wakeup */ - else - kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } - vcpu->arch.apf.halted = false; -} - -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) - return true; - else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); -} - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); @@ -6401,4 +5887,3 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); diff --git a/linux/x86/x86.h b/linux/x86/x86.h index c600da8..2d10163 100644 --- a/linux/x86/x86.h +++ b/linux/x86/x86.h @@ -50,11 +50,6 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } -static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; -} - static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); @@ -70,15 +65,4 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); - -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); - #endif |